diff --git "a/run.ami.log" "b/run.ami.log"
new file mode 100644--- /dev/null
+++ "b/run.ami.log"
@@ -0,0 +1,23038 @@
+/opt/conda/lib/python3.12/site-packages/transformers/training_args.py:1483: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+05/25/2024 17:57:49 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True
+05/25/2024 17:57:49 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
+_n_gpu=1,
+accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
+adafactor=False,
+adam_beta1=0.9,
+adam_beta2=0.999,
+adam_epsilon=1e-08,
+auto_find_batch_size=False,
+batch_eval_metrics=False,
+bf16=False,
+bf16_full_eval=False,
+data_seed=None,
+dataloader_drop_last=False,
+dataloader_num_workers=0,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+dataloader_prefetch_factor=None,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=1800,
+debug=[],
+deepspeed=None,
+disable_tqdm=False,
+dispatch_batches=None,
+do_eval=True,
+do_predict=False,
+do_train=True,
+eval_accumulation_steps=None,
+eval_delay=0,
+eval_do_concat_batches=True,
+eval_steps=1000,
+eval_strategy=IntervalStrategy.STEPS,
+evaluation_strategy=steps,
+fp16=True,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+fsdp=[],
+fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+gradient_accumulation_steps=1,
+gradient_checkpointing=True,
+gradient_checkpointing_kwargs=None,
+greater_is_better=None,
+group_by_length=True,
+half_precision_backend=auto,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=False,
+hub_strategy=HubStrategy.EVERY_SAVE,
+hub_token=<HUB_TOKEN>,
+ignore_data_skip=False,
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+learning_rate=0.0003,
+length_column_name=length,
+load_best_model_at_end=False,
+local_rank=0,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=./runs/May25_17-57-49_tz579-raptorlake,
+logging_first_step=False,
+logging_nan_inf_filter=True,
+logging_steps=1.0,
+logging_strategy=IntervalStrategy.STEPS,
+lr_scheduler_kwargs={},
+lr_scheduler_type=SchedulerType.LINEAR,
+max_grad_norm=1.0,
+max_steps=-1,
+metric_for_best_model=None,
+mp_parameters=,
+neftune_noise_alpha=None,
+no_cuda=False,
+num_train_epochs=2.0,
+optim=OptimizerNames.ADAMW_TORCH,
+optim_args=None,
+optim_target_modules=None,
+output_dir=./,
+overwrite_output_dir=True,
+past_index=-1,
+per_device_eval_batch_size=16,
+per_device_train_batch_size=16,
+prediction_loss_only=False,
+push_to_hub=True,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+ray_scope=last,
+remove_unused_columns=True,
+report_to=['tensorboard'],
+restore_callback_states_from_checkpoint=False,
+resume_from_checkpoint=None,
+run_name=./,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=400,
+save_strategy=IntervalStrategy.STEPS,
+save_total_limit=3,
+seed=42,
+skip_memory_metrics=True,
+split_batches=None,
+tf32=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+use_cpu=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_mps_device=False,
+warmup_ratio=0.0,
+warmup_steps=500,
+weight_decay=0.0,
+)
+/opt/conda/lib/python3.12/site-packages/datasets/load.py:1486: FutureWarning: The repository for edinburghcstr/ami contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/edinburghcstr/ami
+You can avoid this message in future by passing the argument `trust_remote_code=True`.
+Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
+  warnings.warn(
+loading configuration file config.json from cache at /home/Work/common_huggingface/hub/models--facebook--wav2vec2-large-lv60/snapshots/0cde644b64dac88d8416bec1c92a4099b850ba0b/config.json
+Model config Wav2Vec2Config {
+  "_name_or_path": "facebook/wav2vec2-large-lv60",
+  "activation_dropout": 0.1,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForPreTraining"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "transformers_version": "4.42.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}
+
+Map:   0%|          | 0/108502 [00:00<?, ? examples/s]Map: 100%|██████████| 108502/108502 [00:00<00:00, 1781935.83 examples/s]
+Map:   0%|          | 0/13098 [00:00<?, ? examples/s]Map: 100%|██████████| 13098/13098 [00:00<00:00, 1738622.50 examples/s]
+`use_fast` is set to `True` but the tokenizer class does not have a fast version.  Falling back to the slow version.
+loading file vocab.json
+loading file tokenizer_config.json
+loading file added_tokens.json
+loading file special_tokens_map.json
+loading file tokenizer.json
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+loading configuration file preprocessor_config.json from cache at /home/Work/common_huggingface/hub/models--facebook--wav2vec2-large-lv60/snapshots/0cde644b64dac88d8416bec1c92a4099b850ba0b/preprocessor_config.json
+loading configuration file config.json from cache at /home/Work/common_huggingface/hub/models--facebook--wav2vec2-large-lv60/snapshots/0cde644b64dac88d8416bec1c92a4099b850ba0b/config.json
+Model config Wav2Vec2Config {
+  "_name_or_path": "facebook/wav2vec2-large-lv60",
+  "activation_dropout": 0.1,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForPreTraining"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "transformers_version": "4.42.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}
+
+Feature extractor Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}
+
+loading weights file pytorch_model.bin from cache at /home/Work/common_huggingface/hub/models--facebook--wav2vec2-large-lv60/snapshots/0cde644b64dac88d8416bec1c92a4099b850ba0b/pytorch_model.bin
+Some weights of the model checkpoint at facebook/wav2vec2-large-lv60 were not used when initializing Wav2Vec2ForCTC: ['project_hid.bias', 'project_hid.weight', 'project_q.bias', 'project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
+- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-lv60 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Configuration saved in ./config.json
+loading configuration file ./preprocessor_config.json
+loading configuration file ./preprocessor_config.json
+loading configuration file ./preprocessor_config.json
+Feature extractor Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}
+
+loading file vocab.json
+loading file tokenizer_config.json
+loading file added_tokens.json
+loading file special_tokens_map.json
+loading file tokenizer.json
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+Processor Wav2Vec2Processor:
+- feature_extractor: Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}
+
+- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='./', vocab_size=30, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
+	28: AddedToken("[UNK]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
+	29: AddedToken("[PAD]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
+	30: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	31: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+
+{
+  "processor_class": "Wav2Vec2Processor"
+}
+
+Using auto half precision backend
+The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running training *****
+  Num examples = 102,201
+  Num Epochs = 2
+  Instantaneous batch size per device = 16
+  Total train batch size (w. parallel, distributed & accumulation) = 16
+  Gradient Accumulation steps = 1
+  Total optimization steps = 12,776
+  Number of trainable parameters = 311,261,344
+  0%|          | 0/12776 [00:00<?, ?it/s]/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+  0%|          | 1/12776 [00:02<8:20:26,  2.35s/it]                                                     0%|          | 1/12776 [00:02<8:20:26,  2.35s/it]  0%|          | 2/12776 [00:03<5:14:51,  1.48s/it]                                                     0%|          | 2/12776 [00:03<5:14:51,  1.48s/it]  0%|          | 3/12776 [00:04<4:14:10,  1.19s/it]                                                     0%|          | 3/12776 [00:04<4:14:10,  1.19s/it]  0%|          | 4/12776 [00:04<3:40:11,  1.03s/it]                                                     0%|          | 4/12776 [00:04<3:40:11,  1.03s/it]  0%|          | 5/12776 [00:05<3:29:10,  1.02it/s]                                                     0%|          | 5/12776 [00:05<3:29:10,  1.02it/s]  0%|          | 6/12776 [00:06<3:06:57,  1.14it/s]                                                     0%|          | 6/12776 [00:06<3:06:57,  1.14it/s]  0%|          | 7/12776 [00:07<2:53:13,  1.23it/s]                                                     0%|          | 7/12776 [00:07<2:53:13,  1.23it/s]  0%|          | 8/12776 [00:07<2:43:19,  1.30it/s]                                                     0%|          | 8/12776 [00:07<2:43:19,  1.30it/s]  0%|          | 9/12776 [00:08<2:30:33,  1.41it/s]                                                     0%|          | 9/12776 [00:08<2:30:33,  1.41it/s]  0%|          | 10/12776 [00:08<2:22:10,  1.50it/s]                                                      0%|          | 10/12776 [00:08<2:22:10,  1.50it/s]  0%|          | 11/12776 [00:09<2:13:00,  1.60it/s]                                                      0%|          | 11/12776 [00:09<2:13:00,  1.60it/s]  0%|          | 12/12776 [00:10<2:12:50,  1.60it/s]                                                      0%|          | 12/12776 [00:10<2:12:50,  1.60it/s]  0%|          | 13/12776 [00:10<2:02:29,  1.74it/s]                                                      0%|          | 13/12776 [00:10<2:02:29,  1.74it/s]  0%|          | 14/12776 [00:10<1:54:11,  1.86it/s]                                                      0%|          | 14/12776 [00:10<1:54:11,  1.86it/s]  0%|          | 15/12776 [00:11<1:48:49,  1.95it/s]                                                      0%|          | 15/12776 [00:11<1:48:49,  1.95it/s]  0%|          | 16/12776 [00:11<1:42:05,  2.08it/s]                                                      0%|          | 16/12776 [00:11<1:42:05,  2.08it/s]  0%|          | 17/12776 [00:12<1:42:30,  2.07it/s]                                                      0%|          | 17/12776 [00:12<1:42:30,  2.07it/s]  0%|          | 18/12776 [00:12<1:36:39,  2.20it/s]                                                      0%|          | 18/12776 [00:12<1:36:39,  2.20it/s]  0%|          | 19/12776 [00:13<1:32:18,  2.30it/s]                                                      0%|          | 19/12776 [00:13<1:32:18,  2.30it/s]  0%|          | 20/12776 [00:13<1:28:01,  2.42it/s]                                                      0%|          | 20/12776 [00:13<1:28:01,  2.42it/s]  0%|          | 21/12776 [00:13<1:23:56,  2.53it/s]                                                      0%|          | 21/12776 [00:13<1:23:56,  2.53it/s]  0%|          | 22/12776 [00:14<1:20:28,  2.64it/s]                                                      0%|          | 22/12776 [00:14<1:20:28,  2.64it/s]  0%|          | 23/12776 [00:14<1:22:55,  2.56it/s]                                                      0%|          | 23/12776 [00:14<1:22:55,  2.56it/s]  0%|          | 24/12776 [00:14<1:17:57,  2.73it/s]                                                      0%|          | 24/12776 [00:14<1:17:57,  2.73it/s]  0%|          | 25/12776 [00:15<1:13:54,  2.88it/s]                                                      0%|          | 25/12776 [00:15<1:13:54,  2.88it/s]  0%|          | 26/12776 [00:15<1:10:33,  3.01it/s]                                                      0%|          | 26/12776 [00:15<1:10:33,  3.01it/s]  0%|          | 27/12776 [00:15<1:12:49,  2.92it/s]                                                      0%|          | 27/12776 [00:15<1:12:49,  2.92it/s]  0%|          | 28/12776 [00:16<1:08:36,  3.10it/s]                                                      0%|          | 28/12776 [00:16<1:08:36,  3.10it/s]  0%|          | 29/12776 [00:16<1:05:21,  3.25it/s]                                                      0%|          | 29/12776 [00:16<1:05:21,  3.25it/s]  0%|          | 30/12776 [00:16<1:02:23,  3.41it/s]                                                      0%|          | 30/12776 [00:16<1:02:23,  3.41it/s]  0%|          | 31/12776 [00:17<1:04:16,  3.30it/s]                                                      0%|          | 31/12776 [00:17<1:04:16,  3.30it/s]  0%|          | 32/12776 [00:17<1:00:56,  3.49it/s]                                                      0%|          | 32/12776 [00:17<1:00:56,  3.49it/s]  0%|          | 33/12776 [00:17<58:19,  3.64it/s]                                                      0%|          | 33/12776 [00:17<58:19,  3.64it/s]  0%|          | 34/12776 [00:17<56:05,  3.79it/s]                                                    0%|          | 34/12776 [00:17<56:05,  3.79it/s]  0%|          | 35/12776 [00:18<57:59,  3.66it/s]                                                    0%|          | 35/12776 [00:18<57:59,  3.66it/s]  0%|          | 36/12776 [00:18<54:50,  3.87it/s]                                                    0%|          | 36/12776 [00:18<54:50,  3.87it/s]  0%|          | 37/12776 [00:18<52:39,  4.03it/s]                                                    0%|          | 37/12776 [00:18<52:39,  4.03it/s]  0%|          | 38/12776 [00:18<50:50,  4.18it/s]                                                    0%|          | 38/12776 [00:18<50:50,  4.18it/s]  0%|          | 39/12776 [00:18<49:34,  4.28it/s]                                                    0%|          | 39/12776 [00:18<49:34,  4.28it/s]  0%|          | 40/12776 [00:19<57:46,  3.67it/s]                                                    0%|          | 40/12776 [00:19<57:46,  3.67it/s]  0%|          | 41/12776 [00:19<53:48,  3.95it/s]                                                    0%|          | 41/12776 [00:19<53:48,  3.95it/s]  0%|          | 42/12776 [00:19<51:04,  4.16it/s]                                                    0%|          | 42/12776 [00:19<51:04,  4.16it/s]  0%|          | 43/12776 [00:19<48:56,  4.34it/s]                                                    0%|          | 43/12776 [00:19<48:56,  4.34it/s]  0%|          | 44/12776 [00:20<47:26,  4.47it/s]                                                    0%|          | 44/12776 [00:20<47:26,  4.47it/s]  0%|          | 45/12776 [00:20<55:49,  3.80it/s]                                                    0%|          | 45/12776 [00:20<55:49,  3.80it/s]  0%|          | 46/12776 [00:20<51:59,  4.08it/s]                                                    0%|          | 46/12776 [00:20<51:59,  4.08it/s]  0%|          | 47/12776 [00:20<49:10,  4.31it/s]                                                    0%|          | 47/12776 [00:20<49:10,  4.31it/s]  0%|          | 48/12776 [00:21<46:52,  4.52it/s]                                                    0%|          | 48/12776 [00:21<46:52,  4.52it/s]  0%|          | 49/12776 [00:21<45:03,  4.71it/s]                                                    0%|          | 49/12776 [00:21<45:03,  4.71it/s]  0%|          | 50/12776 [00:22<1:22:58,  2.56it/s]                                                      0%|          | 50/12776 [00:22<1:22:58,  2.56it/s]  0%|          | 51/12776 [00:23<2:45:23,  1.28it/s]                                                      0%|          | 51/12776 [00:23<2:45:23,  1.28it/s]  0%|          | 52/12776 [00:24<3:07:04,  1.13it/s]                                                      0%|          | 52/12776 [00:24<3:07:04,  1.13it/s]  0%|          | 53/12776 [00:25<3:11:33,  1.11it/s]                                                      0%|          | 53/12776 [00:25<3:11:33,  1.11it/s]  0%|          | 54/12776 [00:26<3:09:10,  1.12it/s]                                                      0%|          | 54/12776 [00:26<3:09:10,  1.12it/s]  0%|          | 55/12776 [00:27<3:07:21,  1.13it/s]                                                      0%|          | 55/12776 [00:27<3:07:21,  1.13it/s]  0%|          | 56/12776 [00:28<3:03:45,  1.15it/s]                                                      0%|          | 56/12776 [00:28<3:03:45,  1.15it/s]  0%|          | 57/12776 [00:29<2:54:10,  1.22it/s]                                                      0%|          | 57/12776 [00:29<2:54:10,  1.22it/s]  0%|          | 58/12776 [00:29<2:46:59,  1.27it/s]                                                      0%|          | 58/12776 [00:29<2:46:59,  1.27it/s]  0%|          | 59/12776 [00:30<2:37:51,  1.34it/s]                                                      0%|          | 59/12776 [00:30<2:37:51,  1.34it/s]  0%|          | 60/12776 [00:31<2:28:45,  1.42it/s]                                                      0%|          | 60/12776 [00:31<2:28:45,  1.42it/s]  0%|          | 61/12776 [00:31<2:21:19,  1.50it/s]                                                      0%|          | 61/12776 [00:31<2:21:19,  1.50it/s]  0%|          | 62/12776 [00:32<2:15:33,  1.56it/s]                                                      0%|          | 62/12776 [00:32<2:15:33,  1.56it/s]  0%|          | 63/12776 [00:32<2:09:58,  1.63it/s]                                                      0%|          | 63/12776 [00:32<2:09:58,  1.63it/s]  1%|          | 64/12776 [00:33<2:03:50,  1.71it/s]                                                      1%|          | 64/12776 [00:33<2:03:50,  1.71it/s]  1%|          | 65/12776 [00:33<1:58:22,  1.79it/s]                                                      1%|          | 65/12776 [00:33<1:58:22,  1.79it/s]  1%|          | 66/12776 [00:34<1:55:18,  1.84it/s]                                                      1%|          | 66/12776 [00:34<1:55:18,  1.84it/s]  1%|          | 67/12776 [00:34<1:48:44,  1.95it/s]                                                      1%|          | 67/12776 [00:34<1:48:44,  1.95it/s]  1%|          | 68/12776 [00:35<1:46:52,  1.98it/s]                                                      1%|          | 68/12776 [00:35<1:46:52,  1.98it/s]  1%|          | 69/12776 [00:35<1:40:18,  2.11it/s]                                                      1%|          | 69/12776 [00:35<1:40:18,  2.11it/s]  1%|          | 70/12776 [00:36<1:34:55,  2.23it/s]                                                      1%|          | 70/12776 [00:36<1:34:55,  2.23it/s]  1%|          | 71/12776 [00:36<1:31:22,  2.32it/s]                                                      1%|          | 71/12776 [00:36<1:31:22,  2.32it/s]  1%|          | 72/12776 [00:36<1:26:02,  2.46it/s]                                                      1%|          | 72/12776 [00:36<1:26:02,  2.46it/s]  1%|          | 73/12776 [00:37<1:21:47,  2.59it/s]                                                      1%|          | 73/12776 [00:37<1:21:47,  2.59it/s]  1%|          | 74/12776 [00:37<1:25:44,  2.47it/s]                                                      1%|          | 74/12776 [00:37<1:25:44,  2.47it/s]  1%|          | 75/12776 [00:37<1:17:13,  2.74it/s]                                                      1%|          | 75/12776 [00:37<1:17:13,  2.74it/s]  1%|          | 76/12776 [00:38<1:12:19,  2.93it/s]                                                      1%|          | 76/12776 [00:38<1:12:19,  2.93it/s]  1%|          | 77/12776 [00:38<1:08:59,  3.07it/s]                                                      1%|          | 77/12776 [00:38<1:08:59,  3.07it/s]  1%|          | 78/12776 [00:38<1:14:25,  2.84it/s]                                                      1%|          | 78/12776 [00:38<1:14:25,  2.84it/s]  1%|          | 79/12776 [00:39<1:09:32,  3.04it/s]                                                      1%|          | 79/12776 [00:39<1:09:32,  3.04it/s]  1%|          | 80/12776 [00:39<1:05:47,  3.22it/s]                                                      1%|          | 80/12776 [00:39<1:05:47,  3.22it/s]  1%|          | 81/12776 [00:39<1:02:31,  3.38it/s]                                                      1%|          | 81/12776 [00:39<1:02:31,  3.38it/s]  1%|          | 82/12776 [00:39<1:05:49,  3.21it/s]                                                      1%|          | 82/12776 [00:39<1:05:49,  3.21it/s]  1%|          | 83/12776 [00:40<1:01:37,  3.43it/s]                                                      1%|          | 83/12776 [00:40<1:01:37,  3.43it/s]  1%|          | 84/12776 [00:40<58:34,  3.61it/s]                                                      1%|          | 84/12776 [00:40<58:34,  3.61it/s]  1%|          | 85/12776 [00:40<56:22,  3.75it/s]                                                  {'loss': 10.2049, 'grad_norm': inf, 'learning_rate': 0.0, 'epoch': 0.0}
+{'loss': 10.1719, 'grad_norm': inf, 'learning_rate': 0.0, 'epoch': 0.0}
+{'loss': 11.8275, 'grad_norm': 10.289196014404297, 'learning_rate': 6e-07, 'epoch': 0.0}
+{'loss': 9.669, 'grad_norm': 8.432656288146973, 'learning_rate': 1.2e-06, 'epoch': 0.0}
+{'loss': 11.8878, 'grad_norm': 10.236042022705078, 'learning_rate': 1.8e-06, 'epoch': 0.0}
+{'loss': 11.0993, 'grad_norm': 9.023536682128906, 'learning_rate': 2.4e-06, 'epoch': 0.0}
+{'loss': 11.0304, 'grad_norm': 9.157316207885742, 'learning_rate': 2.9999999999999997e-06, 'epoch': 0.0}
+{'loss': 18.5492, 'grad_norm': 15.224575996398926, 'learning_rate': 3.6e-06, 'epoch': 0.0}
+{'loss': 9.8113, 'grad_norm': 8.25251579284668, 'learning_rate': 4.2e-06, 'epoch': 0.0}
+{'loss': 10.5777, 'grad_norm': 8.493621826171875, 'learning_rate': 4.8e-06, 'epoch': 0.0}
+{'loss': 9.6918, 'grad_norm': 8.078389167785645, 'learning_rate': 5.399999999999999e-06, 'epoch': 0.0}
+{'loss': 15.5513, 'grad_norm': 12.197300910949707, 'learning_rate': 5.999999999999999e-06, 'epoch': 0.0}
+{'loss': 21.1878, 'grad_norm': 14.715069770812988, 'learning_rate': 6.599999999999999e-06, 'epoch': 0.0}
+{'loss': 20.1223, 'grad_norm': 14.680959701538086, 'learning_rate': 7.2e-06, 'epoch': 0.0}
+{'loss': 25.9204, 'grad_norm': inf, 'learning_rate': 7.2e-06, 'epoch': 0.0}
+{'loss': 9.0609, 'grad_norm': 7.316311359405518, 'learning_rate': 7.799999999999998e-06, 'epoch': 0.0}
+{'loss': 19.6195, 'grad_norm': 14.48494815826416, 'learning_rate': 8.4e-06, 'epoch': 0.0}
+{'loss': 24.8781, 'grad_norm': 24.607234954833984, 'learning_rate': 8.999999999999999e-06, 'epoch': 0.0}
+{'loss': 14.3883, 'grad_norm': 10.66619873046875, 'learning_rate': 9.6e-06, 'epoch': 0.0}
+{'loss': 11.2116, 'grad_norm': 9.02160930633545, 'learning_rate': 1.02e-05, 'epoch': 0.0}
+{'loss': 10.0936, 'grad_norm': 8.273414611816406, 'learning_rate': 1.0799999999999998e-05, 'epoch': 0.0}
+{'loss': 12.5294, 'grad_norm': 9.83197021484375, 'learning_rate': 1.14e-05, 'epoch': 0.0}
+{'loss': 15.1262, 'grad_norm': 11.494529724121094, 'learning_rate': 1.1999999999999999e-05, 'epoch': 0.0}
+{'loss': 11.8818, 'grad_norm': 8.389466285705566, 'learning_rate': 1.26e-05, 'epoch': 0.0}
+{'loss': 16.7747, 'grad_norm': 11.484485626220703, 'learning_rate': 1.3199999999999997e-05, 'epoch': 0.0}
+{'loss': 15.3859, 'grad_norm': 11.260869026184082, 'learning_rate': 1.3799999999999998e-05, 'epoch': 0.0}
+{'loss': 19.4706, 'grad_norm': 17.069162368774414, 'learning_rate': 1.44e-05, 'epoch': 0.0}
+{'loss': 13.0765, 'grad_norm': 8.778854370117188, 'learning_rate': 1.4999999999999999e-05, 'epoch': 0.0}
+{'loss': 14.7478, 'grad_norm': 10.239326477050781, 'learning_rate': 1.5599999999999996e-05, 'epoch': 0.0}
+{'loss': 13.5911, 'grad_norm': 9.703652381896973, 'learning_rate': 1.6199999999999997e-05, 'epoch': 0.0}
+{'loss': 16.7388, 'grad_norm': 10.830334663391113, 'learning_rate': 1.68e-05, 'epoch': 0.0}
+{'loss': 19.3178, 'grad_norm': 14.102766990661621, 'learning_rate': 1.74e-05, 'epoch': 0.01}
+{'loss': 17.4729, 'grad_norm': 12.113805770874023, 'learning_rate': 1.7999999999999997e-05, 'epoch': 0.01}
+{'loss': 14.8432, 'grad_norm': 9.666101455688477, 'learning_rate': 1.8599999999999998e-05, 'epoch': 0.01}
+{'loss': 15.5869, 'grad_norm': 10.271428108215332, 'learning_rate': 1.92e-05, 'epoch': 0.01}
+{'loss': 13.2456, 'grad_norm': 27.508230209350586, 'learning_rate': 1.98e-05, 'epoch': 0.01}
+{'loss': 15.4974, 'grad_norm': 10.40356159210205, 'learning_rate': 2.04e-05, 'epoch': 0.01}
+{'loss': 16.509, 'grad_norm': 10.521185874938965, 'learning_rate': 2.1e-05, 'epoch': 0.01}
+{'loss': 12.5824, 'grad_norm': 9.262508392333984, 'learning_rate': 2.1599999999999996e-05, 'epoch': 0.01}
+{'loss': 13.576, 'grad_norm': 9.204741477966309, 'learning_rate': 2.2199999999999998e-05, 'epoch': 0.01}
+{'loss': 11.8722, 'grad_norm': 6.91443395614624, 'learning_rate': 2.28e-05, 'epoch': 0.01}
+{'loss': 13.263, 'grad_norm': 7.96762752532959, 'learning_rate': 2.34e-05, 'epoch': 0.01}
+{'loss': 12.1481, 'grad_norm': 9.377372741699219, 'learning_rate': 2.3999999999999997e-05, 'epoch': 0.01}
+{'loss': 10.7579, 'grad_norm': 6.440145015716553, 'learning_rate': 2.4599999999999998e-05, 'epoch': 0.01}
+{'loss': 9.8462, 'grad_norm': 5.9850077629089355, 'learning_rate': 2.52e-05, 'epoch': 0.01}
+{'loss': 9.8883, 'grad_norm': 10.260824203491211, 'learning_rate': 2.5799999999999997e-05, 'epoch': 0.01}
+{'loss': 8.8922, 'grad_norm': 10.321931838989258, 'learning_rate': 2.6399999999999995e-05, 'epoch': 0.01}
+{'loss': 8.3128, 'grad_norm': 5.632627964019775, 'learning_rate': 2.6999999999999996e-05, 'epoch': 0.01}
+{'loss': 7.1669, 'grad_norm': 4.798144817352295, 'learning_rate': 2.7599999999999997e-05, 'epoch': 0.01}
+{'loss': 6.2127, 'grad_norm': 4.642441272735596, 'learning_rate': 2.8199999999999998e-05, 'epoch': 0.01}
+{'loss': 11.9784, 'grad_norm': 12.44339370727539, 'learning_rate': 2.88e-05, 'epoch': 0.01}
+{'loss': 10.976, 'grad_norm': 11.49549674987793, 'learning_rate': 2.94e-05, 'epoch': 0.01}
+{'loss': 15.17, 'grad_norm': 16.98020362854004, 'learning_rate': 2.9999999999999997e-05, 'epoch': 0.01}
+{'loss': 8.8607, 'grad_norm': 8.904106140136719, 'learning_rate': 3.06e-05, 'epoch': 0.01}
+{'loss': 9.6081, 'grad_norm': 9.727025985717773, 'learning_rate': 3.119999999999999e-05, 'epoch': 0.01}
+{'loss': 9.9255, 'grad_norm': 9.460471153259277, 'learning_rate': 3.1799999999999994e-05, 'epoch': 0.01}
+{'loss': 9.9013, 'grad_norm': 9.601983070373535, 'learning_rate': 3.2399999999999995e-05, 'epoch': 0.01}
+{'loss': 11.9712, 'grad_norm': 10.275938034057617, 'learning_rate': 3.2999999999999996e-05, 'epoch': 0.01}
+{'loss': 10.6476, 'grad_norm': 9.154601097106934, 'learning_rate': 3.36e-05, 'epoch': 0.01}
+{'loss': 9.7442, 'grad_norm': 8.198284149169922, 'learning_rate': 3.42e-05, 'epoch': 0.01}
+{'loss': 11.3455, 'grad_norm': 10.912142753601074, 'learning_rate': 3.48e-05, 'epoch': 0.01}
+{'loss': 7.7892, 'grad_norm': 6.028509616851807, 'learning_rate': 3.539999999999999e-05, 'epoch': 0.01}
+{'loss': 10.4904, 'grad_norm': 10.75712776184082, 'learning_rate': 3.5999999999999994e-05, 'epoch': 0.01}
+{'loss': 9.4008, 'grad_norm': 11.000633239746094, 'learning_rate': 3.6599999999999995e-05, 'epoch': 0.01}
+{'loss': 7.2246, 'grad_norm': 7.76064395904541, 'learning_rate': 3.7199999999999996e-05, 'epoch': 0.01}
+{'loss': 13.3124, 'grad_norm': 23.41922950744629, 'learning_rate': 3.78e-05, 'epoch': 0.01}
+{'loss': 7.637, 'grad_norm': 11.076509475708008, 'learning_rate': 3.84e-05, 'epoch': 0.01}
+{'loss': 14.3584, 'grad_norm': 29.545185089111328, 'learning_rate': 3.9e-05, 'epoch': 0.01}
+{'loss': 8.6004, 'grad_norm': 17.31962776184082, 'learning_rate': 3.96e-05, 'epoch': 0.01}
+{'loss': 6.3439, 'grad_norm': 11.55864143371582, 'learning_rate': 4.02e-05, 'epoch': 0.01}
+{'loss': 10.298, 'grad_norm': 25.8043212890625, 'learning_rate': 4.08e-05, 'epoch': 0.01}
+{'loss': 12.2877, 'grad_norm': 33.71727752685547, 'learning_rate': 4.14e-05, 'epoch': 0.01}
+{'loss': 7.7747, 'grad_norm': 20.209596633911133, 'learning_rate': 4.2e-05, 'epoch': 0.01}
+{'loss': 7.6235, 'grad_norm': 20.540761947631836, 'learning_rate': 4.259999999999999e-05, 'epoch': 0.01}
+{'loss': 12.327, 'grad_norm': inf, 'learning_rate': 4.259999999999999e-05, 'epoch': 0.01}
+{'loss': 7.1177, 'grad_norm': 19.35993766784668, 'learning_rate': 4.319999999999999e-05, 'epoch': 0.01}
+{'loss': 10.6781, 'grad_norm': 36.380619049072266, 'learning_rate': 4.3799999999999994e-05, 'epoch': 0.01}
+{'loss': 12.4375, 'grad_norm': 45.644840240478516, 'learning_rate': 4.4399999999999995e-05, 'epoch': 0.01}
+{'loss': 8.1513, 'grad_norm': 27.72957992553711, 'learning_rate': 4.4999999999999996e-05, 'epoch': 0.01}
+{'loss': 9.3938, 'grad_norm': 35.70543670654297, 'learning_rate': 4.56e-05, 'epoch': 0.01}
+{'loss': 12.0758, 'grad_norm': 51.08866500854492, 'learning_rate': 4.62e-05, 'epoch': 0.01}
+{'loss': 9.2489, 'grad_norm': 38.18144989013672, 'learning_rate': 4.68e-05, 'epoch': 0.01}
+{'loss': 6.9829, 'grad_norm': 25.754817962646484, 'learning_rate': 4.7399999999999993e-05, 'epoch': 0.01}
+{'loss': 8.0805, 'grad_norm': 33.58719253540039, 'learning_rate': 4.7999999999999994e-05, 'epoch': 0.01}
+  1%|          | 85/12776 [00:40<56:22,  3.75it/s]  1%|          | 86/12776 [00:40<58:25,  3.62it/s]                                                    1%|          | 86/12776 [00:40<58:25,  3.62it/s]  1%|          | 87/12776 [00:41<55:32,  3.81it/s]                                                    1%|          | 87/12776 [00:41<55:32,  3.81it/s]  1%|          | 88/12776 [00:41<53:16,  3.97it/s]                                                    1%|          | 88/12776 [00:41<53:16,  3.97it/s]  1%|          | 89/12776 [00:41<51:01,  4.14it/s]                                                    1%|          | 89/12776 [00:41<51:01,  4.14it/s]  1%|          | 90/12776 [00:41<49:26,  4.28it/s]                                                    1%|          | 90/12776 [00:41<49:26,  4.28it/s]  1%|          | 91/12776 [00:42<57:05,  3.70it/s]                                                    1%|          | 91/12776 [00:42<57:05,  3.70it/s]  1%|          | 92/12776 [00:42<53:33,  3.95it/s]                                                    1%|          | 92/12776 [00:42<53:33,  3.95it/s]  1%|          | 93/12776 [00:42<50:49,  4.16it/s]                                                    1%|          | 93/12776 [00:42<50:49,  4.16it/s]  1%|          | 94/12776 [00:42<48:40,  4.34it/s]                                                    1%|          | 94/12776 [00:42<48:40,  4.34it/s]  1%|          | 95/12776 [00:43<46:40,  4.53it/s]                                                    1%|          | 95/12776 [00:43<46:40,  4.53it/s]  1%|          | 96/12776 [00:43<50:48,  4.16it/s]                                                    1%|          | 96/12776 [00:43<50:48,  4.16it/s]  1%|          | 97/12776 [00:43<48:20,  4.37it/s]                                                    1%|          | 97/12776 [00:43<48:20,  4.37it/s]  1%|          | 98/12776 [00:43<45:28,  4.65it/s]                                                    1%|          | 98/12776 [00:43<45:28,  4.65it/s]  1%|          | 99/12776 [00:43<43:26,  4.86it/s]                                                    1%|          | 99/12776 [00:43<43:26,  4.86it/s]  1%|          | 100/12776 [00:44<1:16:02,  2.78it/s]                                                       1%|          | 100/12776 [00:44<1:16:02,  2.78it/s]  1%|          | 101/12776 [00:46<2:29:52,  1.41it/s]                                                       1%|          | 101/12776 [00:46<2:29:52,  1.41it/s]  1%|          | 102/12776 [00:47<2:50:12,  1.24it/s]                                                       1%|          | 102/12776 [00:47<2:50:12,  1.24it/s]  1%|          | 103/12776 [00:48<3:01:31,  1.16it/s]                                                       1%|          | 103/12776 [00:48<3:01:31,  1.16it/s]  1%|          | 104/12776 [00:49<3:00:22,  1.17it/s]                                                       1%|          | 104/12776 [00:49<3:00:22,  1.17it/s]  1%|          | 105/12776 [00:49<2:52:23,  1.23it/s]                                                       1%|          | 105/12776 [00:49<2:52:23,  1.23it/s]  1%|          | 106/12776 [00:50<2:47:22,  1.26it/s]                                                       1%|          | 106/12776 [00:50<2:47:22,  1.26it/s]  1%|          | 107/12776 [00:51<2:37:44,  1.34it/s]                                                       1%|          | 107/12776 [00:51<2:37:44,  1.34it/s]  1%|          | 108/12776 [00:51<2:29:44,  1.41it/s]                                                       1%|          | 108/12776 [00:51<2:29:44,  1.41it/s]  1%|          | 109/12776 [00:52<2:21:14,  1.49it/s]                                                       1%|          | 109/12776 [00:52<2:21:14,  1.49it/s]  1%|          | 110/12776 [00:52<2:15:39,  1.56it/s]                                                       1%|          | 110/12776 [00:52<2:15:39,  1.56it/s]  1%|          | 111/12776 [00:53<2:08:43,  1.64it/s]                                                       1%|          | 111/12776 [00:53<2:08:43,  1.64it/s]  1%|          | 112/12776 [00:53<2:05:22,  1.68it/s]                                                       1%|          | 112/12776 [00:53<2:05:22,  1.68it/s]  1%|          | 113/12776 [00:54<1:57:25,  1.80it/s]                                                       1%|          | 113/12776 [00:54<1:57:25,  1.80it/s]  1%|          | 114/12776 [00:55<1:58:57,  1.77it/s]                                                       1%|          | 114/12776 [00:55<1:58:57,  1.77it/s]  1%|          | 115/12776 [00:55<1:51:26,  1.89it/s]                                                       1%|          | 115/12776 [00:55<1:51:26,  1.89it/s]  1%|          | 116/12776 [00:55<1:48:15,  1.95it/s]                                                       1%|          | 116/12776 [00:55<1:48:15,  1.95it/s]  1%|          | 117/12776 [00:56<1:41:57,  2.07it/s]                                                       1%|          | 117/12776 [00:56<1:41:57,  2.07it/s]  1%|          | 118/12776 [00:56<1:36:06,  2.20it/s]                                                       1%|          | 118/12776 [00:56<1:36:06,  2.20it/s]  1%|          | 119/12776 [00:57<1:40:33,  2.10it/s]                                                       1%|          | 119/12776 [00:57<1:40:33,  2.10it/s]  1%|          | 120/12776 [00:57<1:34:04,  2.24it/s]                                                       1%|          | 120/12776 [00:57<1:34:04,  2.24it/s]  1%|          | 121/12776 [00:58<1:28:14,  2.39it/s]                                                       1%|          | 121/12776 [00:58<1:28:14,  2.39it/s]  1%|          | 122/12776 [00:58<1:26:41,  2.43it/s]                                                       1%|          | 122/12776 [00:58<1:26:41,  2.43it/s]  1%|          | 123/12776 [00:58<1:21:41,  2.58it/s]                                                       1%|          | 123/12776 [00:58<1:21:41,  2.58it/s]  1%|          | 124/12776 [00:59<1:16:51,  2.74it/s]                                                       1%|          | 124/12776 [00:59<1:16:51,  2.74it/s]  1%|          | 125/12776 [00:59<1:16:18,  2.76it/s]                                                       1%|          | 125/12776 [00:59<1:16:18,  2.76it/s]  1%|          | 126/12776 [00:59<1:12:27,  2.91it/s]                                                       1%|          | 126/12776 [00:59<1:12:27,  2.91it/s]  1%|          | 127/12776 [01:00<1:08:58,  3.06it/s]                                                       1%|          | 127/12776 [01:00<1:08:58,  3.06it/s]  1%|          | 128/12776 [01:00<1:05:15,  3.23it/s]                                                       1%|          | 128/12776 [01:00<1:05:15,  3.23it/s]  1%|          | 129/12776 [01:00<1:11:17,  2.96it/s]                                                       1%|          | 129/12776 [01:00<1:11:17,  2.96it/s]  1%|          | 130/12776 [01:00<1:06:38,  3.16it/s]                                                       1%|          | 130/12776 [01:00<1:06:38,  3.16it/s]  1%|          | 131/12776 [01:01<1:02:44,  3.36it/s]                                                       1%|          | 131/12776 [01:01<1:02:44,  3.36it/s]  1%|          | 132/12776 [01:01<59:32,  3.54it/s]                                                       1%|          | 132/12776 [01:01<59:32,  3.54it/s]  1%|          | 133/12776 [01:01<1:05:23,  3.22it/s]                                                       1%|          | 133/12776 [01:01<1:05:23,  3.22it/s]  1%|          | 134/12776 [01:02<1:01:13,  3.44it/s]                                                       1%|          | 134/12776 [01:02<1:01:13,  3.44it/s]  1%|          | 135/12776 [01:02<57:19,  3.67it/s]                                                       1%|          | 135/12776 [01:02<57:19,  3.67it/s]  1%|          | 136/12776 [01:02<54:59,  3.83it/s]                                                     1%|          | 136/12776 [01:02<54:59,  3.83it/s]  1%|          | 137/12776 [01:02<57:19,  3.67it/s]                                                     1%|          | 137/12776 [01:02<57:19,  3.67it/s]  1%|          | 138/12776 [01:03<53:48,  3.91it/s]                                                     1%|          | 138/12776 [01:03<53:48,  3.91it/s]  1%|          | 139/12776 [01:03<51:18,  4.10it/s]                                                     1%|          | 139/12776 [01:03<51:18,  4.10it/s]  1%|          | 140/12776 [01:03<49:09,  4.28it/s]                                                     1%|          | 140/12776 [01:03<49:09,  4.28it/s]  1%|          | 141/12776 [01:03<47:49,  4.40it/s]                                                     1%|          | 141/12776 [01:03<47:49,  4.40it/s]  1%|          | 142/12776 [01:03<51:22,  4.10it/s]                                                     1%|          | 142/12776 [01:03<51:22,  4.10it/s]  1%|          | 143/12776 [01:04<48:30,  4.34it/s]                                                     1%|          | 143/12776 [01:04<48:30,  4.34it/s]  1%|          | 144/12776 [01:04<47:02,  4.48it/s]                                                     1%|          | 144/12776 [01:04<47:02,  4.48it/s]  1%|          | 145/12776 [01:04<45:15,  4.65it/s]                                                     1%|          | 145/12776 [01:04<45:15,  4.65it/s]  1%|          | 146/12776 [01:04<43:47,  4.81it/s]                                                     1%|          | 146/12776 [01:04<43:47,  4.81it/s]  1%|          | 147/12776 [01:04<42:40,  4.93it/s]                                                     1%|          | 147/12776 [01:04<42:40,  4.93it/s]  1%|          | 148/12776 [01:05<47:04,  4.47it/s]                                                     1%|          | 148/12776 [01:05<47:04,  4.47it/s]  1%|          | 149/12776 [01:05<44:47,  4.70it/s]                                                     1%|          | 149/12776 [01:05<44:47,  4.70it/s]  1%|          | 150/12776 [01:06<1:25:18,  2.47it/s]                                                       1%|          | 150/12776 [01:06<1:25:18,  2.47it/s]  1%|          | 151/12776 [01:07<2:37:49,  1.33it/s]                                                       1%|          | 151/12776 [01:07<2:37:49,  1.33it/s]  1%|          | 152/12776 [01:08<3:03:37,  1.15it/s]                                                       1%|          | 152/12776 [01:08<3:03:37,  1.15it/s]  1%|          | 153/12776 [01:10<3:15:34,  1.08it/s]                                                       1%|          | 153/12776 [01:10<3:15:34,  1.08it/s]  1%|          | 154/12776 [01:10<3:09:40,  1.11it/s]                                                       1%|          | 154/12776 [01:10<3:09:40,  1.11it/s]  1%|          | 155/12776 [01:11<3:01:14,  1.16it/s]                                                       1%|          | 155/12776 [01:11<3:01:14,  1.16it/s]  1%|          | 156/12776 [01:12<2:56:20,  1.19it/s]                                                       1%|          | 156/12776 [01:12<2:56:20,  1.19it/s]  1%|          | 157/12776 [01:13<2:50:14,  1.24it/s]                                                       1%|          | 157/12776 [01:13<2:50:14,  1.24it/s]  1%|          | 158/12776 [01:13<2:39:36,  1.32it/s]                                                       1%|          | 158/12776 [01:13<2:39:36,  1.32it/s]  1%|          | 159/12776 [01:14<2:42:07,  1.30it/s]                                                       1%|          | 159/12776 [01:14<2:42:07,  1.30it/s]  1%|▏         | 160/12776 [01:15<2:30:28,  1.40it/s]                                                       1%|▏         | 160/12776 [01:15<2:30:28,  1.40it/s]  1%|▏         | 161/12776 [01:15<2:24:32,  1.45it/s]                                                       1%|▏         | 161/12776 [01:15<2:24:32,  1.45it/s]  1%|▏         | 162/12776 [01:16<2:15:39,  1.55it/s]                                                       1%|▏         | 162/12776 [01:16<2:15:39,  1.55it/s]  1%|▏         | 163/12776 [01:16<2:12:17,  1.59it/s]                                                       1%|▏         | 163/12776 [01:16<2:12:17,  1.59it/s]  1%|▏         | 164/12776 [01:17<2:03:57,  1.70it/s]                                                       1%|▏         | 164/12776 [01:17<2:03:57,  1.70it/s]  1%|▏         | 165/12776 [01:17<1:59:21,  1.76it/s]                                                       1%|▏         | 165/12776 [01:17<1:59:21,  1.76it/s]  1%|▏         | 166/12776 [01:18<1:52:06,  1.87it/s]                                                       1%|▏         | 166/12776 [01:18<1:52:06,  1.87it/s]  1%|▏         | 167/12776 [01:18<1:50:45,  1.90it/s]                                                       1%|▏         | 167/12776 [01:18<1:50:45,  1.90it/s]  1%|▏         | 168/12776 [01:19<1:43:25,  2.03it/s]                                                     {'loss': 5.7876, 'grad_norm': 19.882631301879883, 'learning_rate': 4.8599999999999995e-05, 'epoch': 0.01}
+{'loss': 6.844, 'grad_norm': 28.052623748779297, 'learning_rate': 4.9199999999999997e-05, 'epoch': 0.01}
+{'loss': 6.2694, 'grad_norm': 24.705547332763672, 'learning_rate': 4.98e-05, 'epoch': 0.01}
+{'loss': 6.6974, 'grad_norm': 27.596519470214844, 'learning_rate': 5.04e-05, 'epoch': 0.01}
+{'loss': 5.9251, 'grad_norm': 24.197368621826172, 'learning_rate': 5.1e-05, 'epoch': 0.01}
+{'loss': 7.2591, 'grad_norm': 33.817039489746094, 'learning_rate': 5.1599999999999994e-05, 'epoch': 0.01}
+{'loss': 6.3747, 'grad_norm': 28.664337158203125, 'learning_rate': 5.2199999999999995e-05, 'epoch': 0.01}
+{'loss': 5.2411, 'grad_norm': 19.04326057434082, 'learning_rate': 5.279999999999999e-05, 'epoch': 0.01}
+{'loss': 5.2001, 'grad_norm': 19.95840835571289, 'learning_rate': 5.339999999999999e-05, 'epoch': 0.01}
+{'loss': 4.9905, 'grad_norm': 17.750680923461914, 'learning_rate': 5.399999999999999e-05, 'epoch': 0.01}
+{'loss': 4.8285, 'grad_norm': 17.195247650146484, 'learning_rate': 5.459999999999999e-05, 'epoch': 0.01}
+{'loss': 4.311, 'grad_norm': 12.27951431274414, 'learning_rate': 5.519999999999999e-05, 'epoch': 0.02}
+{'loss': 4.0848, 'grad_norm': 10.93501091003418, 'learning_rate': 5.5799999999999994e-05, 'epoch': 0.02}
+{'loss': 3.961, 'grad_norm': 8.672501564025879, 'learning_rate': 5.6399999999999995e-05, 'epoch': 0.02}
+{'loss': 3.8516, 'grad_norm': 7.686964988708496, 'learning_rate': 5.6999999999999996e-05, 'epoch': 0.02}
+{'loss': 3.6987, 'grad_norm': 5.318123817443848, 'learning_rate': 5.76e-05, 'epoch': 0.02}
+{'loss': 4.1017, 'grad_norm': 13.304719924926758, 'learning_rate': 5.82e-05, 'epoch': 0.02}
+{'loss': 3.9793, 'grad_norm': 12.244338989257812, 'learning_rate': 5.88e-05, 'epoch': 0.02}
+{'loss': 3.9138, 'grad_norm': 11.684955596923828, 'learning_rate': 5.94e-05, 'epoch': 0.02}
+{'loss': 6.8625, 'grad_norm': 36.793663024902344, 'learning_rate': 5.9999999999999995e-05, 'epoch': 0.02}
+{'loss': 4.8527, 'grad_norm': 20.531827926635742, 'learning_rate': 6.0599999999999996e-05, 'epoch': 0.02}
+{'loss': 4.0059, 'grad_norm': 12.65087604522705, 'learning_rate': 6.12e-05, 'epoch': 0.02}
+{'loss': 3.7414, 'grad_norm': 9.573142051696777, 'learning_rate': 6.18e-05, 'epoch': 0.02}
+{'loss': 4.1523, 'grad_norm': 13.197820663452148, 'learning_rate': 6.239999999999999e-05, 'epoch': 0.02}
+{'loss': 3.8368, 'grad_norm': 10.532169342041016, 'learning_rate': 6.299999999999999e-05, 'epoch': 0.02}
+{'loss': 4.1369, 'grad_norm': 12.9858980178833, 'learning_rate': 6.359999999999999e-05, 'epoch': 0.02}
+{'loss': 5.8008, 'grad_norm': 29.88274574279785, 'learning_rate': 6.419999999999999e-05, 'epoch': 0.02}
+{'loss': 4.4349, 'grad_norm': 17.681074142456055, 'learning_rate': 6.479999999999999e-05, 'epoch': 0.02}
+{'loss': 4.3329, 'grad_norm': 14.819682121276855, 'learning_rate': 6.539999999999999e-05, 'epoch': 0.02}
+{'loss': 4.0054, 'grad_norm': 12.398885726928711, 'learning_rate': 6.599999999999999e-05, 'epoch': 0.02}
+{'loss': 3.9655, 'grad_norm': 11.930230140686035, 'learning_rate': 6.659999999999999e-05, 'epoch': 0.02}
+{'loss': 3.6875, 'grad_norm': 8.862527847290039, 'learning_rate': 6.72e-05, 'epoch': 0.02}
+{'loss': 4.9808, 'grad_norm': 21.191814422607422, 'learning_rate': 6.78e-05, 'epoch': 0.02}
+{'loss': 3.446, 'grad_norm': 5.6609392166137695, 'learning_rate': 6.84e-05, 'epoch': 0.02}
+{'loss': 4.3345, 'grad_norm': 15.799832344055176, 'learning_rate': 6.9e-05, 'epoch': 0.02}
+{'loss': 4.0145, 'grad_norm': 12.569608688354492, 'learning_rate': 6.96e-05, 'epoch': 0.02}
+{'loss': 3.531, 'grad_norm': 6.250307559967041, 'learning_rate': 7.02e-05, 'epoch': 0.02}
+{'loss': 3.8133, 'grad_norm': 9.823454856872559, 'learning_rate': 7.079999999999999e-05, 'epoch': 0.02}
+{'loss': 4.0484, 'grad_norm': 14.932891845703125, 'learning_rate': 7.139999999999999e-05, 'epoch': 0.02}
+{'loss': 3.775, 'grad_norm': 8.835564613342285, 'learning_rate': 7.199999999999999e-05, 'epoch': 0.02}
+{'loss': 3.385, 'grad_norm': 2.77044939994812, 'learning_rate': 7.259999999999999e-05, 'epoch': 0.02}
+{'loss': 4.4818, 'grad_norm': 19.207578659057617, 'learning_rate': 7.319999999999999e-05, 'epoch': 0.02}
+{'loss': 4.5493, 'grad_norm': 20.31056785583496, 'learning_rate': 7.379999999999999e-05, 'epoch': 0.02}
+{'loss': 3.7879, 'grad_norm': 9.711579322814941, 'learning_rate': 7.439999999999999e-05, 'epoch': 0.02}
+{'loss': 4.6096, 'grad_norm': 19.968996047973633, 'learning_rate': 7.5e-05, 'epoch': 0.02}
+{'loss': 4.1364, 'grad_norm': 13.640237808227539, 'learning_rate': 7.56e-05, 'epoch': 0.02}
+{'loss': 4.4219, 'grad_norm': 18.92345428466797, 'learning_rate': 7.62e-05, 'epoch': 0.02}
+{'loss': 3.615, 'grad_norm': 5.510997295379639, 'learning_rate': 7.68e-05, 'epoch': 0.02}
+{'loss': 4.0149, 'grad_norm': 11.286620140075684, 'learning_rate': 7.74e-05, 'epoch': 0.02}
+{'loss': 3.8435, 'grad_norm': 8.938575744628906, 'learning_rate': 7.8e-05, 'epoch': 0.02}
+{'loss': 3.8606, 'grad_norm': 8.202083587646484, 'learning_rate': 7.86e-05, 'epoch': 0.02}
+{'loss': 3.6637, 'grad_norm': 7.211292266845703, 'learning_rate': 7.92e-05, 'epoch': 0.02}
+{'loss': 3.602, 'grad_norm': 6.565310001373291, 'learning_rate': 7.98e-05, 'epoch': 0.02}
+{'loss': 3.6108, 'grad_norm': 5.222198009490967, 'learning_rate': 8.04e-05, 'epoch': 0.02}
+{'loss': 3.831, 'grad_norm': 9.35428237915039, 'learning_rate': 8.1e-05, 'epoch': 0.02}
+{'loss': 3.8316, 'grad_norm': 9.887560844421387, 'learning_rate': 8.16e-05, 'epoch': 0.02}
+{'loss': 3.642, 'grad_norm': 8.195279121398926, 'learning_rate': 8.22e-05, 'epoch': 0.02}
+{'loss': 3.44, 'grad_norm': 3.218400478363037, 'learning_rate': 8.28e-05, 'epoch': 0.02}
+{'loss': 3.4347, 'grad_norm': 2.472031354904175, 'learning_rate': 8.34e-05, 'epoch': 0.02}
+{'loss': 3.3126, 'grad_norm': 2.141615390777588, 'learning_rate': 8.4e-05, 'epoch': 0.02}
+{'loss': 3.3058, 'grad_norm': 3.026726007461548, 'learning_rate': 8.459999999999998e-05, 'epoch': 0.02}
+{'loss': 3.3199, 'grad_norm': 3.966043472290039, 'learning_rate': 8.519999999999998e-05, 'epoch': 0.02}
+{'loss': 3.2163, 'grad_norm': 3.906219720840454, 'learning_rate': 8.579999999999998e-05, 'epoch': 0.02}
+{'loss': 3.1645, 'grad_norm': 2.3450818061828613, 'learning_rate': 8.639999999999999e-05, 'epoch': 0.02}
+{'loss': 3.1981, 'grad_norm': 3.949995756149292, 'learning_rate': 8.699999999999999e-05, 'epoch': 0.02}
+{'loss': 3.0328, 'grad_norm': 2.897047996520996, 'learning_rate': 8.759999999999999e-05, 'epoch': 0.02}
+{'loss': 3.4079, 'grad_norm': 9.815628051757812, 'learning_rate': 8.819999999999999e-05, 'epoch': 0.02}
+{'loss': 3.4144, 'grad_norm': 8.956120491027832, 'learning_rate': 8.879999999999999e-05, 'epoch': 0.02}
+{'loss': 3.2383, 'grad_norm': 5.622773170471191, 'learning_rate': 8.939999999999999e-05, 'epoch': 0.02}
+{'loss': 3.6527, 'grad_norm': 10.2916259765625, 'learning_rate': 8.999999999999999e-05, 'epoch': 0.02}
+{'loss': 3.1562, 'grad_norm': 3.466508388519287, 'learning_rate': 9.059999999999999e-05, 'epoch': 0.02}
+{'loss': 3.1937, 'grad_norm': 3.6558587551116943, 'learning_rate': 9.12e-05, 'epoch': 0.02}
+{'loss': 3.0899, 'grad_norm': 1.539302945137024, 'learning_rate': 9.18e-05, 'epoch': 0.02}
+{'loss': 3.3159, 'grad_norm': 4.4443230628967285, 'learning_rate': 9.24e-05, 'epoch': 0.02}
+{'loss': 3.1318, 'grad_norm': 3.2545831203460693, 'learning_rate': 9.3e-05, 'epoch': 0.02}
+{'loss': 3.2075, 'grad_norm': 3.192456007003784, 'learning_rate': 9.36e-05, 'epoch': 0.03}
+{'loss': 3.1137, 'grad_norm': 1.9067258834838867, 'learning_rate': 9.419999999999999e-05, 'epoch': 0.03}
+{'loss': 3.109, 'grad_norm': 1.6764792203903198, 'learning_rate': 9.479999999999999e-05, 'epoch': 0.03}
+{'loss': 4.8797, 'grad_norm': 25.346240997314453, 'learning_rate': 9.539999999999999e-05, 'epoch': 0.03}
+{'loss': 3.0522, 'grad_norm': 1.1866326332092285, 'learning_rate': 9.599999999999999e-05, 'epoch': 0.03}
+{'loss': 3.0789, 'grad_norm': 1.4010124206542969, 'learning_rate': 9.659999999999999e-05, 'epoch': 0.03}
+{'loss': 3.3802, 'grad_norm': 8.419206619262695, 'learning_rate': 9.719999999999999e-05, 'epoch': 0.03}
+{'loss': 3.5289, 'grad_norm': 8.86681842803955, 'learning_rate': 9.779999999999999e-05, 'epoch': 0.03}
+  1%|▏         | 168/12776 [01:19<1:43:25,  2.03it/s]  1%|▏         | 169/12776 [01:19<1:37:43,  2.15it/s]                                                       1%|▏         | 169/12776 [01:19<1:37:43,  2.15it/s]  1%|▏         | 170/12776 [01:20<1:39:32,  2.11it/s]                                                       1%|▏         | 170/12776 [01:20<1:39:32,  2.11it/s]  1%|▏         | 171/12776 [01:20<1:32:00,  2.28it/s]                                                       1%|▏         | 171/12776 [01:20<1:32:00,  2.28it/s]  1%|▏         | 172/12776 [01:20<1:26:10,  2.44it/s]                                                       1%|▏         | 172/12776 [01:20<1:26:10,  2.44it/s]  1%|▏         | 173/12776 [01:21<1:26:10,  2.44it/s]                                                       1%|▏         | 173/12776 [01:21<1:26:10,  2.44it/s]  1%|▏         | 174/12776 [01:21<1:20:54,  2.60it/s]                                                       1%|▏         | 174/12776 [01:21<1:20:54,  2.60it/s]  1%|▏         | 175/12776 [01:21<1:16:00,  2.76it/s]                                                       1%|▏         | 175/12776 [01:21<1:16:00,  2.76it/s]  1%|▏         | 176/12776 [01:22<1:14:43,  2.81it/s]                                                       1%|▏         | 176/12776 [01:22<1:14:43,  2.81it/s]  1%|▏         | 177/12776 [01:22<1:11:16,  2.95it/s]                                                       1%|▏         | 177/12776 [01:22<1:11:16,  2.95it/s]  1%|▏         | 178/12776 [01:22<1:07:25,  3.11it/s]                                                       1%|▏         | 178/12776 [01:22<1:07:25,  3.11it/s]  1%|▏         | 179/12776 [01:23<1:04:35,  3.25it/s]                                                       1%|▏         | 179/12776 [01:23<1:04:35,  3.25it/s]  1%|▏         | 180/12776 [01:23<1:04:54,  3.23it/s]                                                       1%|▏         | 180/12776 [01:23<1:04:54,  3.23it/s]  1%|▏         | 181/12776 [01:23<1:01:35,  3.41it/s]                                                       1%|▏         | 181/12776 [01:23<1:01:35,  3.41it/s]  1%|▏         | 182/12776 [01:24<58:38,  3.58it/s]                                                       1%|▏         | 182/12776 [01:24<58:38,  3.58it/s]  1%|▏         | 183/12776 [01:24<56:54,  3.69it/s]                                                     1%|▏         | 183/12776 [01:24<56:54,  3.69it/s]  1%|▏         | 184/12776 [01:24<1:02:58,  3.33it/s]                                                       1%|▏         | 184/12776 [01:24<1:02:58,  3.33it/s]  1%|▏         | 185/12776 [01:24<58:52,  3.56it/s]                                                       1%|▏         | 185/12776 [01:24<58:52,  3.56it/s]  1%|▏         | 186/12776 [01:25<56:07,  3.74it/s]                                                     1%|▏         | 186/12776 [01:25<56:07,  3.74it/s]  1%|▏         | 187/12776 [01:25<53:26,  3.93it/s]                                                     1%|▏         | 187/12776 [01:25<53:26,  3.93it/s]  1%|▏         | 188/12776 [01:25<51:17,  4.09it/s]                                                     1%|▏         | 188/12776 [01:25<51:17,  4.09it/s]  1%|▏         | 189/12776 [01:25<57:17,  3.66it/s]                                                     1%|▏         | 189/12776 [01:25<57:17,  3.66it/s]  1%|▏         | 190/12776 [01:26<53:33,  3.92it/s]                                                     1%|▏         | 190/12776 [01:26<53:33,  3.92it/s]  1%|▏         | 191/12776 [01:26<50:42,  4.14it/s]                                                     1%|▏         | 191/12776 [01:26<50:42,  4.14it/s]  2%|▏         | 192/12776 [01:26<48:20,  4.34it/s]                                                     2%|▏         | 192/12776 [01:26<48:20,  4.34it/s]  2%|▏         | 193/12776 [01:26<45:57,  4.56it/s]                                                     2%|▏         | 193/12776 [01:26<45:57,  4.56it/s]  2%|▏         | 194/12776 [01:27<51:21,  4.08it/s]                                                     2%|▏         | 194/12776 [01:27<51:21,  4.08it/s]  2%|▏         | 195/12776 [01:27<48:11,  4.35it/s]                                                     2%|▏         | 195/12776 [01:27<48:11,  4.35it/s]  2%|▏         | 196/12776 [01:27<45:54,  4.57it/s]                                                     2%|▏         | 196/12776 [01:27<45:54,  4.57it/s]  2%|▏         | 197/12776 [01:27<44:37,  4.70it/s]                                                     2%|▏         | 197/12776 [01:27<44:37,  4.70it/s]  2%|▏         | 198/12776 [01:27<43:21,  4.83it/s]                                                     2%|▏         | 198/12776 [01:27<43:21,  4.83it/s]  2%|▏         | 199/12776 [01:28<45:59,  4.56it/s]                                                     2%|▏         | 199/12776 [01:28<45:59,  4.56it/s]  2%|▏         | 200/12776 [01:28<1:20:13,  2.61it/s]                                                       2%|▏         | 200/12776 [01:28<1:20:13,  2.61it/s]  2%|▏         | 201/12776 [01:30<2:33:47,  1.36it/s]                                                       2%|▏         | 201/12776 [01:30<2:33:47,  1.36it/s]  2%|▏         | 202/12776 [01:31<2:47:10,  1.25it/s]                                                       2%|▏         | 202/12776 [01:31<2:47:10,  1.25it/s]  2%|▏         | 203/12776 [01:32<2:50:02,  1.23it/s]                                                       2%|▏         | 203/12776 [01:32<2:50:02,  1.23it/s]  2%|▏         | 204/12776 [01:32<2:49:52,  1.23it/s]                                                       2%|▏         | 204/12776 [01:32<2:49:52,  1.23it/s]  2%|▏         | 205/12776 [01:33<2:52:57,  1.21it/s]                                                       2%|▏         | 205/12776 [01:33<2:52:57,  1.21it/s]  2%|▏         | 206/12776 [01:34<2:45:08,  1.27it/s]                                                       2%|▏         | 206/12776 [01:34<2:45:08,  1.27it/s]  2%|▏         | 207/12776 [01:35<2:40:11,  1.31it/s]                                                       2%|▏         | 207/12776 [01:35<2:40:11,  1.31it/s]  2%|▏         | 208/12776 [01:35<2:32:39,  1.37it/s]                                                       2%|▏         | 208/12776 [01:35<2:32:39,  1.37it/s]  2%|▏         | 209/12776 [01:36<2:25:50,  1.44it/s]                                                       2%|▏         | 209/12776 [01:36<2:25:50,  1.44it/s]  2%|▏         | 210/12776 [01:37<2:19:23,  1.50it/s]                                                       2%|▏         | 210/12776 [01:37<2:19:23,  1.50it/s]  2%|▏         | 211/12776 [01:37<2:13:38,  1.57it/s]                                                       2%|▏         | 211/12776 [01:37<2:13:38,  1.57it/s]  2%|▏         | 212/12776 [01:38<2:07:41,  1.64it/s]                                                       2%|▏         | 212/12776 [01:38<2:07:41,  1.64it/s]  2%|▏         | 213/12776 [01:38<2:03:44,  1.69it/s]                                                       2%|▏         | 213/12776 [01:38<2:03:44,  1.69it/s]  2%|▏         | 214/12776 [01:39<1:58:18,  1.77it/s]                                                       2%|▏         | 214/12776 [01:39<1:58:18,  1.77it/s]  2%|▏         | 215/12776 [01:39<1:58:12,  1.77it/s]                                                       2%|▏         | 215/12776 [01:39<1:58:12,  1.77it/s]  2%|▏         | 216/12776 [01:40<1:51:28,  1.88it/s]                                                       2%|▏         | 216/12776 [01:40<1:51:28,  1.88it/s]  2%|▏         | 217/12776 [01:40<1:50:34,  1.89it/s]                                                       2%|▏         | 217/12776 [01:40<1:50:34,  1.89it/s]  2%|▏         | 218/12776 [01:41<1:43:20,  2.03it/s]                                                       2%|▏         | 218/12776 [01:41<1:43:20,  2.03it/s]  2%|▏         | 219/12776 [01:41<1:37:34,  2.14it/s]                                                       2%|▏         | 219/12776 [01:41<1:37:34,  2.14it/s]  2%|▏         | 220/12776 [01:42<1:39:53,  2.09it/s]                                                       2%|▏         | 220/12776 [01:42<1:39:53,  2.09it/s]  2%|▏         | 221/12776 [01:42<1:33:41,  2.23it/s]                                                       2%|▏         | 221/12776 [01:42<1:33:41,  2.23it/s]  2%|▏         | 222/12776 [01:42<1:27:12,  2.40it/s]                                                       2%|▏         | 222/12776 [01:42<1:27:12,  2.40it/s]  2%|▏         | 223/12776 [01:43<1:26:27,  2.42it/s]                                                       2%|▏         | 223/12776 [01:43<1:26:27,  2.42it/s]  2%|▏         | 224/12776 [01:43<1:21:46,  2.56it/s]                                                       2%|▏         | 224/12776 [01:43<1:21:46,  2.56it/s]  2%|▏         | 225/12776 [01:43<1:17:43,  2.69it/s]                                                       2%|▏         | 225/12776 [01:43<1:17:43,  2.69it/s]  2%|▏         | 226/12776 [01:44<1:15:33,  2.77it/s]                                                       2%|▏         | 226/12776 [01:44<1:15:33,  2.77it/s]  2%|▏         | 227/12776 [01:44<1:11:17,  2.93it/s]                                                       2%|▏         | 227/12776 [01:44<1:11:17,  2.93it/s]  2%|▏         | 228/12776 [01:44<1:08:29,  3.05it/s]                                                       2%|▏         | 228/12776 [01:44<1:08:29,  3.05it/s]  2%|▏         | 229/12776 [01:45<1:06:11,  3.16it/s]                                                       2%|▏         | 229/12776 [01:45<1:06:11,  3.16it/s]  2%|▏         | 230/12776 [01:45<1:11:16,  2.93it/s]                                                       2%|▏         | 230/12776 [01:45<1:11:16,  2.93it/s]  2%|▏         | 231/12776 [01:45<1:07:01,  3.12it/s]                                                       2%|▏         | 231/12776 [01:45<1:07:01,  3.12it/s]  2%|▏         | 232/12776 [01:46<1:03:17,  3.30it/s]                                                       2%|▏         | 232/12776 [01:46<1:03:17,  3.30it/s]  2%|▏         | 233/12776 [01:46<59:54,  3.49it/s]                                                       2%|▏         | 233/12776 [01:46<59:54,  3.49it/s]  2%|▏         | 234/12776 [01:46<1:04:05,  3.26it/s]                                                       2%|▏         | 234/12776 [01:46<1:04:05,  3.26it/s]  2%|▏         | 235/12776 [01:46<59:59,  3.48it/s]                                                       2%|▏         | 235/12776 [01:46<59:59,  3.48it/s]  2%|▏         | 236/12776 [01:47<56:42,  3.69it/s]                                                     2%|▏         | 236/12776 [01:47<56:42,  3.69it/s]  2%|▏         | 237/12776 [01:47<53:59,  3.87it/s]                                                     2%|▏         | 237/12776 [01:47<53:59,  3.87it/s]  2%|▏         | 238/12776 [01:47<57:37,  3.63it/s]                                                     2%|▏         | 238/12776 [01:47<57:37,  3.63it/s]  2%|▏         | 239/12776 [01:47<53:52,  3.88it/s]                                                     2%|▏         | 239/12776 [01:47<53:52,  3.88it/s]  2%|▏         | 240/12776 [01:48<50:39,  4.12it/s]                                                     2%|▏         | 240/12776 [01:48<50:39,  4.12it/s]  2%|▏         | 241/12776 [01:48<48:48,  4.28it/s]                                                     2%|▏         | 241/12776 [01:48<48:48,  4.28it/s]  2%|▏         | 242/12776 [01:48<47:28,  4.40it/s]                                                     2%|▏         | 242/12776 [01:48<47:28,  4.40it/s]  2%|▏         | 243/12776 [01:48<51:48,  4.03it/s]                                                     2%|▏         | 243/12776 [01:48<51:48,  4.03it/s]  2%|▏         | 244/12776 [01:49<48:43,  4.29it/s]                                                     2%|▏         | 244/12776 [01:49<48:43,  4.29it/s]  2%|▏         | 245/12776 [01:49<46:16,  4.51it/s]                                                     2%|▏         | 245/12776 [01:49<46:16,  4.51it/s]  2%|▏         | 246/12776 [01:49<44:26,  4.70it/s]                                                     2%|▏         | 246/12776 [01:49<44:26,  4.70it/s]  2%|▏         | 247/12776 [01:49<42:59,  4.86it/s]                                                     2%|▏         | 247/12776 [01:49<42:59,  4.86it/s]  2%|▏         | 248/12776 [01:49<41:54,  4.98it/s]                                                     2%|▏         | 248/12776 [01:49<41:54,  4.98it/s]  2%|▏         | 249/12776 [01:50<48:06,  4.34it/s]                                                     2%|▏         | 249/12776 [01:50<48:06,  4.34it/s]  2%|▏         | 250/12776 [01:50<1:17:10,  2.71it/s]                                                     {'loss': 4.5939, 'grad_norm': 23.243457794189453, 'learning_rate': 9.839999999999999e-05, 'epoch': 0.03}
+{'loss': 3.0461, 'grad_norm': 2.583829164505005, 'learning_rate': 9.9e-05, 'epoch': 0.03}
+{'loss': 3.1916, 'grad_norm': 3.8143973350524902, 'learning_rate': 9.96e-05, 'epoch': 0.03}
+{'loss': 3.2371, 'grad_norm': 3.9216201305389404, 'learning_rate': 0.0001002, 'epoch': 0.03}
+{'loss': 3.7303, 'grad_norm': 13.120063781738281, 'learning_rate': 0.0001008, 'epoch': 0.03}
+{'loss': 3.1096, 'grad_norm': 1.0374871492385864, 'learning_rate': 0.0001014, 'epoch': 0.03}
+{'loss': 3.2397, 'grad_norm': 2.4160315990448, 'learning_rate': 0.000102, 'epoch': 0.03}
+{'loss': 3.6207, 'grad_norm': 10.37614631652832, 'learning_rate': 0.0001026, 'epoch': 0.03}
+{'loss': 3.3122, 'grad_norm': 3.1003804206848145, 'learning_rate': 0.00010319999999999999, 'epoch': 0.03}
+{'loss': 3.0935, 'grad_norm': 1.6606565713882446, 'learning_rate': 0.00010379999999999999, 'epoch': 0.03}
+{'loss': 3.2649, 'grad_norm': 1.8800925016403198, 'learning_rate': 0.00010439999999999999, 'epoch': 0.03}
+{'loss': 3.4573, 'grad_norm': 6.265193462371826, 'learning_rate': 0.00010499999999999999, 'epoch': 0.03}
+{'loss': 3.6856, 'grad_norm': 8.842411994934082, 'learning_rate': 0.00010559999999999998, 'epoch': 0.03}
+{'loss': 3.2177, 'grad_norm': 1.783496379852295, 'learning_rate': 0.00010619999999999998, 'epoch': 0.03}
+{'loss': 3.2863, 'grad_norm': 3.2805280685424805, 'learning_rate': 0.00010679999999999998, 'epoch': 0.03}
+{'loss': 3.428, 'grad_norm': 3.0294930934906006, 'learning_rate': 0.00010739999999999998, 'epoch': 0.03}
+{'loss': 3.3734, 'grad_norm': 1.3118689060211182, 'learning_rate': 0.00010799999999999998, 'epoch': 0.03}
+{'loss': 3.4229, 'grad_norm': 4.640561103820801, 'learning_rate': 0.00010859999999999998, 'epoch': 0.03}
+{'loss': 3.1565, 'grad_norm': 2.2550694942474365, 'learning_rate': 0.00010919999999999998, 'epoch': 0.03}
+{'loss': 3.1083, 'grad_norm': 2.734482765197754, 'learning_rate': 0.00010979999999999999, 'epoch': 0.03}
+{'loss': 3.1897, 'grad_norm': 2.0161116123199463, 'learning_rate': 0.00011039999999999999, 'epoch': 0.03}
+{'loss': 3.1898, 'grad_norm': 1.5846065282821655, 'learning_rate': 0.00011099999999999999, 'epoch': 0.03}
+{'loss': 3.3738, 'grad_norm': 7.154541015625, 'learning_rate': 0.00011159999999999999, 'epoch': 0.03}
+{'loss': 3.2111, 'grad_norm': 2.3260960578918457, 'learning_rate': 0.00011219999999999999, 'epoch': 0.03}
+{'loss': 3.2184, 'grad_norm': 2.29046630859375, 'learning_rate': 0.00011279999999999999, 'epoch': 0.03}
+{'loss': 3.1476, 'grad_norm': 3.9989373683929443, 'learning_rate': 0.00011339999999999999, 'epoch': 0.03}
+{'loss': 3.098, 'grad_norm': 1.7007783651351929, 'learning_rate': 0.00011399999999999999, 'epoch': 0.03}
+{'loss': 3.1793, 'grad_norm': 1.5231850147247314, 'learning_rate': 0.0001146, 'epoch': 0.03}
+{'loss': 3.0494, 'grad_norm': 3.336979627609253, 'learning_rate': 0.0001152, 'epoch': 0.03}
+{'loss': 2.9805, 'grad_norm': 2.053584098815918, 'learning_rate': 0.0001158, 'epoch': 0.03}
+{'loss': 2.9734, 'grad_norm': 1.7036573886871338, 'learning_rate': 0.0001164, 'epoch': 0.03}
+{'loss': 2.9142, 'grad_norm': 3.5702524185180664, 'learning_rate': 0.000117, 'epoch': 0.03}
+{'loss': 2.954, 'grad_norm': 3.3421378135681152, 'learning_rate': 0.0001176, 'epoch': 0.03}
+{'loss': 8.3104, 'grad_norm': 59.050907135009766, 'learning_rate': 0.0001182, 'epoch': 0.03}
+{'loss': 4.2312, 'grad_norm': 17.812931060791016, 'learning_rate': 0.0001188, 'epoch': 0.03}
+{'loss': 3.0879, 'grad_norm': 3.1730802059173584, 'learning_rate': 0.0001194, 'epoch': 0.03}
+{'loss': 3.0617, 'grad_norm': 2.468261241912842, 'learning_rate': 0.00011999999999999999, 'epoch': 0.03}
+{'loss': 3.1855, 'grad_norm': 6.175726890563965, 'learning_rate': 0.00012059999999999999, 'epoch': 0.03}
+{'loss': 3.237, 'grad_norm': 3.9613351821899414, 'learning_rate': 0.00012119999999999999, 'epoch': 0.03}
+{'loss': 3.21, 'grad_norm': 3.2016634941101074, 'learning_rate': 0.00012179999999999999, 'epoch': 0.03}
+{'loss': 3.0034, 'grad_norm': 2.0282530784606934, 'learning_rate': 0.0001224, 'epoch': 0.03}
+{'loss': 3.0214, 'grad_norm': 1.9264556169509888, 'learning_rate': 0.00012299999999999998, 'epoch': 0.03}
+{'loss': 3.0177, 'grad_norm': 1.9033787250518799, 'learning_rate': 0.0001236, 'epoch': 0.03}
+{'loss': 3.0139, 'grad_norm': 1.997543215751648, 'learning_rate': 0.00012419999999999998, 'epoch': 0.03}
+{'loss': 3.069, 'grad_norm': 3.2476229667663574, 'learning_rate': 0.00012479999999999997, 'epoch': 0.03}
+{'loss': 2.9909, 'grad_norm': 2.2744576930999756, 'learning_rate': 0.00012539999999999999, 'epoch': 0.03}
+{'loss': 2.9783, 'grad_norm': 1.2331576347351074, 'learning_rate': 0.00012599999999999997, 'epoch': 0.03}
+{'loss': 2.9448, 'grad_norm': 2.7535603046417236, 'learning_rate': 0.0001266, 'epoch': 0.03}
+{'loss': 2.9163, 'grad_norm': 0.8149605989456177, 'learning_rate': 0.00012719999999999997, 'epoch': 0.03}
+{'loss': 3.6752, 'grad_norm': 12.775609970092773, 'learning_rate': 0.0001278, 'epoch': 0.03}
+{'loss': 3.0796, 'grad_norm': 3.7931580543518066, 'learning_rate': 0.00012839999999999998, 'epoch': 0.03}
+{'loss': 3.0605, 'grad_norm': 4.115076065063477, 'learning_rate': 0.000129, 'epoch': 0.03}
+{'loss': 2.994, 'grad_norm': 0.8560850024223328, 'learning_rate': 0.00012959999999999998, 'epoch': 0.03}
+{'loss': 3.2076, 'grad_norm': 4.085236072540283, 'learning_rate': 0.0001302, 'epoch': 0.03}
+{'loss': 3.0968, 'grad_norm': 1.9408749341964722, 'learning_rate': 0.00013079999999999998, 'epoch': 0.03}
+{'loss': 3.0112, 'grad_norm': 2.0618252754211426, 'learning_rate': 0.0001314, 'epoch': 0.03}
+{'loss': 2.9357, 'grad_norm': 2.527026414871216, 'learning_rate': 0.00013199999999999998, 'epoch': 0.04}
+{'loss': 3.2706, 'grad_norm': 8.99402904510498, 'learning_rate': 0.0001326, 'epoch': 0.04}
+{'loss': 3.282, 'grad_norm': 7.271577835083008, 'learning_rate': 0.00013319999999999999, 'epoch': 0.04}
+{'loss': 2.9374, 'grad_norm': 0.7550258040428162, 'learning_rate': 0.0001338, 'epoch': 0.04}
+{'loss': 3.0337, 'grad_norm': 2.4851882457733154, 'learning_rate': 0.0001344, 'epoch': 0.04}
+{'loss': 2.9723, 'grad_norm': 2.233167886734009, 'learning_rate': 0.000135, 'epoch': 0.04}
+{'loss': 3.0346, 'grad_norm': 1.3427929878234863, 'learning_rate': 0.0001356, 'epoch': 0.04}
+{'loss': 2.9946, 'grad_norm': 1.4141734838485718, 'learning_rate': 0.0001362, 'epoch': 0.04}
+{'loss': 3.0399, 'grad_norm': 0.9676278233528137, 'learning_rate': 0.0001368, 'epoch': 0.04}
+{'loss': 2.9977, 'grad_norm': 0.9752649664878845, 'learning_rate': 0.0001374, 'epoch': 0.04}
+{'loss': 3.006, 'grad_norm': 1.0055956840515137, 'learning_rate': 0.000138, 'epoch': 0.04}
+{'loss': 2.9964, 'grad_norm': 2.232508659362793, 'learning_rate': 0.0001386, 'epoch': 0.04}
+{'loss': 2.958, 'grad_norm': 1.2427127361297607, 'learning_rate': 0.0001392, 'epoch': 0.04}
+{'loss': 3.0754, 'grad_norm': 1.305584192276001, 'learning_rate': 0.00013979999999999998, 'epoch': 0.04}
+{'loss': 3.0478, 'grad_norm': 2.785289764404297, 'learning_rate': 0.0001404, 'epoch': 0.04}
+{'loss': 3.2, 'grad_norm': 4.857529640197754, 'learning_rate': 0.00014099999999999998, 'epoch': 0.04}
+{'loss': 3.0873, 'grad_norm': 2.906057119369507, 'learning_rate': 0.00014159999999999997, 'epoch': 0.04}
+{'loss': 3.0688, 'grad_norm': 1.3470922708511353, 'learning_rate': 0.0001422, 'epoch': 0.04}
+{'loss': 3.1128, 'grad_norm': 3.2114920616149902, 'learning_rate': 0.00014279999999999997, 'epoch': 0.04}
+{'loss': 3.0977, 'grad_norm': 2.603756904602051, 'learning_rate': 0.0001434, 'epoch': 0.04}
+{'loss': 3.1806, 'grad_norm': 4.524052143096924, 'learning_rate': 0.00014399999999999998, 'epoch': 0.04}
+{'loss': 3.0789, 'grad_norm': 2.9187328815460205, 'learning_rate': 0.0001446, 'epoch': 0.04}
+{'loss': 2.9883, 'grad_norm': 3.1184439659118652, 'learning_rate': 0.00014519999999999998, 'epoch': 0.04}
+{'loss': 2.9124, 'grad_norm': 3.0836615562438965, 'learning_rate': 0.0001458, 'epoch': 0.04}
+{'loss': 2.9268, 'grad_norm': 2.7621877193450928, 'learning_rate': 0.00014639999999999998, 'epoch': 0.04}
+{'loss': 2.8235, 'grad_norm': 1.8909045457839966, 'learning_rate': 0.000147, 'epoch': 0.04}
+  2%|▏         | 250/12776 [01:50<1:17:10,  2.71it/s]  2%|▏         | 251/12776 [01:52<2:28:53,  1.40it/s]                                                       2%|▏         | 251/12776 [01:52<2:28:53,  1.40it/s]  2%|▏         | 252/12776 [01:53<2:43:06,  1.28it/s]                                                       2%|▏         | 252/12776 [01:53<2:43:06,  1.28it/s]  2%|▏         | 253/12776 [01:54<2:48:39,  1.24it/s]                                                       2%|▏         | 253/12776 [01:54<2:48:39,  1.24it/s]  2%|▏         | 254/12776 [01:55<2:54:27,  1.20it/s]                                                       2%|▏         | 254/12776 [01:55<2:54:27,  1.20it/s]  2%|▏         | 255/12776 [01:55<2:49:02,  1.23it/s]                                                       2%|▏         | 255/12776 [01:55<2:49:02,  1.23it/s]  2%|▏         | 256/12776 [01:56<2:42:07,  1.29it/s]                                                       2%|▏         | 256/12776 [01:56<2:42:07,  1.29it/s]  2%|▏         | 257/12776 [01:57<2:37:00,  1.33it/s]                                                       2%|▏         | 257/12776 [01:57<2:37:00,  1.33it/s]  2%|▏         | 258/12776 [01:57<2:29:21,  1.40it/s]                                                       2%|▏         | 258/12776 [01:57<2:29:21,  1.40it/s]  2%|▏         | 259/12776 [01:58<2:21:55,  1.47it/s]                                                       2%|▏         | 259/12776 [01:58<2:21:55,  1.47it/s]  2%|▏         | 260/12776 [01:58<2:15:20,  1.54it/s]                                                       2%|▏         | 260/12776 [01:58<2:15:20,  1.54it/s]  2%|▏         | 261/12776 [01:59<2:10:45,  1.60it/s]                                                       2%|▏         | 261/12776 [01:59<2:10:45,  1.60it/s]  2%|▏         | 262/12776 [02:00<2:04:52,  1.67it/s]                                                       2%|▏         | 262/12776 [02:00<2:04:52,  1.67it/s]  2%|▏         | 263/12776 [02:00<2:03:51,  1.68it/s]                                                       2%|▏         | 263/12776 [02:00<2:03:51,  1.68it/s]  2%|▏         | 264/12776 [02:01<1:56:15,  1.79it/s]                                                       2%|▏         | 264/12776 [02:01<1:56:15,  1.79it/s]  2%|▏         | 265/12776 [02:01<1:55:30,  1.81it/s]                                                       2%|▏         | 265/12776 [02:01<1:55:30,  1.81it/s]  2%|▏         | 266/12776 [02:02<1:48:14,  1.93it/s]                                                       2%|▏         | 266/12776 [02:02<1:48:14,  1.93it/s]  2%|▏         | 267/12776 [02:02<1:50:12,  1.89it/s]                                                       2%|▏         | 267/12776 [02:02<1:50:12,  1.89it/s]  2%|▏         | 268/12776 [02:03<1:42:04,  2.04it/s]                                                       2%|▏         | 268/12776 [02:03<1:42:04,  2.04it/s]  2%|▏         | 269/12776 [02:03<1:35:58,  2.17it/s]                                                       2%|▏         | 269/12776 [02:03<1:35:58,  2.17it/s]  2%|▏         | 270/12776 [02:03<1:32:13,  2.26it/s]                                                       2%|▏         | 270/12776 [02:03<1:32:13,  2.26it/s]  2%|▏         | 271/12776 [02:04<1:26:24,  2.41it/s]                                                       2%|▏         | 271/12776 [02:04<1:26:24,  2.41it/s]  2%|▏         | 272/12776 [02:04<1:21:51,  2.55it/s]                                                       2%|▏         | 272/12776 [02:04<1:21:51,  2.55it/s]  2%|▏         | 273/12776 [02:04<1:25:41,  2.43it/s]                                                       2%|▏         | 273/12776 [02:04<1:25:41,  2.43it/s]  2%|▏         | 274/12776 [02:05<1:20:50,  2.58it/s]                                                       2%|▏         | 274/12776 [02:05<1:20:50,  2.58it/s]  2%|▏         | 275/12776 [02:05<1:16:41,  2.72it/s]                                                       2%|▏         | 275/12776 [02:05<1:16:41,  2.72it/s]  2%|▏         | 276/12776 [02:05<1:12:52,  2.86it/s]                                                       2%|▏         | 276/12776 [02:05<1:12:52,  2.86it/s]  2%|▏         | 277/12776 [02:06<1:10:55,  2.94it/s]                                                       2%|▏         | 277/12776 [02:06<1:10:55,  2.94it/s]  2%|▏         | 278/12776 [02:06<1:07:36,  3.08it/s]                                                       2%|▏         | 278/12776 [02:06<1:07:36,  3.08it/s]  2%|▏         | 279/12776 [02:06<1:04:07,  3.25it/s]                                                       2%|▏         | 279/12776 [02:06<1:04:07,  3.25it/s]  2%|▏         | 280/12776 [02:07<1:01:11,  3.40it/s]                                                       2%|▏         | 280/12776 [02:07<1:01:11,  3.40it/s]  2%|▏         | 281/12776 [02:07<1:02:52,  3.31it/s]                                                       2%|▏         | 281/12776 [02:07<1:02:52,  3.31it/s]  2%|▏         | 282/12776 [02:07<59:49,  3.48it/s]                                                       2%|▏         | 282/12776 [02:07<59:49,  3.48it/s]  2%|▏         | 283/12776 [02:07<57:37,  3.61it/s]                                                     2%|▏         | 283/12776 [02:07<57:37,  3.61it/s]  2%|▏         | 284/12776 [02:08<55:31,  3.75it/s]                                                     2%|▏         | 284/12776 [02:08<55:31,  3.75it/s]  2%|▏         | 285/12776 [02:08<56:53,  3.66it/s]                                                     2%|▏         | 285/12776 [02:08<56:53,  3.66it/s]  2%|▏         | 286/12776 [02:08<53:57,  3.86it/s]                                                     2%|▏         | 286/12776 [02:08<53:57,  3.86it/s]  2%|▏         | 287/12776 [02:08<51:20,  4.05it/s]                                                     2%|▏         | 287/12776 [02:08<51:20,  4.05it/s]  2%|▏         | 288/12776 [02:09<49:14,  4.23it/s]                                                     2%|▏         | 288/12776 [02:09<49:14,  4.23it/s]  2%|▏         | 289/12776 [02:09<47:38,  4.37it/s]                                                     2%|▏         | 289/12776 [02:09<47:38,  4.37it/s]  2%|▏         | 290/12776 [02:09<50:07,  4.15it/s]                                                     2%|▏         | 290/12776 [02:09<50:07,  4.15it/s]  2%|▏         | 291/12776 [02:09<47:52,  4.35it/s]                                                     2%|▏         | 291/12776 [02:09<47:52,  4.35it/s]  2%|▏         | 292/12776 [02:10<46:47,  4.45it/s]                                                     2%|▏         | 292/12776 [02:10<46:47,  4.45it/s]  2%|▏         | 293/12776 [02:10<45:31,  4.57it/s]                                                     2%|▏         | 293/12776 [02:10<45:31,  4.57it/s]  2%|▏         | 294/12776 [02:10<44:11,  4.71it/s]                                                     2%|▏         | 294/12776 [02:10<44:11,  4.71it/s]  2%|▏         | 295/12776 [02:10<49:23,  4.21it/s]                                                     2%|▏         | 295/12776 [02:10<49:23,  4.21it/s]  2%|▏         | 296/12776 [02:10<46:37,  4.46it/s]                                                     2%|▏         | 296/12776 [02:10<46:37,  4.46it/s]  2%|▏         | 297/12776 [02:11<44:31,  4.67it/s]                                                     2%|▏         | 297/12776 [02:11<44:31,  4.67it/s]  2%|▏         | 298/12776 [02:11<42:41,  4.87it/s]                                                     2%|▏         | 298/12776 [02:11<42:41,  4.87it/s]  2%|▏         | 299/12776 [02:11<41:10,  5.05it/s]                                                     2%|▏         | 299/12776 [02:11<41:10,  5.05it/s]  2%|▏         | 300/12776 [02:12<1:14:58,  2.77it/s]                                                       2%|▏         | 300/12776 [02:12<1:14:58,  2.77it/s]  2%|▏         | 301/12776 [02:13<2:26:41,  1.42it/s]                                                       2%|▏         | 301/12776 [02:13<2:26:41,  1.42it/s]  2%|▏         | 302/12776 [02:14<2:47:02,  1.24it/s]                                                       2%|▏         | 302/12776 [02:14<2:47:02,  1.24it/s]  2%|▏         | 303/12776 [02:15<2:54:33,  1.19it/s]                                                       2%|▏         | 303/12776 [02:15<2:54:33,  1.19it/s]  2%|▏         | 304/12776 [02:16<2:51:18,  1.21it/s]                                                       2%|▏         | 304/12776 [02:16<2:51:18,  1.21it/s]  2%|▏         | 305/12776 [02:17<2:46:22,  1.25it/s]                                                       2%|▏         | 305/12776 [02:17<2:46:22,  1.25it/s]  2%|▏         | 306/12776 [02:17<2:44:56,  1.26it/s]                                                       2%|▏         | 306/12776 [02:17<2:44:56,  1.26it/s]  2%|▏         | 307/12776 [02:18<2:42:17,  1.28it/s]                                                       2%|▏         | 307/12776 [02:18<2:42:17,  1.28it/s]  2%|▏         | 308/12776 [02:19<2:33:24,  1.35it/s]                                                       2%|▏         | 308/12776 [02:19<2:33:24,  1.35it/s]  2%|▏         | 309/12776 [02:20<2:32:44,  1.36it/s]                                                       2%|▏         | 309/12776 [02:20<2:32:44,  1.36it/s]  2%|▏         | 310/12776 [02:20<2:21:44,  1.47it/s]                                                       2%|▏         | 310/12776 [02:20<2:21:44,  1.47it/s]  2%|▏         | 311/12776 [02:21<2:16:20,  1.52it/s]                                                       2%|▏         | 311/12776 [02:21<2:16:20,  1.52it/s]  2%|▏         | 312/12776 [02:21<2:08:06,  1.62it/s]                                                       2%|▏         | 312/12776 [02:21<2:08:06,  1.62it/s]  2%|▏         | 313/12776 [02:22<2:05:42,  1.65it/s]                                                       2%|▏         | 313/12776 [02:22<2:05:42,  1.65it/s]  2%|▏         | 314/12776 [02:22<1:56:33,  1.78it/s]                                                       2%|▏         | 314/12776 [02:22<1:56:33,  1.78it/s]  2%|▏         | 315/12776 [02:23<1:52:21,  1.85it/s]                                                       2%|▏         | 315/12776 [02:23<1:52:21,  1.85it/s]  2%|▏         | 316/12776 [02:23<1:45:02,  1.98it/s]                                                       2%|▏         | 316/12776 [02:23<1:45:02,  1.98it/s]  2%|▏         | 317/12776 [02:24<1:39:11,  2.09it/s]                                                       2%|▏         | 317/12776 [02:24<1:39:11,  2.09it/s]  2%|▏         | 318/12776 [02:24<1:39:00,  2.10it/s]                                                       2%|▏         | 318/12776 [02:24<1:39:00,  2.10it/s]  2%|▏         | 319/12776 [02:24<1:33:35,  2.22it/s]                                                       2%|▏         | 319/12776 [02:24<1:33:35,  2.22it/s]  3%|▎         | 320/12776 [02:25<1:29:00,  2.33it/s]                                                       3%|▎         | 320/12776 [02:25<1:29:00,  2.33it/s]  3%|▎         | 321/12776 [02:25<1:28:43,  2.34it/s]                                                       3%|▎         | 321/12776 [02:25<1:28:43,  2.34it/s]  3%|▎         | 322/12776 [02:26<1:24:37,  2.45it/s]                                                       3%|▎         | 322/12776 [02:26<1:24:37,  2.45it/s]  3%|▎         | 323/12776 [02:26<1:21:29,  2.55it/s]                                                       3%|▎         | 323/12776 [02:26<1:21:29,  2.55it/s]  3%|▎         | 324/12776 [02:26<1:23:26,  2.49it/s]                                                       3%|▎         | 324/12776 [02:26<1:23:26,  2.49it/s]  3%|▎         | 325/12776 [02:27<1:19:12,  2.62it/s]                                                       3%|▎         | 325/12776 [02:27<1:19:12,  2.62it/s]  3%|▎         | 326/12776 [02:27<1:14:47,  2.77it/s]                                                       3%|▎         | 326/12776 [02:27<1:14:47,  2.77it/s]  3%|▎         | 327/12776 [02:27<1:17:54,  2.66it/s]                                                       3%|▎         | 327/12776 [02:27<1:17:54,  2.66it/s]  3%|▎         | 328/12776 [02:28<1:12:15,  2.87it/s]                                                       3%|▎         | 328/12776 [02:28<1:12:15,  2.87it/s]  3%|▎         | 329/12776 [02:28<1:07:47,  3.06it/s]                                                       3%|▎         | 329/12776 [02:28<1:07:47,  3.06it/s]  3%|▎         | 330/12776 [02:28<1:04:28,  3.22it/s]                                                       3%|▎         | 330/12776 [02:28<1:04:28,  3.22it/s]  3%|▎         | 331/12776 [02:29<1:05:11,  3.18it/s]                                                       3%|▎         | 331/12776 [02:29<1:05:11,  3.18it/s]  3%|▎         | 332/12776 [02:29<1:01:49,  3.35it/s]                                                     {'loss': 2.7703, 'grad_norm': 3.872973680496216, 'learning_rate': 0.00014759999999999998, 'epoch': 0.04}
+{'loss': 3.046, 'grad_norm': 4.677459716796875, 'learning_rate': 0.0001482, 'epoch': 0.04}
+{'loss': 3.038, 'grad_norm': 3.410970449447632, 'learning_rate': 0.00014879999999999998, 'epoch': 0.04}
+{'loss': 3.2031, 'grad_norm': 6.155794143676758, 'learning_rate': 0.0001494, 'epoch': 0.04}
+{'loss': 2.9807, 'grad_norm': 2.345693588256836, 'learning_rate': 0.00015, 'epoch': 0.04}
+{'loss': 2.9993, 'grad_norm': 1.6279584169387817, 'learning_rate': 0.00015059999999999997, 'epoch': 0.04}
+{'loss': 2.9005, 'grad_norm': 1.0136637687683105, 'learning_rate': 0.0001512, 'epoch': 0.04}
+{'loss': 2.9279, 'grad_norm': 1.4269553422927856, 'learning_rate': 0.00015179999999999998, 'epoch': 0.04}
+{'loss': 2.8742, 'grad_norm': 1.676174283027649, 'learning_rate': 0.0001524, 'epoch': 0.04}
+{'loss': 3.4308, 'grad_norm': 9.41197395324707, 'learning_rate': 0.00015299999999999998, 'epoch': 0.04}
+{'loss': 2.9597, 'grad_norm': 3.7625269889831543, 'learning_rate': 0.0001536, 'epoch': 0.04}
+{'loss': 2.9289, 'grad_norm': 1.9438997507095337, 'learning_rate': 0.00015419999999999998, 'epoch': 0.04}
+{'loss': 2.8869, 'grad_norm': 1.4095935821533203, 'learning_rate': 0.0001548, 'epoch': 0.04}
+{'loss': 2.9053, 'grad_norm': 1.280892252922058, 'learning_rate': 0.00015539999999999998, 'epoch': 0.04}
+{'loss': 2.8905, 'grad_norm': 0.9870156049728394, 'learning_rate': 0.000156, 'epoch': 0.04}
+{'loss': 2.9066, 'grad_norm': 2.140601634979248, 'learning_rate': 0.00015659999999999998, 'epoch': 0.04}
+{'loss': 2.9958, 'grad_norm': 3.046438217163086, 'learning_rate': 0.0001572, 'epoch': 0.04}
+{'loss': 3.2312, 'grad_norm': 6.271746635437012, 'learning_rate': 0.0001578, 'epoch': 0.04}
+{'loss': 2.928, 'grad_norm': 0.49755653738975525, 'learning_rate': 0.0001584, 'epoch': 0.04}
+{'loss': 2.9341, 'grad_norm': 1.545334815979004, 'learning_rate': 0.000159, 'epoch': 0.04}
+{'loss': 2.9309, 'grad_norm': 0.7185292840003967, 'learning_rate': 0.0001596, 'epoch': 0.04}
+{'loss': 3.1542, 'grad_norm': 5.970613479614258, 'learning_rate': 0.0001602, 'epoch': 0.04}
+{'loss': 3.0439, 'grad_norm': 3.6000194549560547, 'learning_rate': 0.0001608, 'epoch': 0.04}
+{'loss': 2.982, 'grad_norm': 3.2971136569976807, 'learning_rate': 0.0001614, 'epoch': 0.04}
+{'loss': 3.0709, 'grad_norm': 1.305440068244934, 'learning_rate': 0.000162, 'epoch': 0.04}
+{'loss': 3.0722, 'grad_norm': 3.8561184406280518, 'learning_rate': 0.0001626, 'epoch': 0.04}
+{'loss': 2.9751, 'grad_norm': 4.944895267486572, 'learning_rate': 0.0001632, 'epoch': 0.04}
+{'loss': 3.1554, 'grad_norm': 5.420694351196289, 'learning_rate': 0.0001638, 'epoch': 0.04}
+{'loss': 3.1093, 'grad_norm': 1.4312067031860352, 'learning_rate': 0.0001644, 'epoch': 0.04}
+{'loss': 3.016, 'grad_norm': 1.852827548980713, 'learning_rate': 0.000165, 'epoch': 0.04}
+{'loss': 2.9848, 'grad_norm': 1.318656325340271, 'learning_rate': 0.0001656, 'epoch': 0.04}
+{'loss': 2.9815, 'grad_norm': 2.9079530239105225, 'learning_rate': 0.0001662, 'epoch': 0.04}
+{'loss': 2.9969, 'grad_norm': 2.3865318298339844, 'learning_rate': 0.0001668, 'epoch': 0.04}
+{'loss': 3.042, 'grad_norm': 2.2731151580810547, 'learning_rate': 0.0001674, 'epoch': 0.04}
+{'loss': 2.9983, 'grad_norm': 2.308046340942383, 'learning_rate': 0.000168, 'epoch': 0.04}
+{'loss': 3.0874, 'grad_norm': 2.4324429035186768, 'learning_rate': 0.0001686, 'epoch': 0.04}
+{'loss': 3.002, 'grad_norm': 5.1173505783081055, 'learning_rate': 0.00016919999999999997, 'epoch': 0.04}
+{'loss': 3.1237, 'grad_norm': 3.3832643032073975, 'learning_rate': 0.00016979999999999998, 'epoch': 0.04}
+{'loss': 3.1004, 'grad_norm': 1.6564899682998657, 'learning_rate': 0.00017039999999999997, 'epoch': 0.05}
+{'loss': 3.0704, 'grad_norm': 1.8179957866668701, 'learning_rate': 0.00017099999999999998, 'epoch': 0.05}
+{'loss': 3.088, 'grad_norm': 1.3986287117004395, 'learning_rate': 0.00017159999999999997, 'epoch': 0.05}
+{'loss': 2.9509, 'grad_norm': 1.6225603818893433, 'learning_rate': 0.00017219999999999998, 'epoch': 0.05}
+{'loss': 2.8125, 'grad_norm': 1.9594841003417969, 'learning_rate': 0.00017279999999999997, 'epoch': 0.05}
+{'loss': 2.982, 'grad_norm': 2.098877191543579, 'learning_rate': 0.00017339999999999996, 'epoch': 0.05}
+{'loss': 2.8949, 'grad_norm': 2.2931160926818848, 'learning_rate': 0.00017399999999999997, 'epoch': 0.05}
+{'loss': 3.0056, 'grad_norm': 1.8731071949005127, 'learning_rate': 0.00017459999999999996, 'epoch': 0.05}
+{'loss': 2.8271, 'grad_norm': 1.8290430307388306, 'learning_rate': 0.00017519999999999998, 'epoch': 0.05}
+{'loss': 2.5751, 'grad_norm': 2.112307548522949, 'learning_rate': 0.00017579999999999996, 'epoch': 0.05}
+{'loss': 2.5481, 'grad_norm': 1.703934907913208, 'learning_rate': 0.00017639999999999998, 'epoch': 0.05}
+{'loss': 2.3719, 'grad_norm': 2.3497684001922607, 'learning_rate': 0.00017699999999999997, 'epoch': 0.05}
+{'loss': 2.6147, 'grad_norm': 2.5405707359313965, 'learning_rate': 0.00017759999999999998, 'epoch': 0.05}
+{'loss': 5.9212, 'grad_norm': 41.07537841796875, 'learning_rate': 0.00017819999999999997, 'epoch': 0.05}
+{'loss': 2.9271, 'grad_norm': 3.55108904838562, 'learning_rate': 0.00017879999999999998, 'epoch': 0.05}
+{'loss': 3.9858, 'grad_norm': 16.451351165771484, 'learning_rate': 0.00017939999999999997, 'epoch': 0.05}
+{'loss': 2.9336, 'grad_norm': 1.0126781463623047, 'learning_rate': 0.00017999999999999998, 'epoch': 0.05}
+{'loss': 2.8967, 'grad_norm': 1.0213969945907593, 'learning_rate': 0.00018059999999999997, 'epoch': 0.05}
+{'loss': 2.8706, 'grad_norm': 2.1471173763275146, 'learning_rate': 0.00018119999999999999, 'epoch': 0.05}
+{'loss': 2.9031, 'grad_norm': 1.895684003829956, 'learning_rate': 0.00018179999999999997, 'epoch': 0.05}
+{'loss': 2.9365, 'grad_norm': 2.477410316467285, 'learning_rate': 0.0001824, 'epoch': 0.05}
+{'loss': 2.8607, 'grad_norm': 1.3221207857131958, 'learning_rate': 0.00018299999999999998, 'epoch': 0.05}
+{'loss': 2.9193, 'grad_norm': 1.6282726526260376, 'learning_rate': 0.0001836, 'epoch': 0.05}
+{'loss': 2.8905, 'grad_norm': 0.8280702829360962, 'learning_rate': 0.00018419999999999998, 'epoch': 0.05}
+{'loss': 2.8843, 'grad_norm': 1.868796706199646, 'learning_rate': 0.0001848, 'epoch': 0.05}
+{'loss': 2.9584, 'grad_norm': 2.179255723953247, 'learning_rate': 0.00018539999999999998, 'epoch': 0.05}
+{'loss': 2.8541, 'grad_norm': 1.0927103757858276, 'learning_rate': 0.000186, 'epoch': 0.05}
+{'loss': 2.8738, 'grad_norm': 1.2639646530151367, 'learning_rate': 0.00018659999999999998, 'epoch': 0.05}
+{'loss': 2.9175, 'grad_norm': 0.666633665561676, 'learning_rate': 0.0001872, 'epoch': 0.05}
+{'loss': 2.9048, 'grad_norm': 3.4493837356567383, 'learning_rate': 0.00018779999999999998, 'epoch': 0.05}
+{'loss': 2.8725, 'grad_norm': 0.9418443441390991, 'learning_rate': 0.00018839999999999997, 'epoch': 0.05}
+{'loss': 2.8598, 'grad_norm': 1.8465272188186646, 'learning_rate': 0.00018899999999999999, 'epoch': 0.05}
+{'loss': 2.9232, 'grad_norm': 1.2343814373016357, 'learning_rate': 0.00018959999999999997, 'epoch': 0.05}
+{'loss': 2.9062, 'grad_norm': 1.1628434658050537, 'learning_rate': 0.0001902, 'epoch': 0.05}
+{'loss': 2.993, 'grad_norm': 3.26182222366333, 'learning_rate': 0.00019079999999999998, 'epoch': 0.05}
+{'loss': 2.9229, 'grad_norm': 2.6465306282043457, 'learning_rate': 0.0001914, 'epoch': 0.05}
+{'loss': 2.9405, 'grad_norm': 1.6786047220230103, 'learning_rate': 0.00019199999999999998, 'epoch': 0.05}
+{'loss': 2.8359, 'grad_norm': 2.313570022583008, 'learning_rate': 0.0001926, 'epoch': 0.05}
+{'loss': 2.9089, 'grad_norm': 1.3497850894927979, 'learning_rate': 0.00019319999999999998, 'epoch': 0.05}
+{'loss': 2.8491, 'grad_norm': 0.9331809282302856, 'learning_rate': 0.0001938, 'epoch': 0.05}
+{'loss': 2.897, 'grad_norm': 1.8141287565231323, 'learning_rate': 0.00019439999999999998, 'epoch': 0.05}
+{'loss': 2.8376, 'grad_norm': 1.0781328678131104, 'learning_rate': 0.000195, 'epoch': 0.05}
+{'loss': 2.8983, 'grad_norm': 0.9681340456008911, 'learning_rate': 0.00019559999999999998, 'epoch': 0.05}
+{'loss': 2.8991, 'grad_norm': 4.107593059539795, 'learning_rate': 0.0001962, 'epoch': 0.05}
+  3%|▎         | 332/12776 [02:29<1:01:49,  3.35it/s]  3%|▎         | 333/12776 [02:29<58:53,  3.52it/s]                                                       3%|▎         | 333/12776 [02:29<58:53,  3.52it/s]  3%|▎         | 334/12776 [02:29<56:58,  3.64it/s]                                                     3%|▎         | 334/12776 [02:29<56:58,  3.64it/s]  3%|▎         | 335/12776 [02:30<1:02:31,  3.32it/s]                                                       3%|▎         | 335/12776 [02:30<1:02:31,  3.32it/s]  3%|▎         | 336/12776 [02:30<58:25,  3.55it/s]                                                       3%|▎         | 336/12776 [02:30<58:25,  3.55it/s]  3%|▎         | 337/12776 [02:30<54:53,  3.78it/s]                                                     3%|▎         | 337/12776 [02:30<54:53,  3.78it/s]  3%|▎         | 338/12776 [02:30<51:56,  3.99it/s]                                                     3%|▎         | 338/12776 [02:30<51:56,  3.99it/s]  3%|▎         | 339/12776 [02:31<55:47,  3.72it/s]                                                     3%|▎         | 339/12776 [02:31<55:47,  3.72it/s]  3%|▎         | 340/12776 [02:31<52:05,  3.98it/s]                                                     3%|▎         | 340/12776 [02:31<52:05,  3.98it/s]  3%|▎         | 341/12776 [02:31<49:18,  4.20it/s]                                                     3%|▎         | 341/12776 [02:31<49:18,  4.20it/s]  3%|▎         | 342/12776 [02:31<46:56,  4.41it/s]                                                     3%|▎         | 342/12776 [02:31<46:56,  4.41it/s]  3%|▎         | 343/12776 [02:32<45:28,  4.56it/s]                                                     3%|▎         | 343/12776 [02:32<45:28,  4.56it/s]  3%|▎         | 344/12776 [02:32<50:08,  4.13it/s]                                                     3%|▎         | 344/12776 [02:32<50:08,  4.13it/s]  3%|▎         | 345/12776 [02:32<47:12,  4.39it/s]                                                     3%|▎         | 345/12776 [02:32<47:12,  4.39it/s]  3%|▎         | 346/12776 [02:32<45:33,  4.55it/s]                                                     3%|▎         | 346/12776 [02:32<45:33,  4.55it/s]  3%|▎         | 347/12776 [02:32<43:32,  4.76it/s]                                                     3%|▎         | 347/12776 [02:32<43:32,  4.76it/s]  3%|▎         | 348/12776 [02:33<41:47,  4.96it/s]                                                     3%|▎         | 348/12776 [02:33<41:47,  4.96it/s]  3%|▎         | 349/12776 [02:33<40:33,  5.11it/s]                                                     3%|▎         | 349/12776 [02:33<40:33,  5.11it/s]  3%|▎         | 350/12776 [02:34<1:12:48,  2.84it/s]                                                       3%|▎         | 350/12776 [02:34<1:12:48,  2.84it/s]  3%|▎         | 351/12776 [02:35<2:20:09,  1.48it/s]                                                       3%|▎         | 351/12776 [02:35<2:20:09,  1.48it/s]  3%|▎         | 352/12776 [02:36<2:36:53,  1.32it/s]                                                       3%|▎         | 352/12776 [02:36<2:36:53,  1.32it/s]  3%|▎         | 353/12776 [02:37<2:42:31,  1.27it/s]                                                       3%|▎         | 353/12776 [02:37<2:42:31,  1.27it/s]  3%|▎         | 354/12776 [02:38<2:44:02,  1.26it/s]                                                       3%|▎         | 354/12776 [02:38<2:44:02,  1.26it/s]  3%|▎         | 355/12776 [02:38<2:40:36,  1.29it/s]                                                       3%|▎         | 355/12776 [02:38<2:40:36,  1.29it/s]  3%|▎         | 356/12776 [02:39<2:40:55,  1.29it/s]                                                       3%|▎         | 356/12776 [02:39<2:40:55,  1.29it/s]  3%|▎         | 357/12776 [02:40<2:41:26,  1.28it/s]                                                       3%|▎         | 357/12776 [02:40<2:41:26,  1.28it/s]  3%|▎         | 358/12776 [02:41<2:33:11,  1.35it/s]                                                       3%|▎         | 358/12776 [02:41<2:33:11,  1.35it/s]  3%|▎         | 359/12776 [02:41<2:35:18,  1.33it/s]                                                       3%|▎         | 359/12776 [02:41<2:35:18,  1.33it/s]  3%|▎         | 360/12776 [02:42<2:24:38,  1.43it/s]                                                       3%|▎         | 360/12776 [02:42<2:24:38,  1.43it/s]  3%|▎         | 361/12776 [02:43<2:19:58,  1.48it/s]                                                       3%|▎         | 361/12776 [02:43<2:19:58,  1.48it/s]  3%|▎         | 362/12776 [02:43<2:11:03,  1.58it/s]                                                       3%|▎         | 362/12776 [02:43<2:11:03,  1.58it/s]  3%|▎         | 363/12776 [02:44<2:06:54,  1.63it/s]                                                       3%|▎         | 363/12776 [02:44<2:06:54,  1.63it/s]  3%|▎         | 364/12776 [02:44<1:57:41,  1.76it/s]                                                       3%|▎         | 364/12776 [02:44<1:57:41,  1.76it/s]  3%|▎         | 365/12776 [02:45<1:55:31,  1.79it/s]                                                       3%|▎         | 365/12776 [02:45<1:55:31,  1.79it/s]  3%|▎         | 366/12776 [02:45<1:47:08,  1.93it/s]                                                       3%|▎         | 366/12776 [02:45<1:47:08,  1.93it/s]  3%|▎         | 367/12776 [02:46<1:47:19,  1.93it/s]                                                       3%|▎         | 367/12776 [02:46<1:47:19,  1.93it/s]  3%|▎         | 368/12776 [02:46<1:40:12,  2.06it/s]                                                       3%|▎         | 368/12776 [02:46<1:40:12,  2.06it/s]  3%|▎         | 369/12776 [02:46<1:33:58,  2.20it/s]                                                       3%|▎         | 369/12776 [02:46<1:33:58,  2.20it/s]  3%|▎         | 370/12776 [02:47<1:37:47,  2.11it/s]                                                       3%|▎         | 370/12776 [02:47<1:37:47,  2.11it/s]  3%|▎         | 371/12776 [02:47<1:30:23,  2.29it/s]                                                       3%|▎         | 371/12776 [02:47<1:30:23,  2.29it/s]  3%|▎         | 372/12776 [02:48<1:24:55,  2.43it/s]                                                       3%|▎         | 372/12776 [02:48<1:24:55,  2.43it/s]  3%|▎         | 373/12776 [02:48<1:25:29,  2.42it/s]                                                       3%|▎         | 373/12776 [02:48<1:25:29,  2.42it/s]  3%|▎         | 374/12776 [02:48<1:20:59,  2.55it/s]                                                       3%|▎         | 374/12776 [02:48<1:20:59,  2.55it/s]  3%|▎         | 375/12776 [02:49<1:16:28,  2.70it/s]                                                       3%|▎         | 375/12776 [02:49<1:16:28,  2.70it/s]  3%|▎         | 376/12776 [02:49<1:13:29,  2.81it/s]                                                       3%|▎         | 376/12776 [02:49<1:13:29,  2.81it/s]  3%|▎         | 377/12776 [02:49<1:09:34,  2.97it/s]                                                       3%|▎         | 377/12776 [02:49<1:09:34,  2.97it/s]  3%|▎         | 378/12776 [02:50<1:06:11,  3.12it/s]                                                       3%|▎         | 378/12776 [02:50<1:06:11,  3.12it/s]  3%|▎         | 379/12776 [02:50<1:03:32,  3.25it/s]                                                       3%|▎         | 379/12776 [02:50<1:03:32,  3.25it/s]  3%|▎         | 380/12776 [02:50<1:09:19,  2.98it/s]                                                       3%|▎         | 380/12776 [02:50<1:09:19,  2.98it/s]  3%|▎         | 381/12776 [02:50<1:04:37,  3.20it/s]                                                       3%|▎         | 381/12776 [02:50<1:04:37,  3.20it/s]  3%|▎         | 382/12776 [02:51<1:00:19,  3.42it/s]                                                       3%|▎         | 382/12776 [02:51<1:00:19,  3.42it/s]  3%|▎         | 383/12776 [02:51<57:30,  3.59it/s]                                                       3%|▎         | 383/12776 [02:51<57:30,  3.59it/s]  3%|▎         | 384/12776 [02:51<57:16,  3.61it/s]                                                     3%|▎         | 384/12776 [02:51<57:16,  3.61it/s]  3%|▎         | 385/12776 [02:51<54:45,  3.77it/s]                                                     3%|▎         | 385/12776 [02:51<54:45,  3.77it/s]  3%|▎         | 386/12776 [02:52<52:43,  3.92it/s]                                                     3%|▎         | 386/12776 [02:52<52:43,  3.92it/s]  3%|▎         | 387/12776 [02:52<50:53,  4.06it/s]                                                     3%|▎         | 387/12776 [02:52<50:53,  4.06it/s]  3%|▎         | 388/12776 [02:52<49:31,  4.17it/s]                                                     3%|▎         | 388/12776 [02:52<49:31,  4.17it/s]  3%|▎         | 389/12776 [02:53<55:07,  3.74it/s]                                                     3%|▎         | 389/12776 [02:53<55:07,  3.74it/s]  3%|▎         | 390/12776 [02:53<51:18,  4.02it/s]                                                     3%|▎         | 390/12776 [02:53<51:18,  4.02it/s]  3%|▎         | 391/12776 [02:53<48:45,  4.23it/s]                                                     3%|▎         | 391/12776 [02:53<48:45,  4.23it/s]  3%|▎         | 392/12776 [02:53<46:40,  4.42it/s]                                                     3%|▎         | 392/12776 [02:53<46:40,  4.42it/s]  3%|▎         | 393/12776 [02:53<44:54,  4.60it/s]                                                     3%|▎         | 393/12776 [02:53<44:54,  4.60it/s]  3%|▎         | 394/12776 [02:54<50:24,  4.09it/s]                                                     3%|▎         | 394/12776 [02:54<50:24,  4.09it/s]  3%|▎         | 395/12776 [02:54<47:13,  4.37it/s]                                                     3%|▎         | 395/12776 [02:54<47:13,  4.37it/s]  3%|▎         | 396/12776 [02:54<44:45,  4.61it/s]                                                     3%|▎         | 396/12776 [02:54<44:45,  4.61it/s]  3%|▎         | 397/12776 [02:54<43:01,  4.80it/s]                                                     3%|▎         | 397/12776 [02:54<43:01,  4.80it/s]  3%|▎         | 398/12776 [02:54<41:41,  4.95it/s]                                                     3%|��         | 398/12776 [02:54<41:41,  4.95it/s]  3%|▎         | 399/12776 [02:55<45:07,  4.57it/s]                                                     3%|▎         | 399/12776 [02:55<45:07,  4.57it/s]  3%|▎         | 400/12776 [02:55<1:19:32,  2.59it/s]                                                       3%|▎         | 400/12776 [02:55<1:19:32,  2.59it/s]Saving model checkpoint to ./checkpoint-400
+Configuration saved in ./checkpoint-400/config.json
+Model weights saved in ./checkpoint-400/model.safetensors
+Feature extractor saved in ./checkpoint-400/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-400/tokenizer_config.json
+Special tokens file saved in ./checkpoint-400/special_tokens_map.json
+added tokens file saved in ./checkpoint-400/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+  3%|▎         | 401/12776 [03:10<16:02:05,  4.66s/it]                                                        3%|▎         | 401/12776 [03:10<16:02:05,  4.66s/it]  3%|▎         | 402/12776 [03:11<12:15:47,  3.57s/it]                                                        3%|▎         | 402/12776 [03:11<12:15:47,  3.57s/it]  3%|▎         | 403/12776 [03:12<9:33:02,  2.78s/it]                                                        3%|▎         | 403/12776 [03:12<9:33:02,  2.78s/it]  3%|▎         | 404/12776 [03:13<7:26:24,  2.16s/it]                                                       3%|▎         | 404/12776 [03:13<7:26:24,  2.16s/it]  3%|▎         | 405/12776 [03:13<5:56:06,  1.73s/it]                                                       3%|▎         | 405/12776 [03:13<5:56:06,  1.73s/it]  3%|▎         | 406/12776 [03:14<4:53:08,  1.42s/it]                                                       3%|▎         | 406/12776 [03:14<4:53:08,  1.42s/it]  3%|▎         | 407/12776 [03:15<4:13:23,  1.23s/it]                                                       3%|▎         | 407/12776 [03:15<4:13:23,  1.23s/it]  3%|▎         | 408/12776 [03:16<3:35:10,  1.04s/it]                                                       3%|▎         | 408/12776 [03:16<3:35:10,  1.04s/it]  3%|▎         | 409/12776 [03:16<3:07:13,  1.10it/s]                                                       3%|▎         | 409/12776 [03:16<3:07:13,  1.10it/s]  3%|▎         | 410/12776 [03:17<2:43:55,  1.26it/s]                                                       3%|▎         | 410/12776 [03:17<2:43:55,  1.26it/s]  3%|▎         | 411/12776 [03:17<2:32:26,  1.35it/s]                                                       3%|▎         | 411/12776 [03:17<2:32:26,  1.35it/s]  3%|▎         | 412/12776 [03:18<2:17:06,  1.50it/s]                                                       3%|▎         | 412/12776 [03:18<2:17:06,  1.50it/s]  3%|▎         | 413/12776 [03:18<2:04:21,  1.66it/s]                                                     {'loss': 2.8443, 'grad_norm': 2.0109505653381348, 'learning_rate': 0.00019679999999999999, 'epoch': 0.05}
+{'loss': 2.8434, 'grad_norm': 3.9468677043914795, 'learning_rate': 0.0001974, 'epoch': 0.05}
+{'loss': 2.7809, 'grad_norm': 1.277677059173584, 'learning_rate': 0.000198, 'epoch': 0.05}
+{'loss': 2.9161, 'grad_norm': 1.9392768144607544, 'learning_rate': 0.0001986, 'epoch': 0.05}
+{'loss': 2.7724, 'grad_norm': 3.3220739364624023, 'learning_rate': 0.0001992, 'epoch': 0.05}
+{'loss': 2.8016, 'grad_norm': 2.631103754043579, 'learning_rate': 0.0001998, 'epoch': 0.05}
+{'loss': 2.8939, 'grad_norm': 2.869077444076538, 'learning_rate': 0.0002004, 'epoch': 0.05}
+{'loss': 2.7951, 'grad_norm': 1.6763360500335693, 'learning_rate': 0.000201, 'epoch': 0.05}
+{'loss': 2.6876, 'grad_norm': 1.35333251953125, 'learning_rate': 0.0002016, 'epoch': 0.05}
+{'loss': 2.7753, 'grad_norm': 1.591059684753418, 'learning_rate': 0.0002022, 'epoch': 0.05}
+{'loss': 2.9564, 'grad_norm': 2.1885745525360107, 'learning_rate': 0.0002028, 'epoch': 0.05}
+{'loss': 3.0515, 'grad_norm': 6.893604755401611, 'learning_rate': 0.00020339999999999998, 'epoch': 0.05}
+{'loss': 2.6812, 'grad_norm': 4.528074264526367, 'learning_rate': 0.000204, 'epoch': 0.05}
+{'loss': 2.6635, 'grad_norm': 1.5735338926315308, 'learning_rate': 0.00020459999999999999, 'epoch': 0.05}
+{'loss': 2.5343, 'grad_norm': 2.046394109725952, 'learning_rate': 0.0002052, 'epoch': 0.05}
+{'loss': 2.4338, 'grad_norm': 1.8369107246398926, 'learning_rate': 0.0002058, 'epoch': 0.05}
+{'loss': 2.2824, 'grad_norm': 1.8167654275894165, 'learning_rate': 0.00020639999999999998, 'epoch': 0.05}
+{'loss': 2.6324, 'grad_norm': 3.679330825805664, 'learning_rate': 0.00020699999999999996, 'epoch': 0.05}
+{'loss': 2.2228, 'grad_norm': 3.2513327598571777, 'learning_rate': 0.00020759999999999998, 'epoch': 0.05}
+{'loss': 2.9826, 'grad_norm': 3.872000217437744, 'learning_rate': 0.00020819999999999996, 'epoch': 0.05}
+{'loss': 3.3657, 'grad_norm': 4.465301036834717, 'learning_rate': 0.00020879999999999998, 'epoch': 0.06}
+{'loss': 3.4431, 'grad_norm': 7.596004486083984, 'learning_rate': 0.00020939999999999997, 'epoch': 0.06}
+{'loss': 2.8607, 'grad_norm': 0.8127701282501221, 'learning_rate': 0.00020999999999999998, 'epoch': 0.06}
+{'loss': 2.9144, 'grad_norm': 3.6727442741394043, 'learning_rate': 0.00021059999999999997, 'epoch': 0.06}
+{'loss': 2.9496, 'grad_norm': 3.5418031215667725, 'learning_rate': 0.00021119999999999996, 'epoch': 0.06}
+{'loss': 2.8433, 'grad_norm': 0.981769323348999, 'learning_rate': 0.00021179999999999997, 'epoch': 0.06}
+{'loss': 2.8561, 'grad_norm': 0.742219090461731, 'learning_rate': 0.00021239999999999996, 'epoch': 0.06}
+{'loss': 3.9582, 'grad_norm': 14.127908706665039, 'learning_rate': 0.00021299999999999997, 'epoch': 0.06}
+{'loss': 2.9154, 'grad_norm': 1.440157175064087, 'learning_rate': 0.00021359999999999996, 'epoch': 0.06}
+{'loss': 3.58, 'grad_norm': 12.539861679077148, 'learning_rate': 0.00021419999999999998, 'epoch': 0.06}
+{'loss': 3.0251, 'grad_norm': 0.9830080270767212, 'learning_rate': 0.00021479999999999996, 'epoch': 0.06}
+{'loss': 2.8486, 'grad_norm': 0.5790128707885742, 'learning_rate': 0.00021539999999999998, 'epoch': 0.06}
+{'loss': 3.0627, 'grad_norm': 4.547713279724121, 'learning_rate': 0.00021599999999999996, 'epoch': 0.06}
+{'loss': 2.8365, 'grad_norm': 1.8868635892868042, 'learning_rate': 0.00021659999999999998, 'epoch': 0.06}
+{'loss': 2.8774, 'grad_norm': 1.4860725402832031, 'learning_rate': 0.00021719999999999997, 'epoch': 0.06}
+{'loss': 2.9113, 'grad_norm': 1.064705729484558, 'learning_rate': 0.00021779999999999998, 'epoch': 0.06}
+{'loss': 2.9108, 'grad_norm': 1.752041220664978, 'learning_rate': 0.00021839999999999997, 'epoch': 0.06}
+{'loss': 2.8598, 'grad_norm': 0.8162034153938293, 'learning_rate': 0.00021899999999999998, 'epoch': 0.06}
+{'loss': 2.856, 'grad_norm': 0.7272463440895081, 'learning_rate': 0.00021959999999999997, 'epoch': 0.06}
+{'loss': 2.8943, 'grad_norm': 0.9358817338943481, 'learning_rate': 0.00022019999999999999, 'epoch': 0.06}
+{'loss': 2.822, 'grad_norm': 0.799842119216919, 'learning_rate': 0.00022079999999999997, 'epoch': 0.06}
+{'loss': 2.9312, 'grad_norm': 2.1993589401245117, 'learning_rate': 0.0002214, 'epoch': 0.06}
+{'loss': 3.2528, 'grad_norm': 7.407040596008301, 'learning_rate': 0.00022199999999999998, 'epoch': 0.06}
+{'loss': 2.8877, 'grad_norm': 0.9808345437049866, 'learning_rate': 0.0002226, 'epoch': 0.06}
+{'loss': 2.779, 'grad_norm': 1.277418613433838, 'learning_rate': 0.00022319999999999998, 'epoch': 0.06}
+{'loss': 2.8769, 'grad_norm': 1.093755841255188, 'learning_rate': 0.0002238, 'epoch': 0.06}
+{'loss': 2.8662, 'grad_norm': 2.1104671955108643, 'learning_rate': 0.00022439999999999998, 'epoch': 0.06}
+{'loss': 2.8349, 'grad_norm': 1.0569154024124146, 'learning_rate': 0.000225, 'epoch': 0.06}
+{'loss': 2.8008, 'grad_norm': 1.0086591243743896, 'learning_rate': 0.00022559999999999998, 'epoch': 0.06}
+{'loss': 2.7307, 'grad_norm': 2.0983245372772217, 'learning_rate': 0.00022619999999999997, 'epoch': 0.06}
+{'loss': 2.8152, 'grad_norm': 1.8184118270874023, 'learning_rate': 0.00022679999999999998, 'epoch': 0.06}
+{'loss': 2.6611, 'grad_norm': 1.5371543169021606, 'learning_rate': 0.00022739999999999997, 'epoch': 0.06}
+{'loss': 2.9117, 'grad_norm': 2.030109405517578, 'learning_rate': 0.00022799999999999999, 'epoch': 0.06}
+{'loss': 2.7593, 'grad_norm': 1.1472651958465576, 'learning_rate': 0.00022859999999999997, 'epoch': 0.06}
+{'loss': 2.9494, 'grad_norm': 2.547860622406006, 'learning_rate': 0.0002292, 'epoch': 0.06}
+{'loss': 2.7367, 'grad_norm': 4.287590980529785, 'learning_rate': 0.00022979999999999997, 'epoch': 0.06}
+{'loss': 2.8195, 'grad_norm': 2.5711324214935303, 'learning_rate': 0.0002304, 'epoch': 0.06}
+{'loss': 2.6613, 'grad_norm': 1.1869808435440063, 'learning_rate': 0.00023099999999999998, 'epoch': 0.06}
+{'loss': 2.7794, 'grad_norm': 2.388160467147827, 'learning_rate': 0.0002316, 'epoch': 0.06}
+{'loss': 2.9327, 'grad_norm': 2.665323257446289, 'learning_rate': 0.00023219999999999998, 'epoch': 0.06}
+{'loss': 2.7387, 'grad_norm': 1.2686117887496948, 'learning_rate': 0.0002328, 'epoch': 0.06}
+{'loss': 2.6176, 'grad_norm': 2.919185161590576, 'learning_rate': 0.00023339999999999998, 'epoch': 0.06}
+{'loss': 2.4814, 'grad_norm': 1.6040527820587158, 'learning_rate': 0.000234, 'epoch': 0.06}
+{'loss': 2.3291, 'grad_norm': 1.9547075033187866, 'learning_rate': 0.00023459999999999998, 'epoch': 0.06}
+{'loss': 2.3533, 'grad_norm': 2.361668109893799, 'learning_rate': 0.0002352, 'epoch': 0.06}
+{'loss': 2.1218, 'grad_norm': 3.003882646560669, 'learning_rate': 0.00023579999999999999, 'epoch': 0.06}
+{'loss': 2.3472, 'grad_norm': 2.0260565280914307, 'learning_rate': 0.0002364, 'epoch': 0.06}
+{'loss': 2.2098, 'grad_norm': 1.2082924842834473, 'learning_rate': 0.000237, 'epoch': 0.06}
+{'loss': 2.006, 'grad_norm': 2.548179864883423, 'learning_rate': 0.0002376, 'epoch': 0.06}
+{'loss': 5.0331, 'grad_norm': 25.11155128479004, 'learning_rate': 0.0002382, 'epoch': 0.06}
+{'loss': 3.1208, 'grad_norm': 7.020974159240723, 'learning_rate': 0.0002388, 'epoch': 0.06}
+{'loss': 3.0382, 'grad_norm': 5.667887210845947, 'learning_rate': 0.0002394, 'epoch': 0.06}
+{'loss': 2.8571, 'grad_norm': 1.9799600839614868, 'learning_rate': 0.00023999999999999998, 'epoch': 0.06}
+{'loss': 2.8546, 'grad_norm': 2.0452182292938232, 'learning_rate': 0.0002406, 'epoch': 0.06}
+{'loss': 2.9338, 'grad_norm': 3.450450897216797, 'learning_rate': 0.00024119999999999998, 'epoch': 0.06}
+{'loss': 2.8975, 'grad_norm': 3.7206132411956787, 'learning_rate': 0.0002418, 'epoch': 0.06}
+{'loss': 2.8855, 'grad_norm': 2.4928946495056152, 'learning_rate': 0.00024239999999999998, 'epoch': 0.06}
+{'loss': 2.8278, 'grad_norm': 1.6292774677276611, 'learning_rate': 0.000243, 'epoch': 0.06}
+{'loss': 2.8089, 'grad_norm': 1.6044974327087402, 'learning_rate': 0.00024359999999999999, 'epoch': 0.06}
+{'loss': 2.97, 'grad_norm': 3.7000677585601807, 'learning_rate': 0.00024419999999999997, 'epoch': 0.06}
+{'loss': 3.5765, 'grad_norm': 12.418329238891602, 'learning_rate': 0.0002448, 'epoch': 0.06}
+  3%|▎         | 413/12776 [03:18<2:04:21,  1.66it/s]  3%|▎         | 414/12776 [03:19<1:54:51,  1.79it/s]                                                       3%|▎         | 414/12776 [03:19<1:54:51,  1.79it/s]  3%|▎         | 415/12776 [03:19<1:46:16,  1.94it/s]                                                       3%|▎         | 415/12776 [03:19<1:46:16,  1.94it/s]  3%|▎         | 416/12776 [03:20<1:44:34,  1.97it/s]                                                       3%|▎         | 416/12776 [03:20<1:44:34,  1.97it/s]  3%|▎         | 417/12776 [03:20<1:37:09,  2.12it/s]                                                       3%|▎         | 417/12776 [03:20<1:37:09,  2.12it/s]  3%|▎         | 418/12776 [03:20<1:30:46,  2.27it/s]                                                       3%|▎         | 418/12776 [03:20<1:30:46,  2.27it/s]  3%|▎         | 419/12776 [03:21<1:29:10,  2.31it/s]                                                       3%|▎         | 419/12776 [03:21<1:29:10,  2.31it/s]  3%|▎         | 420/12776 [03:21<1:24:26,  2.44it/s]                                                       3%|▎         | 420/12776 [03:21<1:24:26,  2.44it/s]  3%|▎         | 421/12776 [03:21<1:19:56,  2.58it/s]                                                       3%|▎         | 421/12776 [03:21<1:19:56,  2.58it/s]  3%|▎         | 422/12776 [03:22<1:22:51,  2.48it/s]                                                       3%|▎         | 422/12776 [03:22<1:22:51,  2.48it/s]  3%|▎         | 423/12776 [03:22<1:17:21,  2.66it/s]                                                       3%|▎         | 423/12776 [03:22<1:17:21,  2.66it/s]  3%|▎         | 424/12776 [03:22<1:12:06,  2.86it/s]                                                       3%|▎         | 424/12776 [03:22<1:12:06,  2.86it/s]  3%|▎         | 425/12776 [03:23<1:07:38,  3.04it/s]                                                       3%|▎         | 425/12776 [03:23<1:07:38,  3.04it/s]  3%|▎         | 426/12776 [03:23<1:11:09,  2.89it/s]                                                       3%|▎         | 426/12776 [03:23<1:11:09,  2.89it/s]  3%|▎         | 427/12776 [03:23<1:07:02,  3.07it/s]                                                       3%|▎         | 427/12776 [03:23<1:07:02,  3.07it/s]  3%|▎         | 428/12776 [03:24<1:03:00,  3.27it/s]                                                       3%|▎         | 428/12776 [03:24<1:03:00,  3.27it/s]  3%|▎         | 429/12776 [03:24<1:00:32,  3.40it/s]                                                       3%|▎         | 429/12776 [03:24<1:00:32,  3.40it/s]  3%|▎         | 430/12776 [03:24<1:03:53,  3.22it/s]                                                       3%|▎         | 430/12776 [03:24<1:03:53,  3.22it/s]  3%|▎         | 431/12776 [03:25<59:40,  3.45it/s]                                                       3%|▎         | 431/12776 [03:25<59:40,  3.45it/s]  3%|▎         | 432/12776 [03:25<55:57,  3.68it/s]                                                     3%|▎         | 432/12776 [03:25<55:57,  3.68it/s]  3%|▎         | 433/12776 [03:25<53:10,  3.87it/s]                                                     3%|▎         | 433/12776 [03:25<53:10,  3.87it/s]  3%|▎         | 434/12776 [03:25<56:50,  3.62it/s]                                                     3%|▎         | 434/12776 [03:25<56:50,  3.62it/s]  3%|▎         | 435/12776 [03:26<53:00,  3.88it/s]                                                     3%|▎         | 435/12776 [03:26<53:00,  3.88it/s]  3%|▎         | 436/12776 [03:26<50:01,  4.11it/s]                                                     3%|▎         | 436/12776 [03:26<50:01,  4.11it/s]  3%|▎         | 437/12776 [03:26<47:36,  4.32it/s]                                                     3%|▎         | 437/12776 [03:26<47:36,  4.32it/s]  3%|▎         | 438/12776 [03:26<45:28,  4.52it/s]                                                     3%|▎         | 438/12776 [03:26<45:28,  4.52it/s]  3%|▎         | 439/12776 [03:26<50:04,  4.11it/s]                                                     3%|▎         | 439/12776 [03:26<50:04,  4.11it/s]  3%|▎         | 440/12776 [03:27<44:13,  4.65it/s]                                                     3%|▎         | 440/12776 [03:27<44:13,  4.65it/s]  3%|▎         | 441/12776 [03:27<41:46,  4.92it/s]                                                     3%|▎         | 441/12776 [03:27<41:46,  4.92it/s]  3%|▎         | 442/12776 [03:27<40:35,  5.06it/s]                                                     3%|▎         | 442/12776 [03:27<40:35,  5.06it/s]  3%|▎         | 443/12776 [03:27<39:31,  5.20it/s]                                                     3%|▎         | 443/12776 [03:27<39:31,  5.20it/s]  3%|▎         | 444/12776 [03:27<38:28,  5.34it/s]                                                     3%|▎         | 444/12776 [03:27<38:28,  5.34it/s]  3%|▎         | 445/12776 [03:28<43:41,  4.70it/s]                                                     3%|▎         | 445/12776 [03:28<43:41,  4.70it/s]  3%|▎         | 446/12776 [03:28<41:10,  4.99it/s]                                                     3%|▎         | 446/12776 [03:28<41:10,  4.99it/s]  3%|▎         | 447/12776 [03:28<39:13,  5.24it/s]                                                     3%|▎         | 447/12776 [03:28<39:13,  5.24it/s]  4%|▎         | 448/12776 [03:28<37:27,  5.49it/s]                                                     4%|▎         | 448/12776 [03:28<37:27,  5.49it/s]  4%|▎         | 449/12776 [03:28<36:13,  5.67it/s]                                                     4%|▎         | 449/12776 [03:28<36:13,  5.67it/s]  4%|▎         | 450/12776 [03:29<1:08:11,  3.01it/s]                                                       4%|▎         | 450/12776 [03:29<1:08:11,  3.01it/s]  4%|▎         | 451/12776 [03:30<2:19:30,  1.47it/s]                                                       4%|▎         | 451/12776 [03:30<2:19:30,  1.47it/s]  4%|▎         | 452/12776 [03:31<2:37:44,  1.30it/s]                                                       4%|▎         | 452/12776 [03:31<2:37:44,  1.30it/s]  4%|▎         | 453/12776 [03:32<2:43:14,  1.26it/s]                                                       4%|▎         | 453/12776 [03:32<2:43:14,  1.26it/s]  4%|▎         | 454/12776 [03:33<2:41:44,  1.27it/s]                                                       4%|▎         | 454/12776 [03:33<2:41:44,  1.27it/s]  4%|▎         | 455/12776 [03:34<2:37:54,  1.30it/s]                                                       4%|▎         | 455/12776 [03:34<2:37:54,  1.30it/s]  4%|▎         | 456/12776 [03:35<2:36:18,  1.31it/s]                                                       4%|▎         | 456/12776 [03:35<2:36:18,  1.31it/s]  4%|▎         | 457/12776 [03:35<2:30:45,  1.36it/s]                                                       4%|▎         | 457/12776 [03:35<2:30:45,  1.36it/s]  4%|▎         | 458/12776 [03:36<2:31:37,  1.35it/s]                                                       4%|▎         | 458/12776 [03:36<2:31:37,  1.35it/s]  4%|▎         | 459/12776 [03:37<2:23:32,  1.43it/s]                                                       4%|▎         | 459/12776 [03:37<2:23:32,  1.43it/s]  4%|▎         | 460/12776 [03:37<2:18:06,  1.49it/s]                                                       4%|▎         | 460/12776 [03:37<2:18:06,  1.49it/s]  4%|▎         | 461/12776 [03:38<2:10:54,  1.57it/s]                                                       4%|▎         | 461/12776 [03:38<2:10:54,  1.57it/s]  4%|▎         | 462/12776 [03:38<2:06:45,  1.62it/s]                                                       4%|▎         | 462/12776 [03:38<2:06:45,  1.62it/s]  4%|▎         | 463/12776 [03:39<2:00:21,  1.71it/s]                                                       4%|▎         | 463/12776 [03:39<2:00:21,  1.71it/s]  4%|▎         | 464/12776 [03:39<2:00:30,  1.70it/s]                                                       4%|▎         | 464/12776 [03:39<2:00:30,  1.70it/s]  4%|▎         | 465/12776 [03:40<1:52:25,  1.82it/s]                                                       4%|▎         | 465/12776 [03:40<1:52:25,  1.82it/s]  4%|▎         | 466/12776 [03:40<1:54:15,  1.80it/s]                                                       4%|▎         | 466/12776 [03:40<1:54:15,  1.80it/s]  4%|▎         | 467/12776 [03:41<1:46:11,  1.93it/s]                                                       4%|▎         | 467/12776 [03:41<1:46:11,  1.93it/s]  4%|▎         | 468/12776 [03:41<1:48:33,  1.89it/s]                                                       4%|▎         | 468/12776 [03:41<1:48:33,  1.89it/s]  4%|▎         | 469/12776 [03:42<1:39:59,  2.05it/s]                                                       4%|▎         | 469/12776 [03:42<1:39:59,  2.05it/s]  4%|▎         | 470/12776 [03:42<1:32:56,  2.21it/s]                                                       4%|▎         | 470/12776 [03:42<1:32:56,  2.21it/s]  4%|▎         | 471/12776 [03:43<1:29:33,  2.29it/s]                                                       4%|▎         | 471/12776 [03:43<1:29:33,  2.29it/s]  4%|▎         | 472/12776 [03:43<1:23:56,  2.44it/s]                                                       4%|▎         | 472/12776 [03:43<1:23:56,  2.44it/s]  4%|▎         | 473/12776 [03:43<1:20:00,  2.56it/s]                                                       4%|▎         | 473/12776 [03:43<1:20:00,  2.56it/s]  4%|▎         | 474/12776 [03:44<1:22:37,  2.48it/s]                                                       4%|▎         | 474/12776 [03:44<1:22:37,  2.48it/s]  4%|▎         | 475/12776 [03:44<1:17:11,  2.66it/s]                                                       4%|▎         | 475/12776 [03:44<1:17:11,  2.66it/s]  4%|▎         | 476/12776 [03:44<1:12:42,  2.82it/s]                                                       4%|▎         | 476/12776 [03:44<1:12:42,  2.82it/s]  4%|▎         | 477/12776 [03:45<1:08:46,  2.98it/s]                                                       4%|▎         | 477/12776 [03:45<1:08:46,  2.98it/s]  4%|▎         | 478/12776 [03:45<1:09:59,  2.93it/s]                                                       4%|▎         | 478/12776 [03:45<1:09:59,  2.93it/s]  4%|▎         | 479/12776 [03:45<1:05:46,  3.12it/s]                                                       4%|▎         | 479/12776 [03:45<1:05:46,  3.12it/s]  4%|▍         | 480/12776 [03:45<1:02:35,  3.27it/s]                                                       4%|▍         | 480/12776 [03:45<1:02:35,  3.27it/s]  4%|▍         | 481/12776 [03:46<59:30,  3.44it/s]                                                       4%|▍         | 481/12776 [03:46<59:30,  3.44it/s]  4%|▍         | 482/12776 [03:46<1:00:47,  3.37it/s]                                                       4%|▍         | 482/12776 [03:46<1:00:47,  3.37it/s]  4%|▍         | 483/12776 [03:46<57:38,  3.55it/s]                                                       4%|▍         | 483/12776 [03:46<57:38,  3.55it/s]  4%|▍         | 484/12776 [03:47<55:14,  3.71it/s]                                                     4%|▍         | 484/12776 [03:47<55:14,  3.71it/s]  4%|▍         | 485/12776 [03:47<52:50,  3.88it/s]                                                     4%|▍         | 485/12776 [03:47<52:50,  3.88it/s]  4%|▍         | 486/12776 [03:47<54:37,  3.75it/s]                                                     4%|▍         | 486/12776 [03:47<54:37,  3.75it/s]  4%|▍         | 487/12776 [03:47<51:33,  3.97it/s]                                                     4%|▍         | 487/12776 [03:47<51:33,  3.97it/s]  4%|▍         | 488/12776 [03:47<49:01,  4.18it/s]                                                     4%|▍         | 488/12776 [03:47<49:01,  4.18it/s]  4%|▍         | 489/12776 [03:48<47:04,  4.35it/s]                                                     4%|▍         | 489/12776 [03:48<47:04,  4.35it/s]  4%|▍         | 490/12776 [03:48<45:43,  4.48it/s]                                                     4%|▍         | 490/12776 [03:48<45:43,  4.48it/s]  4%|▍         | 491/12776 [03:48<50:38,  4.04it/s]                                                     4%|▍         | 491/12776 [03:48<50:38,  4.04it/s]  4%|▍         | 492/12776 [03:48<47:51,  4.28it/s]                                                     4%|▍         | 492/12776 [03:48<47:51,  4.28it/s]  4%|▍         | 493/12776 [03:49<45:44,  4.48it/s]                                                     4%|▍         | 493/12776 [03:49<45:44,  4.48it/s]  4%|▍         | 494/12776 [03:49<44:05,  4.64it/s]                                                     4%|▍         | 494/12776 [03:49<44:05,  4.64it/s]  4%|▍         | 495/12776 [03:49<42:49,  4.78it/s]                                                     4%|▍         | 495/12776 [03:49<42:49,  4.78it/s]  4%|▍         | 496/12776 [03:49<41:44,  4.90it/s]                                                   {'loss': 2.889, 'grad_norm': 3.708932638168335, 'learning_rate': 0.00024539999999999995, 'epoch': 0.06}
+{'loss': 2.8552, 'grad_norm': 2.5494682788848877, 'learning_rate': 0.00024599999999999996, 'epoch': 0.06}
+{'loss': 2.8204, 'grad_norm': 0.5013105273246765, 'learning_rate': 0.0002466, 'epoch': 0.06}
+{'loss': 2.7921, 'grad_norm': 0.6971216201782227, 'learning_rate': 0.0002472, 'epoch': 0.07}
+{'loss': 2.8253, 'grad_norm': 1.244098424911499, 'learning_rate': 0.00024779999999999995, 'epoch': 0.07}
+{'loss': 2.7789, 'grad_norm': 1.136309027671814, 'learning_rate': 0.00024839999999999997, 'epoch': 0.07}
+{'loss': 2.7981, 'grad_norm': 1.103389859199524, 'learning_rate': 0.000249, 'epoch': 0.07}
+{'loss': 2.777, 'grad_norm': 0.6065533757209778, 'learning_rate': 0.00024959999999999994, 'epoch': 0.07}
+{'loss': 2.779, 'grad_norm': 1.2943936586380005, 'learning_rate': 0.00025019999999999996, 'epoch': 0.07}
+{'loss': 2.7543, 'grad_norm': 1.5812445878982544, 'learning_rate': 0.00025079999999999997, 'epoch': 0.07}
+{'loss': 2.7617, 'grad_norm': 0.6058809757232666, 'learning_rate': 0.0002514, 'epoch': 0.07}
+{'loss': 2.776, 'grad_norm': 0.6701329946517944, 'learning_rate': 0.00025199999999999995, 'epoch': 0.07}
+{'loss': 2.8081, 'grad_norm': 1.4818625450134277, 'learning_rate': 0.00025259999999999996, 'epoch': 0.07}
+{'loss': 2.7473, 'grad_norm': 0.9696059226989746, 'learning_rate': 0.0002532, 'epoch': 0.07}
+{'loss': 2.7107, 'grad_norm': 0.6371885538101196, 'learning_rate': 0.0002538, 'epoch': 0.07}
+{'loss': 2.7509, 'grad_norm': 2.670564889907837, 'learning_rate': 0.00025439999999999995, 'epoch': 0.07}
+{'loss': 2.5508, 'grad_norm': 1.7934705018997192, 'learning_rate': 0.00025499999999999996, 'epoch': 0.07}
+{'loss': 2.7541, 'grad_norm': 1.4174633026123047, 'learning_rate': 0.0002556, 'epoch': 0.07}
+{'loss': 2.6337, 'grad_norm': 1.3677154779434204, 'learning_rate': 0.0002562, 'epoch': 0.07}
+{'loss': 2.6692, 'grad_norm': 3.6003217697143555, 'learning_rate': 0.00025679999999999995, 'epoch': 0.07}
+{'loss': 2.5543, 'grad_norm': 1.2599115371704102, 'learning_rate': 0.00025739999999999997, 'epoch': 0.07}
+{'loss': 2.6359, 'grad_norm': 0.8455353379249573, 'learning_rate': 0.000258, 'epoch': 0.07}
+{'loss': 2.7305, 'grad_norm': 0.9858604073524475, 'learning_rate': 0.0002586, 'epoch': 0.07}
+{'loss': 2.7262, 'grad_norm': 1.948320984840393, 'learning_rate': 0.00025919999999999996, 'epoch': 0.07}
+{'loss': 2.7447, 'grad_norm': 1.0108580589294434, 'learning_rate': 0.00025979999999999997, 'epoch': 0.07}
+{'loss': 2.6928, 'grad_norm': 1.686102032661438, 'learning_rate': 0.0002604, 'epoch': 0.07}
+{'loss': 2.4659, 'grad_norm': 1.4326525926589966, 'learning_rate': 0.000261, 'epoch': 0.07}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.000261, 'epoch': 0.07}
+{'loss': 2.502, 'grad_norm': 3.392507791519165, 'learning_rate': 0.00026159999999999996, 'epoch': 0.07}
+{'loss': 2.3608, 'grad_norm': 1.325119137763977, 'learning_rate': 0.0002622, 'epoch': 0.07}
+{'loss': 2.7061, 'grad_norm': 2.0018181800842285, 'learning_rate': 0.0002628, 'epoch': 0.07}
+{'loss': 2.0728, 'grad_norm': 2.351816415786743, 'learning_rate': 0.00026339999999999995, 'epoch': 0.07}
+{'loss': 2.4649, 'grad_norm': 2.392810583114624, 'learning_rate': 0.00026399999999999997, 'epoch': 0.07}
+{'loss': 1.9439, 'grad_norm': 2.2967441082000732, 'learning_rate': 0.0002646, 'epoch': 0.07}
+{'loss': 2.0041, 'grad_norm': 1.640772819519043, 'learning_rate': 0.0002652, 'epoch': 0.07}
+{'loss': 1.9843, 'grad_norm': 2.1847622394561768, 'learning_rate': 0.00026579999999999996, 'epoch': 0.07}
+{'loss': 2.0595, 'grad_norm': 1.9607410430908203, 'learning_rate': 0.00026639999999999997, 'epoch': 0.07}
+{'loss': 1.9382, 'grad_norm': 1.476098656654358, 'learning_rate': 0.000267, 'epoch': 0.07}
+{'loss': 3.0205, 'grad_norm': 5.004290580749512, 'learning_rate': 0.0002676, 'epoch': 0.07}
+{'loss': 3.5319, 'grad_norm': 11.529528617858887, 'learning_rate': 0.00026819999999999996, 'epoch': 0.07}
+{'loss': 2.7755, 'grad_norm': 1.561353325843811, 'learning_rate': 0.0002688, 'epoch': 0.07}
+{'loss': 2.7657, 'grad_norm': 1.1581677198410034, 'learning_rate': 0.0002694, 'epoch': 0.07}
+{'loss': 2.6956, 'grad_norm': 0.837934136390686, 'learning_rate': 0.00027, 'epoch': 0.07}
+{'loss': 2.5976, 'grad_norm': 1.2540568113327026, 'learning_rate': 0.00027059999999999996, 'epoch': 0.07}
+{'loss': 2.5529, 'grad_norm': 1.265945315361023, 'learning_rate': 0.0002712, 'epoch': 0.07}
+{'loss': 2.5779, 'grad_norm': 0.9604879021644592, 'learning_rate': 0.0002718, 'epoch': 0.07}
+{'loss': 2.6235, 'grad_norm': 0.6527476906776428, 'learning_rate': 0.0002724, 'epoch': 0.07}
+{'loss': 2.4643, 'grad_norm': 0.6187277436256409, 'learning_rate': 0.00027299999999999997, 'epoch': 0.07}
+{'loss': 2.4603, 'grad_norm': 0.8495888710021973, 'learning_rate': 0.0002736, 'epoch': 0.07}
+{'loss': 2.4654, 'grad_norm': 0.812981128692627, 'learning_rate': 0.0002742, 'epoch': 0.07}
+{'loss': 2.7148, 'grad_norm': 3.680403709411621, 'learning_rate': 0.0002748, 'epoch': 0.07}
+{'loss': 2.4459, 'grad_norm': 0.837788462638855, 'learning_rate': 0.00027539999999999997, 'epoch': 0.07}
+{'loss': 2.3291, 'grad_norm': 0.9086034893989563, 'learning_rate': 0.000276, 'epoch': 0.07}
+{'loss': 2.4085, 'grad_norm': 1.12948739528656, 'learning_rate': 0.0002766, 'epoch': 0.07}
+{'loss': 2.2966, 'grad_norm': 0.9835497140884399, 'learning_rate': 0.0002772, 'epoch': 0.07}
+{'loss': 2.3619, 'grad_norm': 2.1482276916503906, 'learning_rate': 0.0002778, 'epoch': 0.07}
+{'loss': 2.3791, 'grad_norm': 3.3188507556915283, 'learning_rate': 0.0002784, 'epoch': 0.07}
+{'loss': 2.3225, 'grad_norm': 1.2607046365737915, 'learning_rate': 0.000279, 'epoch': 0.07}
+{'loss': 2.4383, 'grad_norm': 3.159202814102173, 'learning_rate': 0.00027959999999999997, 'epoch': 0.07}
+{'loss': 2.3881, 'grad_norm': 1.8510031700134277, 'learning_rate': 0.0002802, 'epoch': 0.07}
+{'loss': 2.2954, 'grad_norm': 1.1104274988174438, 'learning_rate': 0.0002808, 'epoch': 0.07}
+{'loss': 2.2092, 'grad_norm': 1.1513651609420776, 'learning_rate': 0.00028139999999999996, 'epoch': 0.07}
+{'loss': 2.2104, 'grad_norm': 1.2425847053527832, 'learning_rate': 0.00028199999999999997, 'epoch': 0.07}
+{'loss': 2.4311, 'grad_norm': 1.6782479286193848, 'learning_rate': 0.0002826, 'epoch': 0.07}
+{'loss': 2.3067, 'grad_norm': 3.411275625228882, 'learning_rate': 0.00028319999999999994, 'epoch': 0.07}
+{'loss': 2.2745, 'grad_norm': 1.4562594890594482, 'learning_rate': 0.00028379999999999996, 'epoch': 0.07}
+{'loss': 2.2636, 'grad_norm': 1.7229201793670654, 'learning_rate': 0.0002844, 'epoch': 0.07}
+{'loss': 2.2304, 'grad_norm': 1.804953932762146, 'learning_rate': 0.000285, 'epoch': 0.08}
+{'loss': 2.9409, 'grad_norm': 6.132996559143066, 'learning_rate': 0.00028559999999999995, 'epoch': 0.08}
+{'loss': 2.2299, 'grad_norm': 1.6565462350845337, 'learning_rate': 0.00028619999999999996, 'epoch': 0.08}
+{'loss': 2.3357, 'grad_norm': 1.50054132938385, 'learning_rate': 0.0002868, 'epoch': 0.08}
+{'loss': 2.4912, 'grad_norm': 2.0970003604888916, 'learning_rate': 0.00028739999999999994, 'epoch': 0.08}
+{'loss': 2.2346, 'grad_norm': 1.7060633897781372, 'learning_rate': 0.00028799999999999995, 'epoch': 0.08}
+{'loss': 2.2125, 'grad_norm': 3.9570698738098145, 'learning_rate': 0.00028859999999999997, 'epoch': 0.08}
+{'loss': 2.4241, 'grad_norm': 3.4521429538726807, 'learning_rate': 0.0002892, 'epoch': 0.08}
+{'loss': 2.4198, 'grad_norm': 2.9334945678710938, 'learning_rate': 0.00028979999999999994, 'epoch': 0.08}
+{'loss': 2.3674, 'grad_norm': 2.5098342895507812, 'learning_rate': 0.00029039999999999996, 'epoch': 0.08}
+{'loss': 2.1336, 'grad_norm': 2.8507261276245117, 'learning_rate': 0.00029099999999999997, 'epoch': 0.08}
+{'loss': 2.3893, 'grad_norm': 2.2666025161743164, 'learning_rate': 0.0002916, 'epoch': 0.08}
+{'loss': 1.7857, 'grad_norm': 2.3513152599334717, 'learning_rate': 0.00029219999999999995, 'epoch': 0.08}
+{'loss': 2.3626, 'grad_norm': 2.0074493885040283, 'learning_rate': 0.00029279999999999996, 'epoch': 0.08}
+{'loss': 2.0882, 'grad_norm': 1.9822458028793335, 'learning_rate': 0.0002934, 'epoch': 0.08}
+{'loss': 2.1448, 'grad_norm': 3.779900074005127, 'learning_rate': 0.000294, 'epoch': 0.08}
+  4%|▍         | 496/12776 [03:49<41:44,  4.90it/s]  4%|▍         | 497/12776 [03:49<46:15,  4.42it/s]                                                     4%|▍         | 497/12776 [03:49<46:15,  4.42it/s]  4%|▍         | 498/12776 [03:50<43:46,  4.67it/s]                                                     4%|▍         | 498/12776 [03:50<43:46,  4.67it/s]  4%|▍         | 499/12776 [03:50<41:57,  4.88it/s]                                                     4%|▍         | 499/12776 [03:50<41:57,  4.88it/s]  4%|▍         | 500/12776 [03:51<1:17:10,  2.65it/s]                                                       4%|▍         | 500/12776 [03:51<1:17:10,  2.65it/s]  4%|▍         | 501/12776 [03:52<2:20:45,  1.45it/s]                                                       4%|▍         | 501/12776 [03:52<2:20:45,  1.45it/s]  4%|▍         | 502/12776 [03:53<2:35:48,  1.31it/s]                                                       4%|▍         | 502/12776 [03:53<2:35:48,  1.31it/s]  4%|▍         | 503/12776 [03:54<2:51:55,  1.19it/s]                                                       4%|▍         | 503/12776 [03:54<2:51:55,  1.19it/s]  4%|▍         | 504/12776 [03:55<2:48:14,  1.22it/s]                                                       4%|▍         | 504/12776 [03:55<2:48:14,  1.22it/s]  4%|▍         | 505/12776 [03:56<2:47:36,  1.22it/s]                                                       4%|▍         | 505/12776 [03:56<2:47:36,  1.22it/s]  4%|▍         | 506/12776 [03:56<2:40:32,  1.27it/s]                                                       4%|▍         | 506/12776 [03:56<2:40:32,  1.27it/s]  4%|▍         | 507/12776 [03:57<2:32:38,  1.34it/s]                                                       4%|▍         | 507/12776 [03:57<2:32:38,  1.34it/s]  4%|▍         | 508/12776 [03:58<2:33:09,  1.34it/s]                                                       4%|▍         | 508/12776 [03:58<2:33:09,  1.34it/s]  4%|▍         | 509/12776 [03:58<2:24:45,  1.41it/s]                                                       4%|▍         | 509/12776 [03:58<2:24:45,  1.41it/s]  4%|▍         | 510/12776 [03:59<2:19:16,  1.47it/s]                                                       4%|▍         | 510/12776 [03:59<2:19:16,  1.47it/s]  4%|▍         | 511/12776 [03:59<2:12:21,  1.54it/s]                                                       4%|▍         | 511/12776 [03:59<2:12:21,  1.54it/s]  4%|▍         | 512/12776 [04:00<2:09:26,  1.58it/s]                                                       4%|▍         | 512/12776 [04:00<2:09:26,  1.58it/s]  4%|▍         | 513/12776 [04:01<2:02:32,  1.67it/s]                                                       4%|▍         | 513/12776 [04:01<2:02:32,  1.67it/s]  4%|▍         | 514/12776 [04:01<2:03:27,  1.66it/s]                                                       4%|▍         | 514/12776 [04:01<2:03:27,  1.66it/s]  4%|▍         | 515/12776 [04:02<1:56:00,  1.76it/s]                                                       4%|▍         | 515/12776 [04:02<1:56:00,  1.76it/s]  4%|▍         | 516/12776 [04:02<1:54:04,  1.79it/s]                                                       4%|▍         | 516/12776 [04:02<1:54:04,  1.79it/s]  4%|▍         | 517/12776 [04:03<1:46:59,  1.91it/s]                                                       4%|▍         | 517/12776 [04:03<1:46:59,  1.91it/s]  4%|▍         | 518/12776 [04:03<1:46:28,  1.92it/s]                                                       4%|▍         | 518/12776 [04:03<1:46:28,  1.92it/s]  4%|▍         | 519/12776 [04:04<1:39:13,  2.06it/s]                                                       4%|▍         | 519/12776 [04:04<1:39:13,  2.06it/s]  4%|▍         | 520/12776 [04:04<1:33:47,  2.18it/s]                                                       4%|▍         | 520/12776 [04:04<1:33:47,  2.18it/s]  4%|▍         | 521/12776 [04:05<1:37:53,  2.09it/s]                                                       4%|▍         | 521/12776 [04:05<1:37:53,  2.09it/s]  4%|▍         | 522/12776 [04:05<1:31:32,  2.23it/s]                                                       4%|▍         | 522/12776 [04:05<1:31:32,  2.23it/s]  4%|▍         | 523/12776 [04:05<1:25:55,  2.38it/s]                                                       4%|▍         | 523/12776 [04:05<1:25:55,  2.38it/s]  4%|▍         | 524/12776 [04:06<1:29:53,  2.27it/s]                                                       4%|▍         | 524/12776 [04:06<1:29:53,  2.27it/s]  4%|▍         | 525/12776 [04:06<1:23:10,  2.45it/s]                                                       4%|▍         | 525/12776 [04:06<1:23:10,  2.45it/s]  4%|▍         | 526/12776 [04:06<1:18:14,  2.61it/s]                                                       4%|▍         | 526/12776 [04:06<1:18:14,  2.61it/s]  4%|▍         | 527/12776 [04:07<1:19:39,  2.56it/s]                                                       4%|▍         | 527/12776 [04:07<1:19:39,  2.56it/s]  4%|▍         | 528/12776 [04:07<1:14:04,  2.76it/s]                                                       4%|▍         | 528/12776 [04:07<1:14:04,  2.76it/s]  4%|▍         | 529/12776 [04:07<1:09:52,  2.92it/s]                                                       4%|▍         | 529/12776 [04:07<1:09:52,  2.92it/s]  4%|▍         | 530/12776 [04:08<1:13:38,  2.77it/s]                                                       4%|▍         | 530/12776 [04:08<1:13:38,  2.77it/s]  4%|▍         | 531/12776 [04:08<1:07:49,  3.01it/s]                                                       4%|▍         | 531/12776 [04:08<1:07:49,  3.01it/s]  4%|▍         | 532/12776 [04:08<1:03:39,  3.21it/s]                                                       4%|▍         | 532/12776 [04:08<1:03:39,  3.21it/s]  4%|▍         | 533/12776 [04:09<1:00:08,  3.39it/s]                                                       4%|▍         | 533/12776 [04:09<1:00:08,  3.39it/s]  4%|▍         | 534/12776 [04:09<1:02:59,  3.24it/s]                                                       4%|▍         | 534/12776 [04:09<1:02:59,  3.24it/s]  4%|▍         | 535/12776 [04:09<58:56,  3.46it/s]                                                       4%|▍         | 535/12776 [04:09<58:56,  3.46it/s]  4%|▍         | 536/12776 [04:09<55:46,  3.66it/s]                                                     4%|▍         | 536/12776 [04:09<55:46,  3.66it/s]  4%|▍         | 537/12776 [04:10<53:34,  3.81it/s]                                                     4%|▍         | 537/12776 [04:10<53:34,  3.81it/s]  4%|▍         | 538/12776 [04:10<51:12,  3.98it/s]                                                     4%|▍         | 538/12776 [04:10<51:12,  3.98it/s]  4%|▍         | 539/12776 [04:10<55:34,  3.67it/s]                                                     4%|▍         | 539/12776 [04:10<55:34,  3.67it/s]  4%|▍         | 540/12776 [04:10<51:46,  3.94it/s]                                                     4%|▍         | 540/12776 [04:10<51:46,  3.94it/s]  4%|▍         | 541/12776 [04:11<49:04,  4.15it/s]                                                     4%|▍         | 541/12776 [04:11<49:04,  4.15it/s]  4%|▍         | 542/12776 [04:11<47:26,  4.30it/s]                                                     4%|▍         | 542/12776 [04:11<47:26,  4.30it/s]  4%|▍         | 543/12776 [04:11<45:37,  4.47it/s]                                                     4%|▍         | 543/12776 [04:11<45:37,  4.47it/s]  4%|▍         | 544/12776 [04:11<49:41,  4.10it/s]                                                     4%|▍         | 544/12776 [04:11<49:41,  4.10it/s]  4%|▍         | 545/12776 [04:12<46:41,  4.37it/s]                                                     4%|▍         | 545/12776 [04:12<46:41,  4.37it/s]  4%|▍         | 546/12776 [04:12<44:28,  4.58it/s]                                                     4%|▍         | 546/12776 [04:12<44:28,  4.58it/s]  4%|▍         | 547/12776 [04:12<42:43,  4.77it/s]                                                     4%|▍         | 547/12776 [04:12<42:43,  4.77it/s]  4%|▍         | 548/12776 [04:12<41:10,  4.95it/s]                                                     4%|▍         | 548/12776 [04:12<41:10,  4.95it/s]  4%|▍         | 549/12776 [04:12<45:39,  4.46it/s]                                                     4%|▍         | 549/12776 [04:12<45:39,  4.46it/s]  4%|▍         | 550/12776 [04:13<1:14:44,  2.73it/s]                                                       4%|▍         | 550/12776 [04:13<1:14:44,  2.73it/s]  4%|▍         | 551/12776 [04:15<2:24:52,  1.41it/s]                                                       4%|▍         | 551/12776 [04:15<2:24:52,  1.41it/s]  4%|▍         | 552/12776 [04:16<2:40:26,  1.27it/s]                                                       4%|▍         | 552/12776 [04:16<2:40:26,  1.27it/s]  4%|▍         | 553/12776 [04:16<2:44:04,  1.24it/s]                                                       4%|▍         | 553/12776 [04:16<2:44:04,  1.24it/s]  4%|▍         | 554/12776 [04:17<2:47:35,  1.22it/s]                                                       4%|▍         | 554/12776 [04:17<2:47:35,  1.22it/s]  4%|▍         | 555/12776 [04:18<2:49:00,  1.21it/s]                                                       4%|▍         | 555/12776 [04:18<2:49:00,  1.21it/s]  4%|▍         | 556/12776 [04:19<2:40:29,  1.27it/s]                                                       4%|▍         | 556/12776 [04:19<2:40:29,  1.27it/s]  4%|▍         | 557/12776 [04:20<2:36:04,  1.30it/s]                                                       4%|▍         | 557/12776 [04:20<2:36:04,  1.30it/s]  4%|▍         | 558/12776 [04:20<2:28:17,  1.37it/s]                                                       4%|▍         | 558/12776 [04:20<2:28:17,  1.37it/s]  4%|▍         | 559/12776 [04:21<2:22:32,  1.43it/s]                                                       4%|▍         | 559/12776 [04:21<2:22:32,  1.43it/s]  4%|▍         | 560/12776 [04:21<2:15:04,  1.51it/s]                                                       4%|▍         | 560/12776 [04:21<2:15:04,  1.51it/s]  4%|▍         | 561/12776 [04:22<2:09:23,  1.57it/s]                                                       4%|▍         | 561/12776 [04:22<2:09:23,  1.57it/s]  4%|▍         | 562/12776 [04:22<2:03:28,  1.65it/s]                                                       4%|▍         | 562/12776 [04:22<2:03:28,  1.65it/s]  4%|▍         | 563/12776 [04:23<2:00:53,  1.68it/s]                                                       4%|▍         | 563/12776 [04:23<2:00:53,  1.68it/s]  4%|▍         | 564/12776 [04:24<1:54:57,  1.77it/s]                                                       4%|▍         | 564/12776 [04:24<1:54:57,  1.77it/s]  4%|▍         | 565/12776 [04:24<1:51:48,  1.82it/s]                                                       4%|▍         | 565/12776 [04:24<1:51:48,  1.82it/s]  4%|▍         | 566/12776 [04:24<1:44:29,  1.95it/s]                                                       4%|▍         | 566/12776 [04:24<1:44:29,  1.95it/s]  4%|▍         | 567/12776 [04:25<1:44:56,  1.94it/s]                                                       4%|▍         | 567/12776 [04:25<1:44:56,  1.94it/s]  4%|▍         | 568/12776 [04:25<1:37:57,  2.08it/s]                                                       4%|▍         | 568/12776 [04:25<1:37:57,  2.08it/s]  4%|▍         | 569/12776 [04:26<1:31:54,  2.21it/s]                                                       4%|▍         | 569/12776 [04:26<1:31:54,  2.21it/s]  4%|▍         | 570/12776 [04:26<1:27:35,  2.32it/s]                                                       4%|▍         | 570/12776 [04:26<1:27:35,  2.32it/s]  4%|▍         | 571/12776 [04:27<1:23:00,  2.45it/s]                                                       4%|▍         | 571/12776 [04:27<1:23:00,  2.45it/s]  4%|▍         | 572/12776 [04:27<1:19:09,  2.57it/s]                                                       4%|▍         | 572/12776 [04:27<1:19:09,  2.57it/s]  4%|▍         | 573/12776 [04:27<1:22:18,  2.47it/s]                                                       4%|▍         | 573/12776 [04:27<1:22:18,  2.47it/s]  4%|▍         | 574/12776 [04:28<1:17:37,  2.62it/s]                                                     {'loss': 1.6344, 'grad_norm': 2.198042869567871, 'learning_rate': 0.00029459999999999995, 'epoch': 0.08}
+{'loss': 1.8025, 'grad_norm': 1.7770249843597412, 'learning_rate': 0.00029519999999999997, 'epoch': 0.08}
+{'loss': 1.6183, 'grad_norm': 3.967764139175415, 'learning_rate': 0.0002958, 'epoch': 0.08}
+{'loss': 1.6993, 'grad_norm': 3.88845157623291, 'learning_rate': 0.0002964, 'epoch': 0.08}
+{'loss': 1.5443, 'grad_norm': 4.8480963706970215, 'learning_rate': 0.00029699999999999996, 'epoch': 0.08}
+{'loss': 2.4391, 'grad_norm': 1.4311107397079468, 'learning_rate': 0.00029759999999999997, 'epoch': 0.08}
+{'loss': 2.1257, 'grad_norm': 1.0577871799468994, 'learning_rate': 0.0002982, 'epoch': 0.08}
+{'loss': 2.0302, 'grad_norm': 0.7990356087684631, 'learning_rate': 0.0002988, 'epoch': 0.08}
+{'loss': 2.0466, 'grad_norm': 0.9843720197677612, 'learning_rate': 0.00029939999999999996, 'epoch': 0.08}
+{'loss': 1.7693, 'grad_norm': 0.8615551590919495, 'learning_rate': 0.0003, 'epoch': 0.08}
+{'loss': 1.7733, 'grad_norm': 1.1965469121932983, 'learning_rate': 0.0002999755620723362, 'epoch': 0.08}
+{'loss': 1.8785, 'grad_norm': 1.6512717008590698, 'learning_rate': 0.0002999511241446725, 'epoch': 0.08}
+{'loss': 1.807, 'grad_norm': 1.4600224494934082, 'learning_rate': 0.0002999266862170088, 'epoch': 0.08}
+{'loss': 1.6916, 'grad_norm': 0.969097375869751, 'learning_rate': 0.00029990224828934503, 'epoch': 0.08}
+{'loss': 2.2747, 'grad_norm': 10.695720672607422, 'learning_rate': 0.0002998778103616813, 'epoch': 0.08}
+{'loss': 1.7943, 'grad_norm': 1.4272046089172363, 'learning_rate': 0.0002998533724340176, 'epoch': 0.08}
+{'loss': 1.6995, 'grad_norm': 1.0465346574783325, 'learning_rate': 0.00029982893450635384, 'epoch': 0.08}
+{'loss': 1.6331, 'grad_norm': 1.2503383159637451, 'learning_rate': 0.0002998044965786901, 'epoch': 0.08}
+{'loss': 1.593, 'grad_norm': 1.0055828094482422, 'learning_rate': 0.00029978005865102634, 'epoch': 0.08}
+{'loss': 1.6474, 'grad_norm': 0.7897710800170898, 'learning_rate': 0.00029975562072336265, 'epoch': 0.08}
+{'loss': 1.8418, 'grad_norm': 2.2324774265289307, 'learning_rate': 0.0002997311827956989, 'epoch': 0.08}
+{'loss': 1.5869, 'grad_norm': 1.3233245611190796, 'learning_rate': 0.00029970674486803515, 'epoch': 0.08}
+{'loss': 1.604, 'grad_norm': 1.2391108274459839, 'learning_rate': 0.00029968230694037146, 'epoch': 0.08}
+{'loss': 1.7185, 'grad_norm': 1.746536135673523, 'learning_rate': 0.0002996578690127077, 'epoch': 0.08}
+{'loss': 1.7549, 'grad_norm': 2.1696672439575195, 'learning_rate': 0.00029963343108504396, 'epoch': 0.08}
+{'loss': 1.7399, 'grad_norm': 2.327106237411499, 'learning_rate': 0.00029960899315738026, 'epoch': 0.08}
+{'loss': 1.6924, 'grad_norm': 2.274477481842041, 'learning_rate': 0.00029958455522971646, 'epoch': 0.08}
+{'loss': 1.7547, 'grad_norm': 2.649080991744995, 'learning_rate': 0.00029956011730205277, 'epoch': 0.08}
+{'loss': 1.7752, 'grad_norm': 2.5556795597076416, 'learning_rate': 0.000299535679374389, 'epoch': 0.08}
+{'loss': 2.1346, 'grad_norm': 3.2737300395965576, 'learning_rate': 0.00029951124144672527, 'epoch': 0.08}
+{'loss': 1.6453, 'grad_norm': 1.695064663887024, 'learning_rate': 0.0002994868035190616, 'epoch': 0.08}
+{'loss': 2.3186, 'grad_norm': 9.204750061035156, 'learning_rate': 0.0002994623655913978, 'epoch': 0.08}
+{'loss': 2.1661, 'grad_norm': 3.981736660003662, 'learning_rate': 0.0002994379276637341, 'epoch': 0.08}
+{'loss': 1.9903, 'grad_norm': 4.276581764221191, 'learning_rate': 0.00029941348973607033, 'epoch': 0.08}
+{'loss': 1.9789, 'grad_norm': 3.1463425159454346, 'learning_rate': 0.00029938905180840663, 'epoch': 0.08}
+{'loss': 1.916, 'grad_norm': 2.7212626934051514, 'learning_rate': 0.0002993646138807429, 'epoch': 0.08}
+{'loss': 2.077, 'grad_norm': 3.936119794845581, 'learning_rate': 0.00029934017595307914, 'epoch': 0.08}
+{'loss': 1.8905, 'grad_norm': 3.289414882659912, 'learning_rate': 0.00029931573802541544, 'epoch': 0.08}
+{'loss': 1.9716, 'grad_norm': 4.605769157409668, 'learning_rate': 0.0002992913000977517, 'epoch': 0.08}
+{'loss': 1.9615, 'grad_norm': 1.6472567319869995, 'learning_rate': 0.00029926686217008794, 'epoch': 0.08}
+{'loss': 1.9301, 'grad_norm': 3.2355268001556396, 'learning_rate': 0.00029924242424242425, 'epoch': 0.08}
+{'loss': 1.8207, 'grad_norm': 1.935306429862976, 'learning_rate': 0.00029921798631476045, 'epoch': 0.08}
+{'loss': 1.9752, 'grad_norm': 2.387748956680298, 'learning_rate': 0.00029919354838709675, 'epoch': 0.08}
+{'loss': 2.2058, 'grad_norm': 3.236212730407715, 'learning_rate': 0.000299169110459433, 'epoch': 0.08}
+{'loss': 2.0538, 'grad_norm': 3.333184003829956, 'learning_rate': 0.00029914467253176926, 'epoch': 0.08}
+{'loss': 1.9744, 'grad_norm': 2.5119690895080566, 'learning_rate': 0.00029912023460410556, 'epoch': 0.08}
+{'loss': 1.9949, 'grad_norm': 3.832310438156128, 'learning_rate': 0.0002990957966764418, 'epoch': 0.08}
+{'loss': 1.8598, 'grad_norm': 3.179901599884033, 'learning_rate': 0.00029907135874877806, 'epoch': 0.09}
+{'loss': 2.1715, 'grad_norm': 4.294993877410889, 'learning_rate': 0.00029904692082111437, 'epoch': 0.09}
+{'loss': 2.3794, 'grad_norm': 5.152624607086182, 'learning_rate': 0.0002990224828934506, 'epoch': 0.09}
+{'loss': 1.3454, 'grad_norm': 3.927077054977417, 'learning_rate': 0.00029899804496578687, 'epoch': 0.09}
+{'loss': 1.8446, 'grad_norm': 2.089223623275757, 'learning_rate': 0.0002989736070381231, 'epoch': 0.09}
+{'loss': 1.7771, 'grad_norm': 3.640803813934326, 'learning_rate': 0.00029894916911045943, 'epoch': 0.09}
+{'loss': 1.6782, 'grad_norm': 2.4625141620635986, 'learning_rate': 0.0002989247311827957, 'epoch': 0.09}
+{'loss': 2.3454, 'grad_norm': 4.571941375732422, 'learning_rate': 0.00029890029325513193, 'epoch': 0.09}
+{'loss': 1.7439, 'grad_norm': 1.4779797792434692, 'learning_rate': 0.00029887585532746824, 'epoch': 0.09}
+{'loss': 1.6772, 'grad_norm': 1.2959303855895996, 'learning_rate': 0.0002988514173998045, 'epoch': 0.09}
+{'loss': 1.6696, 'grad_norm': 1.5447274446487427, 'learning_rate': 0.00029882697947214074, 'epoch': 0.09}
+{'loss': 1.4124, 'grad_norm': 1.5669810771942139, 'learning_rate': 0.000298802541544477, 'epoch': 0.09}
+{'loss': 1.2168, 'grad_norm': 1.109596848487854, 'learning_rate': 0.00029877810361681324, 'epoch': 0.09}
+{'loss': 1.35, 'grad_norm': 1.1062586307525635, 'learning_rate': 0.00029875366568914955, 'epoch': 0.09}
+{'loss': 1.2897, 'grad_norm': 1.2158498764038086, 'learning_rate': 0.0002987292277614858, 'epoch': 0.09}
+{'loss': 1.3757, 'grad_norm': 1.4337570667266846, 'learning_rate': 0.00029870478983382205, 'epoch': 0.09}
+{'loss': 1.5378, 'grad_norm': 1.0046571493148804, 'learning_rate': 0.00029868035190615835, 'epoch': 0.09}
+{'loss': 1.2713, 'grad_norm': 1.03334641456604, 'learning_rate': 0.0002986559139784946, 'epoch': 0.09}
+{'loss': 1.3718, 'grad_norm': 1.219976782798767, 'learning_rate': 0.00029863147605083086, 'epoch': 0.09}
+{'loss': 1.3139, 'grad_norm': 1.4024769067764282, 'learning_rate': 0.0002986070381231671, 'epoch': 0.09}
+{'loss': 1.4525, 'grad_norm': 1.8966326713562012, 'learning_rate': 0.0002985826001955034, 'epoch': 0.09}
+{'loss': 1.2881, 'grad_norm': 1.385165810585022, 'learning_rate': 0.00029855816226783966, 'epoch': 0.09}
+{'loss': 1.1926, 'grad_norm': 1.713485598564148, 'learning_rate': 0.0002985337243401759, 'epoch': 0.09}
+{'loss': 1.3264, 'grad_norm': 1.2496473789215088, 'learning_rate': 0.0002985092864125122, 'epoch': 0.09}
+{'loss': 1.5377, 'grad_norm': 2.36457896232605, 'learning_rate': 0.00029848484848484847, 'epoch': 0.09}
+{'loss': 1.562, 'grad_norm': 1.7575597763061523, 'learning_rate': 0.0002984604105571847, 'epoch': 0.09}
+{'loss': 1.2985, 'grad_norm': 1.9847209453582764, 'learning_rate': 0.00029843597262952103, 'epoch': 0.09}
+{'loss': 1.4982, 'grad_norm': 1.8547005653381348, 'learning_rate': 0.0002984115347018572, 'epoch': 0.09}
+{'loss': 1.3506, 'grad_norm': 1.179679036140442, 'learning_rate': 0.00029838709677419353, 'epoch': 0.09}
+{'loss': 1.6808, 'grad_norm': 2.9201154708862305, 'learning_rate': 0.0002983626588465298, 'epoch': 0.09}
+{'loss': 1.6371, 'grad_norm': 2.8319106101989746, 'learning_rate': 0.00029833822091886603, 'epoch': 0.09}
+  4%|▍         | 574/12776 [04:28<1:17:37,  2.62it/s]  5%|▍         | 575/12776 [04:28<1:12:57,  2.79it/s]                                                       5%|▍         | 575/12776 [04:28<1:12:57,  2.79it/s]  5%|▍         | 576/12776 [04:28<1:09:19,  2.93it/s]                                                       5%|▍         | 576/12776 [04:28<1:09:19,  2.93it/s]  5%|▍         | 577/12776 [04:29<1:09:37,  2.92it/s]                                                       5%|▍         | 577/12776 [04:29<1:09:37,  2.92it/s]  5%|▍         | 578/12776 [04:29<1:06:06,  3.07it/s]                                                       5%|▍         | 578/12776 [04:29<1:06:06,  3.07it/s]  5%|▍         | 579/12776 [04:29<1:02:55,  3.23it/s]                                                       5%|▍         | 579/12776 [04:29<1:02:55,  3.23it/s]  5%|▍         | 580/12776 [04:29<1:00:17,  3.37it/s]                                                       5%|▍         | 580/12776 [04:29<1:00:17,  3.37it/s]  5%|▍         | 581/12776 [04:30<1:01:46,  3.29it/s]                                                       5%|▍         | 581/12776 [04:30<1:01:46,  3.29it/s]  5%|▍         | 582/12776 [04:30<58:39,  3.47it/s]                                                       5%|▍         | 582/12776 [04:30<58:39,  3.47it/s]  5%|▍         | 583/12776 [04:30<56:07,  3.62it/s]                                                     5%|▍         | 583/12776 [04:30<56:07,  3.62it/s]  5%|▍         | 584/12776 [04:30<54:06,  3.75it/s]                                                     5%|▍         | 584/12776 [04:30<54:06,  3.75it/s]  5%|▍         | 585/12776 [04:31<1:01:12,  3.32it/s]                                                       5%|▍         | 585/12776 [04:31<1:01:12,  3.32it/s]  5%|▍         | 586/12776 [04:31<56:47,  3.58it/s]                                                       5%|▍         | 586/12776 [04:31<56:47,  3.58it/s]  5%|▍         | 587/12776 [04:31<53:11,  3.82it/s]                                                     5%|▍         | 587/12776 [04:31<53:11,  3.82it/s]  5%|▍         | 588/12776 [04:32<50:12,  4.05it/s]                                                     5%|▍         | 588/12776 [04:32<50:12,  4.05it/s]  5%|▍         | 589/12776 [04:32<47:50,  4.25it/s]                                                     5%|▍         | 589/12776 [04:32<47:50,  4.25it/s]  5%|▍         | 590/12776 [04:32<52:01,  3.90it/s]                                                     5%|▍         | 590/12776 [04:32<52:01,  3.90it/s]  5%|▍         | 591/12776 [04:32<48:54,  4.15it/s]                                                     5%|▍         | 591/12776 [04:32<48:54,  4.15it/s]  5%|▍         | 592/12776 [04:32<46:31,  4.36it/s]                                                     5%|▍         | 592/12776 [04:32<46:31,  4.36it/s]  5%|▍         | 593/12776 [04:33<44:40,  4.55it/s]                                                     5%|▍         | 593/12776 [04:33<44:40,  4.55it/s]  5%|▍         | 594/12776 [04:33<43:16,  4.69it/s]                                                     5%|▍         | 594/12776 [04:33<43:16,  4.69it/s]  5%|▍         | 595/12776 [04:33<49:09,  4.13it/s]                                                     5%|▍         | 595/12776 [04:33<49:09,  4.13it/s]  5%|▍         | 596/12776 [04:33<46:05,  4.40it/s]                                                     5%|▍         | 596/12776 [04:33<46:05,  4.40it/s]  5%|▍         | 597/12776 [04:34<43:54,  4.62it/s]                                                     5%|▍         | 597/12776 [04:34<43:54,  4.62it/s]  5%|▍         | 598/12776 [04:34<42:09,  4.81it/s]                                                     5%|▍         | 598/12776 [04:34<42:09,  4.81it/s]  5%|▍         | 599/12776 [04:34<40:42,  4.99it/s]                                                     5%|▍         | 599/12776 [04:34<40:42,  4.99it/s]  5%|▍         | 600/12776 [04:35<1:11:46,  2.83it/s]                                                       5%|▍         | 600/12776 [04:35<1:11:46,  2.83it/s]  5%|▍         | 601/12776 [04:36<2:25:07,  1.40it/s]                                                       5%|▍         | 601/12776 [04:36<2:25:07,  1.40it/s]  5%|▍         | 602/12776 [04:37<2:41:14,  1.26it/s]                                                       5%|▍         | 602/12776 [04:37<2:41:14,  1.26it/s]  5%|▍         | 603/12776 [04:38<2:53:22,  1.17it/s]                                                       5%|▍         | 603/12776 [04:38<2:53:22,  1.17it/s]  5%|▍         | 604/12776 [04:39<2:59:14,  1.13it/s]                                                       5%|▍         | 604/12776 [04:39<2:59:14,  1.13it/s]  5%|▍         | 605/12776 [04:40<2:56:02,  1.15it/s]                                                       5%|▍         | 605/12776 [04:40<2:56:02,  1.15it/s]  5%|▍         | 606/12776 [04:41<2:46:15,  1.22it/s]                                                       5%|▍         | 606/12776 [04:41<2:46:15,  1.22it/s]  5%|▍         | 607/12776 [04:41<2:38:43,  1.28it/s]                                                       5%|▍         | 607/12776 [04:41<2:38:43,  1.28it/s]  5%|▍         | 608/12776 [04:42<2:28:45,  1.36it/s]                                                       5%|▍         | 608/12776 [04:42<2:28:45,  1.36it/s]  5%|▍         | 609/12776 [04:43<2:19:21,  1.46it/s]                                                       5%|▍         | 609/12776 [04:43<2:19:21,  1.46it/s]  5%|▍         | 610/12776 [04:43<2:12:33,  1.53it/s]                                                       5%|▍         | 610/12776 [04:43<2:12:33,  1.53it/s]  5%|▍         | 611/12776 [04:44<2:05:22,  1.62it/s]                                                       5%|▍         | 611/12776 [04:44<2:05:22,  1.62it/s]  5%|▍         | 612/12776 [04:44<1:58:09,  1.72it/s]                                                       5%|▍         | 612/12776 [04:44<1:58:09,  1.72it/s]  5%|▍         | 613/12776 [04:45<1:54:16,  1.77it/s]                                                       5%|▍         | 613/12776 [04:45<1:54:16,  1.77it/s]  5%|▍         | 614/12776 [04:45<1:47:18,  1.89it/s]                                                       5%|▍         | 614/12776 [04:45<1:47:18,  1.89it/s]  5%|▍         | 615/12776 [04:46<1:45:45,  1.92it/s]                                                       5%|▍         | 615/12776 [04:46<1:45:45,  1.92it/s]  5%|▍         | 616/12776 [04:46<1:39:14,  2.04it/s]                                                       5%|▍         | 616/12776 [04:46<1:39:14,  2.04it/s]  5%|▍         | 617/12776 [04:46<1:35:17,  2.13it/s]                                                       5%|▍         | 617/12776 [04:46<1:35:17,  2.13it/s]  5%|▍         | 618/12776 [04:47<1:34:14,  2.15it/s]                                                       5%|▍         | 618/12776 [04:47<1:34:14,  2.15it/s]  5%|▍         | 619/12776 [04:47<1:28:54,  2.28it/s]                                                       5%|▍         | 619/12776 [04:47<1:28:54,  2.28it/s]  5%|▍         | 620/12776 [04:48<1:24:16,  2.40it/s]                                                       5%|▍         | 620/12776 [04:48<1:24:16,  2.40it/s]  5%|▍         | 621/12776 [04:48<1:25:10,  2.38it/s]                                                       5%|▍         | 621/12776 [04:48<1:25:10,  2.38it/s]  5%|▍         | 622/12776 [04:48<1:20:19,  2.52it/s]                                                       5%|▍         | 622/12776 [04:48<1:20:19,  2.52it/s]  5%|▍         | 623/12776 [04:49<1:16:08,  2.66it/s]                                                       5%|▍         | 623/12776 [04:49<1:16:08,  2.66it/s]  5%|▍         | 624/12776 [04:49<1:15:48,  2.67it/s]                                                       5%|▍         | 624/12776 [04:49<1:15:48,  2.67it/s]  5%|▍         | 625/12776 [04:49<1:11:05,  2.85it/s]                                                       5%|▍         | 625/12776 [04:49<1:11:05,  2.85it/s]  5%|▍         | 626/12776 [04:50<1:08:02,  2.98it/s]                                                       5%|▍         | 626/12776 [04:50<1:08:02,  2.98it/s]  5%|▍         | 627/12776 [04:50<1:09:29,  2.91it/s]                                                       5%|▍         | 627/12776 [04:50<1:09:29,  2.91it/s]  5%|▍         | 628/12776 [04:50<1:05:11,  3.11it/s]                                                       5%|▍         | 628/12776 [04:50<1:05:11,  3.11it/s]  5%|▍         | 629/12776 [04:51<1:01:38,  3.28it/s]                                                       5%|▍         | 629/12776 [04:51<1:01:38,  3.28it/s]  5%|▍         | 630/12776 [04:51<58:17,  3.47it/s]                                                       5%|▍         | 630/12776 [04:51<58:17,  3.47it/s]  5%|▍         | 631/12776 [04:51<1:00:56,  3.32it/s]                                                       5%|▍         | 631/12776 [04:51<1:00:56,  3.32it/s]  5%|▍         | 632/12776 [04:51<57:47,  3.50it/s]                                                       5%|▍         | 632/12776 [04:51<57:47,  3.50it/s]  5%|▍         | 633/12776 [04:52<55:23,  3.65it/s]                                                     5%|▍         | 633/12776 [04:52<55:23,  3.65it/s]  5%|▍         | 634/12776 [04:52<53:10,  3.81it/s]                                                     5%|▍         | 634/12776 [04:52<53:10,  3.81it/s]  5%|▍         | 635/12776 [04:52<51:18,  3.94it/s]                                                     5%|▍         | 635/12776 [04:52<51:18,  3.94it/s]  5%|▍         | 636/12776 [04:52<52:31,  3.85it/s]                                                     5%|▍         | 636/12776 [04:52<52:31,  3.85it/s]  5%|▍         | 637/12776 [04:53<50:04,  4.04it/s]                                                     5%|▍         | 637/12776 [04:53<50:04,  4.04it/s]  5%|▍         | 638/12776 [04:53<47:59,  4.22it/s]                                                     5%|▍         | 638/12776 [04:53<47:59,  4.22it/s]  5%|▌         | 639/12776 [04:53<46:17,  4.37it/s]                                                     5%|▌         | 639/12776 [04:53<46:17,  4.37it/s]  5%|▌         | 640/12776 [04:53<45:06,  4.48it/s]                                                     5%|▌         | 640/12776 [04:53<45:06,  4.48it/s]  5%|▌         | 641/12776 [04:54<48:48,  4.14it/s]                                                     5%|▌         | 641/12776 [04:54<48:48,  4.14it/s]  5%|▌         | 642/12776 [04:54<46:21,  4.36it/s]                                                     5%|▌         | 642/12776 [04:54<46:21,  4.36it/s]  5%|▌         | 643/12776 [04:54<44:55,  4.50it/s]                                                     5%|▌         | 643/12776 [04:54<44:55,  4.50it/s]  5%|▌         | 644/12776 [04:54<43:26,  4.65it/s]                                                     5%|▌         | 644/12776 [04:54<43:26,  4.65it/s]  5%|▌         | 645/12776 [04:54<42:12,  4.79it/s]                                                     5%|▌         | 645/12776 [04:54<42:12,  4.79it/s]  5%|▌         | 646/12776 [04:55<48:41,  4.15it/s]                                                     5%|▌         | 646/12776 [04:55<48:41,  4.15it/s]  5%|▌         | 647/12776 [04:55<45:44,  4.42it/s]                                                     5%|▌         | 647/12776 [04:55<45:44,  4.42it/s]  5%|▌         | 648/12776 [04:55<43:12,  4.68it/s]                                                     5%|▌         | 648/12776 [04:55<43:12,  4.68it/s]  5%|▌         | 649/12776 [04:55<41:13,  4.90it/s]                                                     5%|▌         | 649/12776 [04:55<41:13,  4.90it/s]  5%|▌         | 650/12776 [04:56<1:12:50,  2.77it/s]                                                       5%|▌         | 650/12776 [04:56<1:12:50,  2.77it/s]  5%|▌         | 651/12776 [04:57<2:21:03,  1.43it/s]                                                       5%|▌         | 651/12776 [04:57<2:21:03,  1.43it/s]  5%|▌         | 652/12776 [04:58<2:42:06,  1.25it/s]                                                     {'loss': 2.353, 'grad_norm': 3.5341687202453613, 'learning_rate': 0.00029831378299120234, 'epoch': 0.09}
+{'loss': 1.3278, 'grad_norm': 1.8579005002975464, 'learning_rate': 0.0002982893450635386, 'epoch': 0.09}
+{'loss': 1.9979, 'grad_norm': 3.4177498817443848, 'learning_rate': 0.00029826490713587484, 'epoch': 0.09}
+{'loss': 1.9626, 'grad_norm': 3.2669591903686523, 'learning_rate': 0.0002982404692082111, 'epoch': 0.09}
+{'loss': 1.724, 'grad_norm': 1.9549978971481323, 'learning_rate': 0.0002982160312805474, 'epoch': 0.09}
+{'loss': 1.3989, 'grad_norm': 1.7460333108901978, 'learning_rate': 0.00029819159335288365, 'epoch': 0.09}
+{'loss': 1.8983, 'grad_norm': 3.614212989807129, 'learning_rate': 0.0002981671554252199, 'epoch': 0.09}
+{'loss': 1.606, 'grad_norm': 3.4498374462127686, 'learning_rate': 0.0002981427174975562, 'epoch': 0.09}
+{'loss': 1.8912, 'grad_norm': 4.136172771453857, 'learning_rate': 0.00029811827956989246, 'epoch': 0.09}
+{'loss': 1.941, 'grad_norm': 3.7499725818634033, 'learning_rate': 0.0002980938416422287, 'epoch': 0.09}
+{'loss': 1.6427, 'grad_norm': 4.676743507385254, 'learning_rate': 0.000298069403714565, 'epoch': 0.09}
+{'loss': 1.8666, 'grad_norm': 2.1480143070220947, 'learning_rate': 0.0002980449657869012, 'epoch': 0.09}
+{'loss': 2.0098, 'grad_norm': 2.891366481781006, 'learning_rate': 0.0002980205278592375, 'epoch': 0.09}
+{'loss': 1.7633, 'grad_norm': 2.557764768600464, 'learning_rate': 0.00029799608993157377, 'epoch': 0.09}
+{'loss': 1.7614, 'grad_norm': 2.8896427154541016, 'learning_rate': 0.00029797165200391, 'epoch': 0.09}
+{'loss': 1.5253, 'grad_norm': 2.5814003944396973, 'learning_rate': 0.0002979472140762463, 'epoch': 0.09}
+{'loss': 2.0882, 'grad_norm': 8.051748275756836, 'learning_rate': 0.0002979227761485826, 'epoch': 0.09}
+{'loss': 1.7916, 'grad_norm': 2.6856775283813477, 'learning_rate': 0.00029789833822091883, 'epoch': 0.09}
+{'loss': 1.8805, 'grad_norm': 2.2569031715393066, 'learning_rate': 0.00029787390029325513, 'epoch': 0.09}
+{'loss': 1.635, 'grad_norm': 11.610511779785156, 'learning_rate': 0.0002978494623655914, 'epoch': 0.09}
+{'loss': 1.8779, 'grad_norm': 2.5398268699645996, 'learning_rate': 0.00029782502443792764, 'epoch': 0.09}
+{'loss': 1.9912, 'grad_norm': 2.323976516723633, 'learning_rate': 0.0002978005865102639, 'epoch': 0.09}
+{'loss': 1.0067, 'grad_norm': 2.0534749031066895, 'learning_rate': 0.0002977761485826002, 'epoch': 0.09}
+{'loss': 1.5773, 'grad_norm': 3.8560521602630615, 'learning_rate': 0.00029775171065493644, 'epoch': 0.09}
+{'loss': 1.7498, 'grad_norm': 3.9110920429229736, 'learning_rate': 0.0002977272727272727, 'epoch': 0.09}
+{'loss': 1.7439, 'grad_norm': 3.5379350185394287, 'learning_rate': 0.000297702834799609, 'epoch': 0.09}
+{'loss': 1.3979, 'grad_norm': 3.4338223934173584, 'learning_rate': 0.0002976783968719452, 'epoch': 0.09}
+{'loss': 1.4324, 'grad_norm': 1.8097113370895386, 'learning_rate': 0.0002976539589442815, 'epoch': 0.09}
+{'loss': 1.3141, 'grad_norm': 1.1481614112854004, 'learning_rate': 0.00029762952101661775, 'epoch': 0.09}
+{'loss': 1.1048, 'grad_norm': 0.7774419784545898, 'learning_rate': 0.000297605083088954, 'epoch': 0.09}
+{'loss': 1.2183, 'grad_norm': 1.0336264371871948, 'learning_rate': 0.0002975806451612903, 'epoch': 0.09}
+{'loss': 1.0482, 'grad_norm': 0.7348364591598511, 'learning_rate': 0.00029755620723362656, 'epoch': 0.09}
+{'loss': 1.0892, 'grad_norm': 0.8298443555831909, 'learning_rate': 0.0002975317693059628, 'epoch': 0.09}
+{'loss': 0.9726, 'grad_norm': 0.7758945226669312, 'learning_rate': 0.0002975073313782991, 'epoch': 0.1}
+{'loss': 1.0492, 'grad_norm': 1.721008062362671, 'learning_rate': 0.00029748289345063537, 'epoch': 0.1}
+{'loss': 1.0549, 'grad_norm': 1.345603585243225, 'learning_rate': 0.0002974584555229716, 'epoch': 0.1}
+{'loss': 1.0506, 'grad_norm': 1.4059998989105225, 'learning_rate': 0.00029743401759530787, 'epoch': 0.1}
+{'loss': 1.0309, 'grad_norm': 1.0252139568328857, 'learning_rate': 0.0002974095796676442, 'epoch': 0.1}
+{'loss': 1.0449, 'grad_norm': 1.4364956617355347, 'learning_rate': 0.00029738514173998043, 'epoch': 0.1}
+{'loss': 1.2821, 'grad_norm': 1.9643033742904663, 'learning_rate': 0.0002973607038123167, 'epoch': 0.1}
+{'loss': 1.2619, 'grad_norm': 1.3634884357452393, 'learning_rate': 0.000297336265884653, 'epoch': 0.1}
+{'loss': 1.4142, 'grad_norm': 2.063399314880371, 'learning_rate': 0.00029731182795698924, 'epoch': 0.1}
+{'loss': 1.0253, 'grad_norm': 1.287653923034668, 'learning_rate': 0.0002972873900293255, 'epoch': 0.1}
+{'loss': 1.2315, 'grad_norm': 1.6113243103027344, 'learning_rate': 0.00029726295210166174, 'epoch': 0.1}
+{'loss': 1.1453, 'grad_norm': 1.7345733642578125, 'learning_rate': 0.000297238514173998, 'epoch': 0.1}
+{'loss': 1.0814, 'grad_norm': 1.8146443367004395, 'learning_rate': 0.0002972140762463343, 'epoch': 0.1}
+{'loss': 1.4222, 'grad_norm': 1.6718642711639404, 'learning_rate': 0.00029718963831867055, 'epoch': 0.1}
+{'loss': 1.5162, 'grad_norm': 1.9962431192398071, 'learning_rate': 0.0002971652003910068, 'epoch': 0.1}
+{'loss': 1.5312, 'grad_norm': 1.7549268007278442, 'learning_rate': 0.0002971407624633431, 'epoch': 0.1}
+{'loss': 1.5691, 'grad_norm': 2.5582468509674072, 'learning_rate': 0.00029711632453567936, 'epoch': 0.1}
+{'loss': 1.1799, 'grad_norm': 2.2081313133239746, 'learning_rate': 0.0002970918866080156, 'epoch': 0.1}
+{'loss': 1.1547, 'grad_norm': 1.4021954536437988, 'learning_rate': 0.00029706744868035186, 'epoch': 0.1}
+{'loss': 1.5232, 'grad_norm': 1.895840048789978, 'learning_rate': 0.00029704301075268816, 'epoch': 0.1}
+{'loss': 1.6906, 'grad_norm': 2.2723934650421143, 'learning_rate': 0.0002970185728250244, 'epoch': 0.1}
+{'loss': 1.755, 'grad_norm': 2.5103864669799805, 'learning_rate': 0.00029699413489736067, 'epoch': 0.1}
+{'loss': 1.3139, 'grad_norm': 1.7477445602416992, 'learning_rate': 0.00029696969696969697, 'epoch': 0.1}
+{'loss': 1.5794, 'grad_norm': 2.9243004322052, 'learning_rate': 0.0002969452590420332, 'epoch': 0.1}
+{'loss': 1.5047, 'grad_norm': 3.4790265560150146, 'learning_rate': 0.0002969208211143695, 'epoch': 0.1}
+{'loss': 1.9341, 'grad_norm': 2.3917460441589355, 'learning_rate': 0.0002968963831867058, 'epoch': 0.1}
+{'loss': 1.6668, 'grad_norm': 3.003119707107544, 'learning_rate': 0.000296871945259042, 'epoch': 0.1}
+{'loss': 1.3301, 'grad_norm': 1.7967157363891602, 'learning_rate': 0.0002968475073313783, 'epoch': 0.1}
+{'loss': 2.0308, 'grad_norm': 9.354352951049805, 'learning_rate': 0.00029682306940371453, 'epoch': 0.1}
+{'loss': 2.2328, 'grad_norm': 2.55900502204895, 'learning_rate': 0.0002967986314760508, 'epoch': 0.1}
+{'loss': 1.5009, 'grad_norm': 3.0861406326293945, 'learning_rate': 0.0002967741935483871, 'epoch': 0.1}
+{'loss': 1.8303, 'grad_norm': 2.7047877311706543, 'learning_rate': 0.00029674975562072334, 'epoch': 0.1}
+{'loss': 1.4659, 'grad_norm': 1.7027578353881836, 'learning_rate': 0.0002967253176930596, 'epoch': 0.1}
+{'loss': 1.7596, 'grad_norm': 3.0926079750061035, 'learning_rate': 0.0002967008797653959, 'epoch': 0.1}
+{'loss': 2.2201, 'grad_norm': 3.1565775871276855, 'learning_rate': 0.00029667644183773215, 'epoch': 0.1}
+{'loss': 1.4988, 'grad_norm': 2.1882317066192627, 'learning_rate': 0.0002966520039100684, 'epoch': 0.1}
+{'loss': 1.6186, 'grad_norm': 3.5301172733306885, 'learning_rate': 0.00029662756598240465, 'epoch': 0.1}
+{'loss': 2.141, 'grad_norm': 3.971935272216797, 'learning_rate': 0.00029660312805474096, 'epoch': 0.1}
+{'loss': 1.8005, 'grad_norm': 4.350378513336182, 'learning_rate': 0.0002965786901270772, 'epoch': 0.1}
+{'loss': 1.7147, 'grad_norm': 4.860312461853027, 'learning_rate': 0.00029655425219941346, 'epoch': 0.1}
+{'loss': 1.8563, 'grad_norm': 4.879358768463135, 'learning_rate': 0.00029652981427174977, 'epoch': 0.1}
+{'loss': 1.569, 'grad_norm': 2.9095821380615234, 'learning_rate': 0.00029650537634408596, 'epoch': 0.1}
+{'loss': 1.724, 'grad_norm': 2.8750619888305664, 'learning_rate': 0.00029648093841642227, 'epoch': 0.1}
+{'loss': 1.792, 'grad_norm': 4.468817234039307, 'learning_rate': 0.0002964565004887585, 'epoch': 0.1}
+{'loss': 1.322, 'grad_norm': 2.051830768585205, 'learning_rate': 0.00029643206256109477, 'epoch': 0.1}
+  5%|▌         | 652/12776 [04:58<2:42:06,  1.25it/s]  5%|▌         | 653/12776 [04:59<2:50:17,  1.19it/s]                                                       5%|▌         | 653/12776 [04:59<2:50:17,  1.19it/s]  5%|▌         | 654/12776 [05:00<2:48:25,  1.20it/s]                                                       5%|▌         | 654/12776 [05:00<2:48:25,  1.20it/s]  5%|▌         | 655/12776 [05:01<2:43:45,  1.23it/s]                                                       5%|▌         | 655/12776 [05:01<2:43:45,  1.23it/s]  5%|▌         | 656/12776 [05:02<2:43:14,  1.24it/s]                                                       5%|▌         | 656/12776 [05:02<2:43:14,  1.24it/s]  5%|▌         | 657/12776 [05:03<2:37:26,  1.28it/s]                                                       5%|▌         | 657/12776 [05:03<2:37:26,  1.28it/s]  5%|▌         | 658/12776 [05:03<2:28:50,  1.36it/s]                                                       5%|▌         | 658/12776 [05:03<2:28:50,  1.36it/s]  5%|▌         | 659/12776 [05:04<2:20:28,  1.44it/s]                                                       5%|▌         | 659/12776 [05:04<2:20:28,  1.44it/s]  5%|▌         | 660/12776 [05:04<2:12:23,  1.53it/s]                                                       5%|▌         | 660/12776 [05:04<2:12:23,  1.53it/s]  5%|▌         | 661/12776 [05:05<2:06:46,  1.59it/s]                                                       5%|▌         | 661/12776 [05:05<2:06:46,  1.59it/s]  5%|▌         | 662/12776 [05:05<2:00:28,  1.68it/s]                                                       5%|▌         | 662/12776 [05:05<2:00:28,  1.68it/s]  5%|▌         | 663/12776 [05:06<2:01:31,  1.66it/s]                                                       5%|▌         | 663/12776 [05:06<2:01:31,  1.66it/s]  5%|▌         | 664/12776 [05:06<1:53:37,  1.78it/s]                                                       5%|▌         | 664/12776 [05:06<1:53:37,  1.78it/s]  5%|▌         | 665/12776 [05:07<1:46:17,  1.90it/s]                                                       5%|▌         | 665/12776 [05:07<1:46:17,  1.90it/s]  5%|▌         | 666/12776 [05:07<1:45:53,  1.91it/s]                                                       5%|▌         | 666/12776 [05:07<1:45:53,  1.91it/s]  5%|▌         | 667/12776 [05:08<1:38:33,  2.05it/s]                                                       5%|▌         | 667/12776 [05:08<1:38:33,  2.05it/s]  5%|▌         | 668/12776 [05:08<1:41:10,  1.99it/s]                                                       5%|▌         | 668/12776 [05:08<1:41:10,  1.99it/s]  5%|▌         | 669/12776 [05:09<1:33:16,  2.16it/s]                                                       5%|▌         | 669/12776 [05:09<1:33:16,  2.16it/s]  5%|▌         | 670/12776 [05:09<1:26:03,  2.34it/s]                                                       5%|▌         | 670/12776 [05:09<1:26:03,  2.34it/s]  5%|▌         | 671/12776 [05:10<1:26:35,  2.33it/s]                                                       5%|▌         | 671/12776 [05:10<1:26:35,  2.33it/s]  5%|▌         | 672/12776 [05:10<1:20:30,  2.51it/s]                                                       5%|▌         | 672/12776 [05:10<1:20:30,  2.51it/s]  5%|▌         | 673/12776 [05:10<1:15:30,  2.67it/s]                                                       5%|▌         | 673/12776 [05:10<1:15:30,  2.67it/s]  5%|▌         | 674/12776 [05:10<1:11:09,  2.83it/s]                                                       5%|▌         | 674/12776 [05:10<1:11:09,  2.83it/s]  5%|▌         | 675/12776 [05:11<1:11:22,  2.83it/s]                                                       5%|▌         | 675/12776 [05:11<1:11:22,  2.83it/s]  5%|▌         | 676/12776 [05:11<1:07:37,  2.98it/s]                                                       5%|▌         | 676/12776 [05:11<1:07:37,  2.98it/s]  5%|▌         | 677/12776 [05:11<1:04:06,  3.15it/s]                                                       5%|▌         | 677/12776 [05:11<1:04:06,  3.15it/s]  5%|▌         | 678/12776 [05:12<1:01:36,  3.27it/s]                                                       5%|▌         | 678/12776 [05:12<1:01:36,  3.27it/s]  5%|▌         | 679/12776 [05:12<1:01:39,  3.27it/s]                                                       5%|▌         | 679/12776 [05:12<1:01:39,  3.27it/s]  5%|▌         | 680/12776 [05:12<59:01,  3.42it/s]                                                       5%|▌         | 680/12776 [05:12<59:01,  3.42it/s]  5%|▌         | 681/12776 [05:12<56:37,  3.56it/s]                                                     5%|▌         | 681/12776 [05:13<56:37,  3.56it/s]  5%|▌         | 682/12776 [05:13<54:45,  3.68it/s]                                                     5%|▌         | 682/12776 [05:13<54:45,  3.68it/s]  5%|▌         | 683/12776 [05:13<1:00:36,  3.33it/s]                                                       5%|▌         | 683/12776 [05:13<1:00:36,  3.33it/s]  5%|▌         | 684/12776 [05:13<56:44,  3.55it/s]                                                       5%|▌         | 684/12776 [05:13<56:44,  3.55it/s]  5%|▌         | 685/12776 [05:14<53:30,  3.77it/s]                                                     5%|▌         | 685/12776 [05:14<53:30,  3.77it/s]  5%|▌         | 686/12776 [05:14<50:52,  3.96it/s]                                                     5%|▌         | 686/12776 [05:14<50:52,  3.96it/s]  5%|▌         | 687/12776 [05:14<54:57,  3.67it/s]                                                     5%|▌         | 687/12776 [05:14<54:57,  3.67it/s]  5%|▌         | 688/12776 [05:14<51:17,  3.93it/s]                                                     5%|▌         | 688/12776 [05:14<51:17,  3.93it/s]  5%|▌         | 689/12776 [05:15<48:29,  4.15it/s]                                                     5%|▌         | 689/12776 [05:15<48:29,  4.15it/s]  5%|▌         | 690/12776 [05:15<46:29,  4.33it/s]                                                     5%|▌         | 690/12776 [05:15<46:29,  4.33it/s]  5%|▌         | 691/12776 [05:15<44:46,  4.50it/s]                                                     5%|▌         | 691/12776 [05:15<44:46,  4.50it/s]  5%|▌         | 692/12776 [05:15<50:27,  3.99it/s]                                                     5%|▌         | 692/12776 [05:15<50:27,  3.99it/s]  5%|▌         | 693/12776 [05:15<47:13,  4.26it/s]                                                     5%|▌         | 693/12776 [05:15<47:13,  4.26it/s]  5%|▌         | 694/12776 [05:16<45:01,  4.47it/s]                                                     5%|▌         | 694/12776 [05:16<45:01,  4.47it/s]  5%|▌         | 695/12776 [05:16<43:15,  4.65it/s]                                                     5%|▌         | 695/12776 [05:16<43:15,  4.65it/s]  5%|▌         | 696/12776 [05:16<41:52,  4.81it/s]                                                     5%|▌         | 696/12776 [05:16<41:52,  4.81it/s]  5%|▌         | 697/12776 [05:16<40:48,  4.93it/s]                                                     5%|▌         | 697/12776 [05:16<40:48,  4.93it/s]  5%|▌         | 698/12776 [05:17<45:32,  4.42it/s]                                                     5%|▌         | 698/12776 [05:17<45:32,  4.42it/s]  5%|▌         | 699/12776 [05:17<42:59,  4.68it/s]                                                     5%|▌         | 699/12776 [05:17<42:59,  4.68it/s]  5%|▌         | 700/12776 [05:18<1:20:40,  2.49it/s]                                                       5%|▌         | 700/12776 [05:18<1:20:40,  2.49it/s]  5%|▌         | 701/12776 [05:19<2:23:20,  1.40it/s]                                                       5%|▌         | 701/12776 [05:19<2:23:20,  1.40it/s]  5%|▌         | 702/12776 [05:20<2:37:19,  1.28it/s]                                                       5%|▌         | 702/12776 [05:20<2:37:19,  1.28it/s]  6%|▌         | 703/12776 [05:21<2:42:13,  1.24it/s]                                                       6%|▌         | 703/12776 [05:21<2:42:13,  1.24it/s]  6%|▌         | 704/12776 [05:22<2:39:59,  1.26it/s]                                                       6%|▌         | 704/12776 [05:22<2:39:59,  1.26it/s]  6%|▌         | 705/12776 [05:22<2:36:22,  1.29it/s]                                                       6%|▌         | 705/12776 [05:22<2:36:22,  1.29it/s]  6%|▌         | 706/12776 [05:23<2:29:50,  1.34it/s]                                                       6%|▌         | 706/12776 [05:23<2:29:50,  1.34it/s]  6%|▌         | 707/12776 [05:24<2:26:09,  1.38it/s]                                                       6%|▌         | 707/12776 [05:24<2:26:09,  1.38it/s]  6%|▌         | 708/12776 [05:24<2:18:18,  1.45it/s]                                                       6%|▌         | 708/12776 [05:24<2:18:18,  1.45it/s]  6%|▌         | 709/12776 [05:25<2:16:12,  1.48it/s]                                                       6%|▌         | 709/12776 [05:25<2:16:12,  1.48it/s]  6%|▌         | 710/12776 [05:25<2:08:48,  1.56it/s]                                                       6%|▌         | 710/12776 [05:25<2:08:48,  1.56it/s]  6%|▌         | 711/12776 [05:26<2:04:53,  1.61it/s]                                                       6%|▌         | 711/12776 [05:26<2:04:53,  1.61it/s]  6%|▌         | 712/12776 [05:27<1:57:18,  1.71it/s]                                                       6%|▌         | 712/12776 [05:27<1:57:18,  1.71it/s]  6%|▌         | 713/12776 [05:27<1:53:40,  1.77it/s]                                                       6%|▌         | 713/12776 [05:27<1:53:40,  1.77it/s]  6%|▌         | 714/12776 [05:27<1:45:49,  1.90it/s]                                                       6%|▌         | 714/12776 [05:27<1:45:49,  1.90it/s]  6%|▌         | 715/12776 [05:28<1:47:00,  1.88it/s]                                                       6%|▌         | 715/12776 [05:28<1:47:00,  1.88it/s]  6%|▌         | 716/12776 [05:28<1:39:16,  2.02it/s]                                                       6%|▌         | 716/12776 [05:28<1:39:16,  2.02it/s]  6%|▌         | 717/12776 [05:29<1:33:11,  2.16it/s]                                                       6%|▌         | 717/12776 [05:29<1:33:11,  2.16it/s]  6%|▌         | 718/12776 [05:29<1:36:44,  2.08it/s]                                                       6%|▌         | 718/12776 [05:29<1:36:44,  2.08it/s]  6%|▌         | 719/12776 [05:30<1:29:07,  2.25it/s]                                                       6%|▌         | 719/12776 [05:30<1:29:07,  2.25it/s]  6%|▌         | 720/12776 [05:30<1:23:10,  2.42it/s]                                                       6%|▌         | 720/12776 [05:30<1:23:10,  2.42it/s]  6%|▌         | 721/12776 [05:30<1:24:12,  2.39it/s]                                                       6%|▌         | 721/12776 [05:30<1:24:12,  2.39it/s]  6%|▌         | 722/12776 [05:31<1:18:59,  2.54it/s]                                                       6%|▌         | 722/12776 [05:31<1:18:59,  2.54it/s]  6%|▌         | 723/12776 [05:31<1:14:53,  2.68it/s]                                                       6%|▌         | 723/12776 [05:31<1:14:53,  2.68it/s]  6%|▌         | 724/12776 [05:31<1:12:53,  2.76it/s]                                                       6%|▌         | 724/12776 [05:31<1:12:53,  2.76it/s]  6%|▌         | 725/12776 [05:32<1:09:31,  2.89it/s]                                                       6%|▌         | 725/12776 [05:32<1:09:31,  2.89it/s]  6%|▌         | 726/12776 [05:32<1:05:41,  3.06it/s]                                                       6%|▌         | 726/12776 [05:32<1:05:41,  3.06it/s]  6%|▌         | 727/12776 [05:32<1:02:51,  3.19it/s]                                                       6%|▌         | 727/12776 [05:32<1:02:51,  3.19it/s]  6%|▌         | 728/12776 [05:33<1:08:31,  2.93it/s]                                                       6%|▌         | 728/12776 [05:33<1:08:31,  2.93it/s]  6%|▌         | 729/12776 [05:33<1:04:08,  3.13it/s]                                                       6%|▌         | 729/12776 [05:33<1:04:08,  3.13it/s]  6%|▌         | 730/12776 [05:33<1:00:03,  3.34it/s]                                                     {'loss': 1.0688, 'grad_norm': 1.3749841451644897, 'learning_rate': 0.0002964076246334311, 'epoch': 0.1}
+{'loss': 1.0538, 'grad_norm': 1.0187915563583374, 'learning_rate': 0.0002963831867057673, 'epoch': 0.1}
+{'loss': 0.894, 'grad_norm': 0.9093784689903259, 'learning_rate': 0.0002963587487781036, 'epoch': 0.1}
+{'loss': 1.0852, 'grad_norm': 0.7980338931083679, 'learning_rate': 0.0002963343108504399, 'epoch': 0.1}
+{'loss': 0.9796, 'grad_norm': 1.0717920064926147, 'learning_rate': 0.00029630987292277613, 'epoch': 0.1}
+{'loss': 0.9234, 'grad_norm': 1.0452539920806885, 'learning_rate': 0.0002962854349951124, 'epoch': 0.1}
+{'loss': 1.0287, 'grad_norm': 1.031282901763916, 'learning_rate': 0.00029626099706744864, 'epoch': 0.1}
+{'loss': 0.8954, 'grad_norm': 1.1783666610717773, 'learning_rate': 0.00029623655913978494, 'epoch': 0.1}
+{'loss': 0.7738, 'grad_norm': 0.8439245223999023, 'learning_rate': 0.0002962121212121212, 'epoch': 0.1}
+{'loss': 0.8843, 'grad_norm': 0.914787769317627, 'learning_rate': 0.00029618768328445745, 'epoch': 0.1}
+{'loss': 1.0231, 'grad_norm': 1.0592864751815796, 'learning_rate': 0.00029616324535679375, 'epoch': 0.1}
+{'loss': 1.096, 'grad_norm': 1.1694962978363037, 'learning_rate': 0.00029613880742913, 'epoch': 0.1}
+{'loss': 0.9233, 'grad_norm': 2.1501312255859375, 'learning_rate': 0.00029611436950146625, 'epoch': 0.1}
+{'loss': 0.9509, 'grad_norm': 0.7547603249549866, 'learning_rate': 0.0002960899315738025, 'epoch': 0.1}
+{'loss': 0.865, 'grad_norm': 1.070226788520813, 'learning_rate': 0.00029606549364613876, 'epoch': 0.1}
+{'loss': 1.893, 'grad_norm': 3.9022319316864014, 'learning_rate': 0.00029604105571847506, 'epoch': 0.1}
+{'loss': 1.1844, 'grad_norm': 1.9020326137542725, 'learning_rate': 0.0002960166177908113, 'epoch': 0.1}
+{'loss': 1.0922, 'grad_norm': 1.318651795387268, 'learning_rate': 0.00029599217986314756, 'epoch': 0.1}
+{'loss': 1.0611, 'grad_norm': 1.523789405822754, 'learning_rate': 0.00029596774193548387, 'epoch': 0.1}
+{'loss': 1.0302, 'grad_norm': 1.3359650373458862, 'learning_rate': 0.0002959433040078201, 'epoch': 0.11}
+{'loss': 1.0434, 'grad_norm': 1.3721380233764648, 'learning_rate': 0.00029591886608015637, 'epoch': 0.11}
+{'loss': 1.4901, 'grad_norm': 1.5583863258361816, 'learning_rate': 0.0002958944281524926, 'epoch': 0.11}
+{'loss': 1.4394, 'grad_norm': 3.990788221359253, 'learning_rate': 0.00029586999022482893, 'epoch': 0.11}
+{'loss': 1.4246, 'grad_norm': 2.45947003364563, 'learning_rate': 0.0002958455522971652, 'epoch': 0.11}
+{'loss': 1.3557, 'grad_norm': 2.0882105827331543, 'learning_rate': 0.00029582111436950143, 'epoch': 0.11}
+{'loss': 0.9934, 'grad_norm': 1.8049339056015015, 'learning_rate': 0.00029579667644183774, 'epoch': 0.11}
+{'loss': 1.2822, 'grad_norm': 2.7573509216308594, 'learning_rate': 0.000295772238514174, 'epoch': 0.11}
+{'loss': 1.2641, 'grad_norm': 1.7646522521972656, 'learning_rate': 0.00029574780058651024, 'epoch': 0.11}
+{'loss': 1.6484, 'grad_norm': 3.6935081481933594, 'learning_rate': 0.00029572336265884654, 'epoch': 0.11}
+{'loss': 1.738, 'grad_norm': 3.90690541267395, 'learning_rate': 0.00029569892473118274, 'epoch': 0.11}
+{'loss': 1.2565, 'grad_norm': 3.629829168319702, 'learning_rate': 0.00029567448680351905, 'epoch': 0.11}
+{'loss': 1.3839, 'grad_norm': 2.447295904159546, 'learning_rate': 0.0002956500488758553, 'epoch': 0.11}
+{'loss': 1.2876, 'grad_norm': 3.1187667846679688, 'learning_rate': 0.00029562561094819155, 'epoch': 0.11}
+{'loss': 1.8006, 'grad_norm': 3.4593164920806885, 'learning_rate': 0.00029560117302052785, 'epoch': 0.11}
+{'loss': 1.7238, 'grad_norm': 2.576993465423584, 'learning_rate': 0.0002955767350928641, 'epoch': 0.11}
+{'loss': 1.8431, 'grad_norm': 4.6910319328308105, 'learning_rate': 0.00029555229716520036, 'epoch': 0.11}
+{'loss': 1.9383, 'grad_norm': 2.633889675140381, 'learning_rate': 0.0002955278592375366, 'epoch': 0.11}
+{'loss': 2.1373, 'grad_norm': 6.47677755355835, 'learning_rate': 0.0002955034213098729, 'epoch': 0.11}
+{'loss': 1.8526, 'grad_norm': 1.8168855905532837, 'learning_rate': 0.00029547898338220917, 'epoch': 0.11}
+{'loss': 1.632, 'grad_norm': 2.5182387828826904, 'learning_rate': 0.0002954545454545454, 'epoch': 0.11}
+{'loss': 2.1292, 'grad_norm': 2.240593671798706, 'learning_rate': 0.0002954301075268817, 'epoch': 0.11}
+{'loss': 1.9179, 'grad_norm': 2.15621018409729, 'learning_rate': 0.000295405669599218, 'epoch': 0.11}
+{'loss': 1.8236, 'grad_norm': 1.9122724533081055, 'learning_rate': 0.0002953812316715542, 'epoch': 0.11}
+{'loss': 2.0443, 'grad_norm': 2.539159059524536, 'learning_rate': 0.00029535679374389053, 'epoch': 0.11}
+{'loss': 1.0128, 'grad_norm': 1.3785215616226196, 'learning_rate': 0.0002953323558162267, 'epoch': 0.11}
+{'loss': 1.38, 'grad_norm': 4.100296974182129, 'learning_rate': 0.00029530791788856303, 'epoch': 0.11}
+{'loss': 1.4838, 'grad_norm': 3.4671149253845215, 'learning_rate': 0.0002952834799608993, 'epoch': 0.11}
+{'loss': 1.5185, 'grad_norm': 2.7299959659576416, 'learning_rate': 0.00029525904203323553, 'epoch': 0.11}
+{'loss': 1.625, 'grad_norm': 3.1657235622406006, 'learning_rate': 0.00029523460410557184, 'epoch': 0.11}
+{'loss': 1.2705, 'grad_norm': 1.4762587547302246, 'learning_rate': 0.0002952101661779081, 'epoch': 0.11}
+{'loss': 1.068, 'grad_norm': 1.150429368019104, 'learning_rate': 0.00029518572825024434, 'epoch': 0.11}
+{'loss': 1.1242, 'grad_norm': 1.3497153520584106, 'learning_rate': 0.00029516129032258065, 'epoch': 0.11}
+{'loss': 0.9952, 'grad_norm': 0.7703993320465088, 'learning_rate': 0.0002951368523949169, 'epoch': 0.11}
+{'loss': 0.9605, 'grad_norm': 1.7000598907470703, 'learning_rate': 0.00029511241446725315, 'epoch': 0.11}
+{'loss': 1.1336, 'grad_norm': 0.7951465845108032, 'learning_rate': 0.0002950879765395894, 'epoch': 0.11}
+{'loss': 0.7303, 'grad_norm': 0.787284791469574, 'learning_rate': 0.0002950635386119257, 'epoch': 0.11}
+{'loss': 0.8081, 'grad_norm': 1.1658583879470825, 'learning_rate': 0.00029503910068426196, 'epoch': 0.11}
+{'loss': 1.0007, 'grad_norm': 1.4762721061706543, 'learning_rate': 0.0002950146627565982, 'epoch': 0.11}
+{'loss': 0.799, 'grad_norm': 0.9633269309997559, 'learning_rate': 0.0002949902248289345, 'epoch': 0.11}
+{'loss': 1.012, 'grad_norm': 1.539566159248352, 'learning_rate': 0.00029496578690127077, 'epoch': 0.11}
+{'loss': 0.8838, 'grad_norm': 1.1010069847106934, 'learning_rate': 0.000294941348973607, 'epoch': 0.11}
+{'loss': 1.1716, 'grad_norm': 1.7431788444519043, 'learning_rate': 0.00029491691104594327, 'epoch': 0.11}
+{'loss': 1.2777, 'grad_norm': 1.6751221418380737, 'learning_rate': 0.0002948924731182795, 'epoch': 0.11}
+{'loss': 0.887, 'grad_norm': 1.2621047496795654, 'learning_rate': 0.0002948680351906158, 'epoch': 0.11}
+{'loss': 1.3467, 'grad_norm': 2.204878091812134, 'learning_rate': 0.0002948435972629521, 'epoch': 0.11}
+{'loss': 1.2575, 'grad_norm': 1.4875999689102173, 'learning_rate': 0.00029481915933528833, 'epoch': 0.11}
+{'loss': 1.4872, 'grad_norm': 3.0924177169799805, 'learning_rate': 0.00029479472140762463, 'epoch': 0.11}
+{'loss': 1.063, 'grad_norm': 2.1464388370513916, 'learning_rate': 0.0002947702834799609, 'epoch': 0.11}
+{'loss': 1.0376, 'grad_norm': 2.1253466606140137, 'learning_rate': 0.00029474584555229714, 'epoch': 0.11}
+{'loss': 1.3224, 'grad_norm': 1.9266527891159058, 'learning_rate': 0.0002947214076246334, 'epoch': 0.11}
+{'loss': 1.186, 'grad_norm': 1.8208385705947876, 'learning_rate': 0.0002946969696969697, 'epoch': 0.11}
+{'loss': 1.2369, 'grad_norm': 1.7924067974090576, 'learning_rate': 0.00029467253176930594, 'epoch': 0.11}
+{'loss': 1.1493, 'grad_norm': 1.3854628801345825, 'learning_rate': 0.0002946480938416422, 'epoch': 0.11}
+{'loss': 1.3814, 'grad_norm': 2.6812336444854736, 'learning_rate': 0.0002946236559139785, 'epoch': 0.11}
+{'loss': 1.117, 'grad_norm': 1.894299864768982, 'learning_rate': 0.00029459921798631475, 'epoch': 0.11}
+{'loss': 1.6541, 'grad_norm': 4.2252726554870605, 'learning_rate': 0.000294574780058651, 'epoch': 0.11}
+{'loss': 1.3064, 'grad_norm': 1.660989761352539, 'learning_rate': 0.0002945503421309873, 'epoch': 0.11}
+{'loss': 1.2461, 'grad_norm': 1.3125436305999756, 'learning_rate': 0.0002945259042033235, 'epoch': 0.11}
+  6%|▌         | 730/12776 [05:33<1:00:03,  3.34it/s]  6%|▌         | 731/12776 [05:34<57:26,  3.50it/s]                                                       6%|▌         | 731/12776 [05:34<57:26,  3.50it/s]  6%|▌         | 732/12776 [05:34<1:02:07,  3.23it/s]                                                       6%|▌         | 732/12776 [05:34<1:02:07,  3.23it/s]  6%|▌         | 733/12776 [05:34<57:54,  3.47it/s]                                                       6%|▌         | 733/12776 [05:34<57:54,  3.47it/s]  6%|▌         | 734/12776 [05:34<54:25,  3.69it/s]                                                     6%|▌         | 734/12776 [05:34<54:25,  3.69it/s]  6%|▌         | 735/12776 [05:35<51:49,  3.87it/s]                                                     6%|▌         | 735/12776 [05:35<51:49,  3.87it/s]  6%|▌         | 736/12776 [05:35<53:46,  3.73it/s]                                                     6%|▌         | 736/12776 [05:35<53:46,  3.73it/s]  6%|▌         | 737/12776 [05:35<50:38,  3.96it/s]                                                     6%|▌         | 737/12776 [05:35<50:38,  3.96it/s]  6%|▌         | 738/12776 [05:35<48:09,  4.17it/s]                                                     6%|▌         | 738/12776 [05:35<48:09,  4.17it/s]  6%|▌         | 739/12776 [05:36<46:11,  4.34it/s]                                                     6%|▌         | 739/12776 [05:36<46:11,  4.34it/s]  6%|▌         | 740/12776 [05:36<45:00,  4.46it/s]                                                     6%|▌         | 740/12776 [05:36<45:00,  4.46it/s]  6%|▌         | 741/12776 [05:36<50:11,  4.00it/s]                                                     6%|▌         | 741/12776 [05:36<50:11,  4.00it/s]  6%|▌         | 742/12776 [05:36<47:18,  4.24it/s]                                                     6%|▌         | 742/12776 [05:36<47:18,  4.24it/s]  6%|▌         | 743/12776 [05:36<45:14,  4.43it/s]                                                     6%|▌         | 743/12776 [05:36<45:14,  4.43it/s]  6%|▌         | 744/12776 [05:37<43:33,  4.60it/s]                                                     6%|▌         | 744/12776 [05:37<43:33,  4.60it/s]  6%|▌         | 745/12776 [05:37<42:10,  4.76it/s]                                                     6%|▌         | 745/12776 [05:37<42:10,  4.76it/s]  6%|▌         | 746/12776 [05:37<41:10,  4.87it/s]                                                     6%|▌         | 746/12776 [05:37<41:10,  4.87it/s]  6%|▌         | 747/12776 [05:37<44:27,  4.51it/s]                                                     6%|▌         | 747/12776 [05:37<44:27,  4.51it/s]  6%|▌         | 748/12776 [05:37<42:11,  4.75it/s]                                                     6%|▌         | 748/12776 [05:37<42:11,  4.75it/s]  6%|▌         | 749/12776 [05:38<40:37,  4.93it/s]                                                     6%|▌         | 749/12776 [05:38<40:37,  4.93it/s]  6%|▌         | 750/12776 [05:38<1:13:25,  2.73it/s]                                                       6%|▌         | 750/12776 [05:38<1:13:25,  2.73it/s]  6%|▌         | 751/12776 [05:40<2:18:33,  1.45it/s]                                                       6%|▌         | 751/12776 [05:40<2:18:33,  1.45it/s]  6%|▌         | 752/12776 [05:41<2:37:32,  1.27it/s]                                                       6%|▌         | 752/12776 [05:41<2:37:32,  1.27it/s]  6%|▌         | 753/12776 [05:42<2:45:27,  1.21it/s]                                                       6%|▌         | 753/12776 [05:42<2:45:27,  1.21it/s]  6%|▌         | 754/12776 [05:43<2:44:25,  1.22it/s]                                                       6%|▌         | 754/12776 [05:43<2:44:25,  1.22it/s]  6%|▌         | 755/12776 [05:43<2:41:21,  1.24it/s]                                                       6%|▌         | 755/12776 [05:43<2:41:21,  1.24it/s]  6%|▌         | 756/12776 [05:44<2:36:29,  1.28it/s]                                                       6%|▌         | 756/12776 [05:44<2:36:29,  1.28it/s]  6%|▌         | 757/12776 [05:45<2:30:06,  1.33it/s]                                                       6%|▌         | 757/12776 [05:45<2:30:06,  1.33it/s]  6%|▌         | 758/12776 [05:46<2:32:44,  1.31it/s]                                                       6%|▌         | 758/12776 [05:46<2:32:44,  1.31it/s]  6%|▌         | 759/12776 [05:46<2:23:43,  1.39it/s]                                                       6%|▌         | 759/12776 [05:46<2:23:43,  1.39it/s]  6%|▌         | 760/12776 [05:47<2:17:47,  1.45it/s]                                                       6%|▌         | 760/12776 [05:47<2:17:47,  1.45it/s]  6%|▌         | 761/12776 [05:47<2:09:16,  1.55it/s]                                                       6%|▌         | 761/12776 [05:47<2:09:16,  1.55it/s]  6%|▌         | 762/12776 [05:48<2:05:15,  1.60it/s]                                                       6%|▌         | 762/12776 [05:48<2:05:15,  1.60it/s]  6%|▌         | 763/12776 [05:48<1:58:58,  1.68it/s]                                                       6%|▌         | 763/12776 [05:48<1:58:58,  1.68it/s]  6%|▌         | 764/12776 [05:49<1:54:12,  1.75it/s]                                                       6%|▌         | 764/12776 [05:49<1:54:12,  1.75it/s]  6%|▌         | 765/12776 [05:49<1:46:37,  1.88it/s]                                                       6%|▌         | 765/12776 [05:49<1:46:37,  1.88it/s]  6%|▌         | 766/12776 [05:50<1:44:56,  1.91it/s]                                                       6%|▌         | 766/12776 [05:50<1:44:56,  1.91it/s]  6%|▌         | 767/12776 [05:50<1:37:39,  2.05it/s]                                                       6%|▌         | 767/12776 [05:50<1:37:39,  2.05it/s]  6%|▌         | 768/12776 [05:51<1:31:41,  2.18it/s]                                                       6%|▌         | 768/12776 [05:51<1:31:41,  2.18it/s]  6%|▌         | 769/12776 [05:51<1:35:10,  2.10it/s]                                                       6%|▌         | 769/12776 [05:51<1:35:10,  2.10it/s]  6%|▌         | 770/12776 [05:52<1:28:31,  2.26it/s]                                                       6%|▌         | 770/12776 [05:52<1:28:31,  2.26it/s]  6%|▌         | 771/12776 [05:52<1:22:37,  2.42it/s]                                                       6%|▌         | 771/12776 [05:52<1:22:37,  2.42it/s]  6%|▌         | 772/12776 [05:52<1:23:13,  2.40it/s]                                                       6%|▌         | 772/12776 [05:52<1:23:13,  2.40it/s]  6%|▌         | 773/12776 [05:53<1:18:29,  2.55it/s]                                                       6%|▌         | 773/12776 [05:53<1:18:29,  2.55it/s]  6%|▌         | 774/12776 [05:53<1:14:47,  2.67it/s]                                                       6%|▌         | 774/12776 [05:53<1:14:47,  2.67it/s]  6%|▌         | 775/12776 [05:53<1:11:39,  2.79it/s]                                                       6%|▌         | 775/12776 [05:53<1:11:39,  2.79it/s]  6%|▌         | 776/12776 [05:54<1:07:44,  2.95it/s]                                                       6%|▌         | 776/12776 [05:54<1:07:44,  2.95it/s]  6%|▌         | 777/12776 [05:54<1:04:44,  3.09it/s]                                                       6%|▌         | 777/12776 [05:54<1:04:44,  3.09it/s]  6%|▌         | 778/12776 [05:54<1:02:19,  3.21it/s]                                                       6%|▌         | 778/12776 [05:54<1:02:19,  3.21it/s]  6%|▌         | 779/12776 [05:55<1:06:38,  3.00it/s]                                                       6%|▌         | 779/12776 [05:55<1:06:38,  3.00it/s]  6%|▌         | 780/12776 [05:55<1:02:34,  3.20it/s]                                                       6%|▌         | 780/12776 [05:55<1:02:34,  3.20it/s]  6%|▌         | 781/12776 [05:55<59:19,  3.37it/s]                                                       6%|▌         | 781/12776 [05:55<59:19,  3.37it/s]  6%|▌         | 782/12776 [05:55<56:28,  3.54it/s]                                                     6%|▌         | 782/12776 [05:55<56:28,  3.54it/s]  6%|▌         | 783/12776 [05:56<1:01:34,  3.25it/s]                                                       6%|▌         | 783/12776 [05:56<1:01:34,  3.25it/s]  6%|▌         | 784/12776 [05:56<57:27,  3.48it/s]                                                       6%|▌         | 784/12776 [05:56<57:27,  3.48it/s]  6%|▌         | 785/12776 [05:56<53:57,  3.70it/s]                                                     6%|▌         | 785/12776 [05:56<53:57,  3.70it/s]  6%|▌         | 786/12776 [05:56<51:04,  3.91it/s]                                                     6%|▌         | 786/12776 [05:56<51:04,  3.91it/s]  6%|▌         | 787/12776 [05:57<54:23,  3.67it/s]                                                     6%|▌         | 787/12776 [05:57<54:23,  3.67it/s]  6%|▌         | 788/12776 [05:57<50:46,  3.93it/s]                                                     6%|▌         | 788/12776 [05:57<50:46,  3.93it/s]  6%|▌         | 789/12776 [05:57<48:06,  4.15it/s]                                                     6%|▌         | 789/12776 [05:57<48:06,  4.15it/s]  6%|▌         | 790/12776 [05:57<46:07,  4.33it/s]                                                     6%|▌         | 790/12776 [05:57<46:07,  4.33it/s]  6%|▌         | 791/12776 [05:58<44:44,  4.47it/s]                                                     6%|▌         | 791/12776 [05:58<44:44,  4.47it/s]  6%|▌         | 792/12776 [05:58<48:14,  4.14it/s]                                                     6%|▌         | 792/12776 [05:58<48:14,  4.14it/s]  6%|▌         | 793/12776 [05:58<45:36,  4.38it/s]                                                     6%|▌         | 793/12776 [05:58<45:36,  4.38it/s]  6%|▌         | 794/12776 [05:58<43:34,  4.58it/s]                                                     6%|▌         | 794/12776 [05:58<43:34,  4.58it/s]  6%|▌         | 795/12776 [05:58<42:04,  4.75it/s]                                                     6%|▌         | 795/12776 [05:58<42:04,  4.75it/s]  6%|▌         | 796/12776 [05:59<40:57,  4.87it/s]                                                     6%|▌         | 796/12776 [05:59<40:57,  4.87it/s]  6%|▌         | 797/12776 [05:59<39:56,  5.00it/s]                                                     6%|▌         | 797/12776 [05:59<39:56,  5.00it/s]  6%|▌         | 798/12776 [05:59<44:59,  4.44it/s]                                                     6%|▌         | 798/12776 [05:59<44:59,  4.44it/s]  6%|▋         | 799/12776 [05:59<42:15,  4.72it/s]                                                     6%|▋         | 799/12776 [05:59<42:15,  4.72it/s]  6%|▋         | 800/12776 [06:00<1:19:01,  2.53it/s]                                                       6%|▋         | 800/12776 [06:00<1:19:01,  2.53it/s]Saving model checkpoint to ./checkpoint-800
+Configuration saved in ./checkpoint-800/config.json
+Model weights saved in ./checkpoint-800/model.safetensors
+Feature extractor saved in ./checkpoint-800/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-800/tokenizer_config.json
+Special tokens file saved in ./checkpoint-800/special_tokens_map.json
+added tokens file saved in ./checkpoint-800/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+  6%|▋         | 801/12776 [06:07<7:26:14,  2.24s/it]                                                       6%|▋         | 801/12776 [06:07<7:26:14,  2.24s/it]  6%|▋         | 802/12776 [06:08<6:12:39,  1.87s/it]                                                       6%|▋         | 802/12776 [06:08<6:12:39,  1.87s/it]  6%|▋         | 803/12776 [06:09<5:16:09,  1.58s/it]                                                       6%|▋         | 803/12776 [06:09<5:16:09,  1.58s/it]  6%|▋         | 804/12776 [06:10<4:37:47,  1.39s/it]                                                       6%|▋         | 804/12776 [06:10<4:37:47,  1.39s/it]  6%|▋         | 805/12776 [06:10<4:03:00,  1.22s/it]                                                       6%|▋         | 805/12776 [06:10<4:03:00,  1.22s/it]  6%|▋         | 806/12776 [06:11<3:30:41,  1.06s/it]                                                       6%|▋         | 806/12776 [06:11<3:30:41,  1.06s/it]  6%|▋         | 807/12776 [06:12<3:11:16,  1.04it/s]                                                       6%|▋         | 807/12776 [06:12<3:11:16,  1.04it/s]  6%|▋         | 808/12776 [06:12<2:50:26,  1.17it/s]                                                     {'loss': 1.4757, 'grad_norm': 1.556378722190857, 'learning_rate': 0.0002945014662756598, 'epoch': 0.11}
+{'loss': 1.472, 'grad_norm': 1.8206431865692139, 'learning_rate': 0.00029447702834799606, 'epoch': 0.11}
+{'loss': 1.6409, 'grad_norm': 2.837712287902832, 'learning_rate': 0.0002944525904203323, 'epoch': 0.11}
+{'loss': 1.618, 'grad_norm': 2.3481979370117188, 'learning_rate': 0.0002944281524926686, 'epoch': 0.11}
+{'loss': 1.1409, 'grad_norm': 2.627960443496704, 'learning_rate': 0.00029440371456500487, 'epoch': 0.11}
+{'loss': 1.4743, 'grad_norm': 1.7479578256607056, 'learning_rate': 0.0002943792766373411, 'epoch': 0.12}
+{'loss': 1.8053, 'grad_norm': 2.662626028060913, 'learning_rate': 0.0002943548387096774, 'epoch': 0.12}
+{'loss': 1.8668, 'grad_norm': 3.780766487121582, 'learning_rate': 0.0002943304007820137, 'epoch': 0.12}
+{'loss': 1.9487, 'grad_norm': 3.494009017944336, 'learning_rate': 0.00029430596285434993, 'epoch': 0.12}
+{'loss': 1.8814, 'grad_norm': 2.3752057552337646, 'learning_rate': 0.0002942815249266862, 'epoch': 0.12}
+{'loss': 1.9308, 'grad_norm': 3.852254629135132, 'learning_rate': 0.0002942570869990225, 'epoch': 0.12}
+{'loss': 1.4905, 'grad_norm': 1.938531756401062, 'learning_rate': 0.00029423264907135874, 'epoch': 0.12}
+{'loss': 1.3597, 'grad_norm': 3.246478319168091, 'learning_rate': 0.000294208211143695, 'epoch': 0.12}
+{'loss': 1.1944, 'grad_norm': 2.4071877002716064, 'learning_rate': 0.0002941837732160313, 'epoch': 0.12}
+{'loss': 1.3349, 'grad_norm': 1.2850818634033203, 'learning_rate': 0.0002941593352883675, 'epoch': 0.12}
+{'loss': 0.904, 'grad_norm': 2.346933126449585, 'learning_rate': 0.0002941348973607038, 'epoch': 0.12}
+{'loss': 2.2503, 'grad_norm': 3.941329002380371, 'learning_rate': 0.00029411045943304005, 'epoch': 0.12}
+{'loss': 1.5694, 'grad_norm': 2.6362099647521973, 'learning_rate': 0.0002940860215053763, 'epoch': 0.12}
+{'loss': 1.9898, 'grad_norm': 4.092523097991943, 'learning_rate': 0.0002940615835777126, 'epoch': 0.12}
+{'loss': 1.589, 'grad_norm': 3.4004411697387695, 'learning_rate': 0.00029403714565004886, 'epoch': 0.12}
+{'loss': 1.6423, 'grad_norm': 3.5993781089782715, 'learning_rate': 0.0002940127077223851, 'epoch': 0.12}
+{'loss': 1.0251, 'grad_norm': 0.811040461063385, 'learning_rate': 0.0002939882697947214, 'epoch': 0.12}
+{'loss': 0.9569, 'grad_norm': 0.8086991310119629, 'learning_rate': 0.00029396383186705766, 'epoch': 0.12}
+{'loss': 0.8385, 'grad_norm': 0.6266233325004578, 'learning_rate': 0.0002939393939393939, 'epoch': 0.12}
+{'loss': 1.0404, 'grad_norm': 2.2221486568450928, 'learning_rate': 0.00029391495601173017, 'epoch': 0.12}
+{'loss': 0.8026, 'grad_norm': 0.7100171446800232, 'learning_rate': 0.00029389051808406647, 'epoch': 0.12}
+{'loss': 0.8826, 'grad_norm': 1.3321337699890137, 'learning_rate': 0.0002938660801564027, 'epoch': 0.12}
+{'loss': 0.7613, 'grad_norm': 0.7835633754730225, 'learning_rate': 0.000293841642228739, 'epoch': 0.12}
+{'loss': 0.6241, 'grad_norm': 0.6961237788200378, 'learning_rate': 0.0002938172043010753, 'epoch': 0.12}
+{'loss': 0.8578, 'grad_norm': 1.128588080406189, 'learning_rate': 0.0002937927663734115, 'epoch': 0.12}
+{'loss': 0.9736, 'grad_norm': 0.9707612991333008, 'learning_rate': 0.0002937683284457478, 'epoch': 0.12}
+{'loss': 0.8492, 'grad_norm': 0.9797849655151367, 'learning_rate': 0.00029374389051808403, 'epoch': 0.12}
+{'loss': 0.7293, 'grad_norm': 1.5107133388519287, 'learning_rate': 0.0002937194525904203, 'epoch': 0.12}
+{'loss': 0.7209, 'grad_norm': 1.1908730268478394, 'learning_rate': 0.0002936950146627566, 'epoch': 0.12}
+{'loss': 0.8672, 'grad_norm': 1.0134694576263428, 'learning_rate': 0.00029367057673509284, 'epoch': 0.12}
+{'loss': 0.8697, 'grad_norm': 1.0121101140975952, 'learning_rate': 0.0002936461388074291, 'epoch': 0.12}
+{'loss': 1.0135, 'grad_norm': 0.948444128036499, 'learning_rate': 0.0002936217008797654, 'epoch': 0.12}
+{'loss': 0.7778, 'grad_norm': 1.0948662757873535, 'learning_rate': 0.0002935972629521016, 'epoch': 0.12}
+{'loss': 1.1036, 'grad_norm': 2.0492300987243652, 'learning_rate': 0.0002935728250244379, 'epoch': 0.12}
+{'loss': 0.9884, 'grad_norm': 1.7745238542556763, 'learning_rate': 0.00029354838709677415, 'epoch': 0.12}
+{'loss': 1.0637, 'grad_norm': 1.6761794090270996, 'learning_rate': 0.0002935239491691104, 'epoch': 0.12}
+{'loss': 0.8567, 'grad_norm': 1.1787947416305542, 'learning_rate': 0.0002934995112414467, 'epoch': 0.12}
+{'loss': 1.1482, 'grad_norm': 2.7158119678497314, 'learning_rate': 0.00029347507331378296, 'epoch': 0.12}
+{'loss': 1.0865, 'grad_norm': 1.5714401006698608, 'learning_rate': 0.0002934506353861192, 'epoch': 0.12}
+{'loss': 1.1607, 'grad_norm': 1.6347661018371582, 'learning_rate': 0.0002934261974584555, 'epoch': 0.12}
+{'loss': 1.1489, 'grad_norm': 2.3367502689361572, 'learning_rate': 0.00029340175953079177, 'epoch': 0.12}
+{'loss': 1.0683, 'grad_norm': 2.6965532302856445, 'learning_rate': 0.000293377321603128, 'epoch': 0.12}
+{'loss': 1.4028, 'grad_norm': 1.9645942449569702, 'learning_rate': 0.00029335288367546427, 'epoch': 0.12}
+{'loss': 1.2004, 'grad_norm': 2.6310694217681885, 'learning_rate': 0.0002933284457478006, 'epoch': 0.12}
+{'loss': 1.4612, 'grad_norm': 1.9831539392471313, 'learning_rate': 0.00029330400782013683, 'epoch': 0.12}
+{'loss': 1.4779, 'grad_norm': 2.4186670780181885, 'learning_rate': 0.0002932795698924731, 'epoch': 0.12}
+{'loss': 1.713, 'grad_norm': 4.081188201904297, 'learning_rate': 0.0002932551319648094, 'epoch': 0.12}
+{'loss': 2.0171, 'grad_norm': 2.4842689037323, 'learning_rate': 0.00029323069403714564, 'epoch': 0.12}
+{'loss': 1.212, 'grad_norm': 2.6195974349975586, 'learning_rate': 0.0002932062561094819, 'epoch': 0.12}
+{'loss': 1.7433, 'grad_norm': 2.697697162628174, 'learning_rate': 0.00029318181818181814, 'epoch': 0.12}
+{'loss': 1.8923, 'grad_norm': 2.484013319015503, 'learning_rate': 0.0002931573802541544, 'epoch': 0.12}
+{'loss': 2.1281, 'grad_norm': 4.01732873916626, 'learning_rate': 0.0002931329423264907, 'epoch': 0.12}
+{'loss': 2.1824, 'grad_norm': 2.3246097564697266, 'learning_rate': 0.00029310850439882695, 'epoch': 0.12}
+{'loss': 1.5211, 'grad_norm': 3.7635695934295654, 'learning_rate': 0.0002930840664711632, 'epoch': 0.12}
+{'loss': 1.4717, 'grad_norm': 2.600083827972412, 'learning_rate': 0.0002930596285434995, 'epoch': 0.12}
+{'loss': 1.7544, 'grad_norm': 4.064560890197754, 'learning_rate': 0.00029303519061583575, 'epoch': 0.12}
+{'loss': 1.9283, 'grad_norm': 4.859133720397949, 'learning_rate': 0.000293010752688172, 'epoch': 0.12}
+{'loss': 1.0601, 'grad_norm': 2.1018145084381104, 'learning_rate': 0.00029298631476050826, 'epoch': 0.12}
+{'loss': 1.8295, 'grad_norm': 2.4449431896209717, 'learning_rate': 0.00029296187683284456, 'epoch': 0.12}
+{'loss': 1.2816, 'grad_norm': 2.4376046657562256, 'learning_rate': 0.0002929374389051808, 'epoch': 0.12}
+{'loss': 1.9171, 'grad_norm': 3.415478229522705, 'learning_rate': 0.00029291300097751706, 'epoch': 0.12}
+{'loss': 1.2723, 'grad_norm': 3.3648712635040283, 'learning_rate': 0.00029288856304985337, 'epoch': 0.12}
+{'loss': 1.2449, 'grad_norm': 3.2523117065429688, 'learning_rate': 0.0002928641251221896, 'epoch': 0.12}
+{'loss': 1.2996, 'grad_norm': 2.2647125720977783, 'learning_rate': 0.00029283968719452587, 'epoch': 0.12}
+{'loss': 1.4211, 'grad_norm': 2.5591723918914795, 'learning_rate': 0.0002928152492668622, 'epoch': 0.13}
+{'loss': 1.3324, 'grad_norm': 1.7322767972946167, 'learning_rate': 0.0002927908113391984, 'epoch': 0.13}
+{'loss': 1.0354, 'grad_norm': 1.5625592470169067, 'learning_rate': 0.0002927663734115347, 'epoch': 0.13}
+{'loss': 0.7138, 'grad_norm': 1.2151803970336914, 'learning_rate': 0.00029274193548387093, 'epoch': 0.13}
+{'loss': 0.6718, 'grad_norm': 0.8814300894737244, 'learning_rate': 0.0002927174975562072, 'epoch': 0.13}
+{'loss': 0.645, 'grad_norm': 0.789391279220581, 'learning_rate': 0.0002926930596285435, 'epoch': 0.13}
+{'loss': 0.6388, 'grad_norm': 0.6633252501487732, 'learning_rate': 0.00029266862170087974, 'epoch': 0.13}
+{'loss': 0.8446, 'grad_norm': 0.9132680892944336, 'learning_rate': 0.000292644183773216, 'epoch': 0.13}
+{'loss': 0.6193, 'grad_norm': 0.9453927278518677, 'learning_rate': 0.00029261974584555224, 'epoch': 0.13}
+  6%|▋         | 808/12776 [06:12<2:50:26,  1.17it/s]  6%|▋         | 809/12776 [06:13<2:35:30,  1.28it/s]                                                       6%|▋         | 809/12776 [06:13<2:35:30,  1.28it/s]  6%|▋         | 810/12776 [06:13<2:20:49,  1.42it/s]                                                       6%|▋         | 810/12776 [06:13<2:20:49,  1.42it/s]  6%|▋         | 811/12776 [06:14<2:11:43,  1.51it/s]                                                       6%|▋         | 811/12776 [06:14<2:11:43,  1.51it/s]  6%|▋         | 812/12776 [06:15<2:01:59,  1.63it/s]                                                       6%|▋         | 812/12776 [06:15<2:01:59,  1.63it/s]  6%|▋         | 813/12776 [06:15<1:58:45,  1.68it/s]                                                       6%|▋         | 813/12776 [06:15<1:58:45,  1.68it/s]  6%|▋         | 814/12776 [06:16<1:48:30,  1.84it/s]                                                       6%|▋         | 814/12776 [06:16<1:48:30,  1.84it/s]  6%|▋         | 815/12776 [06:16<1:45:31,  1.89it/s]                                                       6%|▋         | 815/12776 [06:16<1:45:31,  1.89it/s]  6%|▋         | 816/12776 [06:16<1:38:26,  2.02it/s]                                                       6%|▋         | 816/12776 [06:16<1:38:26,  2.02it/s]  6%|▋         | 817/12776 [06:17<1:31:40,  2.17it/s]                                                       6%|▋         | 817/12776 [06:17<1:31:40,  2.17it/s]  6%|▋         | 818/12776 [06:17<1:34:24,  2.11it/s]                                                       6%|▋         | 818/12776 [06:17<1:34:24,  2.11it/s]  6%|▋         | 819/12776 [06:18<1:26:57,  2.29it/s]                                                       6%|▋         | 819/12776 [06:18<1:26:57,  2.29it/s]  6%|▋         | 820/12776 [06:18<1:20:34,  2.47it/s]                                                       6%|▋         | 820/12776 [06:18<1:20:34,  2.47it/s]  6%|▋         | 821/12776 [06:18<1:23:34,  2.38it/s]                                                       6%|▋         | 821/12776 [06:18<1:23:34,  2.38it/s]  6%|▋         | 822/12776 [06:19<1:17:16,  2.58it/s]                                                       6%|▋         | 822/12776 [06:19<1:17:16,  2.58it/s]  6%|▋         | 823/12776 [06:19<1:12:26,  2.75it/s]                                                       6%|▋         | 823/12776 [06:19<1:12:26,  2.75it/s]  6%|▋         | 824/12776 [06:19<1:13:17,  2.72it/s]                                                       6%|▋         | 824/12776 [06:19<1:13:17,  2.72it/s]  6%|▋         | 825/12776 [06:20<1:07:57,  2.93it/s]                                                       6%|▋         | 825/12776 [06:20<1:07:57,  2.93it/s]  6%|▋         | 826/12776 [06:20<1:03:45,  3.12it/s]                                                       6%|▋         | 826/12776 [06:20<1:03:45,  3.12it/s]  6%|▋         | 827/12776 [06:20<1:00:23,  3.30it/s]                                                       6%|▋         | 827/12776 [06:20<1:00:23,  3.30it/s]  6%|▋         | 828/12776 [06:21<1:01:13,  3.25it/s]                                                       6%|▋         | 828/12776 [06:21<1:01:13,  3.25it/s]  6%|▋         | 829/12776 [06:21<57:48,  3.44it/s]                                                       6%|▋         | 829/12776 [06:21<57:48,  3.44it/s]  6%|▋         | 830/12776 [06:21<54:59,  3.62it/s]                                                     6%|▋         | 830/12776 [06:21<54:59,  3.62it/s]  7%|▋         | 831/12776 [06:21<52:12,  3.81it/s]                                                     7%|▋         | 831/12776 [06:21<52:12,  3.81it/s]  7%|▋         | 832/12776 [06:22<50:00,  3.98it/s]                                                     7%|▋         | 832/12776 [06:22<50:00,  3.98it/s]  7%|▋         | 833/12776 [06:22<53:28,  3.72it/s]                                                     7%|▋         | 833/12776 [06:22<53:28,  3.72it/s]  7%|▋         | 834/12776 [06:22<50:15,  3.96it/s]                                                     7%|▋         | 834/12776 [06:22<50:15,  3.96it/s]  7%|▋         | 835/12776 [06:22<47:39,  4.18it/s]                                                     7%|▋         | 835/12776 [06:22<47:39,  4.18it/s]  7%|▋         | 836/12776 [06:22<45:32,  4.37it/s]                                                     7%|▋         | 836/12776 [06:22<45:32,  4.37it/s]  7%|▋         | 837/12776 [06:23<43:50,  4.54it/s]                                                     7%|▋         | 837/12776 [06:23<43:50,  4.54it/s]  7%|▋         | 838/12776 [06:23<47:53,  4.15it/s]                                                     7%|▋         | 838/12776 [06:23<47:53,  4.15it/s]  7%|▋         | 839/12776 [06:23<44:53,  4.43it/s]                                                     7%|▋         | 839/12776 [06:23<44:53,  4.43it/s]  7%|▋         | 840/12776 [06:23<42:47,  4.65it/s]                                                     7%|▋         | 840/12776 [06:23<42:47,  4.65it/s]  7%|▋         | 841/12776 [06:24<41:03,  4.85it/s]                                                     7%|▋         | 841/12776 [06:24<41:03,  4.85it/s]  7%|▋         | 842/12776 [06:24<39:40,  5.01it/s]                                                     7%|▋         | 842/12776 [06:24<39:40,  5.01it/s]  7%|▋         | 843/12776 [06:24<45:07,  4.41it/s]                                                     7%|▋         | 843/12776 [06:24<45:07,  4.41it/s]  7%|▋         | 844/12776 [06:24<42:12,  4.71it/s]                                                     7%|▋         | 844/12776 [06:24<42:12,  4.71it/s]  7%|▋         | 845/12776 [06:24<39:59,  4.97it/s]                                                     7%|▋         | 845/12776 [06:24<39:59,  4.97it/s]  7%|▋         | 846/12776 [06:25<38:22,  5.18it/s]                                                     7%|▋         | 846/12776 [06:25<38:22,  5.18it/s]  7%|▋         | 847/12776 [06:25<37:00,  5.37it/s]                                                     7%|▋         | 847/12776 [06:25<37:00,  5.37it/s]  7%|▋         | 848/12776 [06:25<35:42,  5.57it/s]                                                     7%|▋         | 848/12776 [06:25<35:42,  5.57it/s]  7%|▋         | 849/12776 [06:25<40:22,  4.92it/s]                                                     7%|▋         | 849/12776 [06:25<40:22,  4.92it/s]  7%|▋         | 850/12776 [06:26<1:10:46,  2.81it/s]                                                       7%|▋         | 850/12776 [06:26<1:10:46,  2.81it/s]  7%|▋         | 851/12776 [06:27<2:25:46,  1.36it/s]                                                       7%|▋         | 851/12776 [06:27<2:25:46,  1.36it/s]  7%|▋         | 852/12776 [06:29<2:46:12,  1.20it/s]                                                       7%|▋         | 852/12776 [06:29<2:46:12,  1.20it/s]  7%|▋         | 853/12776 [06:29<2:47:33,  1.19it/s]                                                       7%|▋         | 853/12776 [06:29<2:47:33,  1.19it/s]  7%|▋         | 854/12776 [06:30<2:45:28,  1.20it/s]                                                       7%|▋         | 854/12776 [06:30<2:45:28,  1.20it/s]  7%|▋         | 855/12776 [06:31<2:49:33,  1.17it/s]                                                       7%|▋         | 855/12776 [06:31<2:49:33,  1.17it/s]  7%|▋         | 856/12776 [06:32<2:40:14,  1.24it/s]                                                       7%|▋         | 856/12776 [06:32<2:40:14,  1.24it/s]  7%|▋         | 857/12776 [06:32<2:30:27,  1.32it/s]                                                       7%|▋         | 857/12776 [06:32<2:30:27,  1.32it/s]  7%|▋         | 858/12776 [06:33<2:24:31,  1.37it/s]                                                       7%|▋         | 858/12776 [06:33<2:24:31,  1.37it/s]  7%|▋         | 859/12776 [06:34<2:16:23,  1.46it/s]                                                       7%|▋         | 859/12776 [06:34<2:16:23,  1.46it/s]  7%|▋         | 860/12776 [06:34<2:09:26,  1.53it/s]                                                       7%|▋         | 860/12776 [06:34<2:09:26,  1.53it/s]  7%|▋         | 861/12776 [06:35<2:02:19,  1.62it/s]                                                       7%|▋         | 861/12776 [06:35<2:02:19,  1.62it/s]  7%|▋         | 862/12776 [06:35<2:02:44,  1.62it/s]                                                       7%|▋         | 862/12776 [06:35<2:02:44,  1.62it/s]  7%|▋         | 863/12776 [06:36<1:55:07,  1.72it/s]                                                       7%|▋         | 863/12776 [06:36<1:55:07,  1.72it/s]  7%|▋         | 864/12776 [06:36<1:48:11,  1.84it/s]                                                       7%|▋         | 864/12776 [06:36<1:48:11,  1.84it/s]  7%|▋         | 865/12776 [06:37<1:42:05,  1.94it/s]                                                       7%|▋         | 865/12776 [06:37<1:42:05,  1.94it/s]  7%|▋         | 866/12776 [06:37<1:35:01,  2.09it/s]                                                       7%|▋         | 866/12776 [06:37<1:35:01,  2.09it/s]  7%|▋         | 867/12776 [06:38<1:39:00,  2.00it/s]                                                       7%|▋         | 867/12776 [06:38<1:39:00,  2.00it/s]  7%|▋         | 868/12776 [06:38<1:32:19,  2.15it/s]                                                       7%|▋         | 868/12776 [06:38<1:32:19,  2.15it/s]  7%|▋         | 869/12776 [06:38<1:27:05,  2.28it/s]                                                       7%|▋         | 869/12776 [06:38<1:27:05,  2.28it/s]  7%|▋         | 870/12776 [06:39<1:24:13,  2.36it/s]                                                       7%|▋         | 870/12776 [06:39<1:24:13,  2.36it/s]  7%|▋         | 871/12776 [06:39<1:19:54,  2.48it/s]                                                       7%|▋         | 871/12776 [06:39<1:19:54,  2.48it/s]  7%|▋         | 872/12776 [06:40<1:16:34,  2.59it/s]                                                       7%|▋         | 872/12776 [06:40<1:16:34,  2.59it/s]  7%|▋         | 873/12776 [06:40<1:20:02,  2.48it/s]                                                       7%|▋         | 873/12776 [06:40<1:20:02,  2.48it/s]  7%|▋         | 874/12776 [06:40<1:15:21,  2.63it/s]                                                       7%|▋         | 874/12776 [06:40<1:15:21,  2.63it/s]  7%|▋         | 875/12776 [06:41<1:11:11,  2.79it/s]                                                       7%|▋         | 875/12776 [06:41<1:11:11,  2.79it/s]  7%|▋         | 876/12776 [06:41<1:07:14,  2.95it/s]                                                       7%|▋         | 876/12776 [06:41<1:07:14,  2.95it/s]  7%|▋         | 877/12776 [06:41<1:09:00,  2.87it/s]                                                       7%|▋         | 877/12776 [06:41<1:09:00,  2.87it/s]  7%|▋         | 878/12776 [06:42<1:05:01,  3.05it/s]                                                       7%|▋         | 878/12776 [06:42<1:05:01,  3.05it/s]  7%|▋         | 879/12776 [06:42<1:01:22,  3.23it/s]                                                       7%|▋         | 879/12776 [06:42<1:01:22,  3.23it/s]  7%|▋         | 880/12776 [06:42<58:38,  3.38it/s]                                                       7%|▋         | 880/12776 [06:42<58:38,  3.38it/s]  7%|▋         | 881/12776 [06:42<1:02:41,  3.16it/s]                                                       7%|▋         | 881/12776 [06:42<1:02:41,  3.16it/s]  7%|▋         | 882/12776 [06:43<59:03,  3.36it/s]                                                       7%|▋         | 882/12776 [06:43<59:03,  3.36it/s]  7%|▋         | 883/12776 [06:43<56:00,  3.54it/s]                                                     7%|▋         | 883/12776 [06:43<56:00,  3.54it/s]  7%|▋         | 884/12776 [06:43<54:33,  3.63it/s]                                                     7%|▋         | 884/12776 [06:43<54:33,  3.63it/s]  7%|▋         | 885/12776 [06:44<57:44,  3.43it/s]                                                   {'loss': 0.8318, 'grad_norm': 1.8710074424743652, 'learning_rate': 0.00029259530791788855, 'epoch': 0.13}
+{'loss': 0.6593, 'grad_norm': 1.0109128952026367, 'learning_rate': 0.0002925708699902248, 'epoch': 0.13}
+{'loss': 0.7262, 'grad_norm': 1.1044448614120483, 'learning_rate': 0.00029254643206256105, 'epoch': 0.13}
+{'loss': 0.9756, 'grad_norm': 1.3992559909820557, 'learning_rate': 0.00029252199413489736, 'epoch': 0.13}
+{'loss': 0.7762, 'grad_norm': 1.5335423946380615, 'learning_rate': 0.0002924975562072336, 'epoch': 0.13}
+{'loss': 0.8961, 'grad_norm': 1.3663071393966675, 'learning_rate': 0.00029247311827956986, 'epoch': 0.13}
+{'loss': 0.8597, 'grad_norm': 1.9370208978652954, 'learning_rate': 0.00029244868035190616, 'epoch': 0.13}
+{'loss': 1.0157, 'grad_norm': 1.6557672023773193, 'learning_rate': 0.00029242424242424236, 'epoch': 0.13}
+{'loss': 0.9113, 'grad_norm': 1.4072177410125732, 'learning_rate': 0.00029239980449657867, 'epoch': 0.13}
+{'loss': 0.8022, 'grad_norm': 1.654747724533081, 'learning_rate': 0.0002923753665689149, 'epoch': 0.13}
+{'loss': 0.7312, 'grad_norm': 1.677295207977295, 'learning_rate': 0.00029235092864125117, 'epoch': 0.13}
+{'loss': 0.9197, 'grad_norm': 1.2632160186767578, 'learning_rate': 0.0002923264907135875, 'epoch': 0.13}
+{'loss': 1.268, 'grad_norm': 1.6950252056121826, 'learning_rate': 0.0002923020527859237, 'epoch': 0.13}
+{'loss': 0.9591, 'grad_norm': 1.9522795677185059, 'learning_rate': 0.00029227761485826, 'epoch': 0.13}
+{'loss': 1.0732, 'grad_norm': 2.316823959350586, 'learning_rate': 0.0002922531769305963, 'epoch': 0.13}
+{'loss': 1.3269, 'grad_norm': 2.60040283203125, 'learning_rate': 0.00029222873900293253, 'epoch': 0.13}
+{'loss': 1.3893, 'grad_norm': 2.1569325923919678, 'learning_rate': 0.0002922043010752688, 'epoch': 0.13}
+{'loss': 1.3369, 'grad_norm': 3.817329168319702, 'learning_rate': 0.00029217986314760504, 'epoch': 0.13}
+{'loss': 0.9969, 'grad_norm': 1.4851608276367188, 'learning_rate': 0.00029215542521994134, 'epoch': 0.13}
+{'loss': 1.6128, 'grad_norm': 2.9426980018615723, 'learning_rate': 0.0002921309872922776, 'epoch': 0.13}
+{'loss': 0.8915, 'grad_norm': 1.6474707126617432, 'learning_rate': 0.00029210654936461384, 'epoch': 0.13}
+{'loss': 1.0587, 'grad_norm': 2.1298272609710693, 'learning_rate': 0.00029208211143695015, 'epoch': 0.13}
+{'loss': 1.544, 'grad_norm': 2.768442153930664, 'learning_rate': 0.00029205767350928635, 'epoch': 0.13}
+{'loss': 1.0509, 'grad_norm': 2.516650676727295, 'learning_rate': 0.00029203323558162265, 'epoch': 0.13}
+{'loss': 1.4115, 'grad_norm': 2.4558215141296387, 'learning_rate': 0.0002920087976539589, 'epoch': 0.13}
+{'loss': 1.4077, 'grad_norm': 2.536071300506592, 'learning_rate': 0.00029198435972629515, 'epoch': 0.13}
+{'loss': 1.7084, 'grad_norm': 3.4631125926971436, 'learning_rate': 0.00029195992179863146, 'epoch': 0.13}
+{'loss': 1.1893, 'grad_norm': 2.3752031326293945, 'learning_rate': 0.0002919354838709677, 'epoch': 0.13}
+{'loss': 1.0723, 'grad_norm': 1.6237187385559082, 'learning_rate': 0.00029191104594330396, 'epoch': 0.13}
+{'loss': 1.6326, 'grad_norm': 2.3249778747558594, 'learning_rate': 0.00029188660801564027, 'epoch': 0.13}
+{'loss': 1.4018, 'grad_norm': 2.8457751274108887, 'learning_rate': 0.0002918621700879765, 'epoch': 0.13}
+{'loss': 2.1144, 'grad_norm': 3.0899088382720947, 'learning_rate': 0.00029183773216031277, 'epoch': 0.13}
+{'loss': 1.9739, 'grad_norm': 4.969150543212891, 'learning_rate': 0.000291813294232649, 'epoch': 0.13}
+{'loss': 1.7328, 'grad_norm': 2.7233245372772217, 'learning_rate': 0.0002917888563049853, 'epoch': 0.13}
+{'loss': 1.5687, 'grad_norm': 2.1063811779022217, 'learning_rate': 0.0002917644183773216, 'epoch': 0.13}
+{'loss': 1.8963, 'grad_norm': 1.7449232339859009, 'learning_rate': 0.00029173998044965783, 'epoch': 0.13}
+{'loss': 1.5947, 'grad_norm': 2.4782533645629883, 'learning_rate': 0.00029171554252199413, 'epoch': 0.13}
+{'loss': 2.6498, 'grad_norm': 2.6746630668640137, 'learning_rate': 0.0002916911045943304, 'epoch': 0.13}
+{'loss': 1.5459, 'grad_norm': 2.479112148284912, 'learning_rate': 0.00029166666666666664, 'epoch': 0.13}
+{'loss': 0.6981, 'grad_norm': 1.7579686641693115, 'learning_rate': 0.0002916422287390029, 'epoch': 0.13}
+{'loss': 1.307, 'grad_norm': 1.6483287811279297, 'learning_rate': 0.00029161779081133914, 'epoch': 0.13}
+{'loss': 1.0251, 'grad_norm': 2.1981475353240967, 'learning_rate': 0.00029159335288367544, 'epoch': 0.13}
+{'loss': 1.0604, 'grad_norm': 2.7301342487335205, 'learning_rate': 0.0002915689149560117, 'epoch': 0.13}
+{'loss': 1.1503, 'grad_norm': 1.2275153398513794, 'learning_rate': 0.00029154447702834795, 'epoch': 0.13}
+{'loss': 0.8434, 'grad_norm': 0.9788094758987427, 'learning_rate': 0.00029152003910068425, 'epoch': 0.13}
+{'loss': 0.6953, 'grad_norm': 0.7427651286125183, 'learning_rate': 0.0002914956011730205, 'epoch': 0.13}
+{'loss': 0.6854, 'grad_norm': 0.8753380179405212, 'learning_rate': 0.00029147116324535676, 'epoch': 0.13}
+{'loss': 0.5981, 'grad_norm': 1.1522316932678223, 'learning_rate': 0.000291446725317693, 'epoch': 0.13}
+{'loss': 0.534, 'grad_norm': 0.9101697206497192, 'learning_rate': 0.0002914222873900293, 'epoch': 0.13}
+{'loss': 0.6715, 'grad_norm': 1.2821955680847168, 'learning_rate': 0.00029139784946236556, 'epoch': 0.13}
+{'loss': 0.8797, 'grad_norm': 1.4305757284164429, 'learning_rate': 0.0002913734115347018, 'epoch': 0.13}
+{'loss': 0.8174, 'grad_norm': 1.9941664934158325, 'learning_rate': 0.0002913489736070381, 'epoch': 0.13}
+{'loss': 0.6398, 'grad_norm': 1.482197880744934, 'learning_rate': 0.00029132453567937437, 'epoch': 0.13}
+{'loss': 0.6547, 'grad_norm': 1.1646844148635864, 'learning_rate': 0.0002913000977517106, 'epoch': 0.13}
+{'loss': 0.7803, 'grad_norm': 1.6788074970245361, 'learning_rate': 0.00029127565982404693, 'epoch': 0.13}
+{'loss': 1.1557, 'grad_norm': 2.2960433959960938, 'learning_rate': 0.0002912512218963831, 'epoch': 0.14}
+{'loss': 0.9451, 'grad_norm': 2.2098381519317627, 'learning_rate': 0.00029122678396871943, 'epoch': 0.14}
+{'loss': 0.9505, 'grad_norm': 1.51131272315979, 'learning_rate': 0.0002912023460410557, 'epoch': 0.14}
+{'loss': 0.7682, 'grad_norm': 0.9991070032119751, 'learning_rate': 0.00029117790811339193, 'epoch': 0.14}
+{'loss': 1.3066, 'grad_norm': 2.670018196105957, 'learning_rate': 0.00029115347018572824, 'epoch': 0.14}
+{'loss': 0.9315, 'grad_norm': 1.4845349788665771, 'learning_rate': 0.0002911290322580645, 'epoch': 0.14}
+{'loss': 0.6735, 'grad_norm': 1.2157962322235107, 'learning_rate': 0.00029110459433040074, 'epoch': 0.14}
+{'loss': 1.0187, 'grad_norm': 1.6585222482681274, 'learning_rate': 0.00029108015640273705, 'epoch': 0.14}
+{'loss': 1.0663, 'grad_norm': 1.76893949508667, 'learning_rate': 0.0002910557184750733, 'epoch': 0.14}
+{'loss': 0.7532, 'grad_norm': 0.9581948518753052, 'learning_rate': 0.00029103128054740955, 'epoch': 0.14}
+{'loss': 0.8671, 'grad_norm': 2.227740526199341, 'learning_rate': 0.0002910068426197458, 'epoch': 0.14}
+{'loss': 1.1649, 'grad_norm': 3.511131525039673, 'learning_rate': 0.0002909824046920821, 'epoch': 0.14}
+{'loss': 1.1585, 'grad_norm': 2.581437826156616, 'learning_rate': 0.00029095796676441836, 'epoch': 0.14}
+{'loss': 1.295, 'grad_norm': 1.5962125062942505, 'learning_rate': 0.0002909335288367546, 'epoch': 0.14}
+{'loss': 1.5455, 'grad_norm': 2.135364055633545, 'learning_rate': 0.0002909090909090909, 'epoch': 0.14}
+{'loss': 1.6572, 'grad_norm': 2.586113452911377, 'learning_rate': 0.0002908846529814271, 'epoch': 0.14}
+{'loss': 1.721, 'grad_norm': 3.2886767387390137, 'learning_rate': 0.0002908602150537634, 'epoch': 0.14}
+{'loss': 0.9838, 'grad_norm': 2.5917325019836426, 'learning_rate': 0.00029083577712609967, 'epoch': 0.14}
+{'loss': 1.3576, 'grad_norm': 2.5062735080718994, 'learning_rate': 0.0002908113391984359, 'epoch': 0.14}
+{'loss': 1.2969, 'grad_norm': 3.9728665351867676, 'learning_rate': 0.0002907869012707722, 'epoch': 0.14}
+{'loss': 1.8066, 'grad_norm': 2.223118305206299, 'learning_rate': 0.0002907624633431085, 'epoch': 0.14}
+{'loss': 1.1451, 'grad_norm': 3.254514694213867, 'learning_rate': 0.0002907380254154447, 'epoch': 0.14}
+  7%|▋         | 885/12776 [06:44<57:44,  3.43it/s]  7%|▋         | 886/12776 [06:44<53:53,  3.68it/s]                                                     7%|▋         | 886/12776 [06:44<53:53,  3.68it/s]  7%|▋         | 887/12776 [06:44<50:44,  3.90it/s]                                                     7%|▋         | 887/12776 [06:44<50:44,  3.90it/s]  7%|▋         | 888/12776 [06:44<48:11,  4.11it/s]                                                     7%|▋         | 888/12776 [06:44<48:11,  4.11it/s]  7%|▋         | 889/12776 [06:44<46:10,  4.29it/s]                                                     7%|▋         | 889/12776 [06:44<46:10,  4.29it/s]  7%|▋         | 890/12776 [06:45<51:20,  3.86it/s]                                                     7%|▋         | 890/12776 [06:45<51:20,  3.86it/s]  7%|▋         | 891/12776 [06:45<48:13,  4.11it/s]                                                     7%|▋         | 891/12776 [06:45<48:13,  4.11it/s]  7%|▋         | 892/12776 [06:45<45:52,  4.32it/s]                                                     7%|▋         | 892/12776 [06:45<45:52,  4.32it/s]  7%|▋         | 893/12776 [06:45<45:15,  4.38it/s]                                                     7%|▋         | 893/12776 [06:45<45:15,  4.38it/s]  7%|▋         | 894/12776 [06:46<43:35,  4.54it/s]                                                     7%|▋         | 894/12776 [06:46<43:35,  4.54it/s]  7%|▋         | 895/12776 [06:46<45:10,  4.38it/s]                                                     7%|▋         | 895/12776 [06:46<45:10,  4.38it/s]  7%|▋         | 896/12776 [06:46<43:25,  4.56it/s]                                                     7%|▋         | 896/12776 [06:46<43:25,  4.56it/s]  7%|▋         | 897/12776 [06:46<41:44,  4.74it/s]                                                     7%|▋         | 897/12776 [06:46<41:44,  4.74it/s]  7%|▋         | 898/12776 [06:46<40:15,  4.92it/s]                                                     7%|▋         | 898/12776 [06:46<40:15,  4.92it/s]  7%|▋         | 899/12776 [06:47<39:01,  5.07it/s]                                                     7%|▋         | 899/12776 [06:47<39:01,  5.07it/s]  7%|▋         | 900/12776 [06:47<1:10:38,  2.80it/s]                                                       7%|▋         | 900/12776 [06:47<1:10:38,  2.80it/s]  7%|▋         | 901/12776 [06:49<2:14:27,  1.47it/s]                                                       7%|▋         | 901/12776 [06:49<2:14:27,  1.47it/s]  7%|▋         | 902/12776 [06:50<2:30:54,  1.31it/s]                                                       7%|▋         | 902/12776 [06:50<2:30:54,  1.31it/s]  7%|▋         | 903/12776 [06:51<2:38:41,  1.25it/s]                                                       7%|▋         | 903/12776 [06:51<2:38:41,  1.25it/s]  7%|▋         | 904/12776 [06:51<2:41:38,  1.22it/s]                                                       7%|▋         | 904/12776 [06:51<2:41:38,  1.22it/s]  7%|▋         | 905/12776 [06:52<2:39:42,  1.24it/s]                                                       7%|▋         | 905/12776 [06:52<2:39:42,  1.24it/s]  7%|▋         | 906/12776 [06:53<2:36:00,  1.27it/s]                                                       7%|▋         | 906/12776 [06:53<2:36:00,  1.27it/s]  7%|▋         | 907/12776 [06:54<2:32:26,  1.30it/s]                                                       7%|▋         | 907/12776 [06:54<2:32:26,  1.30it/s]  7%|▋         | 908/12776 [06:54<2:24:46,  1.37it/s]                                                       7%|▋         | 908/12776 [06:54<2:24:46,  1.37it/s]  7%|▋         | 909/12776 [06:55<2:25:31,  1.36it/s]                                                       7%|▋         | 909/12776 [06:55<2:25:31,  1.36it/s]  7%|▋         | 910/12776 [06:56<2:16:39,  1.45it/s]                                                       7%|▋         | 910/12776 [06:56<2:16:39,  1.45it/s]  7%|▋         | 911/12776 [06:56<2:11:43,  1.50it/s]                                                       7%|▋         | 911/12776 [06:56<2:11:43,  1.50it/s]  7%|▋         | 912/12776 [06:57<2:04:32,  1.59it/s]                                                       7%|▋         | 912/12776 [06:57<2:04:32,  1.59it/s]  7%|▋         | 913/12776 [06:57<2:00:42,  1.64it/s]                                                       7%|▋         | 913/12776 [06:57<2:00:42,  1.64it/s]  7%|▋         | 914/12776 [06:58<1:53:09,  1.75it/s]                                                       7%|▋         | 914/12776 [06:58<1:53:09,  1.75it/s]  7%|▋         | 915/12776 [06:58<1:49:24,  1.81it/s]                                                       7%|▋         | 915/12776 [06:58<1:49:24,  1.81it/s]  7%|▋         | 916/12776 [06:59<1:42:33,  1.93it/s]                                                       7%|▋         | 916/12776 [06:59<1:42:33,  1.93it/s]  7%|▋         | 917/12776 [06:59<1:45:24,  1.87it/s]                                                       7%|▋         | 917/12776 [06:59<1:45:24,  1.87it/s]  7%|▋         | 918/12776 [07:00<1:37:40,  2.02it/s]                                                       7%|▋         | 918/12776 [07:00<1:37:40,  2.02it/s]  7%|▋         | 919/12776 [07:00<1:32:01,  2.15it/s]                                                       7%|▋         | 919/12776 [07:00<1:32:01,  2.15it/s]  7%|▋         | 920/12776 [07:01<1:36:01,  2.06it/s]                                                       7%|▋         | 920/12776 [07:01<1:36:01,  2.06it/s]  7%|▋         | 921/12776 [07:01<1:27:50,  2.25it/s]                                                       7%|▋         | 921/12776 [07:01<1:27:50,  2.25it/s]  7%|▋         | 922/12776 [07:01<1:21:53,  2.41it/s]                                                       7%|▋         | 922/12776 [07:01<1:21:53,  2.41it/s]  7%|▋         | 923/12776 [07:02<1:23:38,  2.36it/s]                                                       7%|▋         | 923/12776 [07:02<1:23:38,  2.36it/s]  7%|▋         | 924/12776 [07:02<1:17:14,  2.56it/s]                                                       7%|▋         | 924/12776 [07:02<1:17:14,  2.56it/s]  7%|▋         | 925/12776 [07:03<1:11:49,  2.75it/s]                                                       7%|▋         | 925/12776 [07:03<1:11:49,  2.75it/s]  7%|▋         | 926/12776 [07:03<1:11:55,  2.75it/s]                                                       7%|▋         | 926/12776 [07:03<1:11:55,  2.75it/s]  7%|▋         | 927/12776 [07:03<1:07:03,  2.94it/s]                                                       7%|▋         | 927/12776 [07:03<1:07:03,  2.94it/s]  7%|▋         | 928/12776 [07:03<1:03:09,  3.13it/s]                                                       7%|▋         | 928/12776 [07:03<1:03:09,  3.13it/s]  7%|▋         | 929/12776 [07:04<59:52,  3.30it/s]                                                       7%|▋         | 929/12776 [07:04<59:52,  3.30it/s]  7%|▋         | 930/12776 [07:04<1:02:16,  3.17it/s]                                                       7%|▋         | 930/12776 [07:04<1:02:16,  3.17it/s]  7%|▋         | 931/12776 [07:04<58:30,  3.37it/s]                                                       7%|▋         | 931/12776 [07:04<58:30,  3.37it/s]  7%|▋         | 932/12776 [07:05<55:29,  3.56it/s]                                                     7%|▋         | 932/12776 [07:05<55:29,  3.56it/s]  7%|▋         | 933/12776 [07:05<53:53,  3.66it/s]                                                     7%|▋         | 933/12776 [07:05<53:53,  3.66it/s]  7%|▋         | 934/12776 [07:05<51:29,  3.83it/s]                                                     7%|▋         | 934/12776 [07:05<51:29,  3.83it/s]  7%|▋         | 935/12776 [07:05<51:32,  3.83it/s]                                                     7%|▋         | 935/12776 [07:05<51:32,  3.83it/s]  7%|▋         | 936/12776 [07:06<49:08,  4.01it/s]                                                     7%|▋         | 936/12776 [07:06<49:08,  4.01it/s]  7%|▋         | 937/12776 [07:06<47:33,  4.15it/s]                                                     7%|▋         | 937/12776 [07:06<47:33,  4.15it/s]  7%|▋         | 938/12776 [07:06<46:02,  4.28it/s]                                                     7%|▋         | 938/12776 [07:06<46:02,  4.28it/s]  7%|▋         | 939/12776 [07:06<44:33,  4.43it/s]                                                     7%|▋         | 939/12776 [07:06<44:33,  4.43it/s]  7%|▋         | 940/12776 [07:06<48:18,  4.08it/s]                                                     7%|▋         | 940/12776 [07:06<48:18,  4.08it/s]  7%|▋         | 941/12776 [07:07<46:03,  4.28it/s]                                                     7%|▋         | 941/12776 [07:07<46:03,  4.28it/s]  7%|▋         | 942/12776 [07:07<44:18,  4.45it/s]                                                     7%|▋         | 942/12776 [07:07<44:18,  4.45it/s]  7%|▋         | 943/12776 [07:07<42:58,  4.59it/s]                                                     7%|▋         | 943/12776 [07:07<42:58,  4.59it/s]  7%|▋         | 944/12776 [07:07<41:55,  4.70it/s]                                                     7%|▋         | 944/12776 [07:07<41:55,  4.70it/s]  7%|▋         | 945/12776 [07:08<45:41,  4.32it/s]                                                     7%|▋         | 945/12776 [07:08<45:41,  4.32it/s]  7%|▋         | 946/12776 [07:08<43:29,  4.53it/s]                                                     7%|▋         | 946/12776 [07:08<43:29,  4.53it/s]  7%|▋         | 947/12776 [07:08<41:50,  4.71it/s]                                                     7%|▋         | 947/12776 [07:08<41:50,  4.71it/s]  7%|▋         | 948/12776 [07:08<40:21,  4.89it/s]                                                     7%|▋         | 948/12776 [07:08<40:21,  4.89it/s]  7%|▋         | 949/12776 [07:08<39:16,  5.02it/s]                                                     7%|▋         | 949/12776 [07:08<39:16,  5.02it/s]  7%|▋         | 950/12776 [07:09<1:09:29,  2.84it/s]                                                       7%|▋         | 950/12776 [07:09<1:09:29,  2.84it/s]  7%|▋         | 951/12776 [07:10<2:04:36,  1.58it/s]                                                       7%|▋         | 951/12776 [07:10<2:04:36,  1.58it/s]  7%|▋         | 952/12776 [07:11<2:28:49,  1.32it/s]                                                       7%|▋         | 952/12776 [07:11<2:28:49,  1.32it/s]  7%|▋         | 953/12776 [07:12<2:36:10,  1.26it/s]                                                       7%|▋         | 953/12776 [07:12<2:36:10,  1.26it/s]  7%|▋         | 954/12776 [07:13<2:38:15,  1.24it/s]                                                       7%|▋         | 954/12776 [07:13<2:38:15,  1.24it/s]  7%|▋         | 955/12776 [07:14<2:34:10,  1.28it/s]                                                       7%|▋         | 955/12776 [07:14<2:34:10,  1.28it/s]  7%|▋         | 956/12776 [07:14<2:27:17,  1.34it/s]                                                       7%|▋         | 956/12776 [07:14<2:27:17,  1.34it/s]  7%|▋         | 957/12776 [07:15<2:21:03,  1.40it/s]                                                       7%|▋         | 957/12776 [07:15<2:21:03,  1.40it/s]  7%|▋         | 958/12776 [07:16<2:24:08,  1.37it/s]                                                       7%|▋         | 958/12776 [07:16<2:24:08,  1.37it/s]  8%|▊         | 959/12776 [07:16<2:15:36,  1.45it/s]                                                       8%|▊         | 959/12776 [07:16<2:15:36,  1.45it/s]  8%|▊         | 960/12776 [07:17<2:10:08,  1.51it/s]                                                       8%|▊         | 960/12776 [07:17<2:10:08,  1.51it/s]  8%|▊         | 961/12776 [07:18<2:02:44,  1.60it/s]                                                       8%|▊         | 961/12776 [07:18<2:02:44,  1.60it/s]  8%|▊         | 962/12776 [07:18<1:59:00,  1.65it/s]                                                       8%|▊         | 962/12776 [07:18<1:59:00,  1.65it/s]  8%|▊         | 963/12776 [07:19<1:53:14,  1.74it/s]                                                     {'loss': 1.7801, 'grad_norm': 7.31671667098999, 'learning_rate': 0.00029071358748778103, 'epoch': 0.14}
+{'loss': 2.0279, 'grad_norm': 3.2049777507781982, 'learning_rate': 0.0002906891495601173, 'epoch': 0.14}
+{'loss': 1.5702, 'grad_norm': 3.6071975231170654, 'learning_rate': 0.00029066471163245353, 'epoch': 0.14}
+{'loss': 1.576, 'grad_norm': 2.798166275024414, 'learning_rate': 0.0002906402737047898, 'epoch': 0.14}
+{'loss': 1.8335, 'grad_norm': 6.099799633026123, 'learning_rate': 0.0002906158357771261, 'epoch': 0.14}
+{'loss': 1.548, 'grad_norm': 2.965550184249878, 'learning_rate': 0.00029059139784946234, 'epoch': 0.14}
+{'loss': 1.9851, 'grad_norm': 2.5119330883026123, 'learning_rate': 0.0002905669599217986, 'epoch': 0.14}
+{'loss': 1.4258, 'grad_norm': 5.480480194091797, 'learning_rate': 0.0002905425219941349, 'epoch': 0.14}
+{'loss': 1.541, 'grad_norm': 3.603321075439453, 'learning_rate': 0.00029051808406647115, 'epoch': 0.14}
+{'loss': 2.0883, 'grad_norm': 4.580733776092529, 'learning_rate': 0.0002904936461388074, 'epoch': 0.14}
+{'loss': 1.5889, 'grad_norm': 2.0983352661132812, 'learning_rate': 0.00029046920821114365, 'epoch': 0.14}
+{'loss': 1.6588, 'grad_norm': 2.2917943000793457, 'learning_rate': 0.0002904447702834799, 'epoch': 0.14}
+{'loss': 1.3828, 'grad_norm': 3.498607873916626, 'learning_rate': 0.0002904203323558162, 'epoch': 0.14}
+{'loss': 1.1676, 'grad_norm': 2.6161184310913086, 'learning_rate': 0.00029039589442815246, 'epoch': 0.14}
+{'loss': 1.2942, 'grad_norm': 3.8199074268341064, 'learning_rate': 0.0002903714565004887, 'epoch': 0.14}
+{'loss': 1.5531, 'grad_norm': 1.9904534816741943, 'learning_rate': 0.000290347018572825, 'epoch': 0.14}
+{'loss': 0.8606, 'grad_norm': 0.8975843191146851, 'learning_rate': 0.00029032258064516127, 'epoch': 0.14}
+{'loss': 0.7228, 'grad_norm': 0.7283688187599182, 'learning_rate': 0.0002902981427174975, 'epoch': 0.14}
+{'loss': 0.7472, 'grad_norm': 0.6623715162277222, 'learning_rate': 0.00029027370478983377, 'epoch': 0.14}
+{'loss': 0.5521, 'grad_norm': 0.7772536873817444, 'learning_rate': 0.0002902492668621701, 'epoch': 0.14}
+{'loss': 0.7454, 'grad_norm': 0.9219233989715576, 'learning_rate': 0.00029022482893450633, 'epoch': 0.14}
+{'loss': 0.7557, 'grad_norm': 0.8416048884391785, 'learning_rate': 0.0002902003910068426, 'epoch': 0.14}
+{'loss': 0.4868, 'grad_norm': 0.8040184378623962, 'learning_rate': 0.0002901759530791789, 'epoch': 0.14}
+{'loss': 0.6294, 'grad_norm': 0.9409207105636597, 'learning_rate': 0.00029015151515151514, 'epoch': 0.14}
+{'loss': 0.8302, 'grad_norm': 1.9797828197479248, 'learning_rate': 0.0002901270772238514, 'epoch': 0.14}
+{'loss': 1.0439, 'grad_norm': 1.6101166009902954, 'learning_rate': 0.0002901026392961877, 'epoch': 0.14}
+{'loss': 0.6344, 'grad_norm': 1.9140416383743286, 'learning_rate': 0.0002900782013685239, 'epoch': 0.14}
+{'loss': 0.6403, 'grad_norm': 0.9907447695732117, 'learning_rate': 0.0002900537634408602, 'epoch': 0.14}
+{'loss': 0.7338, 'grad_norm': 2.3162434101104736, 'learning_rate': 0.00029002932551319645, 'epoch': 0.14}
+{'loss': 0.6497, 'grad_norm': 1.7492724657058716, 'learning_rate': 0.0002900048875855327, 'epoch': 0.14}
+{'loss': 0.7873, 'grad_norm': 1.4608025550842285, 'learning_rate': 0.000289980449657869, 'epoch': 0.14}
+{'loss': 0.836, 'grad_norm': 2.1269257068634033, 'learning_rate': 0.00028995601173020525, 'epoch': 0.14}
+{'loss': 1.3075, 'grad_norm': 3.4742913246154785, 'learning_rate': 0.0002899315738025415, 'epoch': 0.14}
+{'loss': 0.5859, 'grad_norm': 2.038989305496216, 'learning_rate': 0.00028990713587487776, 'epoch': 0.14}
+{'loss': 0.925, 'grad_norm': 2.4943230152130127, 'learning_rate': 0.00028988269794721406, 'epoch': 0.14}
+{'loss': 1.0819, 'grad_norm': 1.818659782409668, 'learning_rate': 0.0002898582600195503, 'epoch': 0.14}
+{'loss': 0.7397, 'grad_norm': 2.3081328868865967, 'learning_rate': 0.00028983382209188656, 'epoch': 0.14}
+{'loss': 0.7526, 'grad_norm': 2.66398024559021, 'learning_rate': 0.00028980938416422287, 'epoch': 0.14}
+{'loss': 1.3631, 'grad_norm': 1.6944574117660522, 'learning_rate': 0.0002897849462365591, 'epoch': 0.14}
+{'loss': 1.302, 'grad_norm': 3.4251976013183594, 'learning_rate': 0.00028976050830889537, 'epoch': 0.14}
+{'loss': 0.9843, 'grad_norm': 6.1071062088012695, 'learning_rate': 0.0002897360703812317, 'epoch': 0.14}
+{'loss': 1.802, 'grad_norm': 4.849330902099609, 'learning_rate': 0.0002897116324535679, 'epoch': 0.14}
+{'loss': 1.2331, 'grad_norm': 3.4698123931884766, 'learning_rate': 0.0002896871945259042, 'epoch': 0.15}
+{'loss': 1.5486, 'grad_norm': 2.4112536907196045, 'learning_rate': 0.00028966275659824043, 'epoch': 0.15}
+{'loss': 1.6323, 'grad_norm': 3.808506965637207, 'learning_rate': 0.0002896383186705767, 'epoch': 0.15}
+{'loss': 1.4004, 'grad_norm': 2.8173084259033203, 'learning_rate': 0.000289613880742913, 'epoch': 0.15}
+{'loss': 1.2148, 'grad_norm': 2.575488805770874, 'learning_rate': 0.00028958944281524924, 'epoch': 0.15}
+{'loss': 1.6757, 'grad_norm': 2.422727108001709, 'learning_rate': 0.0002895650048875855, 'epoch': 0.15}
+{'loss': 0.7642, 'grad_norm': 1.202463984489441, 'learning_rate': 0.0002895405669599218, 'epoch': 0.15}
+{'loss': 1.628, 'grad_norm': 2.1168575286865234, 'learning_rate': 0.00028951612903225805, 'epoch': 0.15}
+{'loss': 1.873, 'grad_norm': 5.3358917236328125, 'learning_rate': 0.0002894916911045943, 'epoch': 0.15}
+{'loss': 1.5203, 'grad_norm': 4.962363243103027, 'learning_rate': 0.00028946725317693055, 'epoch': 0.15}
+{'loss': 1.3402, 'grad_norm': 2.7353806495666504, 'learning_rate': 0.00028944281524926686, 'epoch': 0.15}
+{'loss': 1.7889, 'grad_norm': 2.9888954162597656, 'learning_rate': 0.0002894183773216031, 'epoch': 0.15}
+{'loss': 1.7595, 'grad_norm': 2.7625157833099365, 'learning_rate': 0.00028939393939393936, 'epoch': 0.15}
+{'loss': 1.1726, 'grad_norm': 1.8934247493743896, 'learning_rate': 0.00028936950146627566, 'epoch': 0.15}
+{'loss': 1.0545, 'grad_norm': 2.089092969894409, 'learning_rate': 0.0002893450635386119, 'epoch': 0.15}
+{'loss': 1.4845, 'grad_norm': 2.7847442626953125, 'learning_rate': 0.00028932062561094817, 'epoch': 0.15}
+{'loss': 1.5035, 'grad_norm': 5.1051740646362305, 'learning_rate': 0.0002892961876832844, 'epoch': 0.15}
+{'loss': 1.6538, 'grad_norm': 4.7735161781311035, 'learning_rate': 0.00028927174975562067, 'epoch': 0.15}
+{'loss': 1.2094, 'grad_norm': 2.4726431369781494, 'learning_rate': 0.000289247311827957, 'epoch': 0.15}
+{'loss': 1.1057, 'grad_norm': 3.6386454105377197, 'learning_rate': 0.0002892228739002932, 'epoch': 0.15}
+{'loss': 0.9869, 'grad_norm': 2.5107581615448, 'learning_rate': 0.0002891984359726295, 'epoch': 0.15}
+{'loss': 1.413, 'grad_norm': 2.7597451210021973, 'learning_rate': 0.0002891739980449658, 'epoch': 0.15}
+{'loss': 1.6703, 'grad_norm': 5.333527565002441, 'learning_rate': 0.00028914956011730203, 'epoch': 0.15}
+{'loss': 1.8701, 'grad_norm': 2.9326138496398926, 'learning_rate': 0.0002891251221896383, 'epoch': 0.15}
+{'loss': 0.9318, 'grad_norm': 1.1659024953842163, 'learning_rate': 0.00028910068426197454, 'epoch': 0.15}
+{'loss': 0.824, 'grad_norm': 0.989399790763855, 'learning_rate': 0.00028907624633431084, 'epoch': 0.15}
+{'loss': 0.9731, 'grad_norm': 0.8183377385139465, 'learning_rate': 0.0002890518084066471, 'epoch': 0.15}
+{'loss': 0.6404, 'grad_norm': 0.9016216993331909, 'learning_rate': 0.00028902737047898334, 'epoch': 0.15}
+{'loss': 0.7309, 'grad_norm': 0.7788622975349426, 'learning_rate': 0.00028900293255131965, 'epoch': 0.15}
+{'loss': 0.8296, 'grad_norm': 0.9964921474456787, 'learning_rate': 0.0002889784946236559, 'epoch': 0.15}
+{'loss': 0.7825, 'grad_norm': 1.0400006771087646, 'learning_rate': 0.00028895405669599215, 'epoch': 0.15}
+{'loss': 0.6284, 'grad_norm': 1.093723177909851, 'learning_rate': 0.00028892961876832846, 'epoch': 0.15}
+{'loss': 0.606, 'grad_norm': 0.8602253198623657, 'learning_rate': 0.00028890518084066465, 'epoch': 0.15}
+{'loss': 0.6316, 'grad_norm': 1.0207772254943848, 'learning_rate': 0.00028888074291300096, 'epoch': 0.15}
+{'loss': 0.9523, 'grad_norm': 1.8470277786254883, 'learning_rate': 0.0002888563049853372, 'epoch': 0.15}
+{'loss': 0.7024, 'grad_norm': 1.5962973833084106, 'learning_rate': 0.00028883186705767346, 'epoch': 0.15}
+  8%|▊         | 963/12776 [07:19<1:53:14,  1.74it/s]  8%|▊         | 964/12776 [07:19<1:49:48,  1.79it/s]                                                       8%|▊         | 964/12776 [07:19<1:49:48,  1.79it/s]  8%|▊         | 965/12776 [07:20<1:43:33,  1.90it/s]                                                       8%|▊         | 965/12776 [07:20<1:43:33,  1.90it/s]  8%|▊         | 966/12776 [07:20<1:41:39,  1.94it/s]                                                       8%|▊         | 966/12776 [07:20<1:41:39,  1.94it/s]  8%|▊         | 967/12776 [07:21<1:35:23,  2.06it/s]                                                       8%|▊         | 967/12776 [07:21<1:35:23,  2.06it/s]  8%|▊         | 968/12776 [07:21<1:30:01,  2.19it/s]                                                       8%|▊         | 968/12776 [07:21<1:30:01,  2.19it/s]  8%|▊         | 969/12776 [07:21<1:32:16,  2.13it/s]                                                       8%|▊         | 969/12776 [07:21<1:32:16,  2.13it/s]  8%|▊         | 970/12776 [07:22<1:25:35,  2.30it/s]                                                       8%|▊         | 970/12776 [07:22<1:25:35,  2.30it/s]  8%|▊         | 971/12776 [07:22<1:20:18,  2.45it/s]                                                       8%|▊         | 971/12776 [07:22<1:20:18,  2.45it/s]  8%|▊         | 972/12776 [07:23<1:22:11,  2.39it/s]                                                       8%|▊         | 972/12776 [07:23<1:22:11,  2.39it/s]  8%|▊         | 973/12776 [07:23<1:16:43,  2.56it/s]                                                       8%|▊         | 973/12776 [07:23<1:16:43,  2.56it/s]  8%|▊         | 974/12776 [07:23<1:12:40,  2.71it/s]                                                       8%|▊         | 974/12776 [07:23<1:12:40,  2.71it/s]  8%|▊         | 975/12776 [07:24<1:11:11,  2.76it/s]                                                       8%|▊         | 975/12776 [07:24<1:11:11,  2.76it/s]  8%|▊         | 976/12776 [07:24<1:07:17,  2.92it/s]                                                       8%|▊         | 976/12776 [07:24<1:07:17,  2.92it/s]  8%|▊         | 977/12776 [07:24<1:04:08,  3.07it/s]                                                       8%|▊         | 977/12776 [07:24<1:04:08,  3.07it/s]  8%|▊         | 978/12776 [07:24<1:01:27,  3.20it/s]                                                       8%|▊         | 978/12776 [07:24<1:01:27,  3.20it/s]  8%|▊         | 979/12776 [07:25<1:07:37,  2.91it/s]                                                       8%|▊         | 979/12776 [07:25<1:07:37,  2.91it/s]  8%|▊         | 980/12776 [07:25<1:03:14,  3.11it/s]                                                       8%|▊         | 980/12776 [07:25<1:03:14,  3.11it/s]  8%|▊         | 981/12776 [07:25<58:59,  3.33it/s]                                                       8%|▊         | 981/12776 [07:25<58:59,  3.33it/s]  8%|▊         | 982/12776 [07:26<56:09,  3.50it/s]                                                     8%|▊         | 982/12776 [07:26<56:09,  3.50it/s]  8%|▊         | 983/12776 [07:26<58:33,  3.36it/s]                                                     8%|▊         | 983/12776 [07:26<58:33,  3.36it/s]  8%|▊         | 984/12776 [07:26<55:33,  3.54it/s]                                                     8%|▊         | 984/12776 [07:26<55:33,  3.54it/s]  8%|▊         | 985/12776 [07:26<53:06,  3.70it/s]                                                     8%|▊         | 985/12776 [07:26<53:06,  3.70it/s]  8%|▊         | 986/12776 [07:27<51:03,  3.85it/s]                                                     8%|▊         | 986/12776 [07:27<51:03,  3.85it/s]  8%|▊         | 987/12776 [07:27<52:14,  3.76it/s]                                                     8%|▊         | 987/12776 [07:27<52:14,  3.76it/s]  8%|▊         | 988/12776 [07:27<50:26,  3.89it/s]                                                     8%|▊         | 988/12776 [07:27<50:26,  3.89it/s]  8%|▊         | 989/12776 [07:27<47:53,  4.10it/s]                                                     8%|▊         | 989/12776 [07:27<47:53,  4.10it/s]  8%|▊         | 990/12776 [07:28<45:51,  4.28it/s]                                                     8%|▊         | 990/12776 [07:28<45:51,  4.28it/s]  8%|▊         | 991/12776 [07:28<44:27,  4.42it/s]                                                     8%|▊         | 991/12776 [07:28<44:27,  4.42it/s]  8%|▊         | 992/12776 [07:28<46:33,  4.22it/s]                                                     8%|▊         | 992/12776 [07:28<46:33,  4.22it/s]  8%|▊         | 993/12776 [07:28<44:31,  4.41it/s]                                                     8%|▊         | 993/12776 [07:28<44:31,  4.41it/s]  8%|▊         | 994/12776 [07:28<42:54,  4.58it/s]                                                     8%|▊         | 994/12776 [07:28<42:54,  4.58it/s]  8%|▊         | 995/12776 [07:29<41:33,  4.72it/s]                                                     8%|▊         | 995/12776 [07:29<41:33,  4.72it/s]  8%|▊         | 996/12776 [07:29<40:27,  4.85it/s]                                                     8%|▊         | 996/12776 [07:29<40:27,  4.85it/s]  8%|▊         | 997/12776 [07:29<39:34,  4.96it/s]                                                     8%|▊         | 997/12776 [07:29<39:34,  4.96it/s]  8%|▊         | 998/12776 [07:29<44:58,  4.37it/s]                                                     8%|▊         | 998/12776 [07:29<44:58,  4.37it/s]  8%|▊         | 999/12776 [07:30<42:18,  4.64it/s]                                                     8%|▊         | 999/12776 [07:30<42:18,  4.64it/s]  8%|▊         | 1000/12776 [07:30<1:09:48,  2.81it/s]                                                        8%|▊         | 1000/12776 [07:30<1:09:48,  2.81it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 1.0965, 'grad_norm': 1.2844789028167725, 'learning_rate': 0.00028880742913000977, 'epoch': 0.15}
+{'loss': 0.8194, 'grad_norm': 1.1468544006347656, 'learning_rate': 0.000288782991202346, 'epoch': 0.15}
+{'loss': 0.5367, 'grad_norm': 1.3954484462738037, 'learning_rate': 0.00028875855327468227, 'epoch': 0.15}
+{'loss': 0.7351, 'grad_norm': 1.3916863203048706, 'learning_rate': 0.0002887341153470185, 'epoch': 0.15}
+{'loss': 1.0045, 'grad_norm': 1.7426979541778564, 'learning_rate': 0.0002887096774193548, 'epoch': 0.15}
+{'loss': 0.889, 'grad_norm': 1.407542109489441, 'learning_rate': 0.0002886852394916911, 'epoch': 0.15}
+{'loss': 0.9749, 'grad_norm': 1.1983486413955688, 'learning_rate': 0.00028866080156402733, 'epoch': 0.15}
+{'loss': 0.8094, 'grad_norm': 2.5818674564361572, 'learning_rate': 0.00028863636363636363, 'epoch': 0.15}
+{'loss': 1.3122, 'grad_norm': 1.2199925184249878, 'learning_rate': 0.0002886119257086999, 'epoch': 0.15}
+{'loss': 0.9118, 'grad_norm': 1.924811840057373, 'learning_rate': 0.00028858748778103614, 'epoch': 0.15}
+{'loss': 0.9576, 'grad_norm': 1.6180088520050049, 'learning_rate': 0.00028856304985337244, 'epoch': 0.15}
+{'loss': 1.1137, 'grad_norm': 2.2538881301879883, 'learning_rate': 0.00028853861192570864, 'epoch': 0.15}
+{'loss': 1.6183, 'grad_norm': 3.2477221488952637, 'learning_rate': 0.00028851417399804494, 'epoch': 0.15}
+{'loss': 0.978, 'grad_norm': 2.37235426902771, 'learning_rate': 0.0002884897360703812, 'epoch': 0.15}
+{'loss': 1.0761, 'grad_norm': 2.178849697113037, 'learning_rate': 0.00028846529814271745, 'epoch': 0.15}
+{'loss': 1.2112, 'grad_norm': 2.8794000148773193, 'learning_rate': 0.00028844086021505375, 'epoch': 0.15}
+{'loss': 1.4875, 'grad_norm': 2.8864026069641113, 'learning_rate': 0.00028841642228739, 'epoch': 0.15}
+{'loss': 1.2043, 'grad_norm': 2.0675783157348633, 'learning_rate': 0.00028839198435972626, 'epoch': 0.15}
+{'loss': 1.2546, 'grad_norm': 1.339497685432434, 'learning_rate': 0.00028836754643206256, 'epoch': 0.15}
+{'loss': 1.3646, 'grad_norm': 2.3659565448760986, 'learning_rate': 0.0002883431085043988, 'epoch': 0.15}
+{'loss': 1.3132, 'grad_norm': 2.711576461791992, 'learning_rate': 0.00028831867057673506, 'epoch': 0.15}
+{'loss': 1.5859, 'grad_norm': 2.318260431289673, 'learning_rate': 0.0002882942326490713, 'epoch': 0.15}
+{'loss': 1.1586, 'grad_norm': 1.689608097076416, 'learning_rate': 0.0002882697947214076, 'epoch': 0.15}
+{'loss': 1.1056, 'grad_norm': 2.5589444637298584, 'learning_rate': 0.00028824535679374387, 'epoch': 0.15}
+{'loss': 1.7521, 'grad_norm': 2.314589262008667, 'learning_rate': 0.0002882209188660801, 'epoch': 0.15}
+{'loss': 1.7522, 'grad_norm': 3.8083949089050293, 'learning_rate': 0.00028819648093841643, 'epoch': 0.15}
+{'loss': 1.1393, 'grad_norm': 2.452573537826538, 'learning_rate': 0.0002881720430107526, 'epoch': 0.15}
+{'loss': 2.0492, 'grad_norm': 2.592909336090088, 'learning_rate': 0.00028814760508308893, 'epoch': 0.15}
+{'loss': 1.3064, 'grad_norm': 2.1113009452819824, 'learning_rate': 0.0002881231671554252, 'epoch': 0.16}
+{'loss': 1.1432, 'grad_norm': 2.0039310455322266, 'learning_rate': 0.00028809872922776143, 'epoch': 0.16}
+{'loss': 1.5518, 'grad_norm': 3.2623660564422607, 'learning_rate': 0.00028807429130009774, 'epoch': 0.16}
+{'loss': 1.2908, 'grad_norm': 1.7344200611114502, 'learning_rate': 0.000288049853372434, 'epoch': 0.16}
+{'loss': 1.8197, 'grad_norm': 3.0457141399383545, 'learning_rate': 0.00028802541544477024, 'epoch': 0.16}
+{'loss': 1.2153, 'grad_norm': 3.5465786457061768, 'learning_rate': 0.00028800097751710655, 'epoch': 0.16}
+{'loss': 1.3038, 'grad_norm': 2.4094431400299072, 'learning_rate': 0.0002879765395894428, 'epoch': 0.16}
+{'loss': 1.4192, 'grad_norm': 4.36583948135376, 'learning_rate': 0.00028795210166177905, 'epoch': 0.16}
+{'loss': 1.0493, 'grad_norm': 2.3406410217285156, 'learning_rate': 0.0002879276637341153, 'epoch': 0.16}
+{'loss': 1.0919, 'grad_norm': 1.9417458772659302, 'learning_rate': 0.0002879032258064516, 'epoch': 0.16}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:07,  6.04it/s][A
+  0%|          | 3/774 [00:00<02:50,  4.51it/s][A
+  1%|          | 4/774 [00:00<03:19,  3.87it/s][A
+  1%|          | 5/774 [00:01<03:21,  3.82it/s][A
+  1%|          | 6/774 [00:01<03:33,  3.59it/s][A
+  1%|          | 7/774 [00:01<03:29,  3.65it/s][A
+  1%|          | 8/774 [00:02<03:30,  3.64it/s][A
+  1%|          | 9/774 [00:02<03:20,  3.82it/s][A
+  1%|▏         | 10/774 [00:02<03:22,  3.78it/s][A
+  1%|▏         | 11/774 [00:02<03:34,  3.55it/s][A
+  2%|▏         | 12/774 [00:03<03:22,  3.77it/s][A
+  2%|▏         | 13/774 [00:03<03:14,  3.92it/s][A
+  2%|▏         | 14/774 [00:03<03:25,  3.69it/s][A
+  2%|▏         | 15/774 [00:04<03:46,  3.36it/s][A
+  2%|▏         | 16/774 [00:04<03:43,  3.39it/s][A
+  2%|▏         | 17/774 [00:04<03:19,  3.79it/s][A
+  2%|▏         | 18/774 [00:04<03:11,  3.94it/s][A
+  2%|▏         | 19/774 [00:05<03:21,  3.75it/s][A
+  3%|▎         | 20/774 [00:05<03:19,  3.79it/s][A
+  3%|▎         | 21/774 [00:05<03:20,  3.75it/s][A
+  3%|▎         | 22/774 [00:05<03:24,  3.67it/s][A
+  3%|▎         | 23/774 [00:06<03:37,  3.46it/s][A
+  3%|▎         | 24/774 [00:06<03:33,  3.52it/s][A
+  3%|▎         | 25/774 [00:06<03:35,  3.48it/s][A
+  3%|▎         | 26/774 [00:07<03:34,  3.49it/s][A
+  3%|▎         | 27/774 [00:07<03:32,  3.52it/s][A
+  4%|▎         | 28/774 [00:07<03:39,  3.40it/s][A
+  4%|▎         | 29/774 [00:07<03:41,  3.36it/s][A
+  4%|▍         | 30/774 [00:08<03:29,  3.56it/s][A
+  4%|▍         | 31/774 [00:08<03:29,  3.54it/s][A
+  4%|▍         | 32/774 [00:08<04:00,  3.09it/s][A
+  4%|▍         | 33/774 [00:09<03:47,  3.25it/s][A
+  4%|▍         | 34/774 [00:09<03:33,  3.46it/s][A
+  5%|▍         | 35/774 [00:09<03:44,  3.29it/s][A
+  5%|▍         | 36/774 [00:10<03:46,  3.26it/s][A
+  5%|▍         | 37/774 [00:10<03:45,  3.26it/s][A
+  5%|▍         | 38/774 [00:10<03:37,  3.38it/s][A
+  5%|▌         | 39/774 [00:10<03:24,  3.59it/s][A
+  5%|▌         | 40/774 [00:11<03:29,  3.51it/s][A
+  5%|▌         | 41/774 [00:11<03:25,  3.57it/s][A
+  5%|▌         | 42/774 [00:11<03:12,  3.80it/s][A
+  6%|▌         | 43/774 [00:11<03:25,  3.55it/s][A
+  6%|▌         | 44/774 [00:12<03:26,  3.53it/s][A
+  6%|▌         | 45/774 [00:12<03:15,  3.72it/s][A
+  6%|▌         | 46/774 [00:12<03:03,  3.98it/s][A
+  6%|▌         | 47/774 [00:12<02:49,  4.29it/s][A
+  6%|▌         | 48/774 [00:13<02:49,  4.28it/s][A
+  6%|▋         | 49/774 [00:13<02:49,  4.27it/s][A
+  6%|▋         | 50/774 [00:13<02:51,  4.22it/s][A
+  7%|▋         | 51/774 [00:13<02:54,  4.15it/s][A
+  7%|▋         | 52/774 [00:14<02:54,  4.13it/s][A
+  7%|▋         | 53/774 [00:14<03:05,  3.89it/s][A
+  7%|▋         | 54/774 [00:14<03:08,  3.82it/s][A
+  7%|▋         | 55/774 [00:14<03:15,  3.67it/s][A
+  7%|▋         | 56/774 [00:15<03:17,  3.63it/s][A
+  7%|▋         | 57/774 [00:15<03:24,  3.50it/s][A
+  7%|▋         | 58/774 [00:15<03:25,  3.49it/s][A
+  8%|▊         | 59/774 [00:16<03:06,  3.83it/s][A
+  8%|▊         | 60/774 [00:16<02:52,  4.14it/s][A
+  8%|▊         | 61/774 [00:16<02:30,  4.72it/s][A
+  8%|▊         | 62/774 [00:16<02:28,  4.80it/s][A
+  8%|▊         | 63/774 [00:16<02:55,  4.04it/s][A
+  8%|▊         | 64/774 [00:17<02:45,  4.28it/s][A
+  8%|▊         | 65/774 [00:17<02:45,  4.28it/s][A
+  9%|▊         | 66/774 [00:17<02:44,  4.31it/s][A
+  9%|▊         | 67/774 [00:17<02:37,  4.48it/s][A
+  9%|▉         | 68/774 [00:18<02:35,  4.54it/s][A
+  9%|▉         | 69/774 [00:18<02:26,  4.83it/s][A
+  9%|▉         | 70/774 [00:18<02:33,  4.60it/s][A
+  9%|▉         | 71/774 [00:18<02:28,  4.75it/s][A
+  9%|▉         | 72/774 [00:18<02:37,  4.46it/s][A
+  9%|▉         | 73/774 [00:19<02:50,  4.12it/s][A
+ 10%|▉         | 74/774 [00:19<02:58,  3.93it/s][A
+ 10%|▉         | 75/774 [00:19<03:03,  3.80it/s][A
+ 10%|▉         | 76/774 [00:19<02:58,  3.91it/s][A
+ 10%|▉         | 77/774 [00:20<03:14,  3.59it/s][A
+ 10%|█         | 78/774 [00:20<02:56,  3.95it/s][A
+ 10%|█         | 79/774 [00:20<02:43,  4.25it/s][A
+ 10%|█         | 80/774 [00:20<02:38,  4.38it/s][A
+ 10%|█         | 81/774 [00:21<02:17,  5.03it/s][A
+ 11%|█         | 82/774 [00:21<02:18,  4.99it/s][A
+ 11%|█         | 83/774 [00:21<02:21,  4.89it/s][A
+ 11%|█         | 84/774 [00:21<02:28,  4.65it/s][A
+ 11%|█         | 85/774 [00:21<02:36,  4.40it/s][A
+ 11%|█         | 86/774 [00:22<02:41,  4.26it/s][A
+ 11%|█         | 87/774 [00:22<02:41,  4.24it/s][A
+ 11%|█▏        | 88/774 [00:22<02:30,  4.54it/s][A
+ 11%|█▏        | 89/774 [00:22<02:24,  4.74it/s][A
+ 12%|█▏        | 90/774 [00:23<02:30,  4.54it/s][A
+ 12%|█▏        | 91/774 [00:23<02:44,  4.16it/s][A
+ 12%|█▏        | 92/774 [00:23<03:00,  3.78it/s][A
+ 12%|█▏        | 93/774 [00:23<02:55,  3.87it/s][A
+ 12%|█▏        | 94/774 [00:24<03:00,  3.76it/s][A
+ 12%|█▏        | 95/774 [00:24<02:58,  3.81it/s][A
+ 12%|█▏        | 96/774 [00:24<02:54,  3.89it/s][A
+ 13%|█▎        | 97/774 [00:24<02:40,  4.23it/s][A
+ 13%|█▎        | 98/774 [00:25<02:32,  4.43it/s][A
+ 13%|█▎        | 99/774 [00:25<02:44,  4.09it/s][A
+ 13%|█▎        | 100/774 [00:25<02:55,  3.85it/s][A
+ 13%|█▎        | 101/774 [00:25<02:59,  3.76it/s][A
+ 13%|█▎        | 102/774 [00:26<03:11,  3.51it/s][A
+ 13%|█▎        | 103/774 [00:26<03:12,  3.49it/s][A
+ 13%|█▎        | 104/774 [00:26<03:10,  3.51it/s][A
+ 14%|█▎        | 105/774 [00:27<03:11,  3.49it/s][A
+ 14%|█▎        | 106/774 [00:27<03:34,  3.11it/s][A
+ 14%|█▍        | 107/774 [00:27<03:46,  2.95it/s][A
+ 14%|█▍        | 108/774 [00:28<03:38,  3.05it/s][A
+ 14%|█▍        | 109/774 [00:28<03:35,  3.08it/s][A
+ 14%|█▍        | 110/774 [00:28<03:24,  3.24it/s][A
+ 14%|█▍        | 111/774 [00:29<03:24,  3.24it/s][A
+ 14%|█▍        | 112/774 [00:29<03:14,  3.41it/s][A
+ 15%|█▍        | 113/774 [00:29<03:16,  3.36it/s][A
+ 15%|█▍        | 114/774 [00:29<03:22,  3.27it/s][A
+ 15%|█▍        | 115/774 [00:30<03:16,  3.35it/s][A
+ 15%|█▍        | 116/774 [00:30<03:01,  3.63it/s][A
+ 15%|█▌        | 117/774 [00:30<03:05,  3.54it/s][A
+ 15%|█▌        | 118/774 [00:31<03:05,  3.53it/s][A
+ 15%|█▌        | 119/774 [00:31<02:57,  3.69it/s][A
+ 16%|█▌        | 120/774 [00:31<03:13,  3.38it/s][A
+ 16%|█▌        | 121/774 [00:31<03:07,  3.47it/s][A
+ 16%|█▌        | 122/774 [00:32<03:08,  3.47it/s][A
+ 16%|█▌        | 123/774 [00:32<02:57,  3.68it/s][A
+ 16%|█▌        | 124/774 [00:32<02:57,  3.67it/s][A
+ 16%|█▌        | 125/774 [00:33<03:01,  3.57it/s][A
+ 16%|█▋        | 126/774 [00:33<03:10,  3.40it/s][A
+ 16%|█▋        | 127/774 [00:33<03:20,  3.23it/s][A
+ 17%|█▋        | 128/774 [00:33<03:12,  3.35it/s][A
+ 17%|█▋        | 129/774 [00:34<03:15,  3.30it/s][A
+ 17%|█▋        | 130/774 [00:34<03:23,  3.17it/s][A
+ 17%|█▋        | 131/774 [00:34<03:11,  3.36it/s][A
+ 17%|█▋        | 132/774 [00:35<03:12,  3.34it/s][A
+ 17%|█▋        | 133/774 [00:35<03:09,  3.38it/s][A
+ 17%|█▋        | 134/774 [00:35<03:08,  3.40it/s][A
+ 17%|█▋        | 135/774 [00:36<03:25,  3.11it/s][A
+ 18%|█▊        | 136/774 [00:36<03:33,  2.99it/s][A
+ 18%|█▊        | 137/774 [00:36<03:33,  2.99it/s][A
+ 18%|█▊        | 138/774 [00:37<03:28,  3.04it/s][A
+ 18%|█▊        | 139/774 [00:37<03:28,  3.05it/s][A
+ 18%|█▊        | 140/774 [00:37<03:22,  3.14it/s][A
+ 18%|█▊        | 141/774 [00:38<03:12,  3.28it/s][A
+ 18%|█▊        | 142/774 [00:38<03:23,  3.10it/s][A
+ 18%|█▊        | 143/774 [00:38<03:18,  3.17it/s][A
+ 19%|█▊        | 144/774 [00:39<03:09,  3.33it/s][A
+ 19%|█▊        | 145/774 [00:39<03:00,  3.49it/s][A
+ 19%|█▉        | 146/774 [00:39<02:50,  3.69it/s][A
+ 19%|█▉        | 147/774 [00:39<02:41,  3.87it/s][A
+ 19%|█▉        | 148/774 [00:40<02:50,  3.67it/s][A
+ 19%|█▉        | 149/774 [00:40<03:04,  3.38it/s][A
+ 19%|█▉        | 150/774 [00:40<03:07,  3.33it/s][A
+ 20%|█▉        | 151/774 [00:40<02:55,  3.54it/s][A
+ 20%|█▉        | 152/774 [00:41<02:48,  3.69it/s][A
+ 20%|█▉        | 153/774 [00:41<02:55,  3.53it/s][A
+ 20%|█▉        | 154/774 [00:41<02:48,  3.67it/s][A
+ 20%|██        | 155/774 [00:41<02:45,  3.73it/s][A
+ 20%|██        | 156/774 [00:42<02:42,  3.79it/s][A
+ 20%|██        | 157/774 [00:42<02:35,  3.97it/s][A
+ 20%|██        | 158/774 [00:42<02:44,  3.75it/s][A
+ 21%|██        | 159/774 [00:43<02:47,  3.66it/s][A
+ 21%|██        | 160/774 [00:43<02:40,  3.82it/s][A
+ 21%|██        | 161/774 [00:43<02:48,  3.64it/s][A
+ 21%|██        | 162/774 [00:43<02:56,  3.47it/s][A
+ 21%|██        | 163/774 [00:44<02:54,  3.49it/s][A
+ 21%|██        | 164/774 [00:44<02:50,  3.57it/s][A
+ 21%|██▏       | 165/774 [00:44<02:45,  3.69it/s][A
+ 21%|██▏       | 166/774 [00:45<02:47,  3.63it/s][A
+ 22%|██▏       | 167/774 [00:45<02:48,  3.60it/s][A
+ 22%|██▏       | 168/774 [00:45<02:39,  3.80it/s][A
+ 22%|██▏       | 169/774 [00:45<02:36,  3.86it/s][A
+ 22%|██▏       | 170/774 [00:46<02:41,  3.73it/s][A
+ 22%|██▏       | 171/774 [00:46<02:49,  3.56it/s][A
+ 22%|██▏       | 172/774 [00:46<02:56,  3.40it/s][A
+ 22%|██▏       | 173/774 [00:46<02:50,  3.51it/s][A
+ 22%|██▏       | 174/774 [00:47<02:41,  3.71it/s][A
+ 23%|██▎       | 175/774 [00:47<02:43,  3.67it/s][A
+ 23%|██▎       | 176/774 [00:47<02:34,  3.87it/s][A
+ 23%|██▎       | 177/774 [00:48<02:49,  3.53it/s][A
+ 23%|██▎       | 178/774 [00:48<02:33,  3.89it/s][A
+ 23%|██▎       | 179/774 [00:48<02:20,  4.25it/s][A
+ 23%|██▎       | 180/774 [00:48<02:14,  4.42it/s][A
+ 23%|██▎       | 181/774 [00:48<02:18,  4.29it/s][A
+ 24%|██▎       | 182/774 [00:49<02:20,  4.22it/s][A
+ 24%|██▎       | 183/774 [00:49<02:20,  4.20it/s][A
+ 24%|██▍       | 184/774 [00:49<02:31,  3.88it/s][A
+ 24%|██▍       | 185/774 [00:49<02:39,  3.70it/s][A
+ 24%|██▍       | 186/774 [00:50<02:36,  3.76it/s][A
+ 24%|██▍       | 187/774 [00:50<02:31,  3.87it/s][A
+ 24%|██▍       | 188/774 [00:50<02:30,  3.91it/s][A
+ 24%|██▍       | 189/774 [00:50<02:29,  3.91it/s][A
+ 25%|██▍       | 190/774 [00:51<02:24,  4.04it/s][A
+ 25%|██▍       | 191/774 [00:51<02:29,  3.91it/s][A
+ 25%|██▍       | 192/774 [00:51<02:35,  3.74it/s][A
+ 25%|██▍       | 193/774 [00:52<02:37,  3.69it/s][A
+ 25%|██▌       | 194/774 [00:52<02:49,  3.41it/s][A
+ 25%|██▌       | 195/774 [00:52<02:59,  3.23it/s][A
+ 25%|██▌       | 196/774 [00:53<03:00,  3.20it/s][A
+ 25%|██▌       | 197/774 [00:53<02:58,  3.24it/s][A
+ 26%|██▌       | 198/774 [00:53<02:49,  3.39it/s][A
+ 26%|██▌       | 199/774 [00:53<02:50,  3.36it/s][A
+ 26%|██▌       | 200/774 [00:54<02:44,  3.50it/s][A
+ 26%|██▌       | 201/774 [00:54<02:39,  3.59it/s][A
+ 26%|██▌       | 202/774 [00:54<02:36,  3.66it/s][A
+ 26%|██▌       | 203/774 [00:54<02:27,  3.86it/s][A
+ 26%|██▋       | 204/774 [00:55<02:30,  3.78it/s][A
+ 26%|██▋       | 205/774 [00:55<02:40,  3.54it/s][A
+ 27%|██▋       | 206/774 [00:55<02:35,  3.65it/s][A
+ 27%|██▋       | 207/774 [00:56<02:35,  3.65it/s][A
+ 27%|██▋       | 208/774 [00:56<02:35,  3.64it/s][A
+ 27%|██▋       | 209/774 [00:56<02:32,  3.69it/s][A
+ 27%|██▋       | 210/774 [00:56<02:31,  3.73it/s][A
+ 27%|██▋       | 211/774 [00:57<02:27,  3.81it/s][A
+ 27%|██▋       | 212/774 [00:57<02:17,  4.08it/s][A
+ 28%|██▊       | 213/774 [00:57<02:02,  4.60it/s][A
+ 28%|██▊       | 214/774 [00:57<02:05,  4.47it/s][A
+ 28%|██▊       | 215/774 [00:57<02:04,  4.49it/s][A
+ 28%|██▊       | 216/774 [00:58<02:01,  4.58it/s][A
+ 28%|██▊       | 217/774 [00:58<02:06,  4.39it/s][A
+ 28%|██▊       | 218/774 [00:58<02:11,  4.22it/s][A
+ 28%|██▊       | 219/774 [00:58<02:21,  3.92it/s][A
+ 28%|██▊       | 220/774 [00:59<02:19,  3.99it/s][A
+ 29%|██▊       | 221/774 [00:59<02:24,  3.82it/s][A
+ 29%|██▊       | 222/774 [00:59<02:34,  3.57it/s][A
+ 29%|██▉       | 223/774 [01:00<02:52,  3.20it/s][A
+ 29%|██▉       | 224/774 [01:00<03:00,  3.04it/s][A
+ 29%|██▉       | 225/774 [01:00<03:13,  2.83it/s][A
+ 29%|██▉       | 226/774 [01:01<03:18,  2.76it/s][A
+ 29%|██▉       | 227/774 [01:01<03:17,  2.77it/s][A
+ 29%|██▉       | 228/774 [01:02<03:08,  2.89it/s][A
+ 30%|██▉       | 229/774 [01:02<03:23,  2.68it/s][A
+ 30%|██▉       | 230/774 [01:02<03:06,  2.92it/s][A
+ 30%|██▉       | 231/774 [01:03<03:04,  2.95it/s][A
+ 30%|██▉       | 232/774 [01:03<02:55,  3.10it/s][A
+ 30%|███       | 233/774 [01:03<03:17,  2.74it/s][A
+ 30%|███       | 234/774 [01:04<03:23,  2.65it/s][A
+ 30%|███       | 235/774 [01:04<03:21,  2.68it/s][A
+ 30%|███       | 236/774 [01:04<03:24,  2.63it/s][A
+ 31%|███       | 237/774 [01:05<03:20,  2.68it/s][A
+ 31%|███       | 238/774 [01:05<03:06,  2.87it/s][A
+ 31%|███       | 239/774 [01:05<03:06,  2.86it/s][A
+ 31%|███       | 240/774 [01:06<03:05,  2.89it/s][A
+ 31%|███       | 241/774 [01:06<03:06,  2.86it/s][A
+ 31%|███▏      | 242/774 [01:07<03:15,  2.72it/s][A
+ 31%|███▏      | 243/774 [01:07<03:29,  2.54it/s][A
+ 32%|███▏      | 244/774 [01:07<03:21,  2.64it/s][A
+ 32%|███▏      | 245/774 [01:08<03:09,  2.80it/s][A
+ 32%|███▏      | 246/774 [01:08<03:04,  2.86it/s][A
+ 32%|███▏      | 247/774 [01:09<03:45,  2.34it/s][A
+ 32%|███▏      | 248/774 [01:09<03:51,  2.27it/s][A
+ 32%|███▏      | 249/774 [01:09<03:28,  2.52it/s][A
+ 32%|███▏      | 250/774 [01:10<03:21,  2.60it/s][A
+ 32%|███▏      | 251/774 [01:10<03:19,  2.62it/s][A
+ 33%|███▎      | 252/774 [01:10<03:13,  2.69it/s][A
+ 33%|███▎      | 253/774 [01:11<03:13,  2.70it/s][A
+ 33%|███▎      | 254/774 [01:11<03:07,  2.77it/s][A
+ 33%|███▎      | 255/774 [01:11<03:01,  2.86it/s][A
+ 33%|███▎      | 256/774 [01:12<02:56,  2.93it/s][A
+ 33%|███▎      | 257/774 [01:12<02:56,  2.93it/s][A
+ 33%|███▎      | 258/774 [01:12<02:40,  3.22it/s][A
+ 33%|███▎      | 259/774 [01:13<02:22,  3.63it/s][A
+ 34%|███▎      | 260/774 [01:13<02:21,  3.64it/s][A
+ 34%|███▎      | 261/774 [01:13<02:28,  3.46it/s][A
+ 34%|███▍      | 262/774 [01:13<02:14,  3.81it/s][A
+ 34%|███▍      | 263/774 [01:14<02:06,  4.04it/s][A
+ 34%|███▍      | 264/774 [01:14<02:15,  3.77it/s][A
+ 34%|███▍      | 265/774 [01:14<02:10,  3.91it/s][A
+ 34%|███▍      | 266/774 [01:14<02:02,  4.14it/s][A
+ 34%|███▍      | 267/774 [01:15<01:59,  4.24it/s][A
+ 35%|███▍      | 268/774 [01:15<02:05,  4.04it/s][A
+ 35%|███▍      | 269/774 [01:15<02:11,  3.84it/s][A
+ 35%|███▍      | 270/774 [01:15<02:16,  3.68it/s][A
+ 35%|███▌      | 271/774 [01:16<02:14,  3.73it/s][A
+ 35%|███▌      | 272/774 [01:16<02:04,  4.04it/s][A
+ 35%|███▌      | 273/774 [01:16<01:59,  4.18it/s][A
+ 35%|███▌      | 274/774 [01:16<02:03,  4.04it/s][A
+ 36%|███▌      | 275/774 [01:17<01:56,  4.28it/s][A
+ 36%|███▌      | 276/774 [01:17<01:50,  4.50it/s][A
+ 36%|███▌      | 277/774 [01:17<01:53,  4.38it/s][A
+ 36%|███▌      | 278/774 [01:17<01:54,  4.34it/s][A
+ 36%|███▌      | 279/774 [01:17<01:49,  4.51it/s][A
+ 36%|███▌      | 280/774 [01:18<01:51,  4.43it/s][A
+ 36%|███▋      | 281/774 [01:18<02:03,  4.00it/s][A
+ 36%|███▋      | 282/774 [01:18<02:15,  3.64it/s][A
+ 37%|███▋      | 283/774 [01:19<02:10,  3.76it/s][A
+ 37%|███▋      | 284/774 [01:19<02:11,  3.73it/s][A
+ 37%|███▋      | 285/774 [01:19<02:05,  3.89it/s][A
+ 37%|███▋      | 286/774 [01:19<02:01,  4.02it/s][A
+ 37%|███▋      | 287/774 [01:20<02:13,  3.66it/s][A
+ 37%|███▋      | 288/774 [01:20<02:16,  3.57it/s][A
+ 37%|███▋      | 289/774 [01:20<02:13,  3.63it/s][A
+ 37%|███▋      | 290/774 [01:20<02:08,  3.76it/s][A
+ 38%|███▊      | 291/774 [01:21<02:08,  3.76it/s][A
+ 38%|███▊      | 292/774 [01:21<02:04,  3.88it/s][A
+ 38%|███▊      | 293/774 [01:21<01:53,  4.25it/s][A
+ 38%|███▊      | 294/774 [01:21<01:49,  4.38it/s][A
+ 38%|███▊      | 295/774 [01:22<01:47,  4.45it/s][A
+ 38%|███▊      | 296/774 [01:22<01:43,  4.64it/s][A
+ 38%|███▊      | 297/774 [01:22<01:37,  4.90it/s][A
+ 39%|███▊      | 298/774 [01:22<01:42,  4.64it/s][A
+ 39%|███▊      | 299/774 [01:22<01:46,  4.47it/s][A
+ 39%|███▉      | 300/774 [01:23<01:53,  4.16it/s][A
+ 39%|███▉      | 301/774 [01:23<01:47,  4.40it/s][A
+ 39%|███▉      | 302/774 [01:23<01:41,  4.64it/s][A
+ 39%|███▉      | 303/774 [01:23<01:38,  4.76it/s][A
+ 39%|███▉      | 304/774 [01:23<01:27,  5.39it/s][A
+ 39%|███▉      | 305/774 [01:24<01:25,  5.47it/s][A
+ 40%|███▉      | 306/774 [01:24<01:35,  4.89it/s][A
+ 40%|███▉      | 307/774 [01:24<01:43,  4.51it/s][A
+ 40%|███▉      | 308/774 [01:24<01:37,  4.76it/s][A
+ 40%|███▉      | 309/774 [01:24<01:38,  4.70it/s][A
+ 40%|████      | 310/774 [01:25<01:43,  4.49it/s][A
+ 40%|████      | 311/774 [01:25<01:41,  4.57it/s][A
+ 40%|████      | 312/774 [01:25<01:38,  4.71it/s][A
+ 40%|████      | 313/774 [01:25<01:39,  4.63it/s][A
+ 41%|████      | 314/774 [01:26<01:39,  4.61it/s][A
+ 41%|████      | 315/774 [01:26<01:50,  4.15it/s][A
+ 41%|████      | 316/774 [01:26<01:41,  4.53it/s][A
+ 41%|████      | 317/774 [01:26<01:34,  4.83it/s][A
+ 41%|████      | 318/774 [01:26<01:37,  4.69it/s][A
+ 41%|████      | 319/774 [01:27<01:39,  4.57it/s][A
+ 41%|████▏     | 320/774 [01:27<01:39,  4.54it/s][A
+ 41%|████▏     | 321/774 [01:27<01:31,  4.94it/s][A
+ 42%|████▏     | 322/774 [01:27<01:26,  5.23it/s][A
+ 42%|████▏     | 323/774 [01:27<01:17,  5.81it/s][A
+ 42%|████▏     | 324/774 [01:28<01:24,  5.33it/s][A
+ 42%|████▏     | 325/774 [01:28<01:28,  5.06it/s][A
+ 42%|████▏     | 326/774 [01:28<01:25,  5.26it/s][A
+ 42%|████▏     | 327/774 [01:28<01:28,  5.02it/s][A
+ 42%|████▏     | 328/774 [01:28<01:27,  5.11it/s][A
+ 43%|████▎     | 329/774 [01:29<01:37,  4.54it/s][A
+ 43%|████▎     | 330/774 [01:29<01:32,  4.78it/s][A
+ 43%|████▎     | 331/774 [01:29<01:23,  5.32it/s][A
+ 43%|████▎     | 332/774 [01:29<01:21,  5.43it/s][A
+ 43%|████▎     | 333/774 [01:29<01:24,  5.22it/s][A
+ 43%|████▎     | 334/774 [01:30<01:27,  5.01it/s][A
+ 43%|████▎     | 335/774 [01:30<01:28,  4.96it/s][A
+ 43%|████▎     | 336/774 [01:30<01:27,  5.02it/s][A
+ 44%|████▎     | 337/774 [01:30<01:21,  5.39it/s][A
+ 44%|████▎     | 338/774 [01:30<01:15,  5.77it/s][A
+ 44%|████▍     | 339/774 [01:30<01:11,  6.12it/s][A
+ 44%|████▍     | 340/774 [01:31<01:10,  6.15it/s][A
+ 44%|████▍     | 341/774 [01:31<01:28,  4.91it/s][A
+ 44%|████▍     | 342/774 [01:31<01:36,  4.46it/s][A
+ 44%|████▍     | 343/774 [01:31<01:37,  4.42it/s][A
+ 44%|████▍     | 344/774 [01:32<01:41,  4.23it/s][A
+ 45%|████▍     | 345/774 [01:32<01:44,  4.09it/s][A
+ 45%|████▍     | 346/774 [01:32<01:46,  4.02it/s][A
+ 45%|████▍     | 347/774 [01:32<01:43,  4.11it/s][A
+ 45%|████▍     | 348/774 [01:33<01:38,  4.31it/s][A
+ 45%|████▌     | 349/774 [01:33<01:35,  4.44it/s][A
+ 45%|████▌     | 350/774 [01:33<01:37,  4.35it/s][A
+ 45%|████▌     | 351/774 [01:33<01:38,  4.31it/s][A
+ 45%|████▌     | 352/774 [01:34<01:34,  4.47it/s][A
+ 46%|████▌     | 353/774 [01:34<01:33,  4.51it/s][A
+ 46%|████▌     | 354/774 [01:34<01:32,  4.54it/s][A
+ 46%|████▌     | 355/774 [01:34<01:37,  4.28it/s][A
+ 46%|████▌     | 356/774 [01:35<01:46,  3.94it/s][A
+ 46%|████▌     | 357/774 [01:35<02:02,  3.41it/s][A
+ 46%|████▋     | 358/774 [01:35<02:06,  3.29it/s][A
+ 46%|████▋     | 359/774 [01:36<02:05,  3.31it/s][A
+ 47%|████▋     | 360/774 [01:36<02:05,  3.31it/s][A
+ 47%|████▋     | 361/774 [01:36<01:56,  3.53it/s][A
+ 47%|████▋     | 362/774 [01:36<02:04,  3.31it/s][A
+ 47%|████▋     | 363/774 [01:37<02:02,  3.36it/s][A
+ 47%|████▋     | 364/774 [01:37<02:04,  3.29it/s][A
+ 47%|████▋     | 365/774 [01:37<02:00,  3.40it/s][A
+ 47%|████▋     | 366/774 [01:38<01:51,  3.64it/s][A
+ 47%|████▋     | 367/774 [01:38<01:47,  3.78it/s][A
+ 48%|████▊     | 368/774 [01:38<01:44,  3.90it/s][A
+ 48%|████▊     | 369/774 [01:38<01:50,  3.66it/s][A
+ 48%|████▊     | 370/774 [01:39<02:06,  3.20it/s][A
+ 48%|████▊     | 371/774 [01:39<01:57,  3.44it/s][A
+ 48%|████▊     | 372/774 [01:39<01:57,  3.42it/s][A
+ 48%|████▊     | 373/774 [01:40<01:56,  3.46it/s][A
+ 48%|████▊     | 374/774 [01:40<01:52,  3.57it/s][A
+ 48%|████▊     | 375/774 [01:40<01:53,  3.53it/s][A
+ 49%|████▊     | 376/774 [01:40<01:56,  3.43it/s][A
+ 49%|████▊     | 377/774 [01:41<02:09,  3.08it/s][A
+ 49%|████▉     | 378/774 [01:41<02:10,  3.04it/s][A
+ 49%|████▉     | 379/774 [01:41<02:01,  3.25it/s][A
+ 49%|████▉     | 380/774 [01:42<01:50,  3.55it/s][A
+ 49%|████▉     | 381/774 [01:42<01:42,  3.83it/s][A
+ 49%|████▉     | 382/774 [01:42<01:39,  3.95it/s][A
+ 49%|████▉     | 383/774 [01:42<01:37,  4.01it/s][A
+ 50%|████▉     | 384/774 [01:43<01:43,  3.75it/s][A
+ 50%|████▉     | 385/774 [01:43<01:53,  3.43it/s][A
+ 50%|████▉     | 386/774 [01:43<01:45,  3.68it/s][A
+ 50%|█████     | 387/774 [01:43<01:39,  3.88it/s][A
+ 50%|█████     | 388/774 [01:44<01:45,  3.67it/s][A
+ 50%|█████     | 389/774 [01:44<01:41,  3.80it/s][A
+ 50%|█████     | 390/774 [01:44<01:54,  3.37it/s][A
+ 51%|█████     | 391/774 [01:45<01:56,  3.30it/s][A
+ 51%|█████     | 392/774 [01:45<01:46,  3.58it/s][A
+ 51%|█████     | 393/774 [01:45<01:38,  3.89it/s][A
+ 51%|█████     | 394/774 [01:45<01:37,  3.91it/s][A
+ 51%|█████     | 395/774 [01:46<01:45,  3.59it/s][A
+ 51%|█████     | 396/774 [01:46<01:43,  3.67it/s][A
+ 51%|█████▏    | 397/774 [01:46<01:44,  3.60it/s][A
+ 51%|█████▏    | 398/774 [01:46<01:41,  3.72it/s][A
+ 52%|█████▏    | 399/774 [01:47<01:39,  3.77it/s][A
+ 52%|█████▏    | 400/774 [01:47<01:31,  4.09it/s][A
+ 52%|█████▏    | 401/774 [01:47<01:27,  4.26it/s][A
+ 52%|█████▏    | 402/774 [01:47<01:27,  4.27it/s][A
+ 52%|█████▏    | 403/774 [01:48<01:31,  4.05it/s][A
+ 52%|█████▏    | 404/774 [01:48<01:38,  3.77it/s][A
+ 52%|█████▏    | 405/774 [01:48<01:35,  3.88it/s][A
+ 52%|█████▏    | 406/774 [01:48<01:38,  3.72it/s][A
+ 53%|█████▎    | 407/774 [01:49<01:44,  3.52it/s][A
+ 53%|█████▎    | 408/774 [01:49<01:38,  3.72it/s][A
+ 53%|█████▎    | 409/774 [01:49<01:34,  3.86it/s][A
+ 53%|█████▎    | 410/774 [01:50<01:36,  3.75it/s][A
+ 53%|█████▎    | 411/774 [01:50<01:37,  3.73it/s][A
+ 53%|█████▎    | 412/774 [01:50<01:38,  3.68it/s][A
+ 53%|█████▎    | 413/774 [01:50<01:35,  3.78it/s][A
+ 53%|█████▎    | 414/774 [01:51<01:34,  3.81it/s][A
+ 54%|█████▎    | 415/774 [01:51<01:24,  4.27it/s][A
+ 54%|█████▎    | 416/774 [01:51<01:24,  4.24it/s][A
+ 54%|█████▍    | 417/774 [01:51<01:22,  4.34it/s][A
+ 54%|█████▍    | 418/774 [01:51<01:16,  4.66it/s][A
+ 54%|█████▍    | 419/774 [01:52<01:31,  3.87it/s][A
+ 54%|█████▍    | 420/774 [01:52<01:35,  3.72it/s][A
+ 54%|█████▍    | 421/774 [01:52<01:33,  3.76it/s][A
+ 55%|█████▍    | 422/774 [01:53<01:32,  3.80it/s][A
+ 55%|█████▍    | 423/774 [01:53<01:33,  3.77it/s][A
+ 55%|█████▍    | 424/774 [01:53<01:31,  3.82it/s][A
+ 55%|█████▍    | 425/774 [01:53<01:21,  4.28it/s][A
+ 55%|█████▌    | 426/774 [01:53<01:16,  4.54it/s][A
+ 55%|█████▌    | 427/774 [01:54<01:12,  4.78it/s][A
+ 55%|█████▌    | 428/774 [01:54<01:14,  4.63it/s][A
+ 55%|█████▌    | 429/774 [01:54<01:16,  4.53it/s][A
+ 56%|█████▌    | 430/774 [01:54<01:19,  4.32it/s][A
+ 56%|█████▌    | 431/774 [01:55<01:33,  3.67it/s][A
+ 56%|█████▌    | 432/774 [01:55<01:32,  3.68it/s][A
+ 56%|█████▌    | 433/774 [01:55<01:25,  3.99it/s][A
+ 56%|█████▌    | 434/774 [01:55<01:20,  4.22it/s][A
+ 56%|█████▌    | 435/774 [01:56<01:19,  4.26it/s][A
+ 56%|█████▋    | 436/774 [01:56<01:22,  4.12it/s][A
+ 56%|█████▋    | 437/774 [01:56<01:18,  4.32it/s][A
+ 57%|█████▋    | 438/774 [01:56<01:14,  4.50it/s][A
+ 57%|█████▋    | 439/774 [01:57<01:17,  4.32it/s][A
+ 57%|█████▋    | 440/774 [01:57<01:20,  4.14it/s][A
+ 57%|█████▋    | 441/774 [01:57<01:25,  3.91it/s][A
+ 57%|█████▋    | 442/774 [01:57<01:27,  3.80it/s][A
+ 57%|█████▋    | 443/774 [01:58<01:24,  3.90it/s][A
+ 57%|█████▋    | 444/774 [01:58<01:24,  3.91it/s][A
+ 57%|█████▋    | 445/774 [01:58<01:24,  3.91it/s][A
+ 58%|█████▊    | 446/774 [01:58<01:21,  4.01it/s][A
+ 58%|█████▊    | 447/774 [01:59<01:20,  4.05it/s][A
+ 58%|█████▊    | 448/774 [01:59<01:13,  4.41it/s][A
+ 58%|█████▊    | 449/774 [01:59<01:14,  4.34it/s][A
+ 58%|█████▊    | 450/774 [01:59<01:16,  4.21it/s][A
+ 58%|█████▊    | 451/774 [02:00<01:14,  4.33it/s][A
+ 58%|█████▊    | 452/774 [02:00<01:10,  4.55it/s][A
+ 59%|█████▊    | 453/774 [02:00<01:10,  4.57it/s][A
+ 59%|█████▊    | 454/774 [02:00<01:16,  4.18it/s][A
+ 59%|█████▉    | 455/774 [02:00<01:20,  3.96it/s][A
+ 59%|█████▉    | 456/774 [02:01<01:24,  3.75it/s][A
+ 59%|█████▉    | 457/774 [02:01<01:18,  4.03it/s][A
+ 59%|█████▉    | 458/774 [02:01<01:18,  4.04it/s][A
+ 59%|█████▉    | 459/774 [02:01<01:16,  4.13it/s][A
+ 59%|█████▉    | 460/774 [02:02<01:21,  3.87it/s][A
+ 60%|█████▉    | 461/774 [02:02<01:29,  3.52it/s][A
+ 60%|█████▉    | 462/774 [02:02<01:26,  3.59it/s][A
+ 60%|█████▉    | 463/774 [02:03<01:23,  3.73it/s][A
+ 60%|█████▉    | 464/774 [02:03<01:23,  3.70it/s][A
+ 60%|██████    | 465/774 [02:03<01:15,  4.10it/s][A
+ 60%|██████    | 466/774 [02:03<01:11,  4.28it/s][A
+ 60%|██████    | 467/774 [02:03<01:08,  4.50it/s][A
+ 60%|██████    | 468/774 [02:04<01:08,  4.48it/s][A
+ 61%|██████    | 469/774 [02:04<01:02,  4.89it/s][A
+ 61%|██████    | 470/774 [02:04<00:59,  5.10it/s][A
+ 61%|██████    | 471/774 [02:04<01:01,  4.92it/s][A
+ 61%|██████    | 472/774 [02:05<01:06,  4.55it/s][A
+ 61%|██████    | 473/774 [02:05<01:09,  4.31it/s][A
+ 61%|██████    | 474/774 [02:05<01:09,  4.35it/s][A
+ 61%|██████▏   | 475/774 [02:05<01:09,  4.30it/s][A
+ 61%|██████▏   | 476/774 [02:06<01:17,  3.83it/s][A
+ 62%|██████▏   | 477/774 [02:06<01:32,  3.22it/s][A
+ 62%|██████▏   | 478/774 [02:06<01:33,  3.17it/s][A
+ 62%|██████▏   | 479/774 [02:07<01:30,  3.28it/s][A
+ 62%|██████▏   | 480/774 [02:07<01:25,  3.43it/s][A
+ 62%|██████▏   | 481/774 [02:07<01:27,  3.34it/s][A
+ 62%|██████▏   | 482/774 [02:07<01:26,  3.38it/s][A
+ 62%|██████▏   | 483/774 [02:08<01:23,  3.48it/s][A
+ 63%|██████▎   | 484/774 [02:08<01:24,  3.41it/s][A
+ 63%|██████▎   | 485/774 [02:08<01:28,  3.28it/s][A
+ 63%|██████▎   | 486/774 [02:09<01:24,  3.43it/s][A
+ 63%|██████▎   | 487/774 [02:09<01:25,  3.35it/s][A
+ 63%|██████▎   | 488/774 [02:09<01:22,  3.46it/s][A
+ 63%|██████▎   | 489/774 [02:09<01:17,  3.68it/s][A
+ 63%|██████▎   | 490/774 [02:10<01:16,  3.70it/s][A
+ 63%|██████▎   | 491/774 [02:10<01:15,  3.76it/s][A
+ 64%|██████▎   | 492/774 [02:10<01:16,  3.66it/s][A
+ 64%|██████▎   | 493/774 [02:11<01:17,  3.63it/s][A
+ 64%|██████▍   | 494/774 [02:11<01:15,  3.69it/s][A
+ 64%|██████▍   | 495/774 [02:11<01:16,  3.65it/s][A
+ 64%|██████▍   | 496/774 [02:11<01:21,  3.41it/s][A
+ 64%|██████▍   | 497/774 [02:12<01:22,  3.35it/s][A
+ 64%|██████▍   | 498/774 [02:12<01:20,  3.41it/s][A
+ 64%|██████▍   | 499/774 [02:12<01:19,  3.46it/s][A
+ 65%|██████▍   | 500/774 [02:13<01:16,  3.56it/s][A
+ 65%|██████▍   | 501/774 [02:13<01:14,  3.68it/s][A
+ 65%|██████▍   | 502/774 [02:13<01:14,  3.68it/s][A
+ 65%|██████▍   | 503/774 [02:13<01:21,  3.34it/s][A
+ 65%|██████▌   | 504/774 [02:14<01:22,  3.28it/s][A
+ 65%|██████▌   | 505/774 [02:14<01:19,  3.36it/s][A
+ 65%|██████▌   | 506/774 [02:14<01:19,  3.37it/s][A
+ 66%|██████▌   | 507/774 [02:15<01:24,  3.15it/s][A
+ 66%|██████▌   | 508/774 [02:15<01:22,  3.24it/s][A
+ 66%|██████▌   | 509/774 [02:15<01:20,  3.31it/s][A
+ 66%|██████▌   | 510/774 [02:16<01:18,  3.38it/s][A
+ 66%|██████▌   | 511/774 [02:16<01:13,  3.57it/s][A
+ 66%|██████▌   | 512/774 [02:16<01:12,  3.62it/s][A
+ 66%|██████▋   | 513/774 [02:16<01:14,  3.49it/s][A
+ 66%|██████▋   | 514/774 [02:17<01:16,  3.41it/s][A
+ 67%|██████▋   | 515/774 [02:17<01:22,  3.14it/s][A
+ 67%|██████▋   | 516/774 [02:17<01:15,  3.41it/s][A
+ 67%|██████▋   | 517/774 [02:18<01:10,  3.67it/s][A
+ 67%|██████▋   | 518/774 [02:18<01:07,  3.79it/s][A
+ 67%|██████▋   | 519/774 [02:18<01:09,  3.65it/s][A
+ 67%|██████▋   | 520/774 [02:18<01:09,  3.66it/s][A
+ 67%|██████▋   | 521/774 [02:19<01:07,  3.77it/s][A
+ 67%|██████▋   | 522/774 [02:19<01:03,  3.99it/s][A
+ 68%|██████▊   | 523/774 [02:19<01:02,  3.99it/s][A
+ 68%|██████▊   | 524/774 [02:19<01:06,  3.74it/s][A
+ 68%|██████▊   | 525/774 [02:20<01:07,  3.69it/s][A
+ 68%|██████▊   | 526/774 [02:20<01:10,  3.53it/s][A
+ 68%|██████▊   | 527/774 [02:20<01:12,  3.42it/s][A
+ 68%|██████▊   | 528/774 [02:21<01:11,  3.46it/s][A
+ 68%|██████▊   | 529/774 [02:21<01:06,  3.68it/s][A
+ 68%|██████▊   | 530/774 [02:21<01:05,  3.74it/s][A
+ 69%|██████▊   | 531/774 [02:21<01:04,  3.78it/s][A
+ 69%|██████▊   | 532/774 [02:22<01:02,  3.89it/s][A
+ 69%|██████▉   | 533/774 [02:22<00:59,  4.08it/s][A
+ 69%|██████▉   | 534/774 [02:22<00:56,  4.27it/s][A
+ 69%|██████▉   | 535/774 [02:22<00:58,  4.08it/s][A
+ 69%|██████▉   | 536/774 [02:23<01:01,  3.90it/s][A
+ 69%|██████▉   | 537/774 [02:23<01:01,  3.84it/s][A
+ 70%|██████▉   | 538/774 [02:23<01:05,  3.61it/s][A
+ 70%|██████▉   | 539/774 [02:23<01:04,  3.64it/s][A
+ 70%|██████▉   | 540/774 [02:24<01:04,  3.64it/s][A
+ 70%|██████▉   | 541/774 [02:24<01:01,  3.77it/s][A
+ 70%|███████   | 542/774 [02:24<01:02,  3.70it/s][A
+ 70%|███████   | 543/774 [02:24<01:03,  3.62it/s][A
+ 70%|███████   | 544/774 [02:25<01:04,  3.57it/s][A
+ 70%|███████   | 545/774 [02:25<01:01,  3.72it/s][A
+ 71%|███████   | 546/774 [02:25<00:57,  3.94it/s][A
+ 71%|███████   | 547/774 [02:25<00:54,  4.16it/s][A
+ 71%|███████   | 548/774 [02:26<00:53,  4.21it/s][A
+ 71%|███████   | 549/774 [02:26<00:55,  4.06it/s][A
+ 71%|███████   | 550/774 [02:26<00:58,  3.83it/s][A
+ 71%|███████   | 551/774 [02:27<01:01,  3.64it/s][A
+ 71%|███████▏  | 552/774 [02:27<01:03,  3.49it/s][A
+ 71%|███████▏  | 553/774 [02:27<01:08,  3.24it/s][A
+ 72%|███████▏  | 554/774 [02:27<01:07,  3.28it/s][A
+ 72%|███████▏  | 555/774 [02:28<01:06,  3.32it/s][A
+ 72%|███████▏  | 556/774 [02:28<01:02,  3.49it/s][A
+ 72%|███████▏  | 557/774 [02:28<01:07,  3.24it/s][A
+ 72%|███████▏  | 558/774 [02:29<01:01,  3.54it/s][A
+ 72%|███████▏  | 559/774 [02:29<00:57,  3.76it/s][A
+ 72%|███████▏  | 560/774 [02:29<01:01,  3.48it/s][A
+ 72%|███████▏  | 561/774 [02:29<00:57,  3.71it/s][A
+ 73%|███████▎  | 562/774 [02:30<00:52,  4.02it/s][A
+ 73%|███████▎  | 563/774 [02:30<00:50,  4.15it/s][A
+ 73%|███████▎  | 564/774 [02:30<00:52,  4.02it/s][A
+ 73%|███████▎  | 565/774 [02:30<00:53,  3.89it/s][A
+ 73%|███████▎  | 566/774 [02:31<00:49,  4.18it/s][A
+ 73%|███████▎  | 567/774 [02:31<00:45,  4.52it/s][A
+ 73%|███████▎  | 568/774 [02:31<00:48,  4.29it/s][A
+ 74%|███████▎  | 569/774 [02:31<00:48,  4.26it/s][A
+ 74%|███████▎  | 570/774 [02:31<00:48,  4.23it/s][A
+ 74%|███████▍  | 571/774 [02:32<00:51,  3.92it/s][A
+ 74%|███████▍  | 572/774 [02:32<00:54,  3.74it/s][A
+ 74%|███████▍  | 573/774 [02:32<00:53,  3.74it/s][A
+ 74%|███████▍  | 574/774 [02:33<00:51,  3.88it/s][A
+ 74%|███████▍  | 575/774 [02:33<00:50,  3.94it/s][A
+ 74%|███████▍  | 576/774 [02:33<00:56,  3.51it/s][A
+ 75%|███████▍  | 577/774 [02:33<00:54,  3.60it/s][A
+ 75%|███████▍  | 578/774 [02:34<00:52,  3.70it/s][A
+ 75%|███████▍  | 579/774 [02:34<00:55,  3.54it/s][A
+ 75%|███████▍  | 580/774 [02:34<00:54,  3.59it/s][A
+ 75%|███████▌  | 581/774 [02:35<00:53,  3.58it/s][A
+ 75%|███████▌  | 582/774 [02:35<00:51,  3.74it/s][A
+ 75%|███████▌  | 583/774 [02:35<00:49,  3.88it/s][A
+ 75%|███████▌  | 584/774 [02:35<00:48,  3.89it/s][A
+ 76%|███████▌  | 585/774 [02:36<00:51,  3.70it/s][A
+ 76%|███████▌  | 586/774 [02:36<00:51,  3.69it/s][A
+ 76%|███████▌  | 587/774 [02:36<00:50,  3.70it/s][A
+ 76%|███████▌  | 588/774 [02:36<00:49,  3.77it/s][A
+ 76%|███████▌  | 589/774 [02:37<00:48,  3.81it/s][A
+ 76%|███████▌  | 590/774 [02:37<00:45,  4.07it/s][A
+ 76%|███████▋  | 591/774 [02:37<00:45,  4.02it/s][A
+ 76%|███████▋  | 592/774 [02:37<00:48,  3.75it/s][A
+ 77%|███████▋  | 593/774 [02:38<00:48,  3.70it/s][A
+ 77%|███████▋  | 594/774 [02:38<00:49,  3.66it/s][A
+ 77%|███████▋  | 595/774 [02:38<00:52,  3.43it/s][A
+ 77%|███████▋  | 596/774 [02:39<00:55,  3.23it/s][A
+ 77%|███████▋  | 597/774 [02:39<00:55,  3.18it/s][A
+ 77%|███████▋  | 598/774 [02:39<00:56,  3.10it/s][A
+ 77%|███████▋  | 599/774 [02:40<00:57,  3.03it/s][A
+ 78%|███████▊  | 600/774 [02:40<00:57,  3.02it/s][A
+ 78%|███████▊  | 601/774 [02:40<00:57,  3.01it/s][A
+ 78%|███████▊  | 602/774 [02:41<00:56,  3.02it/s][A
+ 78%|███████▊  | 603/774 [02:41<00:55,  3.06it/s][A
+ 78%|███████▊  | 604/774 [02:41<00:56,  3.01it/s][A
+ 78%|███████▊  | 605/774 [02:42<00:54,  3.08it/s][A
+ 78%|███████▊  | 606/774 [02:42<00:56,  2.99it/s][A
+ 78%|███████▊  | 607/774 [02:42<00:55,  2.99it/s][A
+ 79%|███████▊  | 608/774 [02:43<00:56,  2.95it/s][A
+ 79%|███████▊  | 609/774 [02:43<00:53,  3.08it/s][A
+ 79%|███████▉  | 610/774 [02:43<00:54,  3.02it/s][A
+ 79%|███████▉  | 611/774 [02:44<00:58,  2.79it/s][A
+ 79%|███████▉  | 612/774 [02:44<01:00,  2.66it/s][A
+ 79%|███████▉  | 613/774 [02:44<00:55,  2.89it/s][A
+ 79%|███████▉  | 614/774 [02:45<00:54,  2.94it/s][A
+ 79%|███████▉  | 615/774 [02:45<00:51,  3.09it/s][A
+ 80%|███████▉  | 616/774 [02:45<00:49,  3.16it/s][A
+ 80%|███████▉  | 617/774 [02:46<00:49,  3.15it/s][A
+ 80%|███████▉  | 618/774 [02:46<00:47,  3.27it/s][A
+ 80%|███████▉  | 619/774 [02:46<00:45,  3.42it/s][A
+ 80%|████████  | 620/774 [02:46<00:44,  3.48it/s][A
+ 80%|████████  | 621/774 [02:47<00:40,  3.78it/s][A
+ 80%|████████  | 622/774 [02:47<00:37,  4.04it/s][A
+ 80%|████████  | 623/774 [02:47<00:37,  4.03it/s][A
+ 81%|████████  | 624/774 [02:47<00:41,  3.66it/s][A
+ 81%|████████  | 625/774 [02:48<00:40,  3.64it/s][A
+ 81%|████████  | 626/774 [02:48<00:44,  3.34it/s][A
+ 81%|████████  | 627/774 [02:48<00:44,  3.28it/s][A
+ 81%|████████  | 628/774 [02:49<00:45,  3.24it/s][A
+ 81%|████████▏ | 629/774 [02:49<00:43,  3.33it/s][A
+ 81%|████████▏ | 630/774 [02:49<00:40,  3.59it/s][A
+ 82%|████████▏ | 631/774 [02:49<00:37,  3.80it/s][A
+ 82%|████████▏ | 632/774 [02:50<00:37,  3.83it/s][A
+ 82%|████████▏ | 633/774 [02:50<00:39,  3.58it/s][A
+ 82%|████████▏ | 634/774 [02:50<00:41,  3.41it/s][A
+ 82%|████████▏ | 635/774 [02:51<00:39,  3.50it/s][A
+ 82%|████████▏ | 636/774 [02:51<00:39,  3.45it/s][A
+ 82%|████████▏ | 637/774 [02:51<00:38,  3.57it/s][A
+ 82%|████████▏ | 638/774 [02:52<00:38,  3.53it/s][A
+ 83%|████████▎ | 639/774 [02:52<00:42,  3.15it/s][A
+ 83%|████████▎ | 640/774 [02:52<00:49,  2.70it/s][A
+ 83%|████████▎ | 641/774 [02:53<00:48,  2.72it/s][A
+ 83%|████████▎ | 642/774 [02:53<00:45,  2.90it/s][A
+ 83%|████████▎ | 643/774 [02:53<00:44,  2.93it/s][A
+ 83%|████████▎ | 644/774 [02:54<00:40,  3.19it/s][A
+ 83%|████████▎ | 645/774 [02:54<00:37,  3.47it/s][A
+ 83%|████████▎ | 646/774 [02:54<00:34,  3.67it/s][A
+ 84%|████████▎ | 647/774 [02:54<00:32,  3.95it/s][A
+ 84%|████████▎ | 648/774 [02:55<00:30,  4.11it/s][A
+ 84%|████████▍ | 649/774 [02:55<00:30,  4.13it/s][A
+ 84%|████████▍ | 650/774 [02:55<00:28,  4.35it/s][A
+ 84%|████████▍ | 651/774 [02:55<00:28,  4.35it/s][A
+ 84%|████████▍ | 652/774 [02:55<00:29,  4.12it/s][A
+ 84%|████████▍ | 653/774 [02:56<00:31,  3.86it/s][A
+ 84%|████████▍ | 654/774 [02:56<00:28,  4.15it/s][A
+ 85%|████████▍ | 655/774 [02:56<00:26,  4.49it/s][A
+ 85%|████████▍ | 656/774 [02:56<00:27,  4.29it/s][A
+ 85%|████████▍ | 657/774 [02:57<00:26,  4.50it/s][A
+ 85%|████████▌ | 658/774 [02:57<00:26,  4.33it/s][A
+ 85%|████████▌ | 659/774 [02:57<00:28,  4.02it/s][A
+ 85%|████████▌ | 660/774 [02:57<00:30,  3.73it/s][A
+ 85%|████████▌ | 661/774 [02:58<00:31,  3.64it/s][A
+ 86%|████████▌ | 662/774 [02:58<00:29,  3.83it/s][A
+ 86%|████████▌ | 663/774 [02:58<00:30,  3.65it/s][A
+ 86%|████████▌ | 664/774 [02:59<00:30,  3.61it/s][A
+ 86%|████████▌ | 665/774 [02:59<00:27,  3.91it/s][A
+ 86%|████████▌ | 666/774 [02:59<00:25,  4.21it/s][A
+ 86%|████████▌ | 667/774 [02:59<00:23,  4.48it/s][A
+ 86%|████████▋ | 668/774 [02:59<00:23,  4.44it/s][A
+ 86%|████████▋ | 669/774 [03:00<00:25,  4.13it/s][A
+ 87%|████████▋ | 670/774 [03:00<00:24,  4.23it/s][A
+ 87%|████████▋ | 671/774 [03:00<00:27,  3.77it/s][A
+ 87%|████████▋ | 672/774 [03:00<00:26,  3.90it/s][A
+ 87%|████████▋ | 673/774 [03:01<00:24,  4.07it/s][A
+ 87%|████████▋ | 674/774 [03:01<00:25,  3.97it/s][A
+ 87%|████████▋ | 675/774 [03:01<00:23,  4.14it/s][A
+ 87%|████████▋ | 676/774 [03:01<00:22,  4.38it/s][A
+ 87%|████████▋ | 677/774 [03:02<00:22,  4.36it/s][A
+ 88%|████████▊ | 678/774 [03:02<00:21,  4.42it/s][A
+ 88%|████████▊ | 679/774 [03:02<00:22,  4.18it/s][A
+ 88%|████████▊ | 680/774 [03:02<00:23,  4.07it/s][A
+ 88%|████████▊ | 681/774 [03:03<00:21,  4.34it/s][A
+ 88%|████████▊ | 682/774 [03:03<00:20,  4.41it/s][A
+ 88%|████████▊ | 683/774 [03:03<00:22,  4.06it/s][A
+ 88%|████████▊ | 684/774 [03:03<00:23,  3.79it/s][A
+ 89%|████████▊ | 685/774 [03:04<00:24,  3.65it/s][A
+ 89%|████████▊ | 686/774 [03:04<00:23,  3.78it/s][A
+ 89%|████████▉ | 687/774 [03:04<00:21,  4.06it/s][A
+ 89%|████████▉ | 688/774 [03:04<00:21,  4.08it/s][A
+ 89%|████████▉ | 689/774 [03:05<00:19,  4.27it/s][A
+ 89%|████████▉ | 690/774 [03:05<00:19,  4.40it/s][A
+ 89%|████████▉ | 691/774 [03:05<00:18,  4.52it/s][A
+ 89%|████████▉ | 692/774 [03:05<00:17,  4.57it/s][A
+ 90%|████████▉ | 693/774 [03:05<00:17,  4.53it/s][A
+ 90%|████████▉ | 694/774 [03:06<00:18,  4.23it/s][A
+ 90%|████████▉ | 695/774 [03:06<00:20,  3.78it/s][A
+ 90%|████████▉ | 696/774 [03:06<00:20,  3.89it/s][A
+ 90%|█████████ | 697/774 [03:06<00:19,  3.98it/s][A
+ 90%|█████████ | 698/774 [03:07<00:17,  4.40it/s][A
+ 90%|█████████ | 699/774 [03:07<00:15,  4.73it/s][A
+ 90%|█████████ | 700/774 [03:07<00:16,  4.36it/s][A
+ 91%|█████████ | 701/774 [03:07<00:16,  4.45it/s][A
+ 91%|█████████ | 702/774 [03:08<00:16,  4.48it/s][A
+ 91%|█████████ | 703/774 [03:08<00:16,  4.41it/s][A
+ 91%|█████████ | 704/774 [03:08<00:16,  4.18it/s][A
+ 91%|█████████ | 705/774 [03:08<00:15,  4.55it/s][A
+ 91%|█████████ | 706/774 [03:08<00:14,  4.70it/s][A
+ 91%|█████████▏| 707/774 [03:09<00:14,  4.63it/s][A
+ 91%|█████████▏| 708/774 [03:09<00:13,  4.97it/s][A
+ 92%|█████████▏| 709/774 [03:09<00:13,  4.85it/s][A
+ 92%|█████████▏| 710/774 [03:09<00:13,  4.80it/s][A
+ 92%|█████████▏| 711/774 [03:09<00:12,  4.96it/s][A
+ 92%|█████████▏| 712/774 [03:10<00:12,  5.14it/s][A
+ 92%|█████████▏| 713/774 [03:10<00:12,  4.93it/s][A
+ 92%|█████████▏| 714/774 [03:10<00:13,  4.60it/s][A
+ 92%|█████████▏| 715/774 [03:10<00:12,  4.71it/s][A
+ 93%|█████████▎| 716/774 [03:10<00:11,  5.02it/s][A
+ 93%|█████████▎| 717/774 [03:11<00:11,  5.08it/s][A
+ 93%|█████████▎| 718/774 [03:11<00:12,  4.56it/s][A
+ 93%|█████████▎| 719/774 [03:11<00:12,  4.53it/s][A
+ 93%|█████████▎| 720/774 [03:11<00:11,  4.88it/s][A
+ 93%|█████████▎| 721/774 [03:11<00:10,  5.11it/s][A
+ 93%|█████████▎| 722/774 [03:12<00:09,  5.52it/s][A
+ 93%|█████████▎| 723/774 [03:12<00:09,  5.31it/s][A
+ 94%|█████████▎| 724/774 [03:12<00:09,  5.22it/s][A
+ 94%|█████████▎| 725/774 [03:12<00:09,  5.40it/s][A
+ 94%|█████████▍| 726/774 [03:12<00:08,  5.40it/s][A
+ 94%|█████████▍| 727/774 [03:13<00:08,  5.25it/s][A
+ 94%|█████████▍| 728/774 [03:13<00:09,  4.65it/s][A
+ 94%|█████████▍| 729/774 [03:13<00:09,  4.92it/s][A
+ 94%|█████████▍| 730/774 [03:13<00:08,  5.23it/s][A
+ 94%|█████████▍| 731/774 [03:13<00:08,  5.27it/s][A
+ 95%|█████████▍| 732/774 [03:14<00:07,  5.39it/s][A
+ 95%|█████████▍| 733/774 [03:14<00:07,  5.33it/s][A
+ 95%|█████████▍| 734/774 [03:14<00:07,  5.42it/s][A
+ 95%|█████████▍| 735/774 [03:14<00:07,  5.52it/s][A
+ 95%|█████████▌| 736/774 [03:14<00:06,  5.61it/s][A
+ 95%|█████████▌| 737/774 [03:14<00:06,  5.47it/s][A
+ 95%|█████████▌| 738/774 [03:15<00:06,  5.40it/s][A
+ 95%|█████████▌| 739/774 [03:15<00:06,  5.31it/s][A
+ 96%|█████████▌| 740/774 [03:15<00:06,  5.23it/s][A
+ 96%|█████████▌| 741/774 [03:15<00:06,  4.79it/s][A
+ 96%|█████████▌| 742/774 [03:15<00:06,  5.03it/s][A
+ 96%|█████████▌| 743/774 [03:16<00:05,  5.34it/s][A
+ 96%|█████████▌| 744/774 [03:16<00:05,  5.19it/s][A
+ 96%|█████████▋| 745/774 [03:16<00:06,  4.37it/s][A
+ 96%|█████████▋| 746/774 [03:16<00:07,  3.77it/s][A
+ 97%|█████████▋| 747/774 [03:17<00:06,  3.94it/s][A
+ 97%|█████████▋| 748/774 [03:17<00:06,  4.20it/s][A
+ 97%|█████████▋| 749/774 [03:17<00:05,  4.53it/s][A
+ 97%|█████████▋| 750/774 [03:17<00:05,  4.25it/s][A
+ 97%|█████████▋| 751/774 [03:18<00:05,  4.43it/s][A
+ 97%|█████████▋| 752/774 [03:18<00:05,  4.36it/s][A
+ 97%|█████████▋| 753/774 [03:18<00:04,  4.71it/s][A
+ 97%|█████████▋| 754/774 [03:18<00:03,  5.28it/s][A
+ 98%|█████████▊| 755/774 [03:18<00:03,  5.59it/s][A
+ 98%|█████████▊| 756/774 [03:18<00:03,  5.47it/s][A
+ 98%|█████████▊| 757/774 [03:19<00:03,  5.19it/s][A
+ 98%|█████████▊| 758/774 [03:19<00:03,  5.15it/s][A
+ 98%|█████████▊| 759/774 [03:19<00:02,  5.41it/s][A
+ 98%|█████████▊| 760/774 [03:19<00:02,  5.44it/s][A
+ 98%|█████████▊| 761/774 [03:19<00:02,  5.91it/s][A
+ 98%|█████████▊| 762/774 [03:20<00:01,  6.04it/s][A
+ 99%|█████████▊| 763/774 [03:20<00:01,  6.21it/s][A
+ 99%|█████████▊| 764/774 [03:20<00:01,  6.36it/s][A
+ 99%|█████████▉| 765/774 [03:20<00:01,  6.29it/s][A
+ 99%|█████████▉| 766/774 [03:20<00:01,  5.45it/s][A
+ 99%|█████████▉| 767/774 [03:20<00:01,  5.62it/s][A
+ 99%|█████████▉| 768/774 [03:21<00:01,  5.57it/s][A
+ 99%|█████████▉| 769/774 [03:21<00:00,  5.25it/s][A
+ 99%|█████████▉| 770/774 [03:21<00:00,  5.10it/s][A
+100%|█████████▉| 771/774 [03:21<00:00,  5.27it/s][A
+100%|█████████▉| 772/774 [03:21<00:00,  5.03it/s][A
+100%|█████████▉| 773/774 [03:22<00:00,  4.96it/s][A
+100%|██████████| 774/774 [03:22<00:00,  5.78it/s][A                                                      
+                                                 [A  8%|▊         | 1000/12776 [10:55<1:09:48,  2.81it/s]
+100%|██████████| 774/774 [03:24<00:00,  5.78it/s][A
+                                                 [A  8%|▊         | 1001/12776 [10:57<203:25:49, 62.20s/it]                                                          8%|▊         | 1001/12776 [10:57<203:25:49, 62.20s/it]  8%|▊         | 1002/12776 [10:58<143:23:56, 43.85s/it]                                                          8%|▊         | 1002/12776 [10:58<143:23:56, 43.85s/it]  8%|▊         | 1003/12776 [10:59<101:12:16, 30.95s/it]                                                          8%|▊         | 1003/12776 [10:59<101:12:16, 30.95s/it]  8%|▊         | 1004/12776 [10:59<71:35:29, 21.89s/it]                                                          8%|▊         | 1004/12776 [10:59<71:35:29, 21.89s/it]  8%|▊         | 1005/12776 [11:00<50:49:48, 15.55s/it]                                                         8%|▊         | 1005/12776 [11:00<50:49:48, 15.55s/it]  8%|▊         | 1006/12776 [11:01<36:13:24, 11.08s/it]                                                         8%|▊         | 1006/12776 [11:01<36:13:24, 11.08s/it]  8%|▊         | 1007/12776 [11:01<25:59:19,  7.95s/it]                                                         8%|▊         | 1007/12776 [11:01<25:59:19,  7.95s/it]  8%|▊         | 1008/12776 [11:02<18:52:27,  5.77s/it]                                                         8%|▊         | 1008/12776 [11:02<18:52:27,  5.77s/it]  8%|▊         | 1009/12776 [11:03<13:46:37,  4.21s/it]                                                         8%|▊         | 1009/12776 [11:03<13:46:37,  4.21s/it]  8%|▊         | 1010/12776 [11:03<10:15:12,  3.14s/it]                                                         8%|▊         | 1010/12776 [11:03<10:15:12,  3.14s/it]  8%|▊         | 1011/12776 [11:04<7:41:29,  2.35s/it]                                                         8%|▊         | 1011/12776 [11:04<7:41:29,  2.35s/it]  8%|▊         | 1012/12776 [11:04<5:57:40,  1.82s/it]                                                        8%|▊         | 1012/12776 [11:04<5:57:40,  1.82s/it]  8%|▊         | 1013/12776 [11:05<4:37:31,  1.42s/it]                                                        8%|▊         | 1013/12776 [11:05<4:37:31,  1.42s/it]  8%|▊         | 1014/12776 [11:05<3:49:19,  1.17s/it]                                                        8%|▊         | 1014/12776 [11:05<3:49:19,  1.17s/it]  8%|▊         | 1015/12776 [11:06<3:06:01,  1.05it/s]                                                        8%|▊         | 1015/12776 [11:06<3:06:01,  1.05it/s]  8%|▊         | 1016/12776 [11:06<2:39:50,  1.23it/s]                                                        8%|▊         | 1016/12776 [11:06<2:39:50,  1.23it/s]  8%|▊         | 1017/12776 [11:07<2:15:41,  1.44it/s]                                                        8%|▊         | 1017/12776 [11:07<2:15:41,  1.44it/s]  8%|▊         | 1018/12776 [11:07<1:57:51,  1.66it/s]                                                        8%|▊         | 1018/12776 [11:07<1:57:51,  1.66it/s]  8%|▊         | 1019/12776 [11:08<1:51:38,  1.76it/s]                                                        8%|▊         | 1019/12776 [11:08<1:51:38,  1.76it/s]  8%|▊         | 1020/12776 [11:08<1:38:50,  1.98it/s]                                                        8%|▊         | 1020/12776 [11:08<1:38:50,  1.98it/s]  8%|▊         | 1021/12776 [11:08<1:29:21,  2.19it/s]                                                        8%|▊         | 1021/12776 [11:08<1:29:21,  2.19it/s]  8%|▊         | 1022/12776 [11:09<1:28:22,  2.22it/s]                                                        8%|▊         | 1022/12776 [11:09<1:28:22,  2.22it/s]  8%|▊         | 1023/12776 [11:09<1:21:31,  2.40it/s]                                                        8%|▊         | 1023/12776 [11:09<1:21:31,  2.40it/s]  8%|▊         | 1024/12776 [11:09<1:15:45,  2.59it/s]                                                        8%|▊         | 1024/12776 [11:09<1:15:45,  2.59it/s]  8%|▊         | 1025/12776 [11:10<1:13:16,  2.67it/s]                                                        8%|▊         | 1025/12776 [11:10<1:13:16,  2.67it/s]  8%|▊         | 1026/12776 [11:10<1:08:37,  2.85it/s]                                                        8%|▊         | 1026/12776 [11:10<1:08:37,  2.85it/s]  8%|▊         | 1027/12776 [11:10<1:04:57,  3.01it/s]                                                        8%|▊         | 1027/12776 [11:10<1:04:57,  3.01it/s]  8%|▊         | 1028/12776 [11:11<1:01:56,  3.16it/s]                                                        8%|▊         | 1028/12776 [11:11<1:01:56,  3.16it/s]  8%|▊         | 1029/12776 [11:11<1:06:12,  2.96it/s]                                                        8%|▊         | 1029/12776 [11:11<1:06:12,  2.96it/s]  8%|▊         | 1030/12776 [11:11<1:01:47,  3.17it/s]                                                        8%|▊         | 1030/12776 [11:11<1:01:47,  3.17it/s]  8%|▊         | 1031/12776 [11:12<57:45,  3.39it/s]                                                        8%|▊         | 1031/12776 [11:12<57:45,  3.39it/s]  8%|▊         | 1032/12776 [11:12<55:06,  3.55it/s]                                                      8%|▊         | 1032/12776 [11:12<55:06,  3.55it/s]  8%|▊         | 1033/12776 [11:12<1:00:47,  3.22it/s]                                                        8%|▊         | 1033/12776 [11:12<1:00:47,  3.22it/s]  8%|▊         | 1034/12776 [11:12<56:39,  3.45it/s]                                                        8%|▊         | 1034/12776 [11:12<56:39,  3.45it/s]  8%|▊         | 1035/12776 [11:13<53:17,  3.67it/s]                                                      8%|▊         | 1035/12776 [11:13<53:17,  3.67it/s]  8%|▊         | 1036/12776 [11:13<50:42,  3.86it/s]                                                      8%|▊         | 1036/12776 [11:13<50:42,  3.86it/s]  8%|▊         | 1037/12776 [11:13<54:04,  3.62it/s]                                                      8%|▊         | 1037/12776 [11:13<54:04,  3.62it/s]  8%|▊         | 1038/12776 [11:13<50:30,  3.87it/s]                                                      8%|▊         | 1038/12776 [11:13<50:30,  3.87it/s]  8%|▊         | 1039/12776 [11:14<47:47,  4.09it/s]                                                      8%|▊         | 1039/12776 [11:14<47:47,  4.09it/s]  8%|▊         | 1040/12776 [11:14<45:45,  4.28it/s]                                                      8%|▊         | 1040/12776 [11:14<45:45,  4.28it/s]  8%|▊         | 1041/12776 [11:14<44:19,  4.41it/s]                                                      8%|▊         | 1041/12776 [11:14<44:19,  4.41it/s]  8%|▊         | 1042/12776 [11:14<47:36,  4.11it/s]                                                      8%|▊         | 1042/12776 [11:14<47:36,  4.11it/s]  8%|▊         | 1043/12776 [11:15<45:20,  4.31it/s]                                                      8%|▊         | 1043/12776 [11:15<45:20,  4.31it/s]  8%|▊         | 1044/12776 [11:15<43:24,  4.50it/s]                                                      8%|▊         | 1044/12776 [11:15<43:24,  4.50it/s]  8%|▊         | 1045/12776 [11:15<41:49,  4.67it/s]                                                      8%|▊         | 1045/12776 [11:15<41:49,  4.67it/s]  8%|▊         | 1046/12776 [11:15<40:37,  4.81it/s]                                                      8%|▊         | 1046/12776 [11:15<40:37,  4.81it/s]  8%|▊         | 1047/12776 [11:15<39:40,  4.93it/s]                                                      8%|▊         | 1047/12776 [11:15<39:40,  4.93it/s]  8%|▊         | 1048/12776 [11:16<43:05,  4.54it/s]                                                      8%|▊         | 1048/12776 [11:16<43:05,  4.54it/s]  8%|▊         | 1049/12776 [11:16<41:15,  4.74it/s]                                                      8%|▊         | 1049/12776 [11:16<41:15,  4.74it/s]  8%|▊         | 1050/12776 [11:17<1:16:33,  2.55it/s]                                                        8%|▊         | 1050/12776 [11:17<1:16:33,  2.55it/s]  8%|▊         | 1051/12776 [11:18<2:20:06,  1.39it/s]                                                        8%|▊         | 1051/12776 [11:18<2:20:06,  1.39it/s]  8%|▊         | 1052/12776 [11:19<2:35:42,  1.25it/s]                                                        8%|▊         | 1052/12776 [11:19<2:35:42,  1.25it/s]  8%|▊         | 1053/12776 [11:20<2:44:15,  1.19it/s]                                                        8%|▊         | 1053/12776 [11:20<2:44:15,  1.19it/s]  8%|▊         | 1054/12776 [11:21<2:42:12,  1.20it/s]                                                        8%|▊         | 1054/12776 [11:21<2:42:12,  1.20it/s]  8%|▊         | 1055/12776 [11:22<2:38:35,  1.23it/s]                                                        8%|▊         | 1055/12776 [11:22<2:38:35,  1.23it/s]  8%|▊         | 1056/12776 [11:22<2:33:50,  1.27it/s]                                                        8%|▊         | 1056/12776 [11:22<2:33:50,  1.27it/s]  8%|▊         | 1057/12776 [11:23<2:28:29,  1.32it/s]                                                        8%|▊         | 1057/12776 [11:23<2:28:29,  1.32it/s]  8%|▊         | 1058/12776 [11:24<2:26:27,  1.33it/s]                                                        8%|▊         | 1058/12776 [11:24<2:26:27,  1.33it/s]  8%|▊         | 1059/12776 [11:24<2:19:03,  1.40it/s]                                                        8%|▊         | 1059/12776 [11:24<2:19:03,  1.40it/s]  8%|▊         | 1060/12776 [11:25<2:15:24,  1.44it/s]                                                        8%|▊         | 1060/12776 [11:25<2:15:24,  1.44it/s]  8%|▊         | 1061/12776 [11:26<2:07:48,  1.53it/s]                                                        8%|▊         | 1061/12776 [11:26<2:07:48,  1.53it/s]  8%|▊         | 1062/12776 [11:26<2:03:24,  1.58it/s]                                                        8%|▊         | 1062/12776 [11:26<2:03:24,  1.58it/s]  8%|▊         | 1063/12776 [11:27<1:57:15,  1.66it/s]                                                        8%|▊         | 1063/12776 [11:27<1:57:15,  1.66it/s]  8%|▊         | 1064/12776 [11:27<1:56:45,  1.67it/s]                                                        8%|▊         | 1064/12776 [11:27<1:56:45,  1.67it/s]  8%|▊         | 1065/12776 [11:28<1:50:33,  1.77it/s]                                                        8%|▊         | 1065/12776 [11:28<1:50:33,  1.77it/s]  8%|▊         | 1066/12776 [11:28<1:48:28,  1.80it/s]                                                        8%|▊         | 1066/12776 [11:28<1:48:28,  1.80it/s]  8%|▊         | 1067/12776 [11:29<1:41:08,  1.93it/s]                                                        8%|▊         | 1067/12776 [11:29<1:41:08,  1.93it/s]  8%|▊         | 1068/12776 [11:29<1:43:31,  1.88it/s]                                                        8%|▊         | 1068/12776 [11:29<1:43:31,  1.88it/s]  8%|▊         | 1069/12776 [11:30<1:35:16,  2.05it/s]                                                        8%|▊         | 1069/12776 [11:30<1:35:16,  2.05it/s]  8%|▊         | 1070/12776 [11:30<1:28:06,  2.21it/s]                                                        8%|▊         | 1070/12776 [11:30<1:28:06,  2.21it/s]  8%|▊         | 1071/12776 [11:30<1:24:44,  2.30it/s]                                                        8%|▊         | 1071/12776 [11:30<1:24:44,  2.30it/s]  8%|▊         | 1072/12776 [11:31<1:19:38,  2.45it/s]                                                        8%|▊         | 1072/12776 [11:31<1:19:38,  2.45it/s]  8%|▊         | 1073/12776 [11:31<1:15:22,  2.59it/s]                                                        8%|▊         | 1073/12776 [11:31<1:15:22,  2.59it/s]  8%|▊         | 1074/12776 [11:32<1:19:44,  2.45it/s]                                                        8%|▊         | 1074/12776 [11:32<1:19:44,  2.45it/s]  8%|▊         | 1075/12776 [11:32<1:13:32,  2.65it/s]                                                        8%|▊         | 1075/12776 [11:32<1:13:32,  2.65it/s]  8%|▊         | 1076/12776 [11:32<1:08:53,  2.83it/s]                                                        8%|▊         | 1076/12776 [11:32<1:08:53,  2.83it/s]  8%|▊         | 1077/12776 [11:32<1:05:03,  3.00it/s]                                                      {'eval_loss': 1.01686429977417, 'eval_wer': 0.7063561212997004, 'eval_runtime': 205.1557, 'eval_samples_per_second': 60.359, 'eval_steps_per_second': 3.773, 'epoch': 0.16}
+{'loss': 0.8153, 'grad_norm': 1.0563576221466064, 'learning_rate': 0.00028787878787878786, 'epoch': 0.16}
+{'loss': 0.6999, 'grad_norm': 0.817237913608551, 'learning_rate': 0.0002878543499511241, 'epoch': 0.16}
+{'loss': 0.5458, 'grad_norm': 0.6916754841804504, 'learning_rate': 0.0002878299120234604, 'epoch': 0.16}
+{'loss': 0.677, 'grad_norm': 0.7108484506607056, 'learning_rate': 0.00028780547409579666, 'epoch': 0.16}
+{'loss': 0.7012, 'grad_norm': 1.1377077102661133, 'learning_rate': 0.0002877810361681329, 'epoch': 0.16}
+{'loss': 0.6656, 'grad_norm': 1.456892967224121, 'learning_rate': 0.00028775659824046917, 'epoch': 0.16}
+{'loss': 0.6829, 'grad_norm': 1.1236454248428345, 'learning_rate': 0.0002877321603128054, 'epoch': 0.16}
+{'loss': 1.0528, 'grad_norm': 2.5703699588775635, 'learning_rate': 0.0002877077223851417, 'epoch': 0.16}
+{'loss': 0.6504, 'grad_norm': 1.1044304370880127, 'learning_rate': 0.000287683284457478, 'epoch': 0.16}
+{'loss': 0.654, 'grad_norm': 1.558677315711975, 'learning_rate': 0.0002876588465298142, 'epoch': 0.16}
+{'loss': 0.7489, 'grad_norm': 1.6468641757965088, 'learning_rate': 0.00028763440860215053, 'epoch': 0.16}
+{'loss': 0.8098, 'grad_norm': 1.1569092273712158, 'learning_rate': 0.0002876099706744868, 'epoch': 0.16}
+{'loss': 0.8457, 'grad_norm': 1.2596278190612793, 'learning_rate': 0.00028758553274682303, 'epoch': 0.16}
+{'loss': 0.6101, 'grad_norm': 0.9945486187934875, 'learning_rate': 0.0002875610948191593, 'epoch': 0.16}
+{'loss': 0.7989, 'grad_norm': 1.4740206003189087, 'learning_rate': 0.0002875366568914956, 'epoch': 0.16}
+{'loss': 0.9334, 'grad_norm': 4.350058555603027, 'learning_rate': 0.00028751221896383184, 'epoch': 0.16}
+{'loss': 0.6877, 'grad_norm': 1.1375359296798706, 'learning_rate': 0.0002874877810361681, 'epoch': 0.16}
+{'loss': 0.8379, 'grad_norm': 2.1333682537078857, 'learning_rate': 0.0002874633431085044, 'epoch': 0.16}
+{'loss': 1.0991, 'grad_norm': 1.3724780082702637, 'learning_rate': 0.00028743890518084065, 'epoch': 0.16}
+{'loss': 1.4866, 'grad_norm': 4.463011741638184, 'learning_rate': 0.0002874144672531769, 'epoch': 0.16}
+{'loss': 0.7986, 'grad_norm': 2.178107738494873, 'learning_rate': 0.0002873900293255132, 'epoch': 0.16}
+{'loss': 0.8131, 'grad_norm': 1.3757675886154175, 'learning_rate': 0.0002873655913978494, 'epoch': 0.16}
+{'loss': 1.0131, 'grad_norm': 1.6082704067230225, 'learning_rate': 0.0002873411534701857, 'epoch': 0.16}
+{'loss': 0.9901, 'grad_norm': 1.9035452604293823, 'learning_rate': 0.00028731671554252196, 'epoch': 0.16}
+{'loss': 1.0808, 'grad_norm': 2.227452039718628, 'learning_rate': 0.0002872922776148582, 'epoch': 0.16}
+{'loss': 1.4922, 'grad_norm': 2.5133438110351562, 'learning_rate': 0.0002872678396871945, 'epoch': 0.16}
+{'loss': 1.2555, 'grad_norm': 3.3356120586395264, 'learning_rate': 0.00028724340175953077, 'epoch': 0.16}
+{'loss': 1.3604, 'grad_norm': 2.868877649307251, 'learning_rate': 0.000287218963831867, 'epoch': 0.16}
+{'loss': 1.4672, 'grad_norm': 2.3197638988494873, 'learning_rate': 0.0002871945259042033, 'epoch': 0.16}
+{'loss': 0.7615, 'grad_norm': 1.5867116451263428, 'learning_rate': 0.0002871700879765396, 'epoch': 0.16}
+{'loss': 1.4227, 'grad_norm': 1.823372721672058, 'learning_rate': 0.00028714565004887583, 'epoch': 0.16}
+{'loss': 1.9678, 'grad_norm': 3.756685972213745, 'learning_rate': 0.0002871212121212121, 'epoch': 0.16}
+{'loss': 0.9197, 'grad_norm': 2.313270330429077, 'learning_rate': 0.0002870967741935484, 'epoch': 0.16}
+{'loss': 1.4155, 'grad_norm': 1.9607908725738525, 'learning_rate': 0.00028707233626588464, 'epoch': 0.16}
+{'loss': 1.2159, 'grad_norm': 2.0244407653808594, 'learning_rate': 0.0002870478983382209, 'epoch': 0.16}
+{'loss': 1.8048, 'grad_norm': 2.4551143646240234, 'learning_rate': 0.0002870234604105572, 'epoch': 0.16}
+{'loss': 1.2969, 'grad_norm': 2.5346062183380127, 'learning_rate': 0.0002869990224828934, 'epoch': 0.16}
+{'loss': 2.0716, 'grad_norm': 3.634019374847412, 'learning_rate': 0.0002869745845552297, 'epoch': 0.16}
+{'loss': 0.8884, 'grad_norm': 2.7607200145721436, 'learning_rate': 0.00028695014662756595, 'epoch': 0.16}
+{'loss': 1.5306, 'grad_norm': 2.8792600631713867, 'learning_rate': 0.0002869257086999022, 'epoch': 0.16}
+{'loss': 1.4916, 'grad_norm': 5.755788803100586, 'learning_rate': 0.0002869012707722385, 'epoch': 0.16}
+{'loss': 1.3055, 'grad_norm': 2.4009017944335938, 'learning_rate': 0.00028687683284457475, 'epoch': 0.16}
+{'loss': 1.3189, 'grad_norm': 2.3000261783599854, 'learning_rate': 0.000286852394916911, 'epoch': 0.16}
+{'loss': 1.5442, 'grad_norm': 2.4022369384765625, 'learning_rate': 0.0002868279569892473, 'epoch': 0.16}
+{'loss': 1.7154, 'grad_norm': 2.216566801071167, 'learning_rate': 0.00028680351906158356, 'epoch': 0.16}
+{'loss': 1.1951, 'grad_norm': 2.9098117351531982, 'learning_rate': 0.0002867790811339198, 'epoch': 0.16}
+{'loss': 1.6244, 'grad_norm': 4.8952178955078125, 'learning_rate': 0.00028675464320625606, 'epoch': 0.16}
+{'loss': 1.3874, 'grad_norm': 4.096086502075195, 'learning_rate': 0.00028673020527859237, 'epoch': 0.16}
+{'loss': 1.345, 'grad_norm': 3.0238194465637207, 'learning_rate': 0.0002867057673509286, 'epoch': 0.16}
+{'loss': 1.2903, 'grad_norm': 2.8986921310424805, 'learning_rate': 0.00028668132942326487, 'epoch': 0.16}
+{'loss': 0.6808, 'grad_norm': 0.9072627425193787, 'learning_rate': 0.0002866568914956012, 'epoch': 0.16}
+{'loss': 0.6256, 'grad_norm': 0.8426551222801208, 'learning_rate': 0.00028663245356793743, 'epoch': 0.16}
+{'loss': 0.5074, 'grad_norm': 0.7460830807685852, 'learning_rate': 0.0002866080156402737, 'epoch': 0.16}
+{'loss': 0.6061, 'grad_norm': 0.606028139591217, 'learning_rate': 0.00028658357771260993, 'epoch': 0.16}
+{'loss': 0.5883, 'grad_norm': 0.833991527557373, 'learning_rate': 0.0002865591397849462, 'epoch': 0.17}
+{'loss': 0.7446, 'grad_norm': 1.9847928285598755, 'learning_rate': 0.0002865347018572825, 'epoch': 0.17}
+{'loss': 0.5827, 'grad_norm': 0.9032483100891113, 'learning_rate': 0.00028651026392961874, 'epoch': 0.17}
+{'loss': 0.6579, 'grad_norm': 0.7685295343399048, 'learning_rate': 0.000286485826001955, 'epoch': 0.17}
+{'loss': 0.5019, 'grad_norm': 0.6518795490264893, 'learning_rate': 0.0002864613880742913, 'epoch': 0.17}
+{'loss': 0.4796, 'grad_norm': 1.02455735206604, 'learning_rate': 0.00028643695014662755, 'epoch': 0.17}
+{'loss': 0.5102, 'grad_norm': 1.0418224334716797, 'learning_rate': 0.0002864125122189638, 'epoch': 0.17}
+{'loss': 0.7181, 'grad_norm': 2.0449554920196533, 'learning_rate': 0.00028638807429130005, 'epoch': 0.17}
+{'loss': 0.7923, 'grad_norm': 1.0994571447372437, 'learning_rate': 0.00028636363636363636, 'epoch': 0.17}
+{'loss': 0.5434, 'grad_norm': 0.9208387136459351, 'learning_rate': 0.0002863391984359726, 'epoch': 0.17}
+{'loss': 1.0223, 'grad_norm': 1.2651475667953491, 'learning_rate': 0.00028631476050830886, 'epoch': 0.17}
+{'loss': 0.8762, 'grad_norm': 1.9309346675872803, 'learning_rate': 0.00028629032258064516, 'epoch': 0.17}
+{'loss': 0.9494, 'grad_norm': 2.4084434509277344, 'learning_rate': 0.0002862658846529814, 'epoch': 0.17}
+{'loss': 0.7541, 'grad_norm': 1.4075591564178467, 'learning_rate': 0.00028624144672531767, 'epoch': 0.17}
+{'loss': 0.8695, 'grad_norm': 2.102005958557129, 'learning_rate': 0.00028621700879765397, 'epoch': 0.17}
+{'loss': 0.7304, 'grad_norm': 2.49064040184021, 'learning_rate': 0.00028619257086999017, 'epoch': 0.17}
+{'loss': 0.6588, 'grad_norm': 1.221191167831421, 'learning_rate': 0.0002861681329423265, 'epoch': 0.17}
+{'loss': 1.1985, 'grad_norm': 1.5001965761184692, 'learning_rate': 0.0002861436950146627, 'epoch': 0.17}
+{'loss': 0.9565, 'grad_norm': 1.4838846921920776, 'learning_rate': 0.000286119257086999, 'epoch': 0.17}
+{'loss': 0.8253, 'grad_norm': 1.5427438020706177, 'learning_rate': 0.0002860948191593353, 'epoch': 0.17}
+{'loss': 0.6618, 'grad_norm': 1.4111217260360718, 'learning_rate': 0.00028607038123167153, 'epoch': 0.17}
+{'loss': 0.9508, 'grad_norm': 1.6827608346939087, 'learning_rate': 0.0002860459433040078, 'epoch': 0.17}
+  8%|▊         | 1077/12776 [11:32<1:05:03,  3.00it/s]  8%|▊         | 1078/12776 [11:33<1:06:40,  2.92it/s]                                                        8%|▊         | 1078/12776 [11:33<1:06:40,  2.92it/s]  8%|▊         | 1079/12776 [11:33<1:02:29,  3.12it/s]                                                        8%|▊         | 1079/12776 [11:33<1:02:29,  3.12it/s]  8%|▊         | 1080/12776 [11:33<59:13,  3.29it/s]                                                        8%|▊         | 1080/12776 [11:33<59:13,  3.29it/s]  8%|▊         | 1081/12776 [11:34<56:06,  3.47it/s]                                                      8%|▊         | 1081/12776 [11:34<56:06,  3.47it/s]  8%|▊         | 1082/12776 [11:34<58:15,  3.35it/s]                                                      8%|▊         | 1082/12776 [11:34<58:15,  3.35it/s]  8%|▊         | 1083/12776 [11:34<55:02,  3.54it/s]                                                      8%|▊         | 1083/12776 [11:34<55:02,  3.54it/s]  8%|▊         | 1084/12776 [11:34<52:28,  3.71it/s]                                                      8%|▊         | 1084/12776 [11:34<52:28,  3.71it/s]  8%|▊         | 1085/12776 [11:35<50:39,  3.85it/s]                                                      8%|▊         | 1085/12776 [11:35<50:39,  3.85it/s]  9%|▊         | 1086/12776 [11:35<51:12,  3.80it/s]                                                      9%|▊         | 1086/12776 [11:35<51:12,  3.80it/s]  9%|▊         | 1087/12776 [11:35<49:09,  3.96it/s]                                                      9%|▊         | 1087/12776 [11:35<49:09,  3.96it/s]  9%|▊         | 1088/12776 [11:35<47:16,  4.12it/s]                                                      9%|▊         | 1088/12776 [11:35<47:16,  4.12it/s]  9%|▊         | 1089/12776 [11:36<45:35,  4.27it/s]                                                      9%|▊         | 1089/12776 [11:36<45:35,  4.27it/s]  9%|▊         | 1090/12776 [11:36<44:05,  4.42it/s]                                                      9%|▊         | 1090/12776 [11:36<44:05,  4.42it/s]  9%|▊         | 1091/12776 [11:36<46:49,  4.16it/s]                                                      9%|▊         | 1091/12776 [11:36<46:49,  4.16it/s]  9%|▊         | 1092/12776 [11:36<44:41,  4.36it/s]                                                      9%|▊         | 1092/12776 [11:36<44:41,  4.36it/s]  9%|▊         | 1093/12776 [11:36<43:00,  4.53it/s]                                                      9%|▊         | 1093/12776 [11:36<43:00,  4.53it/s]  9%|▊         | 1094/12776 [11:37<41:39,  4.67it/s]                                                      9%|▊         | 1094/12776 [11:37<41:39,  4.67it/s]  9%|▊         | 1095/12776 [11:37<40:36,  4.79it/s]                                                      9%|▊         | 1095/12776 [11:37<40:36,  4.79it/s]  9%|▊         | 1096/12776 [11:37<39:48,  4.89it/s]                                                      9%|▊         | 1096/12776 [11:37<39:48,  4.89it/s]  9%|▊         | 1097/12776 [11:37<43:25,  4.48it/s]                                                      9%|▊         | 1097/12776 [11:37<43:25,  4.48it/s]  9%|▊         | 1098/12776 [11:37<41:31,  4.69it/s]                                                      9%|▊         | 1098/12776 [11:38<41:31,  4.69it/s]  9%|▊         | 1099/12776 [11:38<39:47,  4.89it/s]                                                      9%|▊         | 1099/12776 [11:38<39:47,  4.89it/s]  9%|▊         | 1100/12776 [11:38<1:12:04,  2.70it/s]                                                        9%|▊         | 1100/12776 [11:38<1:12:04,  2.70it/s]  9%|▊         | 1101/12776 [11:40<2:19:29,  1.39it/s]                                                        9%|▊         | 1101/12776 [11:40<2:19:29,  1.39it/s]  9%|▊         | 1102/12776 [11:41<2:40:40,  1.21it/s]                                                        9%|▊         | 1102/12776 [11:41<2:40:40,  1.21it/s]  9%|▊         | 1103/12776 [11:42<2:42:23,  1.20it/s]                                                        9%|▊         | 1103/12776 [11:42<2:42:23,  1.20it/s]  9%|▊         | 1104/12776 [11:43<2:39:19,  1.22it/s]                                                        9%|▊         | 1104/12776 [11:43<2:39:19,  1.22it/s]  9%|▊         | 1105/12776 [11:43<2:34:58,  1.26it/s]                                                        9%|▊         | 1105/12776 [11:43<2:34:58,  1.26it/s]  9%|▊         | 1106/12776 [11:44<2:31:06,  1.29it/s]                                                        9%|▊         | 1106/12776 [11:44<2:31:06,  1.29it/s]  9%|▊         | 1107/12776 [11:45<2:24:47,  1.34it/s]                                                        9%|▊         | 1107/12776 [11:45<2:24:47,  1.34it/s]  9%|▊         | 1108/12776 [11:46<2:24:54,  1.34it/s]                                                        9%|▊         | 1108/12776 [11:46<2:24:54,  1.34it/s]  9%|▊         | 1109/12776 [11:46<2:16:06,  1.43it/s]                                                        9%|▊         | 1109/12776 [11:46<2:16:06,  1.43it/s]  9%|▊         | 1110/12776 [11:47<2:09:19,  1.50it/s]                                                        9%|▊         | 1110/12776 [11:47<2:09:19,  1.50it/s]  9%|▊         | 1111/12776 [11:47<2:01:18,  1.60it/s]                                                        9%|▊         | 1111/12776 [11:47<2:01:18,  1.60it/s]  9%|▊         | 1112/12776 [11:48<1:59:02,  1.63it/s]                                                        9%|▊         | 1112/12776 [11:48<1:59:02,  1.63it/s]  9%|▊         | 1113/12776 [11:48<1:51:21,  1.75it/s]                                                        9%|▊         | 1113/12776 [11:48<1:51:21,  1.75it/s]  9%|▊         | 1114/12776 [11:49<1:50:24,  1.76it/s]                                                        9%|▊         | 1114/12776 [11:49<1:50:24,  1.76it/s]  9%|▊         | 1115/12776 [11:49<1:42:39,  1.89it/s]                                                        9%|▊         | 1115/12776 [11:49<1:42:39,  1.89it/s]  9%|▊         | 1116/12776 [11:50<1:41:46,  1.91it/s]                                                        9%|▊         | 1116/12776 [11:50<1:41:46,  1.91it/s]  9%|▊         | 1117/12776 [11:50<1:34:05,  2.07it/s]                                                        9%|▊         | 1117/12776 [11:50<1:34:05,  2.07it/s]  9%|▉         | 1118/12776 [11:51<1:27:50,  2.21it/s]                                                        9%|▉         | 1118/12776 [11:51<1:27:50,  2.21it/s]  9%|▉         | 1119/12776 [11:51<1:24:53,  2.29it/s]                                                        9%|▉         | 1119/12776 [11:51<1:24:53,  2.29it/s]  9%|▉         | 1120/12776 [11:51<1:19:57,  2.43it/s]                                                        9%|▉         | 1120/12776 [11:51<1:19:57,  2.43it/s]  9%|▉         | 1121/12776 [11:52<1:16:14,  2.55it/s]                                                        9%|▉         | 1121/12776 [11:52<1:16:14,  2.55it/s]  9%|▉         | 1122/12776 [11:52<1:18:40,  2.47it/s]                                                        9%|▉         | 1122/12776 [11:52<1:18:40,  2.47it/s]  9%|▉         | 1123/12776 [11:52<1:14:11,  2.62it/s]                                                        9%|▉         | 1123/12776 [11:52<1:14:11,  2.62it/s]  9%|▉         | 1124/12776 [11:53<1:09:49,  2.78it/s]                                                        9%|▉         | 1124/12776 [11:53<1:09:49,  2.78it/s]  9%|▉         | 1125/12776 [11:53<1:06:28,  2.92it/s]                                                        9%|▉         | 1125/12776 [11:53<1:06:28,  2.92it/s]  9%|▉         | 1126/12776 [11:53<1:05:58,  2.94it/s]                                                        9%|▉         | 1126/12776 [11:53<1:05:58,  2.94it/s]  9%|▉         | 1127/12776 [11:54<1:03:01,  3.08it/s]                                                        9%|▉         | 1127/12776 [11:54<1:03:01,  3.08it/s]  9%|▉         | 1128/12776 [11:54<1:00:18,  3.22it/s]                                                        9%|▉         | 1128/12776 [11:54<1:00:18,  3.22it/s]  9%|▉         | 1129/12776 [11:54<58:00,  3.35it/s]                                                        9%|▉         | 1129/12776 [11:54<58:00,  3.35it/s]  9%|▉         | 1130/12776 [11:55<56:34,  3.43it/s]                                                      9%|▉         | 1130/12776 [11:55<56:34,  3.43it/s]  9%|▉         | 1131/12776 [11:55<54:41,  3.55it/s]                                                      9%|▉         | 1131/12776 [11:55<54:41,  3.55it/s]  9%|▉         | 1132/12776 [11:55<52:52,  3.67it/s]                                                      9%|▉         | 1132/12776 [11:55<52:52,  3.67it/s]  9%|▉         | 1133/12776 [11:55<51:19,  3.78it/s]                                                      9%|▉         | 1133/12776 [11:55<51:19,  3.78it/s]  9%|▉         | 1134/12776 [11:56<56:01,  3.46it/s]                                                      9%|▉         | 1134/12776 [11:56<56:01,  3.46it/s]  9%|▉         | 1135/12776 [11:56<52:48,  3.67it/s]                                                      9%|▉         | 1135/12776 [11:56<52:48,  3.67it/s]  9%|▉         | 1136/12776 [11:56<50:09,  3.87it/s]                                                      9%|▉         | 1136/12776 [11:56<50:09,  3.87it/s]  9%|▉         | 1137/12776 [11:56<47:58,  4.04it/s]                                                      9%|▉         | 1137/12776 [11:56<47:58,  4.04it/s]  9%|▉         | 1138/12776 [11:57<52:13,  3.71it/s]                                                      9%|▉         | 1138/12776 [11:57<52:13,  3.71it/s]  9%|▉         | 1139/12776 [11:57<48:50,  3.97it/s]                                                      9%|▉         | 1139/12776 [11:57<48:50,  3.97it/s]  9%|▉         | 1140/12776 [11:57<46:09,  4.20it/s]                                                      9%|▉         | 1140/12776 [11:57<46:09,  4.20it/s]  9%|▉         | 1141/12776 [11:57<44:22,  4.37it/s]                                                      9%|▉         | 1141/12776 [11:57<44:22,  4.37it/s]  9%|▉         | 1142/12776 [11:57<42:47,  4.53it/s]                                                      9%|▉         | 1142/12776 [11:57<42:47,  4.53it/s]  9%|▉         | 1143/12776 [11:58<47:45,  4.06it/s]                                                      9%|▉         | 1143/12776 [11:58<47:45,  4.06it/s]  9%|▉         | 1144/12776 [11:58<44:50,  4.32it/s]                                                      9%|▉         | 1144/12776 [11:58<44:50,  4.32it/s]  9%|▉         | 1145/12776 [11:58<42:48,  4.53it/s]                                                      9%|▉         | 1145/12776 [11:58<42:48,  4.53it/s]  9%|▉         | 1146/12776 [11:58<41:14,  4.70it/s]                                                      9%|▉         | 1146/12776 [11:58<41:14,  4.70it/s]  9%|▉         | 1147/12776 [11:59<39:59,  4.85it/s]                                                      9%|▉         | 1147/12776 [11:59<39:59,  4.85it/s]  9%|▉         | 1148/12776 [11:59<38:48,  4.99it/s]                                                      9%|▉         | 1148/12776 [11:59<38:48,  4.99it/s]  9%|▉         | 1149/12776 [11:59<45:02,  4.30it/s]                                                      9%|▉         | 1149/12776 [11:59<45:02,  4.30it/s]  9%|▉         | 1150/12776 [12:00<1:10:44,  2.74it/s]                                                        9%|▉         | 1150/12776 [12:00<1:10:44,  2.74it/s]  9%|▉         | 1151/12776 [12:01<2:05:34,  1.54it/s]                                                        9%|▉         | 1151/12776 [12:01<2:05:34,  1.54it/s]  9%|▉         | 1152/12776 [12:02<2:22:44,  1.36it/s]                                                        9%|▉         | 1152/12776 [12:02<2:22:44,  1.36it/s]  9%|▉         | 1153/12776 [12:03<2:31:40,  1.28it/s]                                                        9%|▉         | 1153/12776 [12:03<2:31:40,  1.28it/s]  9%|▉         | 1154/12776 [12:04<2:29:48,  1.29it/s]                                                        9%|▉         | 1154/12776 [12:04<2:29:48,  1.29it/s]  9%|▉         | 1155/12776 [12:04<2:26:43,  1.32it/s]                                                      {'loss': 1.5639, 'grad_norm': 1.9233096837997437, 'learning_rate': 0.00028602150537634404, 'epoch': 0.17}
+{'loss': 0.9704, 'grad_norm': 1.4510252475738525, 'learning_rate': 0.00028599706744868034, 'epoch': 0.17}
+{'loss': 0.979, 'grad_norm': 3.087716817855835, 'learning_rate': 0.0002859726295210166, 'epoch': 0.17}
+{'loss': 0.6662, 'grad_norm': 3.0797057151794434, 'learning_rate': 0.00028594819159335284, 'epoch': 0.17}
+{'loss': 1.4854, 'grad_norm': 2.5428311824798584, 'learning_rate': 0.00028592375366568915, 'epoch': 0.17}
+{'loss': 1.3866, 'grad_norm': 1.8000260591506958, 'learning_rate': 0.0002858993157380254, 'epoch': 0.17}
+{'loss': 1.1556, 'grad_norm': 3.2034716606140137, 'learning_rate': 0.00028587487781036165, 'epoch': 0.17}
+{'loss': 1.0316, 'grad_norm': 1.544206142425537, 'learning_rate': 0.00028585043988269796, 'epoch': 0.17}
+{'loss': 1.1934, 'grad_norm': 2.489361047744751, 'learning_rate': 0.00028582600195503415, 'epoch': 0.17}
+{'loss': 1.4824, 'grad_norm': 2.156745672225952, 'learning_rate': 0.00028580156402737046, 'epoch': 0.17}
+{'loss': 1.266, 'grad_norm': 2.317676305770874, 'learning_rate': 0.0002857771260997067, 'epoch': 0.17}
+{'loss': 1.0838, 'grad_norm': 1.6004157066345215, 'learning_rate': 0.00028575268817204296, 'epoch': 0.17}
+{'loss': 1.274, 'grad_norm': 1.9848181009292603, 'learning_rate': 0.00028572825024437927, 'epoch': 0.17}
+{'loss': 1.2818, 'grad_norm': 2.925771474838257, 'learning_rate': 0.0002857038123167155, 'epoch': 0.17}
+{'loss': 1.0246, 'grad_norm': 1.8362958431243896, 'learning_rate': 0.00028567937438905177, 'epoch': 0.17}
+{'loss': 2.1207, 'grad_norm': 4.179997444152832, 'learning_rate': 0.0002856549364613881, 'epoch': 0.17}
+{'loss': 2.0854, 'grad_norm': 4.3508429527282715, 'learning_rate': 0.0002856304985337243, 'epoch': 0.17}
+{'loss': 1.654, 'grad_norm': 6.442362308502197, 'learning_rate': 0.0002856060606060606, 'epoch': 0.17}
+{'loss': 1.2959, 'grad_norm': 1.9675512313842773, 'learning_rate': 0.00028558162267839683, 'epoch': 0.17}
+{'loss': 1.4101, 'grad_norm': 2.1139070987701416, 'learning_rate': 0.00028555718475073313, 'epoch': 0.17}
+{'loss': 0.6606, 'grad_norm': 1.998644232749939, 'learning_rate': 0.0002855327468230694, 'epoch': 0.17}
+{'loss': 0.8759, 'grad_norm': 3.3093795776367188, 'learning_rate': 0.00028550830889540564, 'epoch': 0.17}
+{'loss': 0.5971, 'grad_norm': 3.0865354537963867, 'learning_rate': 0.00028548387096774194, 'epoch': 0.17}
+{'loss': 0.8417, 'grad_norm': 1.6972962617874146, 'learning_rate': 0.0002854594330400782, 'epoch': 0.17}
+{'loss': 0.6272, 'grad_norm': 0.6339183449745178, 'learning_rate': 0.00028543499511241445, 'epoch': 0.17}
+{'loss': 0.6742, 'grad_norm': 0.7550747990608215, 'learning_rate': 0.0002854105571847507, 'epoch': 0.17}
+{'loss': 0.4962, 'grad_norm': 0.7940789461135864, 'learning_rate': 0.00028538611925708695, 'epoch': 0.17}
+{'loss': 0.591, 'grad_norm': 0.7967172265052795, 'learning_rate': 0.00028536168132942325, 'epoch': 0.17}
+{'loss': 0.5865, 'grad_norm': 0.9077056050300598, 'learning_rate': 0.0002853372434017595, 'epoch': 0.17}
+{'loss': 0.5431, 'grad_norm': 0.7266988158226013, 'learning_rate': 0.00028531280547409576, 'epoch': 0.17}
+{'loss': 0.68, 'grad_norm': 1.1024484634399414, 'learning_rate': 0.00028528836754643206, 'epoch': 0.17}
+{'loss': 0.6728, 'grad_norm': 1.058496117591858, 'learning_rate': 0.0002852639296187683, 'epoch': 0.17}
+{'loss': 0.6104, 'grad_norm': 1.3563542366027832, 'learning_rate': 0.00028523949169110456, 'epoch': 0.17}
+{'loss': 0.4989, 'grad_norm': 0.9580490589141846, 'learning_rate': 0.0002852150537634408, 'epoch': 0.17}
+{'loss': 0.7394, 'grad_norm': 1.4669703245162964, 'learning_rate': 0.0002851906158357771, 'epoch': 0.17}
+{'loss': 0.5999, 'grad_norm': 1.063644289970398, 'learning_rate': 0.00028516617790811337, 'epoch': 0.17}
+{'loss': 0.8043, 'grad_norm': 1.5729597806930542, 'learning_rate': 0.0002851417399804496, 'epoch': 0.17}
+{'loss': 0.5842, 'grad_norm': 0.8515031933784485, 'learning_rate': 0.00028511730205278593, 'epoch': 0.17}
+{'loss': 1.1475, 'grad_norm': 2.940446138381958, 'learning_rate': 0.0002850928641251222, 'epoch': 0.17}
+{'loss': 1.125, 'grad_norm': 1.5992013216018677, 'learning_rate': 0.00028506842619745843, 'epoch': 0.17}
+{'loss': 0.8755, 'grad_norm': 1.8026546239852905, 'learning_rate': 0.00028504398826979474, 'epoch': 0.17}
+{'loss': 0.5423, 'grad_norm': 0.9795113801956177, 'learning_rate': 0.00028501955034213093, 'epoch': 0.18}
+{'loss': 0.907, 'grad_norm': 2.1793277263641357, 'learning_rate': 0.00028499511241446724, 'epoch': 0.18}
+{'loss': 1.1782, 'grad_norm': 6.160722255706787, 'learning_rate': 0.0002849706744868035, 'epoch': 0.18}
+{'loss': 1.0413, 'grad_norm': 3.347403049468994, 'learning_rate': 0.00028494623655913974, 'epoch': 0.18}
+{'loss': 0.8199, 'grad_norm': 2.783139705657959, 'learning_rate': 0.00028492179863147605, 'epoch': 0.18}
+{'loss': 0.7946, 'grad_norm': 1.6603683233261108, 'learning_rate': 0.0002848973607038123, 'epoch': 0.18}
+{'loss': 1.1803, 'grad_norm': 2.4885950088500977, 'learning_rate': 0.00028487292277614855, 'epoch': 0.18}
+{'loss': 1.2259, 'grad_norm': 2.89567494392395, 'learning_rate': 0.0002848484848484848, 'epoch': 0.18}
+{'loss': 1.0174, 'grad_norm': 1.2964775562286377, 'learning_rate': 0.0002848240469208211, 'epoch': 0.18}
+{'loss': 0.9492, 'grad_norm': 1.6365723609924316, 'learning_rate': 0.00028479960899315736, 'epoch': 0.18}
+{'loss': 0.7886, 'grad_norm': 1.2834248542785645, 'learning_rate': 0.0002847751710654936, 'epoch': 0.18}
+{'loss': 1.1145, 'grad_norm': 1.8275222778320312, 'learning_rate': 0.0002847507331378299, 'epoch': 0.18}
+{'loss': 0.9815, 'grad_norm': 2.4772050380706787, 'learning_rate': 0.00028472629521016617, 'epoch': 0.18}
+{'loss': 1.1874, 'grad_norm': 5.258149147033691, 'learning_rate': 0.0002847018572825024, 'epoch': 0.18}
+{'loss': 1.308, 'grad_norm': 2.3318662643432617, 'learning_rate': 0.0002846774193548387, 'epoch': 0.18}
+{'loss': 1.0855, 'grad_norm': 3.756080389022827, 'learning_rate': 0.0002846529814271749, 'epoch': 0.18}
+{'loss': 1.395, 'grad_norm': 2.8273825645446777, 'learning_rate': 0.0002846285434995112, 'epoch': 0.18}
+{'loss': 1.2854, 'grad_norm': 1.8789290189743042, 'learning_rate': 0.0002846041055718475, 'epoch': 0.18}
+{'loss': 1.3411, 'grad_norm': 5.230047225952148, 'learning_rate': 0.0002845796676441837, 'epoch': 0.18}
+{'loss': 1.972, 'grad_norm': 3.2810747623443604, 'learning_rate': 0.00028455522971652003, 'epoch': 0.18}
+{'loss': 1.4195, 'grad_norm': 3.59177565574646, 'learning_rate': 0.0002845307917888563, 'epoch': 0.18}
+{'loss': 1.0904, 'grad_norm': 3.564577341079712, 'learning_rate': 0.00028450635386119253, 'epoch': 0.18}
+{'loss': 1.4299, 'grad_norm': 2.349719762802124, 'learning_rate': 0.00028448191593352884, 'epoch': 0.18}
+{'loss': 1.9473, 'grad_norm': 2.8419268131256104, 'learning_rate': 0.0002844574780058651, 'epoch': 0.18}
+{'loss': 1.7942, 'grad_norm': 5.5093159675598145, 'learning_rate': 0.00028443304007820134, 'epoch': 0.18}
+{'loss': 1.3642, 'grad_norm': 2.4165680408477783, 'learning_rate': 0.0002844086021505376, 'epoch': 0.18}
+{'loss': 1.649, 'grad_norm': 3.020798444747925, 'learning_rate': 0.0002843841642228739, 'epoch': 0.18}
+{'loss': 1.4339, 'grad_norm': 2.0584144592285156, 'learning_rate': 0.00028435972629521015, 'epoch': 0.18}
+{'loss': 1.143, 'grad_norm': 3.8811330795288086, 'learning_rate': 0.0002843352883675464, 'epoch': 0.18}
+{'loss': 1.3439, 'grad_norm': 3.469479560852051, 'learning_rate': 0.0002843108504398827, 'epoch': 0.18}
+{'loss': 1.253, 'grad_norm': 2.772313117980957, 'learning_rate': 0.0002842864125122189, 'epoch': 0.18}
+{'loss': 1.6485, 'grad_norm': 2.618361473083496, 'learning_rate': 0.0002842619745845552, 'epoch': 0.18}
+{'loss': 1.4382, 'grad_norm': 1.9248026609420776, 'learning_rate': 0.00028423753665689146, 'epoch': 0.18}
+{'loss': 0.6461, 'grad_norm': 1.16201913356781, 'learning_rate': 0.0002842130987292277, 'epoch': 0.18}
+{'loss': 0.6146, 'grad_norm': 0.9263600707054138, 'learning_rate': 0.000284188660801564, 'epoch': 0.18}
+{'loss': 0.5212, 'grad_norm': 0.6873730421066284, 'learning_rate': 0.00028416422287390027, 'epoch': 0.18}
+{'loss': 0.478, 'grad_norm': 0.7068181037902832, 'learning_rate': 0.0002841397849462365, 'epoch': 0.18}
+  9%|▉         | 1155/12776 [12:04<2:26:43,  1.32it/s]  9%|▉         | 1156/12776 [12:05<2:22:07,  1.36it/s]                                                        9%|▉         | 1156/12776 [12:05<2:22:07,  1.36it/s]  9%|▉         | 1157/12776 [12:06<2:14:49,  1.44it/s]                                                        9%|▉         | 1157/12776 [12:06<2:14:49,  1.44it/s]  9%|▉         | 1158/12776 [12:06<2:08:05,  1.51it/s]                                                        9%|▉         | 1158/12776 [12:06<2:08:05,  1.51it/s]  9%|▉         | 1159/12776 [12:07<2:11:54,  1.47it/s]                                                        9%|▉         | 1159/12776 [12:07<2:11:54,  1.47it/s]  9%|▉         | 1160/12776 [12:07<2:04:18,  1.56it/s]                                                        9%|▉         | 1160/12776 [12:07<2:04:18,  1.56it/s]  9%|▉         | 1161/12776 [12:08<2:01:49,  1.59it/s]                                                        9%|▉         | 1161/12776 [12:08<2:01:49,  1.59it/s]  9%|▉         | 1162/12776 [12:09<1:54:04,  1.70it/s]                                                        9%|▉         | 1162/12776 [12:09<1:54:04,  1.70it/s]  9%|▉         | 1163/12776 [12:09<1:46:04,  1.82it/s]                                                        9%|▉         | 1163/12776 [12:09<1:46:04,  1.82it/s]  9%|▉         | 1164/12776 [12:09<1:41:12,  1.91it/s]                                                        9%|▉         | 1164/12776 [12:09<1:41:12,  1.91it/s]  9%|▉         | 1165/12776 [12:10<1:35:46,  2.02it/s]                                                        9%|▉         | 1165/12776 [12:10<1:35:46,  2.02it/s]  9%|▉         | 1166/12776 [12:10<1:34:15,  2.05it/s]                                                        9%|▉         | 1166/12776 [12:10<1:34:15,  2.05it/s]  9%|▉         | 1167/12776 [12:11<1:29:16,  2.17it/s]                                                        9%|▉         | 1167/12776 [12:11<1:29:16,  2.17it/s]  9%|▉         | 1168/12776 [12:11<1:24:58,  2.28it/s]                                                        9%|▉         | 1168/12776 [12:11<1:24:58,  2.28it/s]  9%|▉         | 1169/12776 [12:12<1:31:13,  2.12it/s]                                                        9%|▉         | 1169/12776 [12:12<1:31:13,  2.12it/s]  9%|▉         | 1170/12776 [12:12<1:24:30,  2.29it/s]                                                        9%|▉         | 1170/12776 [12:12<1:24:30,  2.29it/s]  9%|▉         | 1171/12776 [12:12<1:19:17,  2.44it/s]                                                        9%|▉         | 1171/12776 [12:12<1:19:17,  2.44it/s]  9%|▉         | 1172/12776 [12:13<1:19:53,  2.42it/s]                                                        9%|▉         | 1172/12776 [12:13<1:19:53,  2.42it/s]  9%|▉         | 1173/12776 [12:13<1:14:50,  2.58it/s]                                                        9%|▉         | 1173/12776 [12:13<1:14:50,  2.58it/s]  9%|▉         | 1174/12776 [12:13<1:09:54,  2.77it/s]                                                        9%|▉         | 1174/12776 [12:13<1:09:54,  2.77it/s]  9%|▉         | 1175/12776 [12:14<1:09:59,  2.76it/s]                                                        9%|▉         | 1175/12776 [12:14<1:09:59,  2.76it/s]  9%|▉         | 1176/12776 [12:14<1:05:52,  2.93it/s]                                                        9%|▉         | 1176/12776 [12:14<1:05:52,  2.93it/s]  9%|▉         | 1177/12776 [12:14<1:02:23,  3.10it/s]                                                        9%|▉         | 1177/12776 [12:14<1:02:23,  3.10it/s]  9%|▉         | 1178/12776 [12:15<59:33,  3.25it/s]                                                        9%|▉         | 1178/12776 [12:15<59:33,  3.25it/s]  9%|▉         | 1179/12776 [12:15<59:22,  3.26it/s]                                                      9%|▉         | 1179/12776 [12:15<59:22,  3.26it/s]  9%|▉         | 1180/12776 [12:15<56:01,  3.45it/s]                                                      9%|▉         | 1180/12776 [12:15<56:01,  3.45it/s]  9%|▉         | 1181/12776 [12:15<53:43,  3.60it/s]                                                      9%|▉         | 1181/12776 [12:15<53:43,  3.60it/s]  9%|▉         | 1182/12776 [12:16<52:00,  3.72it/s]                                                      9%|▉         | 1182/12776 [12:16<52:00,  3.72it/s]  9%|▉         | 1183/12776 [12:16<50:32,  3.82it/s]                                                      9%|▉         | 1183/12776 [12:16<50:32,  3.82it/s]  9%|▉         | 1184/12776 [12:16<50:20,  3.84it/s]                                                      9%|▉         | 1184/12776 [12:16<50:20,  3.84it/s]  9%|▉         | 1185/12776 [12:16<48:37,  3.97it/s]                                                      9%|▉         | 1185/12776 [12:16<48:37,  3.97it/s]  9%|▉         | 1186/12776 [12:17<47:05,  4.10it/s]                                                      9%|▉         | 1186/12776 [12:17<47:05,  4.10it/s]  9%|▉         | 1187/12776 [12:17<45:29,  4.25it/s]                                                      9%|▉         | 1187/12776 [12:17<45:29,  4.25it/s]  9%|▉         | 1188/12776 [12:17<51:58,  3.72it/s]                                                      9%|▉         | 1188/12776 [12:17<51:58,  3.72it/s]  9%|▉         | 1189/12776 [12:17<48:27,  3.99it/s]                                                      9%|▉         | 1189/12776 [12:17<48:27,  3.99it/s]  9%|▉         | 1190/12776 [12:18<46:05,  4.19it/s]                                                      9%|▉         | 1190/12776 [12:18<46:05,  4.19it/s]  9%|▉         | 1191/12776 [12:18<44:07,  4.38it/s]                                                      9%|▉         | 1191/12776 [12:18<44:07,  4.38it/s]  9%|▉         | 1192/12776 [12:18<42:35,  4.53it/s]                                                      9%|▉         | 1192/12776 [12:18<42:35,  4.53it/s]  9%|▉         | 1193/12776 [12:18<47:06,  4.10it/s]                                                      9%|▉         | 1193/12776 [12:18<47:06,  4.10it/s]  9%|▉         | 1194/12776 [12:19<44:25,  4.34it/s]                                                      9%|▉         | 1194/12776 [12:19<44:25,  4.34it/s]  9%|▉         | 1195/12776 [12:19<42:28,  4.54it/s]                                                      9%|▉         | 1195/12776 [12:19<42:28,  4.54it/s]  9%|▉         | 1196/12776 [12:19<41:15,  4.68it/s]                                                      9%|▉         | 1196/12776 [12:19<41:15,  4.68it/s]  9%|▉         | 1197/12776 [12:19<39:50,  4.84it/s]                                                      9%|▉         | 1197/12776 [12:19<39:50,  4.84it/s]  9%|▉         | 1198/12776 [12:19<38:32,  5.01it/s]                                                      9%|▉         | 1198/12776 [12:19<38:32,  5.01it/s]  9%|▉         | 1199/12776 [12:20<43:35,  4.43it/s]                                                      9%|▉         | 1199/12776 [12:20<43:35,  4.43it/s]  9%|▉         | 1200/12776 [12:20<1:09:48,  2.76it/s]                                                        9%|▉         | 1200/12776 [12:20<1:09:48,  2.76it/s]Saving model checkpoint to ./checkpoint-1200
+Configuration saved in ./checkpoint-1200/config.json
+Model weights saved in ./checkpoint-1200/model.safetensors
+Feature extractor saved in ./checkpoint-1200/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-1200/tokenizer_config.json
+Special tokens file saved in ./checkpoint-1200/special_tokens_map.json
+added tokens file saved in ./checkpoint-1200/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+  9%|▉         | 1201/12776 [12:27<7:09:48,  2.23s/it]                                                        9%|▉         | 1201/12776 [12:27<7:09:48,  2.23s/it]  9%|▉         | 1202/12776 [12:28<5:57:13,  1.85s/it]                                                        9%|▉         | 1202/12776 [12:28<5:57:13,  1.85s/it]  9%|▉         | 1203/12776 [12:29<4:58:22,  1.55s/it]                                                        9%|▉         | 1203/12776 [12:29<4:58:22,  1.55s/it]  9%|▉         | 1204/12776 [12:29<4:12:09,  1.31s/it]                                                        9%|▉         | 1204/12776 [12:29<4:12:09,  1.31s/it]  9%|▉         | 1205/12776 [12:30<3:42:30,  1.15s/it]                                                        9%|▉         | 1205/12776 [12:30<3:42:30,  1.15s/it]  9%|▉         | 1206/12776 [12:31<3:21:26,  1.04s/it]                                                        9%|▉         | 1206/12776 [12:31<3:21:26,  1.04s/it]  9%|▉         | 1207/12776 [12:32<2:56:48,  1.09it/s]                                                        9%|▉         | 1207/12776 [12:32<2:56:48,  1.09it/s]  9%|▉         | 1208/12776 [12:32<2:38:03,  1.22it/s]                                                        9%|▉         | 1208/12776 [12:32<2:38:03,  1.22it/s]  9%|▉         | 1209/12776 [12:33<2:22:48,  1.35it/s]                                                        9%|▉         | 1209/12776 [12:33<2:22:48,  1.35it/s]  9%|▉         | 1210/12776 [12:33<2:13:27,  1.44it/s]                                                        9%|▉         | 1210/12776 [12:33<2:13:27,  1.44it/s]  9%|▉         | 1211/12776 [12:34<2:02:17,  1.58it/s]                                                        9%|▉         | 1211/12776 [12:34<2:02:17,  1.58it/s]  9%|▉         | 1212/12776 [12:34<1:53:16,  1.70it/s]                                                        9%|▉         | 1212/12776 [12:34<1:53:16,  1.70it/s]  9%|▉         | 1213/12776 [12:35<1:45:40,  1.82it/s]                                                        9%|▉         | 1213/12776 [12:35<1:45:40,  1.82it/s] 10%|▉         | 1214/12776 [12:35<1:38:25,  1.96it/s]                                                       10%|▉         | 1214/12776 [12:35<1:38:25,  1.96it/s] 10%|▉         | 1215/12776 [12:36<1:34:45,  2.03it/s]                                                       10%|▉         | 1215/12776 [12:36<1:34:45,  2.03it/s] 10%|▉         | 1216/12776 [12:36<1:29:16,  2.16it/s]                                                       10%|▉         | 1216/12776 [12:36<1:29:16,  2.16it/s] 10%|▉         | 1217/12776 [12:36<1:24:54,  2.27it/s]                                                       10%|▉         | 1217/12776 [12:36<1:24:54,  2.27it/s] 10%|▉         | 1218/12776 [12:37<1:21:38,  2.36it/s]                                                       10%|▉         | 1218/12776 [12:37<1:21:38,  2.36it/s] 10%|▉         | 1219/12776 [12:37<1:17:24,  2.49it/s]                                                       10%|▉         | 1219/12776 [12:37<1:17:24,  2.49it/s] 10%|▉         | 1220/12776 [12:38<1:13:28,  2.62it/s]                                                       10%|▉         | 1220/12776 [12:38<1:13:28,  2.62it/s] 10%|▉         | 1221/12776 [12:38<1:17:59,  2.47it/s]                                                       10%|▉         | 1221/12776 [12:38<1:17:59,  2.47it/s] 10%|▉         | 1222/12776 [12:38<1:12:44,  2.65it/s]                                                       10%|▉         | 1222/12776 [12:38<1:12:44,  2.65it/s] 10%|▉         | 1223/12776 [12:39<1:08:36,  2.81it/s]                                                       10%|▉         | 1223/12776 [12:39<1:08:36,  2.81it/s] 10%|▉         | 1224/12776 [12:39<1:04:40,  2.98it/s]                                                       10%|▉         | 1224/12776 [12:39<1:04:40,  2.98it/s] 10%|▉         | 1225/12776 [12:39<1:05:41,  2.93it/s]                                                       10%|▉         | 1225/12776 [12:39<1:05:41,  2.93it/s] 10%|▉         | 1226/12776 [12:40<1:01:17,  3.14it/s]                                                       10%|▉         | 1226/12776 [12:40<1:01:17,  3.14it/s] 10%|▉         | 1227/12776 [12:40<57:49,  3.33it/s]                                                       10%|▉         | 1227/12776 [12:40<57:49,  3.33it/s] 10%|▉         | 1228/12776 [12:40<55:01,  3.50it/s]                                                     10%|▉         | 1228/12776 [12:40<55:01,  3.50it/s] 10%|▉         | 1229/12776 [12:40<59:22,  3.24it/s]                                                     10%|▉         | 1229/12776 [12:40<59:22,  3.24it/s] 10%|▉         | 1230/12776 [12:41<55:55,  3.44it/s]                                                     10%|▉         | 1230/12776 [12:41<55:55,  3.44it/s] 10%|▉         | 1231/12776 [12:41<52:33,  3.66it/s]                                                     10%|▉         | 1231/12776 [12:41<52:33,  3.66it/s] 10%|▉         | 1232/12776 [12:41<49:46,  3.87it/s]                                                    {'loss': 0.7398, 'grad_norm': 0.824680745601654, 'learning_rate': 0.0002841153470185728, 'epoch': 0.18}
+{'loss': 0.4756, 'grad_norm': 0.7223101854324341, 'learning_rate': 0.0002840909090909091, 'epoch': 0.18}
+{'loss': 0.518, 'grad_norm': 1.0076377391815186, 'learning_rate': 0.00028406647116324533, 'epoch': 0.18}
+{'loss': 0.604, 'grad_norm': 1.0383641719818115, 'learning_rate': 0.0002840420332355816, 'epoch': 0.18}
+{'loss': 0.6473, 'grad_norm': 1.6516761779785156, 'learning_rate': 0.0002840175953079179, 'epoch': 0.18}
+{'loss': 0.8136, 'grad_norm': 1.3219435214996338, 'learning_rate': 0.00028399315738025414, 'epoch': 0.18}
+{'loss': 0.5458, 'grad_norm': 1.0588476657867432, 'learning_rate': 0.0002839687194525904, 'epoch': 0.18}
+{'loss': 0.6775, 'grad_norm': 1.2911880016326904, 'learning_rate': 0.0002839442815249267, 'epoch': 0.18}
+{'loss': 0.7164, 'grad_norm': 1.0801639556884766, 'learning_rate': 0.00028391984359726294, 'epoch': 0.18}
+{'loss': 0.7186, 'grad_norm': 1.2428066730499268, 'learning_rate': 0.0002838954056695992, 'epoch': 0.18}
+{'loss': 1.0134, 'grad_norm': 2.018033504486084, 'learning_rate': 0.00028387096774193545, 'epoch': 0.18}
+{'loss': 0.6296, 'grad_norm': 1.2658600807189941, 'learning_rate': 0.0002838465298142717, 'epoch': 0.18}
+{'loss': 0.8566, 'grad_norm': 1.356482744216919, 'learning_rate': 0.000283822091886608, 'epoch': 0.18}
+{'loss': 0.9361, 'grad_norm': 1.4970476627349854, 'learning_rate': 0.00028379765395894425, 'epoch': 0.18}
+{'loss': 1.3125, 'grad_norm': 2.531439781188965, 'learning_rate': 0.0002837732160312805, 'epoch': 0.18}
+{'loss': 1.1296, 'grad_norm': 1.9374449253082275, 'learning_rate': 0.0002837487781036168, 'epoch': 0.18}
+{'loss': 0.9607, 'grad_norm': 1.2566038370132446, 'learning_rate': 0.00028372434017595306, 'epoch': 0.18}
+{'loss': 1.2914, 'grad_norm': 1.5955857038497925, 'learning_rate': 0.0002836999022482893, 'epoch': 0.18}
+{'loss': 0.8891, 'grad_norm': 1.207210898399353, 'learning_rate': 0.00028367546432062557, 'epoch': 0.18}
+{'loss': 0.8335, 'grad_norm': 1.754010796546936, 'learning_rate': 0.00028365102639296187, 'epoch': 0.18}
+{'loss': 1.0153, 'grad_norm': 1.5274769067764282, 'learning_rate': 0.0002836265884652981, 'epoch': 0.18}
+{'loss': 0.7795, 'grad_norm': 1.2438735961914062, 'learning_rate': 0.0002836021505376344, 'epoch': 0.18}
+{'loss': 1.1812, 'grad_norm': 1.854723572731018, 'learning_rate': 0.0002835777126099707, 'epoch': 0.18}
+{'loss': 1.0775, 'grad_norm': 1.8179014921188354, 'learning_rate': 0.00028355327468230693, 'epoch': 0.18}
+{'loss': 0.7761, 'grad_norm': 1.427675724029541, 'learning_rate': 0.0002835288367546432, 'epoch': 0.18}
+{'loss': 1.3916, 'grad_norm': 2.1459126472473145, 'learning_rate': 0.0002835043988269795, 'epoch': 0.18}
+{'loss': 1.0506, 'grad_norm': 1.9748454093933105, 'learning_rate': 0.0002834799608993157, 'epoch': 0.18}
+{'loss': 1.7368, 'grad_norm': 2.326005220413208, 'learning_rate': 0.000283455522971652, 'epoch': 0.19}
+{'loss': 1.5369, 'grad_norm': 3.0131938457489014, 'learning_rate': 0.00028343108504398824, 'epoch': 0.19}
+{'loss': 1.1345, 'grad_norm': 2.223701000213623, 'learning_rate': 0.0002834066471163245, 'epoch': 0.19}
+{'loss': 1.7771, 'grad_norm': 3.37308669090271, 'learning_rate': 0.0002833822091886608, 'epoch': 0.19}
+{'loss': 1.2859, 'grad_norm': 2.0770747661590576, 'learning_rate': 0.00028335777126099705, 'epoch': 0.19}
+{'loss': 1.3574, 'grad_norm': 2.7626192569732666, 'learning_rate': 0.0002833333333333333, 'epoch': 0.19}
+{'loss': 1.9453, 'grad_norm': 2.9744863510131836, 'learning_rate': 0.0002833088954056696, 'epoch': 0.19}
+{'loss': 1.4961, 'grad_norm': 2.143836736679077, 'learning_rate': 0.00028328445747800586, 'epoch': 0.19}
+{'loss': 1.7137, 'grad_norm': 1.8865716457366943, 'learning_rate': 0.0002832600195503421, 'epoch': 0.19}
+{'loss': 1.4174, 'grad_norm': 2.325896978378296, 'learning_rate': 0.00028323558162267836, 'epoch': 0.19}
+{'loss': 1.7222, 'grad_norm': 2.0641846656799316, 'learning_rate': 0.00028321114369501466, 'epoch': 0.19}
+{'loss': 1.4698, 'grad_norm': 1.5752524137496948, 'learning_rate': 0.0002831867057673509, 'epoch': 0.19}
+{'loss': 1.1898, 'grad_norm': 1.7678941488265991, 'learning_rate': 0.00028316226783968717, 'epoch': 0.19}
+{'loss': 1.2828, 'grad_norm': 2.59869384765625, 'learning_rate': 0.00028313782991202347, 'epoch': 0.19}
+{'loss': 0.9471, 'grad_norm': 1.4962000846862793, 'learning_rate': 0.00028311339198435967, 'epoch': 0.19}
+{'loss': 1.1181, 'grad_norm': 2.0714030265808105, 'learning_rate': 0.000283088954056696, 'epoch': 0.19}
+{'loss': 1.0763, 'grad_norm': 1.3907320499420166, 'learning_rate': 0.0002830645161290322, 'epoch': 0.19}
+{'loss': 1.3243, 'grad_norm': 2.7507219314575195, 'learning_rate': 0.0002830400782013685, 'epoch': 0.19}
+{'loss': 1.1937, 'grad_norm': 2.2302870750427246, 'learning_rate': 0.0002830156402737048, 'epoch': 0.19}
+{'loss': 0.6451, 'grad_norm': 0.9416796565055847, 'learning_rate': 0.00028299120234604103, 'epoch': 0.19}
+{'loss': 0.483, 'grad_norm': 0.6642339825630188, 'learning_rate': 0.0002829667644183773, 'epoch': 0.19}
+{'loss': 0.9268, 'grad_norm': 0.9438838958740234, 'learning_rate': 0.0002829423264907136, 'epoch': 0.19}
+{'loss': 0.6353, 'grad_norm': 0.6609278917312622, 'learning_rate': 0.00028291788856304984, 'epoch': 0.19}
+{'loss': 0.6421, 'grad_norm': 0.7672062516212463, 'learning_rate': 0.0002828934506353861, 'epoch': 0.19}
+{'loss': 0.579, 'grad_norm': 0.7625839114189148, 'learning_rate': 0.00028286901270772234, 'epoch': 0.19}
+{'loss': 0.7153, 'grad_norm': 1.4474471807479858, 'learning_rate': 0.00028284457478005865, 'epoch': 0.19}
+{'loss': 0.5436, 'grad_norm': 1.2392566204071045, 'learning_rate': 0.0002828201368523949, 'epoch': 0.19}
+{'loss': 0.5473, 'grad_norm': 1.2932655811309814, 'learning_rate': 0.00028279569892473115, 'epoch': 0.19}
+{'loss': 0.5555, 'grad_norm': 1.3752779960632324, 'learning_rate': 0.00028277126099706746, 'epoch': 0.19}
+{'loss': 0.4844, 'grad_norm': 0.6900332570075989, 'learning_rate': 0.0002827468230694037, 'epoch': 0.19}
+{'loss': 1.2104, 'grad_norm': 1.8527002334594727, 'learning_rate': 0.00028272238514173996, 'epoch': 0.19}
+{'loss': 0.7639, 'grad_norm': 1.6955235004425049, 'learning_rate': 0.0002826979472140762, 'epoch': 0.19}
+{'loss': 0.6054, 'grad_norm': 1.2351977825164795, 'learning_rate': 0.00028267350928641246, 'epoch': 0.19}
+{'loss': 0.6811, 'grad_norm': 1.1425282955169678, 'learning_rate': 0.00028264907135874877, 'epoch': 0.19}
+{'loss': 0.7065, 'grad_norm': 1.043776273727417, 'learning_rate': 0.000282624633431085, 'epoch': 0.19}
+{'loss': 0.8076, 'grad_norm': 1.4897061586380005, 'learning_rate': 0.00028260019550342127, 'epoch': 0.19}
+{'loss': 1.0173, 'grad_norm': 1.984785556793213, 'learning_rate': 0.0002825757575757576, 'epoch': 0.19}
+{'loss': 1.0036, 'grad_norm': 1.847044587135315, 'learning_rate': 0.00028255131964809383, 'epoch': 0.19}
+{'loss': 0.7437, 'grad_norm': 1.3059883117675781, 'learning_rate': 0.0002825268817204301, 'epoch': 0.19}
+{'loss': 0.785, 'grad_norm': 1.8383262157440186, 'learning_rate': 0.00028250244379276633, 'epoch': 0.19}
+{'loss': 1.0545, 'grad_norm': 1.516766905784607, 'learning_rate': 0.00028247800586510264, 'epoch': 0.19}
+{'loss': 0.7535, 'grad_norm': 1.1955336332321167, 'learning_rate': 0.0002824535679374389, 'epoch': 0.19}
+{'loss': 0.7899, 'grad_norm': 1.8770792484283447, 'learning_rate': 0.00028242913000977514, 'epoch': 0.19}
+{'loss': 0.903, 'grad_norm': 2.0006120204925537, 'learning_rate': 0.00028240469208211144, 'epoch': 0.19}
+{'loss': 1.5116, 'grad_norm': 1.9902926683425903, 'learning_rate': 0.0002823802541544477, 'epoch': 0.19}
+{'loss': 0.917, 'grad_norm': 1.5198619365692139, 'learning_rate': 0.00028235581622678395, 'epoch': 0.19}
+{'loss': 1.1258, 'grad_norm': 1.8800374269485474, 'learning_rate': 0.00028233137829912025, 'epoch': 0.19}
+{'loss': 1.0894, 'grad_norm': 1.7788026332855225, 'learning_rate': 0.00028230694037145645, 'epoch': 0.19}
+{'loss': 0.812, 'grad_norm': 1.4421257972717285, 'learning_rate': 0.00028228250244379275, 'epoch': 0.19}
+{'loss': 1.2619, 'grad_norm': 2.770679235458374, 'learning_rate': 0.000282258064516129, 'epoch': 0.19}
+{'loss': 0.9904, 'grad_norm': 1.2351292371749878, 'learning_rate': 0.00028223362658846526, 'epoch': 0.19} 10%|▉         | 1232/12776 [12:41<49:46,  3.87it/s] 10%|▉         | 1233/12776 [12:41<53:02,  3.63it/s]                                                     10%|▉         | 1233/12776 [12:41<53:02,  3.63it/s] 10%|▉         | 1234/12776 [12:42<49:36,  3.88it/s]                                                     10%|▉         | 1234/12776 [12:42<49:36,  3.88it/s] 10%|▉         | 1235/12776 [12:42<46:49,  4.11it/s]                                                     10%|▉         | 1235/12776 [12:42<46:49,  4.11it/s] 10%|▉         | 1236/12776 [12:42<44:40,  4.31it/s]                                                     10%|▉         | 1236/12776 [12:42<44:40,  4.31it/s] 10%|▉         | 1237/12776 [12:42<42:54,  4.48it/s]                                                     10%|▉         | 1237/12776 [12:42<42:54,  4.48it/s] 10%|▉         | 1238/12776 [12:43<46:19,  4.15it/s]                                                     10%|▉         | 1238/12776 [12:43<46:19,  4.15it/s] 10%|▉         | 1239/12776 [12:43<43:30,  4.42it/s]                                                     10%|▉         | 1239/12776 [12:43<43:30,  4.42it/s] 10%|▉         | 1240/12776 [12:43<41:17,  4.66it/s]                                                     10%|▉         | 1240/12776 [12:43<41:17,  4.66it/s] 10%|▉         | 1241/12776 [12:43<39:26,  4.87it/s]                                                     10%|▉         | 1241/12776 [12:43<39:26,  4.87it/s] 10%|▉         | 1242/12776 [12:43<38:11,  5.03it/s]                                                     10%|▉         | 1242/12776 [12:43<38:11,  5.03it/s] 10%|▉         | 1243/12776 [12:43<36:56,  5.20it/s]                                                     10%|▉         | 1243/12776 [12:43<36:56,  5.20it/s] 10%|▉         | 1244/12776 [12:44<42:13,  4.55it/s]                                                     10%|▉         | 1244/12776 [12:44<42:13,  4.55it/s] 10%|▉         | 1245/12776 [12:44<39:32,  4.86it/s]                                                     10%|▉         | 1245/12776 [12:44<39:32,  4.86it/s] 10%|▉         | 1246/12776 [12:44<37:28,  5.13it/s]                                                     10%|▉         | 1246/12776 [12:44<37:28,  5.13it/s] 10%|▉         | 1247/12776 [12:44<35:41,  5.38it/s]                                                     10%|▉         | 1247/12776 [12:44<35:41,  5.38it/s] 10%|▉         | 1248/12776 [12:44<34:24,  5.58it/s]                                                     10%|▉         | 1248/12776 [12:44<34:24,  5.58it/s] 10%|▉         | 1249/12776 [12:45<33:29,  5.74it/s]                                                     10%|▉         | 1249/12776 [12:45<33:29,  5.74it/s] 10%|▉         | 1250/12776 [12:45<59:32,  3.23it/s]                                                     10%|▉         | 1250/12776 [12:45<59:32,  3.23it/s] 10%|▉         | 1251/12776 [12:46<1:50:21,  1.74it/s]                                                       10%|▉         | 1251/12776 [12:46<1:50:21,  1.74it/s] 10%|▉         | 1252/12776 [12:47<2:15:10,  1.42it/s]                                                       10%|▉         | 1252/12776 [12:47<2:15:10,  1.42it/s] 10%|▉         | 1253/12776 [12:48<2:22:54,  1.34it/s]                                                       10%|▉         | 1253/12776 [12:48<2:22:54,  1.34it/s] 10%|▉         | 1254/12776 [12:49<2:24:22,  1.33it/s]                                                       10%|▉         | 1254/12776 [12:49<2:24:22,  1.33it/s] 10%|▉         | 1255/12776 [12:50<2:25:28,  1.32it/s]                                                       10%|▉         | 1255/12776 [12:50<2:25:28,  1.32it/s] 10%|▉         | 1256/12776 [12:51<2:24:42,  1.33it/s]                                                       10%|▉         | 1256/12776 [12:51<2:24:42,  1.33it/s] 10%|▉         | 1257/12776 [12:51<2:19:04,  1.38it/s]                                                       10%|▉         | 1257/12776 [12:51<2:19:04,  1.38it/s] 10%|▉         | 1258/12776 [12:52<2:19:34,  1.38it/s]                                                       10%|▉         | 1258/12776 [12:52<2:19:34,  1.38it/s] 10%|▉         | 1259/12776 [12:53<2:11:39,  1.46it/s]                                                       10%|▉         | 1259/12776 [12:53<2:11:39,  1.46it/s] 10%|▉         | 1260/12776 [12:53<2:06:06,  1.52it/s]                                                       10%|▉         | 1260/12776 [12:53<2:06:06,  1.52it/s] 10%|▉         | 1261/12776 [12:54<1:58:26,  1.62it/s]                                                       10%|▉         | 1261/12776 [12:54<1:58:26,  1.62it/s] 10%|▉         | 1262/12776 [12:54<1:55:13,  1.67it/s]                                                       10%|▉         | 1262/12776 [12:54<1:55:13,  1.67it/s] 10%|▉         | 1263/12776 [12:55<1:48:36,  1.77it/s]                                                       10%|▉         | 1263/12776 [12:55<1:48:36,  1.77it/s] 10%|▉         | 1264/12776 [12:55<1:48:42,  1.76it/s]                                                       10%|▉         | 1264/12776 [12:55<1:48:42,  1.76it/s] 10%|▉         | 1265/12776 [12:56<1:40:46,  1.90it/s]                                                       10%|▉         | 1265/12776 [12:56<1:40:46,  1.90it/s] 10%|▉         | 1266/12776 [12:56<1:40:28,  1.91it/s]                                                       10%|▉         | 1266/12776 [12:56<1:40:28,  1.91it/s] 10%|▉         | 1267/12776 [12:57<1:33:04,  2.06it/s]                                                       10%|▉         | 1267/12776 [12:57<1:33:04,  2.06it/s] 10%|▉         | 1268/12776 [12:57<1:26:39,  2.21it/s]                                                       10%|▉         | 1268/12776 [12:57<1:26:39,  2.21it/s] 10%|▉         | 1269/12776 [12:57<1:23:45,  2.29it/s]                                                       10%|▉         | 1269/12776 [12:57<1:23:45,  2.29it/s] 10%|▉         | 1270/12776 [12:58<1:18:02,  2.46it/s]                                                       10%|▉         | 1270/12776 [12:58<1:18:02,  2.46it/s] 10%|▉         | 1271/12776 [12:58<1:13:18,  2.62it/s]                                                       10%|▉         | 1271/12776 [12:58<1:13:18,  2.62it/s] 10%|▉         | 1272/12776 [12:58<1:10:00,  2.74it/s]                                                       10%|▉         | 1272/12776 [12:58<1:10:00,  2.74it/s] 10%|▉         | 1273/12776 [12:59<1:12:28,  2.65it/s]                                                       10%|▉         | 1273/12776 [12:59<1:12:28,  2.65it/s] 10%|▉         | 1274/12776 [12:59<1:09:53,  2.74it/s]                                                       10%|▉         | 1274/12776 [12:59<1:09:53,  2.74it/s] 10%|▉         | 1275/12776 [12:59<1:07:18,  2.85it/s]                                                       10%|▉         | 1275/12776 [12:59<1:07:18,  2.85it/s] 10%|▉         | 1276/12776 [13:00<1:11:13,  2.69it/s]                                                       10%|▉         | 1276/12776 [13:00<1:11:13,  2.69it/s] 10%|▉         | 1277/12776 [13:00<1:06:57,  2.86it/s]                                                       10%|▉         | 1277/12776 [13:00<1:06:57,  2.86it/s] 10%|█         | 1278/12776 [13:00<1:03:41,  3.01it/s]                                                       10%|█         | 1278/12776 [13:00<1:03:41,  3.01it/s] 10%|█         | 1279/12776 [13:01<1:06:34,  2.88it/s]                                                       10%|█         | 1279/12776 [13:01<1:06:34,  2.88it/s] 10%|█         | 1280/12776 [13:01<1:02:11,  3.08it/s]                                                       10%|█         | 1280/12776 [13:01<1:02:11,  3.08it/s] 10%|█         | 1281/12776 [13:01<58:37,  3.27it/s]                                                       10%|█         | 1281/12776 [13:01<58:37,  3.27it/s] 10%|█         | 1282/12776 [13:02<55:39,  3.44it/s]                                                     10%|█         | 1282/12776 [13:02<55:39,  3.44it/s] 10%|█         | 1283/12776 [13:02<59:59,  3.19it/s]                                                     10%|█         | 1283/12776 [13:02<59:59,  3.19it/s] 10%|█         | 1284/12776 [13:02<56:27,  3.39it/s]                                                     10%|█         | 1284/12776 [13:02<56:27,  3.39it/s] 10%|█         | 1285/12776 [13:02<53:28,  3.58it/s]                                                     10%|█         | 1285/12776 [13:02<53:28,  3.58it/s] 10%|█         | 1286/12776 [13:03<51:01,  3.75it/s]                                                     10%|█         | 1286/12776 [13:03<51:01,  3.75it/s] 10%|█         | 1287/12776 [13:03<48:56,  3.91it/s]                                                     10%|█         | 1287/12776 [13:03<48:56,  3.91it/s] 10%|█         | 1288/12776 [13:03<50:35,  3.78it/s]                                                     10%|█         | 1288/12776 [13:03<50:35,  3.78it/s] 10%|█         | 1289/12776 [13:03<47:36,  4.02it/s]                                                     10%|█         | 1289/12776 [13:03<47:36,  4.02it/s] 10%|█         | 1290/12776 [13:04<46:34,  4.11it/s]                                                     10%|█         | 1290/12776 [13:04<46:34,  4.11it/s] 10%|█         | 1291/12776 [13:04<44:43,  4.28it/s]                                                     10%|█         | 1291/12776 [13:04<44:43,  4.28it/s] 10%|█         | 1292/12776 [13:04<42:59,  4.45it/s]                                                     10%|█         | 1292/12776 [13:04<42:59,  4.45it/s] 10%|█         | 1293/12776 [13:04<44:27,  4.30it/s]                                                     10%|█         | 1293/12776 [13:04<44:27,  4.30it/s] 10%|█         | 1294/12776 [13:05<42:30,  4.50it/s]                                                     10%|█         | 1294/12776 [13:05<42:30,  4.50it/s] 10%|█         | 1295/12776 [13:05<41:01,  4.66it/s]                                                     10%|█         | 1295/12776 [13:05<41:01,  4.66it/s] 10%|█         | 1296/12776 [13:05<39:52,  4.80it/s]                                                     10%|█         | 1296/12776 [13:05<39:52,  4.80it/s] 10%|█         | 1297/12776 [13:05<38:58,  4.91it/s]                                                     10%|█         | 1297/12776 [13:05<38:58,  4.91it/s] 10%|█         | 1298/12776 [13:05<40:17,  4.75it/s]                                                     10%|█         | 1298/12776 [13:05<40:17,  4.75it/s] 10%|█         | 1299/12776 [13:06<38:56,  4.91it/s]                                                     10%|█         | 1299/12776 [13:06<38:56,  4.91it/s] 10%|█         | 1300/12776 [13:06<1:05:45,  2.91it/s]                                                       10%|█         | 1300/12776 [13:06<1:05:45,  2.91it/s] 10%|█         | 1301/12776 [13:08<2:07:00,  1.51it/s]                                                       10%|█         | 1301/12776 [13:08<2:07:00,  1.51it/s] 10%|█         | 1302/12776 [13:09<2:22:12,  1.34it/s]                                                       10%|█         | 1302/12776 [13:09<2:22:12,  1.34it/s] 10%|█         | 1303/12776 [13:09<2:28:04,  1.29it/s]                                                       10%|█         | 1303/12776 [13:09<2:28:04,  1.29it/s] 10%|█         | 1304/12776 [13:10<2:34:20,  1.24it/s]                                                       10%|█         | 1304/12776 [13:10<2:34:20,  1.24it/s] 10%|█         | 1305/12776 [13:11<2:36:57,  1.22it/s]                                                       10%|█         | 1305/12776 [13:11<2:36:57,  1.22it/s] 10%|█         | 1306/12776 [13:12<2:28:49,  1.28it/s]                                                       10%|█         | 1306/12776 [13:12<2:28:49,  1.28it/s] 10%|█         | 1307/12776 [13:13<2:26:38,  1.30it/s]                                                       10%|█         | 1307/12776 [13:13<2:26:38,  1.30it/s] 10%|█         | 1308/12776 [13:13<2:18:55,  1.38it/s]                                                       10%|█         | 1308/12776 [13:13<2:18:55,  1.38it/s] 10%|█         | 1309/12776 [13:14<2:10:05,  1.47it/s]                                                       10%|█         | 1309/12776 [13:14<2:10:05,  1.47it/s] 10%|█         | 1310/12776 [13:14<2:02:55,  1.55it/s]                                                       10%|█         | 1310/12776 [13:14<2:02:55,  1.55it/s] 10%|█         | 1311/12776 [13:15<1:59:43,  1.60it/s]                                                      
+{'loss': 1.2112, 'grad_norm': 1.4025189876556396, 'learning_rate': 0.00028220918866080156, 'epoch': 0.19}
+{'loss': 1.3794, 'grad_norm': 1.9613655805587769, 'learning_rate': 0.0002821847507331378, 'epoch': 0.19}
+{'loss': 1.0788, 'grad_norm': 1.3618839979171753, 'learning_rate': 0.00028216031280547406, 'epoch': 0.19}
+{'loss': 1.62, 'grad_norm': 2.786076068878174, 'learning_rate': 0.0002821358748778103, 'epoch': 0.19}
+{'loss': 1.5648, 'grad_norm': 2.0850629806518555, 'learning_rate': 0.0002821114369501466, 'epoch': 0.19}
+{'loss': 1.2221, 'grad_norm': 2.4790842533111572, 'learning_rate': 0.00028208699902248287, 'epoch': 0.19}
+{'loss': 1.0456, 'grad_norm': 2.154008388519287, 'learning_rate': 0.0002820625610948191, 'epoch': 0.19}
+{'loss': 1.4998, 'grad_norm': 4.071086883544922, 'learning_rate': 0.00028203812316715543, 'epoch': 0.19}
+{'loss': 2.0477, 'grad_norm': 2.761547803878784, 'learning_rate': 0.0002820136852394917, 'epoch': 0.19}
+{'loss': 1.1553, 'grad_norm': 3.3864057064056396, 'learning_rate': 0.00028198924731182793, 'epoch': 0.19}
+{'loss': 1.2943, 'grad_norm': 2.1232552528381348, 'learning_rate': 0.00028196480938416424, 'epoch': 0.19}
+{'loss': 1.0515, 'grad_norm': 4.488090991973877, 'learning_rate': 0.00028194037145650043, 'epoch': 0.19}
+{'loss': 0.3637, 'grad_norm': 2.625746726989746, 'learning_rate': 0.00028191593352883674, 'epoch': 0.19}
+{'loss': 0.7695, 'grad_norm': 3.204906940460205, 'learning_rate': 0.000281891495601173, 'epoch': 0.2}
+{'loss': 0.871, 'grad_norm': 2.906982660293579, 'learning_rate': 0.00028186705767350924, 'epoch': 0.2}
+{'loss': 1.6993, 'grad_norm': 3.8746962547302246, 'learning_rate': 0.00028184261974584555, 'epoch': 0.2}
+{'loss': 1.3819, 'grad_norm': 5.038626194000244, 'learning_rate': 0.0002818181818181818, 'epoch': 0.2}
+{'loss': 0.9957, 'grad_norm': 2.480462074279785, 'learning_rate': 0.00028179374389051805, 'epoch': 0.2}
+{'loss': 0.5353, 'grad_norm': 0.802400529384613, 'learning_rate': 0.00028176930596285436, 'epoch': 0.2}
+{'loss': 0.7606, 'grad_norm': 0.9732524752616882, 'learning_rate': 0.0002817448680351906, 'epoch': 0.2}
+{'loss': 0.7711, 'grad_norm': 1.2732630968093872, 'learning_rate': 0.00028172043010752686, 'epoch': 0.2}
+{'loss': 0.5173, 'grad_norm': 0.6224183440208435, 'learning_rate': 0.0002816959921798631, 'epoch': 0.2}
+{'loss': 0.5248, 'grad_norm': 0.9690751433372498, 'learning_rate': 0.0002816715542521994, 'epoch': 0.2}
+{'loss': 0.4887, 'grad_norm': 1.045830249786377, 'learning_rate': 0.00028164711632453567, 'epoch': 0.2}
+{'loss': 0.5496, 'grad_norm': 0.9513697624206543, 'learning_rate': 0.0002816226783968719, 'epoch': 0.2}
+{'loss': 0.4298, 'grad_norm': 0.6548385620117188, 'learning_rate': 0.0002815982404692082, 'epoch': 0.2}
+{'loss': 0.603, 'grad_norm': 0.9877871870994568, 'learning_rate': 0.0002815738025415445, 'epoch': 0.2}
+{'loss': 0.5035, 'grad_norm': 0.9719914197921753, 'learning_rate': 0.0002815493646138807, 'epoch': 0.2}
+{'loss': 0.3911, 'grad_norm': 0.5327731966972351, 'learning_rate': 0.000281524926686217, 'epoch': 0.2}
+{'loss': 0.7128, 'grad_norm': 0.874661386013031, 'learning_rate': 0.00028150048875855323, 'epoch': 0.2}
+{'loss': 0.6342, 'grad_norm': 1.2676417827606201, 'learning_rate': 0.00028147605083088953, 'epoch': 0.2}
+{'loss': 0.7849, 'grad_norm': 1.1647354364395142, 'learning_rate': 0.0002814516129032258, 'epoch': 0.2}
+{'loss': 0.4386, 'grad_norm': 2.0960235595703125, 'learning_rate': 0.00028142717497556204, 'epoch': 0.2}
+{'loss': 0.5409, 'grad_norm': 0.8338558673858643, 'learning_rate': 0.00028140273704789834, 'epoch': 0.2}
+{'loss': 0.7232, 'grad_norm': 1.7632123231887817, 'learning_rate': 0.0002813782991202346, 'epoch': 0.2}
+{'loss': 0.6558, 'grad_norm': 2.810450792312622, 'learning_rate': 0.00028135386119257084, 'epoch': 0.2}
+{'loss': 0.84, 'grad_norm': 1.2503775358200073, 'learning_rate': 0.0002813294232649071, 'epoch': 0.2}
+{'loss': 0.8859, 'grad_norm': 1.8836698532104492, 'learning_rate': 0.0002813049853372434, 'epoch': 0.2}
+{'loss': 0.7314, 'grad_norm': 1.3264356851577759, 'learning_rate': 0.00028128054740957965, 'epoch': 0.2}
+{'loss': 1.4003, 'grad_norm': 4.775301933288574, 'learning_rate': 0.0002812561094819159, 'epoch': 0.2}
+{'loss': 0.7736, 'grad_norm': 0.8449872136116028, 'learning_rate': 0.00028123167155425215, 'epoch': 0.2}
+{'loss': 1.187, 'grad_norm': 1.8365225791931152, 'learning_rate': 0.00028120723362658846, 'epoch': 0.2}
+{'loss': 0.9421, 'grad_norm': 1.0617682933807373, 'learning_rate': 0.0002811827956989247, 'epoch': 0.2}
+{'loss': 1.0415, 'grad_norm': 1.617153525352478, 'learning_rate': 0.00028115835777126096, 'epoch': 0.2}
+{'loss': 0.984, 'grad_norm': 1.6880416870117188, 'learning_rate': 0.0002811339198435972, 'epoch': 0.2}
+{'loss': 1.0823, 'grad_norm': 1.4345347881317139, 'learning_rate': 0.0002811094819159335, 'epoch': 0.2}
+{'loss': 1.1963, 'grad_norm': 1.5855309963226318, 'learning_rate': 0.00028108504398826977, 'epoch': 0.2}
+{'loss': 1.0804, 'grad_norm': 1.754294991493225, 'learning_rate': 0.000281060606060606, 'epoch': 0.2}
+{'loss': 0.9677, 'grad_norm': 2.036597967147827, 'learning_rate': 0.0002810361681329423, 'epoch': 0.2}
+{'loss': 1.2978, 'grad_norm': 3.59908390045166, 'learning_rate': 0.0002810117302052786, 'epoch': 0.2}
+{'loss': 1.0612, 'grad_norm': 3.524487257003784, 'learning_rate': 0.00028098729227761483, 'epoch': 0.2}
+{'loss': 1.1174, 'grad_norm': 2.481109857559204, 'learning_rate': 0.0002809628543499511, 'epoch': 0.2}
+{'loss': 1.2175, 'grad_norm': 1.8362330198287964, 'learning_rate': 0.00028093841642228733, 'epoch': 0.2}
+{'loss': 1.3735, 'grad_norm': 2.3617823123931885, 'learning_rate': 0.00028091397849462364, 'epoch': 0.2}
+{'loss': 1.0858, 'grad_norm': 2.753002405166626, 'learning_rate': 0.0002808895405669599, 'epoch': 0.2}
+{'loss': 0.93, 'grad_norm': 2.5607731342315674, 'learning_rate': 0.00028086510263929614, 'epoch': 0.2}
+{'loss': 1.6911, 'grad_norm': 2.9476540088653564, 'learning_rate': 0.00028084066471163244, 'epoch': 0.2}
+{'loss': 1.402, 'grad_norm': 3.2327563762664795, 'learning_rate': 0.0002808162267839687, 'epoch': 0.2}
+{'loss': 1.945, 'grad_norm': 3.618028163909912, 'learning_rate': 0.00028079178885630495, 'epoch': 0.2}
+{'loss': 1.8992, 'grad_norm': 2.141831159591675, 'learning_rate': 0.0002807673509286412, 'epoch': 0.2}
+{'loss': 1.0292, 'grad_norm': 2.144073963165283, 'learning_rate': 0.0002807429130009775, 'epoch': 0.2}
+{'loss': 1.18, 'grad_norm': 2.449118137359619, 'learning_rate': 0.00028071847507331376, 'epoch': 0.2}
+{'loss': 1.7014, 'grad_norm': 2.5604381561279297, 'learning_rate': 0.00028069403714565, 'epoch': 0.2}
+{'loss': 0.9777, 'grad_norm': 1.5793492794036865, 'learning_rate': 0.0002806695992179863, 'epoch': 0.2}
+{'loss': 1.0947, 'grad_norm': 2.9324707984924316, 'learning_rate': 0.00028064516129032256, 'epoch': 0.2}
+{'loss': 0.8496, 'grad_norm': 1.8581868410110474, 'learning_rate': 0.0002806207233626588, 'epoch': 0.2}
+{'loss': 1.2021, 'grad_norm': 2.117748975753784, 'learning_rate': 0.0002805962854349951, 'epoch': 0.2}
+{'loss': 1.5356, 'grad_norm': 3.1243643760681152, 'learning_rate': 0.0002805718475073313, 'epoch': 0.2}
+{'loss': 0.745, 'grad_norm': 1.2260499000549316, 'learning_rate': 0.0002805474095796676, 'epoch': 0.2}
+{'loss': 0.5955, 'grad_norm': 0.8840999007225037, 'learning_rate': 0.0002805229716520039, 'epoch': 0.2}
+{'loss': 0.8156, 'grad_norm': 0.9450700879096985, 'learning_rate': 0.0002804985337243401, 'epoch': 0.2}
+{'loss': 0.6363, 'grad_norm': 0.9631698727607727, 'learning_rate': 0.00028047409579667643, 'epoch': 0.2}
+{'loss': 0.6415, 'grad_norm': 0.7808576822280884, 'learning_rate': 0.0002804496578690127, 'epoch': 0.2}
+{'loss': 0.5683, 'grad_norm': 0.8394602537155151, 'learning_rate': 0.00028042521994134893, 'epoch': 0.2}
+{'loss': 0.6006, 'grad_norm': 0.9493279457092285, 'learning_rate': 0.0002804007820136852, 'epoch': 0.2}
+{'loss': 0.5773, 'grad_norm': 1.2991026639938354, 'learning_rate': 0.0002803763440860215, 'epoch': 0.2}
+{'loss': 0.6043, 'grad_norm': 0.9779866337776184, 'learning_rate': 0.00028035190615835774, 'epoch': 0.2}
+{'loss': 0.5852, 'grad_norm': 0.8985393047332764, 'learning_rate': 0.000280327468230694, 'epoch': 0.21}
+ 10%|█         | 1311/12776 [13:15<1:59:43,  1.60it/s] 10%|█         | 1312/12776 [13:15<1:52:06,  1.70it/s]                                                       10%|█         | 1312/12776 [13:15<1:52:06,  1.70it/s] 10%|█         | 1313/12776 [13:16<1:47:24,  1.78it/s]                                                       10%|█         | 1313/12776 [13:16<1:47:24,  1.78it/s] 10%|█         | 1314/12776 [13:16<1:40:12,  1.91it/s]                                                       10%|█         | 1314/12776 [13:16<1:40:12,  1.91it/s] 10%|█         | 1315/12776 [13:17<1:38:22,  1.94it/s]                                                       10%|█         | 1315/12776 [13:17<1:38:22,  1.94it/s] 10%|█         | 1316/12776 [13:17<1:32:19,  2.07it/s]                                                       10%|█         | 1316/12776 [13:17<1:32:19,  2.07it/s] 10%|█         | 1317/12776 [13:18<1:27:09,  2.19it/s]                                                       10%|█         | 1317/12776 [13:18<1:27:09,  2.19it/s] 10%|█         | 1318/12776 [13:18<1:29:42,  2.13it/s]                                                       10%|█         | 1318/12776 [13:18<1:29:42,  2.13it/s] 10%|█         | 1319/12776 [13:18<1:23:12,  2.29it/s]                                                       10%|█         | 1319/12776 [13:18<1:23:12,  2.29it/s] 10%|█         | 1320/12776 [13:19<1:18:06,  2.44it/s]                                                       10%|█         | 1320/12776 [13:19<1:18:06,  2.44it/s] 10%|█         | 1321/12776 [13:19<1:20:00,  2.39it/s]                                                       10%|█         | 1321/12776 [13:19<1:20:00,  2.39it/s] 10%|█         | 1322/12776 [13:20<1:14:49,  2.55it/s]                                                       10%|█         | 1322/12776 [13:20<1:14:49,  2.55it/s] 10%|█         | 1323/12776 [13:20<1:12:11,  2.64it/s]                                                       10%|█         | 1323/12776 [13:20<1:12:11,  2.64it/s] 10%|█         | 1324/12776 [13:20<1:12:25,  2.64it/s]                                                       10%|█         | 1324/12776 [13:20<1:12:25,  2.64it/s] 10%|█         | 1325/12776 [13:21<1:09:03,  2.76it/s]                                                       10%|█         | 1325/12776 [13:21<1:09:03,  2.76it/s] 10%|█         | 1326/12776 [13:21<1:05:39,  2.91it/s]                                                       10%|█         | 1326/12776 [13:21<1:05:39,  2.91it/s] 10%|█         | 1327/12776 [13:21<1:06:12,  2.88it/s]                                                       10%|█         | 1327/12776 [13:21<1:06:12,  2.88it/s] 10%|█         | 1328/12776 [13:22<1:02:23,  3.06it/s]                                                       10%|█         | 1328/12776 [13:22<1:02:23,  3.06it/s] 10%|█         | 1329/12776 [13:22<59:00,  3.23it/s]                                                       10%|█         | 1329/12776 [13:22<59:00,  3.23it/s] 10%|█         | 1330/12776 [13:22<55:53,  3.41it/s]                                                     10%|█         | 1330/12776 [13:22<55:53,  3.41it/s] 10%|█         | 1331/12776 [13:22<57:09,  3.34it/s]                                                     10%|█         | 1331/12776 [13:22<57:09,  3.34it/s] 10%|█         | 1332/12776 [13:23<53:59,  3.53it/s]                                                     10%|█         | 1332/12776 [13:23<53:59,  3.53it/s] 10%|█         | 1333/12776 [13:23<51:40,  3.69it/s]                                                     10%|█         | 1333/12776 [13:23<51:40,  3.69it/s] 10%|█         | 1334/12776 [13:23<49:51,  3.82it/s]                                                     10%|█         | 1334/12776 [13:23<49:51,  3.82it/s] 10%|█         | 1335/12776 [13:23<47:58,  3.97it/s]                                                     10%|█         | 1335/12776 [13:23<47:58,  3.97it/s] 10%|█         | 1336/12776 [13:24<50:41,  3.76it/s]                                                     10%|█         | 1336/12776 [13:24<50:41,  3.76it/s] 10%|█         | 1337/12776 [13:24<47:38,  4.00it/s]                                                     10%|█         | 1337/12776 [13:24<47:38,  4.00it/s] 10%|█         | 1338/12776 [13:24<45:16,  4.21it/s]                                                     10%|█         | 1338/12776 [13:24<45:16,  4.21it/s] 10%|█         | 1339/12776 [13:24<43:34,  4.37it/s]                                                     10%|█         | 1339/12776 [13:24<43:34,  4.37it/s] 10%|█         | 1340/12776 [13:24<42:03,  4.53it/s]                                                     10%|█         | 1340/12776 [13:24<42:03,  4.53it/s] 10%|█         | 1341/12776 [13:25<46:49,  4.07it/s]                                                     10%|█         | 1341/12776 [13:25<46:49,  4.07it/s] 11%|█         | 1342/12776 [13:25<44:00,  4.33it/s]                                                     11%|█         | 1342/12776 [13:25<44:00,  4.33it/s] 11%|█         | 1343/12776 [13:25<42:29,  4.48it/s]                                                     11%|█         | 1343/12776 [13:25<42:29,  4.48it/s] 11%|█         | 1344/12776 [13:25<40:51,  4.66it/s]                                                     11%|█         | 1344/12776 [13:25<40:51,  4.66it/s] 11%|█         | 1345/12776 [13:26<39:34,  4.82it/s]                                                     11%|█         | 1345/12776 [13:26<39:34,  4.82it/s] 11%|█         | 1346/12776 [13:26<41:18,  4.61it/s]                                                     11%|█         | 1346/12776 [13:26<41:18,  4.61it/s] 11%|█         | 1347/12776 [13:26<39:22,  4.84it/s]                                                     11%|█         | 1347/12776 [13:26<39:22,  4.84it/s] 11%|█         | 1348/12776 [13:26<37:54,  5.02it/s]                                                     11%|█         | 1348/12776 [13:26<37:54,  5.02it/s] 11%|█         | 1349/12776 [13:26<36:51,  5.17it/s]                                                     11%|█         | 1349/12776 [13:26<36:51,  5.17it/s] 11%|█         | 1350/12776 [13:27<1:08:50,  2.77it/s]                                                       11%|█         | 1350/12776 [13:27<1:08:50,  2.77it/s] 11%|█         | 1351/12776 [13:29<2:23:55,  1.32it/s]                                                       11%|█         | 1351/12776 [13:29<2:23:55,  1.32it/s] 11%|█         | 1352/12776 [13:30<2:36:54,  1.21it/s]                                                       11%|█         | 1352/12776 [13:30<2:36:54,  1.21it/s] 11%|█         | 1353/12776 [13:31<2:42:32,  1.17it/s]                                                       11%|█         | 1353/12776 [13:31<2:42:32,  1.17it/s] 11%|█         | 1354/12776 [13:32<2:41:32,  1.18it/s]                                                       11%|█         | 1354/12776 [13:32<2:41:32,  1.18it/s] 11%|█         | 1355/12776 [13:32<2:36:44,  1.21it/s]                                                       11%|█         | 1355/12776 [13:32<2:36:44,  1.21it/s] 11%|█         | 1356/12776 [13:33<2:33:54,  1.24it/s]                                                       11%|█         | 1356/12776 [13:33<2:33:54,  1.24it/s] 11%|█         | 1357/12776 [13:34<2:29:10,  1.28it/s]                                                       11%|█         | 1357/12776 [13:34<2:29:10,  1.28it/s] 11%|█         | 1358/12776 [13:34<2:21:02,  1.35it/s]                                                       11%|█         | 1358/12776 [13:34<2:21:02,  1.35it/s] 11%|█         | 1359/12776 [13:35<2:20:19,  1.36it/s]                                                       11%|█         | 1359/12776 [13:35<2:20:19,  1.36it/s] 11%|█         | 1360/12776 [13:36<2:11:19,  1.45it/s]                                                       11%|█         | 1360/12776 [13:36<2:11:19,  1.45it/s] 11%|█         | 1361/12776 [13:36<2:07:36,  1.49it/s]                                                       11%|█         | 1361/12776 [13:36<2:07:36,  1.49it/s] 11%|█         | 1362/12776 [13:37<1:59:46,  1.59it/s]                                                       11%|█         | 1362/12776 [13:37<1:59:46,  1.59it/s] 11%|█         | 1363/12776 [13:38<1:58:24,  1.61it/s]                                                       11%|█         | 1363/12776 [13:38<1:58:24,  1.61it/s] 11%|█         | 1364/12776 [13:38<1:50:22,  1.72it/s]                                                       11%|█         | 1364/12776 [13:38<1:50:22,  1.72it/s] 11%|█         | 1365/12776 [13:39<1:48:18,  1.76it/s]                                                       11%|█         | 1365/12776 [13:39<1:48:18,  1.76it/s] 11%|█         | 1366/12776 [13:39<1:41:27,  1.87it/s]                                                       11%|█         | 1366/12776 [13:39<1:41:27,  1.87it/s] 11%|█         | 1367/12776 [13:39<1:38:42,  1.93it/s]                                                       11%|█         | 1367/12776 [13:39<1:38:42,  1.93it/s] 11%|█         | 1368/12776 [13:40<1:32:38,  2.05it/s]                                                       11%|█         | 1368/12776 [13:40<1:32:38,  2.05it/s] 11%|█         | 1369/12776 [13:40<1:28:37,  2.15it/s]                                                       11%|█         | 1369/12776 [13:40<1:28:37,  2.15it/s] 11%|█         | 1370/12776 [13:41<1:29:04,  2.13it/s]                                                       11%|█         | 1370/12776 [13:41<1:29:04,  2.13it/s] 11%|█         | 1371/12776 [13:41<1:22:34,  2.30it/s]                                                       11%|█         | 1371/12776 [13:41<1:22:34,  2.30it/s] 11%|█         | 1372/12776 [13:41<1:17:36,  2.45it/s]                                                       11%|█         | 1372/12776 [13:41<1:17:36,  2.45it/s] 11%|█         | 1373/12776 [13:42<1:16:28,  2.49it/s]                                                       11%|█         | 1373/12776 [13:42<1:16:28,  2.49it/s] 11%|█         | 1374/12776 [13:42<1:12:37,  2.62it/s]                                                       11%|█         | 1374/12776 [13:42<1:12:37,  2.62it/s] 11%|█         | 1375/12776 [13:43<1:09:23,  2.74it/s]                                                       11%|█         | 1375/12776 [13:43<1:09:23,  2.74it/s] 11%|█         | 1376/12776 [13:43<1:06:48,  2.84it/s]                                                       11%|█         | 1376/12776 [13:43<1:06:48,  2.84it/s] 11%|█         | 1377/12776 [13:43<1:03:46,  2.98it/s]                                                       11%|█         | 1377/12776 [13:43<1:03:46,  2.98it/s] 11%|█         | 1378/12776 [13:43<1:01:19,  3.10it/s]                                                       11%|█         | 1378/12776 [13:43<1:01:19,  3.10it/s] 11%|█         | 1379/12776 [13:44<1:00:23,  3.15it/s]                                                       11%|█         | 1379/12776 [13:44<1:00:23,  3.15it/s] 11%|█         | 1380/12776 [13:44<1:02:20,  3.05it/s]                                                       11%|█         | 1380/12776 [13:44<1:02:20,  3.05it/s] 11%|█         | 1381/12776 [13:44<59:27,  3.19it/s]                                                       11%|█         | 1381/12776 [13:44<59:27,  3.19it/s] 11%|█         | 1382/12776 [13:45<56:42,  3.35it/s]                                                     11%|█         | 1382/12776 [13:45<56:42,  3.35it/s] 11%|█         | 1383/12776 [13:45<54:31,  3.48it/s]                                                     11%|█         | 1383/12776 [13:45<54:31,  3.48it/s] 11%|█         | 1384/12776 [13:45<55:21,  3.43it/s]                                                     11%|█         | 1384/12776 [13:45<55:21,  3.43it/s] 11%|█         | 1385/12776 [13:45<52:39,  3.60it/s]                                                     11%|█         | 1385/12776 [13:45<52:39,  3.60it/s] 11%|█         | 1386/12776 [13:46<50:14,  3.78it/s]                                                     11%|█         | 1386/12776 [13:46<50:14,  3.78it/s] 11%|█         | 1387/12776 [13:46<48:21,  3.93it/s]                                                     11%|█         | 1387/12776 [13:46<48:21,  3.93it/s] 11%|█         | 1388/12776 [13:46<49:28,  3.84it/s]                                                     11%|█         | 1388/12776 [13:46<49:28,  3.84it/s] 11%|█         | 1389/12776 [13:46<46:55,  4.04it/s]                                                    {'loss': 1.5883, 'grad_norm': 6.3509039878845215, 'learning_rate': 0.0002803030303030303, 'epoch': 0.21}
+{'loss': 0.4863, 'grad_norm': 0.9153957366943359, 'learning_rate': 0.00028027859237536655, 'epoch': 0.21}
+{'loss': 0.7647, 'grad_norm': 1.6397264003753662, 'learning_rate': 0.0002802541544477028, 'epoch': 0.21}
+{'loss': 0.5911, 'grad_norm': 1.2118104696273804, 'learning_rate': 0.0002802297165200391, 'epoch': 0.21}
+{'loss': 0.9774, 'grad_norm': 1.76604425907135, 'learning_rate': 0.0002802052785923753, 'epoch': 0.21}
+{'loss': 0.6841, 'grad_norm': 1.2922953367233276, 'learning_rate': 0.0002801808406647116, 'epoch': 0.21}
+{'loss': 0.5421, 'grad_norm': 1.8913264274597168, 'learning_rate': 0.00028015640273704786, 'epoch': 0.21}
+{'loss': 0.8927, 'grad_norm': 4.230543613433838, 'learning_rate': 0.0002801319648093841, 'epoch': 0.21}
+{'loss': 0.8219, 'grad_norm': 1.4822667837142944, 'learning_rate': 0.0002801075268817204, 'epoch': 0.21}
+{'loss': 0.7132, 'grad_norm': 1.4265705347061157, 'learning_rate': 0.00028008308895405667, 'epoch': 0.21}
+{'loss': 0.6372, 'grad_norm': 1.4394387006759644, 'learning_rate': 0.0002800586510263929, 'epoch': 0.21}
+{'loss': 0.7006, 'grad_norm': 1.241612434387207, 'learning_rate': 0.0002800342130987292, 'epoch': 0.21}
+{'loss': 1.122, 'grad_norm': 1.5274121761322021, 'learning_rate': 0.0002800097751710655, 'epoch': 0.21}
+{'loss': 0.7326, 'grad_norm': 1.7099493741989136, 'learning_rate': 0.0002799853372434017, 'epoch': 0.21}
+{'loss': 0.7534, 'grad_norm': 2.0768227577209473, 'learning_rate': 0.000279960899315738, 'epoch': 0.21}
+{'loss': 0.6914, 'grad_norm': 0.9548947811126709, 'learning_rate': 0.0002799364613880743, 'epoch': 0.21}
+{'loss': 1.5067, 'grad_norm': 2.6987247467041016, 'learning_rate': 0.00027991202346041053, 'epoch': 0.21}
+{'loss': 0.8489, 'grad_norm': 1.8522530794143677, 'learning_rate': 0.0002798875855327468, 'epoch': 0.21}
+{'loss': 0.7161, 'grad_norm': 2.1194634437561035, 'learning_rate': 0.0002798631476050831, 'epoch': 0.21}
+{'loss': 1.1511, 'grad_norm': 1.979345440864563, 'learning_rate': 0.0002798387096774193, 'epoch': 0.21}
+{'loss': 1.0264, 'grad_norm': 2.7786378860473633, 'learning_rate': 0.0002798142717497556, 'epoch': 0.21}
+{'loss': 0.9509, 'grad_norm': 1.8285077810287476, 'learning_rate': 0.00027978983382209184, 'epoch': 0.21}
+{'loss': 1.1794, 'grad_norm': 1.5117707252502441, 'learning_rate': 0.0002797653958944281, 'epoch': 0.21}
+{'loss': 1.2072, 'grad_norm': 1.830258846282959, 'learning_rate': 0.0002797409579667644, 'epoch': 0.21}
+{'loss': 1.6122, 'grad_norm': 2.282845973968506, 'learning_rate': 0.00027971652003910065, 'epoch': 0.21}
+{'loss': 1.3019, 'grad_norm': 2.070908308029175, 'learning_rate': 0.0002796920821114369, 'epoch': 0.21}
+{'loss': 1.2159, 'grad_norm': 1.86088228225708, 'learning_rate': 0.0002796676441837732, 'epoch': 0.21}
+{'loss': 1.5197, 'grad_norm': 1.9150466918945312, 'learning_rate': 0.00027964320625610946, 'epoch': 0.21}
+{'loss': 1.5689, 'grad_norm': 2.406750202178955, 'learning_rate': 0.0002796187683284457, 'epoch': 0.21}
+{'loss': 1.4533, 'grad_norm': 3.363661289215088, 'learning_rate': 0.00027959433040078196, 'epoch': 0.21}
+{'loss': 1.4408, 'grad_norm': 2.635338306427002, 'learning_rate': 0.00027956989247311827, 'epoch': 0.21}
+{'loss': 1.3341, 'grad_norm': 2.8413336277008057, 'learning_rate': 0.0002795454545454545, 'epoch': 0.21}
+{'loss': 1.2058, 'grad_norm': 1.84250009059906, 'learning_rate': 0.00027952101661779077, 'epoch': 0.21}
+{'loss': 1.3711, 'grad_norm': 2.388916492462158, 'learning_rate': 0.0002794965786901271, 'epoch': 0.21}
+{'loss': 1.0988, 'grad_norm': 2.6313962936401367, 'learning_rate': 0.00027947214076246333, 'epoch': 0.21}
+{'loss': 1.0163, 'grad_norm': 1.7488915920257568, 'learning_rate': 0.0002794477028347996, 'epoch': 0.21}
+{'loss': 0.9676, 'grad_norm': 2.03892183303833, 'learning_rate': 0.0002794232649071359, 'epoch': 0.21}
+{'loss': 0.8602, 'grad_norm': 1.5612317323684692, 'learning_rate': 0.0002793988269794721, 'epoch': 0.21}
+{'loss': 0.6831, 'grad_norm': 2.5052318572998047, 'learning_rate': 0.0002793743890518084, 'epoch': 0.21}
+{'loss': 1.084, 'grad_norm': 2.1519994735717773, 'learning_rate': 0.00027934995112414464, 'epoch': 0.21}
+{'loss': 0.4423, 'grad_norm': 0.670945405960083, 'learning_rate': 0.0002793255131964809, 'epoch': 0.21}
+{'loss': 0.6916, 'grad_norm': 1.1006921529769897, 'learning_rate': 0.0002793010752688172, 'epoch': 0.21}
+{'loss': 0.4832, 'grad_norm': 0.8006137609481812, 'learning_rate': 0.00027927663734115345, 'epoch': 0.21}
+{'loss': 0.5289, 'grad_norm': 0.7051506042480469, 'learning_rate': 0.0002792521994134897, 'epoch': 0.21}
+{'loss': 0.423, 'grad_norm': 0.6033449769020081, 'learning_rate': 0.00027922776148582595, 'epoch': 0.21}
+{'loss': 0.5403, 'grad_norm': 1.0505086183547974, 'learning_rate': 0.00027920332355816225, 'epoch': 0.21}
+{'loss': 0.6797, 'grad_norm': 1.063795804977417, 'learning_rate': 0.0002791788856304985, 'epoch': 0.21}
+{'loss': 0.7163, 'grad_norm': 0.77169269323349, 'learning_rate': 0.00027915444770283476, 'epoch': 0.21}
+{'loss': 0.6437, 'grad_norm': 1.3137171268463135, 'learning_rate': 0.00027913000977517106, 'epoch': 0.21}
+{'loss': 0.5225, 'grad_norm': 1.344262719154358, 'learning_rate': 0.0002791055718475073, 'epoch': 0.21}
+{'loss': 0.6403, 'grad_norm': 0.8262065649032593, 'learning_rate': 0.00027908113391984356, 'epoch': 0.21}
+{'loss': 0.8737, 'grad_norm': 1.4010947942733765, 'learning_rate': 0.00027905669599217987, 'epoch': 0.21}
+{'loss': 0.7244, 'grad_norm': 1.177219033241272, 'learning_rate': 0.00027903225806451607, 'epoch': 0.21}
+{'loss': 0.4727, 'grad_norm': 1.3933367729187012, 'learning_rate': 0.00027900782013685237, 'epoch': 0.21}
+{'loss': 0.6863, 'grad_norm': 1.8369626998901367, 'learning_rate': 0.0002789833822091886, 'epoch': 0.21}
+{'loss': 0.667, 'grad_norm': 1.523794174194336, 'learning_rate': 0.0002789589442815249, 'epoch': 0.21}
+{'loss': 1.0209, 'grad_norm': 1.0985807180404663, 'learning_rate': 0.0002789345063538612, 'epoch': 0.21}
+{'loss': 0.6309, 'grad_norm': 1.0308488607406616, 'learning_rate': 0.00027891006842619743, 'epoch': 0.21}
+{'loss': 1.1807, 'grad_norm': 2.2205519676208496, 'learning_rate': 0.0002788856304985337, 'epoch': 0.21}
+{'loss': 0.9443, 'grad_norm': 1.8049101829528809, 'learning_rate': 0.00027886119257087, 'epoch': 0.21}
+{'loss': 0.7854, 'grad_norm': 1.7300370931625366, 'learning_rate': 0.00027883675464320624, 'epoch': 0.21}
+{'loss': 0.5499, 'grad_norm': 1.587482213973999, 'learning_rate': 0.0002788123167155425, 'epoch': 0.21}
+{'loss': 0.9599, 'grad_norm': 1.539663553237915, 'learning_rate': 0.00027878787878787874, 'epoch': 0.21}
+{'loss': 0.9099, 'grad_norm': 3.231137990951538, 'learning_rate': 0.00027876344086021505, 'epoch': 0.22}
+{'loss': 1.0147, 'grad_norm': 1.5350894927978516, 'learning_rate': 0.0002787390029325513, 'epoch': 0.22}
+{'loss': 0.7973, 'grad_norm': 2.6407129764556885, 'learning_rate': 0.00027871456500488755, 'epoch': 0.22}
+{'loss': 1.1, 'grad_norm': 2.9032108783721924, 'learning_rate': 0.00027869012707722386, 'epoch': 0.22}
+{'loss': 0.625, 'grad_norm': 2.3868675231933594, 'learning_rate': 0.00027866568914956005, 'epoch': 0.22}
+{'loss': 0.7701, 'grad_norm': 1.203199863433838, 'learning_rate': 0.00027864125122189636, 'epoch': 0.22}
+{'loss': 1.1647, 'grad_norm': 5.6415205001831055, 'learning_rate': 0.0002786168132942326, 'epoch': 0.22}
+{'loss': 1.0549, 'grad_norm': 1.5552973747253418, 'learning_rate': 0.00027859237536656886, 'epoch': 0.22}
+{'loss': 1.311, 'grad_norm': 4.893740653991699, 'learning_rate': 0.00027856793743890517, 'epoch': 0.22}
+{'loss': 1.5808, 'grad_norm': 5.882252216339111, 'learning_rate': 0.0002785434995112414, 'epoch': 0.22}
+{'loss': 1.5347, 'grad_norm': 2.043102502822876, 'learning_rate': 0.00027851906158357767, 'epoch': 0.22}
+{'loss': 1.37, 'grad_norm': 2.2881131172180176, 'learning_rate': 0.000278494623655914, 'epoch': 0.22}
+{'loss': 1.5788, 'grad_norm': 2.5173704624176025, 'learning_rate': 0.0002784701857282502, 'epoch': 0.22}
+{'loss': 1.5984, 'grad_norm': 2.1015539169311523, 'learning_rate': 0.0002784457478005865, 'epoch': 0.22}
+{'loss': 1.49, 'grad_norm': 2.4034738540649414, 'learning_rate': 0.00027842130987292273, 'epoch': 0.22}
+ 11%|█         | 1389/12776 [13:46<46:55,  4.04it/s] 11%|█         | 1390/12776 [13:47<44:52,  4.23it/s]                                                     11%|█         | 1390/12776 [13:47<44:52,  4.23it/s] 11%|█         | 1391/12776 [13:47<43:35,  4.35it/s]                                                     11%|█         | 1391/12776 [13:47<43:35,  4.35it/s] 11%|█         | 1392/12776 [13:47<42:17,  4.49it/s]                                                     11%|█         | 1392/12776 [13:47<42:17,  4.49it/s] 11%|█         | 1393/12776 [13:47<45:27,  4.17it/s]                                                     11%|█         | 1393/12776 [13:47<45:27,  4.17it/s] 11%|█         | 1394/12776 [13:48<43:20,  4.38it/s]                                                     11%|█         | 1394/12776 [13:48<43:20,  4.38it/s] 11%|█         | 1395/12776 [13:48<42:26,  4.47it/s]                                                     11%|█         | 1395/12776 [13:48<42:26,  4.47it/s] 11%|█         | 1396/12776 [13:48<41:53,  4.53it/s]                                                     11%|█         | 1396/12776 [13:48<41:53,  4.53it/s] 11%|█         | 1397/12776 [13:48<40:15,  4.71it/s]                                                     11%|█         | 1397/12776 [13:48<40:15,  4.71it/s] 11%|█         | 1398/12776 [13:48<44:19,  4.28it/s]                                                     11%|█         | 1398/12776 [13:48<44:19,  4.28it/s] 11%|█         | 1399/12776 [13:49<41:50,  4.53it/s]                                                     11%|█         | 1399/12776 [13:49<41:50,  4.53it/s] 11%|█         | 1400/12776 [13:49<1:10:35,  2.69it/s]                                                       11%|█         | 1400/12776 [13:49<1:10:35,  2.69it/s] 11%|█         | 1401/12776 [13:51<2:09:54,  1.46it/s]                                                       11%|█         | 1401/12776 [13:51<2:09:54,  1.46it/s] 11%|█         | 1402/12776 [13:52<2:34:06,  1.23it/s]                                                       11%|█         | 1402/12776 [13:52<2:34:06,  1.23it/s] 11%|█         | 1403/12776 [13:53<2:36:41,  1.21it/s]                                                       11%|█         | 1403/12776 [13:53<2:36:41,  1.21it/s] 11%|█         | 1404/12776 [13:54<2:35:03,  1.22it/s]                                                       11%|█         | 1404/12776 [13:54<2:35:03,  1.22it/s] 11%|█         | 1405/12776 [13:54<2:33:35,  1.23it/s]                                                       11%|█         | 1405/12776 [13:54<2:33:35,  1.23it/s] 11%|█         | 1406/12776 [13:55<2:28:27,  1.28it/s]                                                       11%|█         | 1406/12776 [13:55<2:28:27,  1.28it/s] 11%|█         | 1407/12776 [13:56<2:23:53,  1.32it/s]                                                       11%|█         | 1407/12776 [13:56<2:23:53,  1.32it/s] 11%|█         | 1408/12776 [13:56<2:21:32,  1.34it/s]                                                       11%|█         | 1408/12776 [13:56<2:21:32,  1.34it/s] 11%|█         | 1409/12776 [13:57<2:15:33,  1.40it/s]                                                       11%|█         | 1409/12776 [13:57<2:15:33,  1.40it/s] 11%|█         | 1410/12776 [13:58<2:08:27,  1.47it/s]                                                       11%|█         | 1410/12776 [13:58<2:08:27,  1.47it/s] 11%|█         | 1411/12776 [13:58<2:01:09,  1.56it/s]                                                       11%|█         | 1411/12776 [13:58<2:01:09,  1.56it/s] 11%|█         | 1412/12776 [13:59<1:55:15,  1.64it/s]                                                       11%|█         | 1412/12776 [13:59<1:55:15,  1.64it/s] 11%|█         | 1413/12776 [13:59<1:49:11,  1.73it/s]                                                       11%|█         | 1413/12776 [13:59<1:49:11,  1.73it/s] 11%|█         | 1414/12776 [14:00<1:51:35,  1.70it/s]                                                       11%|█         | 1414/12776 [14:00<1:51:35,  1.70it/s] 11%|█         | 1415/12776 [14:00<1:44:03,  1.82it/s]                                                       11%|█         | 1415/12776 [14:00<1:44:03,  1.82it/s] 11%|█         | 1416/12776 [14:01<1:38:09,  1.93it/s]                                                       11%|█         | 1416/12776 [14:01<1:38:09,  1.93it/s] 11%|█         | 1417/12776 [14:01<1:36:11,  1.97it/s]                                                       11%|█         | 1417/12776 [14:01<1:36:11,  1.97it/s] 11%|█         | 1418/12776 [14:02<1:30:33,  2.09it/s]                                                       11%|█         | 1418/12776 [14:02<1:30:33,  2.09it/s] 11%|█         | 1419/12776 [14:02<1:33:04,  2.03it/s]                                                       11%|█         | 1419/12776 [14:02<1:33:04,  2.03it/s] 11%|█         | 1420/12776 [14:03<1:26:43,  2.18it/s]                                                       11%|█         | 1420/12776 [14:03<1:26:43,  2.18it/s] 11%|█         | 1421/12776 [14:03<1:21:40,  2.32it/s]                                                       11%|█         | 1421/12776 [14:03<1:21:40,  2.32it/s] 11%|█         | 1422/12776 [14:03<1:19:31,  2.38it/s]                                                       11%|█         | 1422/12776 [14:03<1:19:31,  2.38it/s] 11%|█         | 1423/12776 [14:04<1:15:02,  2.52it/s]                                                       11%|█         | 1423/12776 [14:04<1:15:02,  2.52it/s] 11%|█         | 1424/12776 [14:04<1:11:20,  2.65it/s]                                                       11%|█         | 1424/12776 [14:04<1:11:20,  2.65it/s] 11%|█         | 1425/12776 [14:04<1:08:20,  2.77it/s]                                                       11%|█         | 1425/12776 [14:04<1:08:20,  2.77it/s] 11%|█         | 1426/12776 [14:05<1:05:24,  2.89it/s]                                                       11%|█         | 1426/12776 [14:05<1:05:24,  2.89it/s] 11%|█         | 1427/12776 [14:05<1:02:32,  3.02it/s]                                                       11%|█         | 1427/12776 [14:05<1:02:32,  3.02it/s] 11%|█         | 1428/12776 [14:05<1:00:20,  3.13it/s]                                                       11%|█         | 1428/12776 [14:05<1:00:20,  3.13it/s] 11%|█         | 1429/12776 [14:06<1:04:19,  2.94it/s]                                                       11%|█         | 1429/12776 [14:06<1:04:19,  2.94it/s] 11%|█         | 1430/12776 [14:06<1:00:43,  3.11it/s]                                                       11%|█         | 1430/12776 [14:06<1:00:43,  3.11it/s] 11%|█         | 1431/12776 [14:06<57:55,  3.26it/s]                                                       11%|█         | 1431/12776 [14:06<57:55,  3.26it/s] 11%|█         | 1432/12776 [14:06<55:28,  3.41it/s]                                                     11%|█         | 1432/12776 [14:06<55:28,  3.41it/s] 11%|█         | 1433/12776 [14:07<1:02:25,  3.03it/s]                                                       11%|█         | 1433/12776 [14:07<1:02:25,  3.03it/s] 11%|█         | 1434/12776 [14:07<58:23,  3.24it/s]                                                       11%|█         | 1434/12776 [14:07<58:23,  3.24it/s] 11%|█         | 1435/12776 [14:07<54:37,  3.46it/s]                                                     11%|█         | 1435/12776 [14:07<54:37,  3.46it/s] 11%|█         | 1436/12776 [14:08<52:32,  3.60it/s]                                                     11%|█         | 1436/12776 [14:08<52:32,  3.60it/s] 11%|█         | 1437/12776 [14:08<51:43,  3.65it/s]                                                     11%|█         | 1437/12776 [14:08<51:43,  3.65it/s] 11%|█▏        | 1438/12776 [14:08<49:19,  3.83it/s]                                                     11%|█▏        | 1438/12776 [14:08<49:19,  3.83it/s] 11%|█▏        | 1439/12776 [14:08<46:43,  4.04it/s]                                                     11%|█▏        | 1439/12776 [14:08<46:43,  4.04it/s] 11%|█▏        | 1440/12776 [14:09<44:34,  4.24it/s]                                                     11%|█▏        | 1440/12776 [14:09<44:34,  4.24it/s] 11%|█▏        | 1441/12776 [14:09<43:09,  4.38it/s]                                                     11%|█▏        | 1441/12776 [14:09<43:09,  4.38it/s] 11%|█▏        | 1442/12776 [14:09<44:35,  4.24it/s]                                                     11%|█▏        | 1442/12776 [14:09<44:35,  4.24it/s] 11%|█▏        | 1443/12776 [14:09<42:34,  4.44it/s]                                                     11%|█▏        | 1443/12776 [14:09<42:34,  4.44it/s] 11%|█▏        | 1444/12776 [14:09<41:02,  4.60it/s]                                                     11%|█▏        | 1444/12776 [14:09<41:02,  4.60it/s] 11%|█▏        | 1445/12776 [14:10<39:49,  4.74it/s]                                                     11%|█▏        | 1445/12776 [14:10<39:49,  4.74it/s] 11%|█▏        | 1446/12776 [14:10<38:47,  4.87it/s]                                                     11%|█▏        | 1446/12776 [14:10<38:47,  4.87it/s] 11%|█▏        | 1447/12776 [14:10<38:04,  4.96it/s]                                                     11%|█▏        | 1447/12776 [14:10<38:04,  4.96it/s] 11%|█▏        | 1448/12776 [14:10<40:42,  4.64it/s]                                                     11%|█▏        | 1448/12776 [14:10<40:42,  4.64it/s] 11%|█▏        | 1449/12776 [14:10<38:59,  4.84it/s]                                                     11%|█▏        | 1449/12776 [14:10<38:59,  4.84it/s] 11%|█▏        | 1450/12776 [14:11<1:14:31,  2.53it/s]                                                       11%|█▏        | 1450/12776 [14:11<1:14:31,  2.53it/s] 11%|█▏        | 1451/12776 [14:13<2:12:38,  1.42it/s]                                                       11%|█▏        | 1451/12776 [14:13<2:12:38,  1.42it/s] 11%|█▏        | 1452/12776 [14:14<2:29:18,  1.26it/s]                                                       11%|█▏        | 1452/12776 [14:14<2:29:18,  1.26it/s] 11%|█▏        | 1453/12776 [14:15<2:34:48,  1.22it/s]                                                       11%|█▏        | 1453/12776 [14:15<2:34:48,  1.22it/s] 11%|█▏        | 1454/12776 [14:15<2:33:33,  1.23it/s]                                                       11%|█▏        | 1454/12776 [14:15<2:33:33,  1.23it/s] 11%|█▏        | 1455/12776 [14:16<2:37:53,  1.20it/s]                                                       11%|█▏        | 1455/12776 [14:16<2:37:53,  1.20it/s] 11%|█▏        | 1456/12776 [14:17<2:38:44,  1.19it/s]                                                       11%|█▏        | 1456/12776 [14:17<2:38:44,  1.19it/s] 11%|█▏        | 1457/12776 [14:18<2:27:32,  1.28it/s]                                                       11%|█▏        | 1457/12776 [14:18<2:27:32,  1.28it/s] 11%|█▏        | 1458/12776 [14:19<2:26:59,  1.28it/s]                                                       11%|█▏        | 1458/12776 [14:19<2:26:59,  1.28it/s] 11%|█▏        | 1459/12776 [14:19<2:15:45,  1.39it/s]                                                       11%|█▏        | 1459/12776 [14:19<2:15:45,  1.39it/s] 11%|█▏        | 1460/12776 [14:20<2:08:43,  1.47it/s]                                                       11%|█▏        | 1460/12776 [14:20<2:08:43,  1.47it/s] 11%|█▏        | 1461/12776 [14:20<2:00:47,  1.56it/s]                                                       11%|█▏        | 1461/12776 [14:20<2:00:47,  1.56it/s] 11%|█▏        | 1462/12776 [14:21<1:56:32,  1.62it/s]                                                       11%|█▏        | 1462/12776 [14:21<1:56:32,  1.62it/s] 11%|█▏        | 1463/12776 [14:21<1:49:23,  1.72it/s]                                                       11%|█▏        | 1463/12776 [14:21<1:49:23,  1.72it/s] 11%|█▏        | 1464/12776 [14:22<1:46:29,  1.77it/s]                                                       11%|█▏        | 1464/12776 [14:22<1:46:29,  1.77it/s] 11%|█▏        | 1465/12776 [14:22<1:39:18,  1.90it/s]                                                       11%|█▏        | 1465/12776 [14:22<1:39:18,  1.90it/s] 11%|█▏        | 1466/12776 [14:23<1:37:59,  1.92it/s]                                                       11%|█▏        | 1466/12776 [14:23<1:37:59,  1.92it/s] 11%|█▏        | 1467/12776 [14:23<1:31:19,  2.06it/s]                                                      {'loss': 1.4753, 'grad_norm': 2.5903968811035156, 'learning_rate': 0.00027839687194525903, 'epoch': 0.22}
+{'loss': 1.6426, 'grad_norm': 3.2042367458343506, 'learning_rate': 0.0002783724340175953, 'epoch': 0.22}
+{'loss': 1.9565, 'grad_norm': 4.184605598449707, 'learning_rate': 0.00027834799608993154, 'epoch': 0.22}
+{'loss': 1.7027, 'grad_norm': 2.502986431121826, 'learning_rate': 0.00027832355816226784, 'epoch': 0.22}
+{'loss': 1.2844, 'grad_norm': 2.451610565185547, 'learning_rate': 0.0002782991202346041, 'epoch': 0.22}
+{'loss': 0.9655, 'grad_norm': 1.821632742881775, 'learning_rate': 0.00027827468230694034, 'epoch': 0.22}
+{'loss': 2.3643, 'grad_norm': 4.602065563201904, 'learning_rate': 0.0002782502443792766, 'epoch': 0.22}
+{'loss': 1.0854, 'grad_norm': 3.144266366958618, 'learning_rate': 0.00027822580645161285, 'epoch': 0.22}
+{'loss': 1.2546, 'grad_norm': 2.5353446006774902, 'learning_rate': 0.00027820136852394915, 'epoch': 0.22}
+{'loss': 1.4616, 'grad_norm': 2.2268052101135254, 'learning_rate': 0.0002781769305962854, 'epoch': 0.22}
+{'loss': 1.7655, 'grad_norm': 3.656038284301758, 'learning_rate': 0.00027815249266862165, 'epoch': 0.22}
+{'loss': 1.8593, 'grad_norm': 2.7248010635375977, 'learning_rate': 0.00027812805474095796, 'epoch': 0.22}
+{'loss': 0.5366, 'grad_norm': 0.8559685945510864, 'learning_rate': 0.0002781036168132942, 'epoch': 0.22}
+{'loss': 0.5512, 'grad_norm': 0.8044238090515137, 'learning_rate': 0.00027807917888563046, 'epoch': 0.22}
+{'loss': 0.4996, 'grad_norm': 0.5511070489883423, 'learning_rate': 0.0002780547409579667, 'epoch': 0.22}
+{'loss': 0.6064, 'grad_norm': 0.9777888655662537, 'learning_rate': 0.000278030303030303, 'epoch': 0.22}
+{'loss': 0.4713, 'grad_norm': 0.7745730876922607, 'learning_rate': 0.00027800586510263927, 'epoch': 0.22}
+{'loss': 0.6403, 'grad_norm': 1.0580544471740723, 'learning_rate': 0.0002779814271749755, 'epoch': 0.22}
+{'loss': 0.4613, 'grad_norm': 0.7956675291061401, 'learning_rate': 0.0002779569892473118, 'epoch': 0.22}
+{'loss': 0.3812, 'grad_norm': 0.678325891494751, 'learning_rate': 0.0002779325513196481, 'epoch': 0.22}
+{'loss': 0.3709, 'grad_norm': 0.7760977149009705, 'learning_rate': 0.00027790811339198433, 'epoch': 0.22}
+{'loss': 0.4915, 'grad_norm': 1.0391250848770142, 'learning_rate': 0.00027788367546432063, 'epoch': 0.22}
+{'loss': 0.5706, 'grad_norm': 0.9946457743644714, 'learning_rate': 0.00027785923753665683, 'epoch': 0.22}
+{'loss': 0.6768, 'grad_norm': 1.5022060871124268, 'learning_rate': 0.00027783479960899314, 'epoch': 0.22}
+{'loss': 0.6929, 'grad_norm': 1.4307317733764648, 'learning_rate': 0.0002778103616813294, 'epoch': 0.22}
+{'loss': 0.4171, 'grad_norm': 0.9843283891677856, 'learning_rate': 0.00027778592375366564, 'epoch': 0.22}
+{'loss': 0.5666, 'grad_norm': 1.1952210664749146, 'learning_rate': 0.00027776148582600195, 'epoch': 0.22}
+{'loss': 0.8112, 'grad_norm': 1.7626519203186035, 'learning_rate': 0.0002777370478983382, 'epoch': 0.22}
+{'loss': 1.1599, 'grad_norm': 3.326972246170044, 'learning_rate': 0.00027771260997067445, 'epoch': 0.22}
+{'loss': 0.6991, 'grad_norm': 0.9253565669059753, 'learning_rate': 0.00027768817204301075, 'epoch': 0.22}
+{'loss': 0.5801, 'grad_norm': 1.0200566053390503, 'learning_rate': 0.000277663734115347, 'epoch': 0.22}
+{'loss': 0.4883, 'grad_norm': 1.5075781345367432, 'learning_rate': 0.00027763929618768326, 'epoch': 0.22}
+{'loss': 0.7034, 'grad_norm': 1.3740901947021484, 'learning_rate': 0.0002776148582600195, 'epoch': 0.22}
+{'loss': 0.7774, 'grad_norm': 2.125110149383545, 'learning_rate': 0.0002775904203323558, 'epoch': 0.22}
+{'loss': 0.7952, 'grad_norm': 2.414090633392334, 'learning_rate': 0.00027756598240469206, 'epoch': 0.22}
+{'loss': 0.9317, 'grad_norm': 4.026956081390381, 'learning_rate': 0.0002775415444770283, 'epoch': 0.22}
+{'loss': 1.0914, 'grad_norm': 1.9889763593673706, 'learning_rate': 0.0002775171065493646, 'epoch': 0.22}
+{'loss': 0.7606, 'grad_norm': 1.7664798498153687, 'learning_rate': 0.0002774926686217008, 'epoch': 0.22}
+{'loss': 1.0761, 'grad_norm': 1.880422830581665, 'learning_rate': 0.0002774682306940371, 'epoch': 0.22}
+{'loss': 0.9571, 'grad_norm': 2.686260223388672, 'learning_rate': 0.0002774437927663734, 'epoch': 0.22}
+{'loss': 1.0255, 'grad_norm': 1.939051628112793, 'learning_rate': 0.0002774193548387096, 'epoch': 0.22}
+{'loss': 1.0628, 'grad_norm': 2.358743190765381, 'learning_rate': 0.00027739491691104593, 'epoch': 0.22}
+{'loss': 1.2966, 'grad_norm': 2.4641621112823486, 'learning_rate': 0.0002773704789833822, 'epoch': 0.22}
+{'loss': 1.1513, 'grad_norm': 1.8473012447357178, 'learning_rate': 0.00027734604105571843, 'epoch': 0.22}
+{'loss': 1.306, 'grad_norm': 1.4949876070022583, 'learning_rate': 0.00027732160312805474, 'epoch': 0.22}
+{'loss': 1.6001, 'grad_norm': 1.902312159538269, 'learning_rate': 0.000277297165200391, 'epoch': 0.22}
+{'loss': 1.3238, 'grad_norm': 1.7652263641357422, 'learning_rate': 0.00027727272727272724, 'epoch': 0.22}
+{'loss': 1.3513, 'grad_norm': 2.4492361545562744, 'learning_rate': 0.0002772482893450635, 'epoch': 0.22}
+{'loss': 1.626, 'grad_norm': 2.270918369293213, 'learning_rate': 0.0002772238514173998, 'epoch': 0.22}
+{'loss': 1.3422, 'grad_norm': 2.2354772090911865, 'learning_rate': 0.00027719941348973605, 'epoch': 0.23}
+{'loss': 1.5828, 'grad_norm': 1.9211229085922241, 'learning_rate': 0.0002771749755620723, 'epoch': 0.23}
+{'loss': 1.2256, 'grad_norm': 1.6848493814468384, 'learning_rate': 0.0002771505376344086, 'epoch': 0.23}
+{'loss': 1.1445, 'grad_norm': 2.3783888816833496, 'learning_rate': 0.00027712609970674486, 'epoch': 0.23}
+{'loss': 1.2094, 'grad_norm': 3.050572156906128, 'learning_rate': 0.0002771016617790811, 'epoch': 0.23}
+{'loss': 1.415, 'grad_norm': 2.4172866344451904, 'learning_rate': 0.00027707722385141736, 'epoch': 0.23}
+{'loss': 1.7239, 'grad_norm': 2.319094181060791, 'learning_rate': 0.0002770527859237536, 'epoch': 0.23}
+{'loss': 1.2432, 'grad_norm': 2.561666965484619, 'learning_rate': 0.0002770283479960899, 'epoch': 0.23}
+{'loss': 0.9331, 'grad_norm': 2.667259454727173, 'learning_rate': 0.00027700391006842617, 'epoch': 0.23}
+{'loss': 1.0571, 'grad_norm': 2.7518651485443115, 'learning_rate': 0.0002769794721407624, 'epoch': 0.23}
+{'loss': 1.1156, 'grad_norm': 2.4335968494415283, 'learning_rate': 0.0002769550342130987, 'epoch': 0.23}
+{'loss': 0.6606, 'grad_norm': 5.546756744384766, 'learning_rate': 0.000276930596285435, 'epoch': 0.23}
+{'loss': 1.3594, 'grad_norm': 2.5599935054779053, 'learning_rate': 0.0002769061583577712, 'epoch': 0.23}
+{'loss': 0.533, 'grad_norm': 0.9069414734840393, 'learning_rate': 0.0002768817204301075, 'epoch': 0.23}
+{'loss': 0.5733, 'grad_norm': 0.9586019515991211, 'learning_rate': 0.0002768572825024438, 'epoch': 0.23}
+{'loss': 0.6321, 'grad_norm': 1.2516900300979614, 'learning_rate': 0.00027683284457478003, 'epoch': 0.23}
+{'loss': 0.534, 'grad_norm': 0.7019465565681458, 'learning_rate': 0.0002768084066471163, 'epoch': 0.23}
+{'loss': 0.4528, 'grad_norm': 0.9529553651809692, 'learning_rate': 0.0002767839687194526, 'epoch': 0.23}
+{'loss': 0.5788, 'grad_norm': 2.888197660446167, 'learning_rate': 0.00027675953079178884, 'epoch': 0.23}
+{'loss': 0.7626, 'grad_norm': 0.9398375153541565, 'learning_rate': 0.0002767350928641251, 'epoch': 0.23}
+{'loss': 0.4823, 'grad_norm': 1.111086368560791, 'learning_rate': 0.0002767106549364614, 'epoch': 0.23}
+{'loss': 0.5664, 'grad_norm': 0.8638045191764832, 'learning_rate': 0.0002766862170087976, 'epoch': 0.23}
+{'loss': 0.5838, 'grad_norm': 1.1331050395965576, 'learning_rate': 0.0002766617790811339, 'epoch': 0.23}
+{'loss': 0.5199, 'grad_norm': 1.5367311239242554, 'learning_rate': 0.00027663734115347015, 'epoch': 0.23}
+{'loss': 0.6322, 'grad_norm': 1.0833733081817627, 'learning_rate': 0.0002766129032258064, 'epoch': 0.23}
+{'loss': 0.8835, 'grad_norm': 1.2636278867721558, 'learning_rate': 0.0002765884652981427, 'epoch': 0.23}
+{'loss': 0.5407, 'grad_norm': 0.8570249080657959, 'learning_rate': 0.00027656402737047896, 'epoch': 0.23}
+{'loss': 0.6971, 'grad_norm': 1.7874391078948975, 'learning_rate': 0.0002765395894428152, 'epoch': 0.23}
+{'loss': 0.6228, 'grad_norm': 1.138791799545288, 'learning_rate': 0.00027651515151515146, 'epoch': 0.23}
+ 11%|█▏        | 1467/12776 [14:23<1:31:19,  2.06it/s] 11%|█▏        | 1468/12776 [14:24<1:25:39,  2.20it/s]                                                       11%|█▏        | 1468/12776 [14:24<1:25:39,  2.20it/s] 11%|█▏        | 1469/12776 [14:24<1:21:56,  2.30it/s]                                                       11%|█▏        | 1469/12776 [14:24<1:21:56,  2.30it/s] 12%|█▏        | 1470/12776 [14:24<1:17:26,  2.43it/s]                                                       12%|█▏        | 1470/12776 [14:24<1:17:26,  2.43it/s] 12%|█▏        | 1471/12776 [14:25<1:14:45,  2.52it/s]                                                       12%|█▏        | 1471/12776 [14:25<1:14:45,  2.52it/s] 12%|█▏        | 1472/12776 [14:25<1:15:13,  2.50it/s]                                                       12%|█▏        | 1472/12776 [14:25<1:15:13,  2.50it/s] 12%|█▏        | 1473/12776 [14:25<1:12:38,  2.59it/s]                                                       12%|█▏        | 1473/12776 [14:25<1:12:38,  2.59it/s] 12%|█▏        | 1474/12776 [14:26<1:08:46,  2.74it/s]                                                       12%|█▏        | 1474/12776 [14:26<1:08:46,  2.74it/s] 12%|█▏        | 1475/12776 [14:26<1:06:17,  2.84it/s]                                                       12%|█▏        | 1475/12776 [14:26<1:06:17,  2.84it/s] 12%|█▏        | 1476/12776 [14:26<1:02:54,  2.99it/s]                                                       12%|█▏        | 1476/12776 [14:26<1:02:54,  2.99it/s] 12%|█▏        | 1477/12776 [14:27<1:00:27,  3.11it/s]                                                       12%|█▏        | 1477/12776 [14:27<1:00:27,  3.11it/s] 12%|█▏        | 1478/12776 [14:27<58:14,  3.23it/s]                                                       12%|█▏        | 1478/12776 [14:27<58:14,  3.23it/s] 12%|█▏        | 1479/12776 [14:27<1:01:48,  3.05it/s]                                                       12%|█▏        | 1479/12776 [14:27<1:01:48,  3.05it/s] 12%|█▏        | 1480/12776 [14:28<58:32,  3.22it/s]                                                       12%|█▏        | 1480/12776 [14:28<58:32,  3.22it/s] 12%|█▏        | 1481/12776 [14:28<55:27,  3.39it/s]                                                     12%|█▏        | 1481/12776 [14:28<55:27,  3.39it/s] 12%|█▏        | 1482/12776 [14:28<53:03,  3.55it/s]                                                     12%|█▏        | 1482/12776 [14:28<53:03,  3.55it/s] 12%|█▏        | 1483/12776 [14:28<56:26,  3.33it/s]                                                     12%|█▏        | 1483/12776 [14:28<56:26,  3.33it/s] 12%|█▏        | 1484/12776 [14:29<53:07,  3.54it/s]                                                     12%|█▏        | 1484/12776 [14:29<53:07,  3.54it/s] 12%|█▏        | 1485/12776 [14:29<50:22,  3.74it/s]                                                     12%|█▏        | 1485/12776 [14:29<50:22,  3.74it/s] 12%|█▏        | 1486/12776 [14:29<48:06,  3.91it/s]                                                     12%|█▏        | 1486/12776 [14:29<48:06,  3.91it/s] 12%|█▏        | 1487/12776 [14:29<46:03,  4.08it/s]                                                     12%|█▏        | 1487/12776 [14:29<46:03,  4.08it/s] 12%|█▏        | 1488/12776 [14:30<49:06,  3.83it/s]                                                     12%|█▏        | 1488/12776 [14:30<49:06,  3.83it/s] 12%|█▏        | 1489/12776 [14:30<46:27,  4.05it/s]                                                     12%|█▏        | 1489/12776 [14:30<46:27,  4.05it/s] 12%|█▏        | 1490/12776 [14:30<44:22,  4.24it/s]                                                     12%|█▏        | 1490/12776 [14:30<44:22,  4.24it/s] 12%|█▏        | 1491/12776 [14:30<42:56,  4.38it/s]                                                     12%|█▏        | 1491/12776 [14:30<42:56,  4.38it/s] 12%|█▏        | 1492/12776 [14:30<41:40,  4.51it/s]                                                     12%|█▏        | 1492/12776 [14:30<41:40,  4.51it/s] 12%|█▏        | 1493/12776 [14:31<44:40,  4.21it/s]                                                     12%|█▏        | 1493/12776 [14:31<44:40,  4.21it/s] 12%|█▏        | 1494/12776 [14:31<42:30,  4.42it/s]                                                     12%|█▏        | 1494/12776 [14:31<42:30,  4.42it/s] 12%|█▏        | 1495/12776 [14:31<40:52,  4.60it/s]                                                     12%|█▏        | 1495/12776 [14:31<40:52,  4.60it/s] 12%|█▏        | 1496/12776 [14:31<39:39,  4.74it/s]                                                     12%|█▏        | 1496/12776 [14:31<39:39,  4.74it/s] 12%|█▏        | 1497/12776 [14:32<38:38,  4.86it/s]                                                     12%|█▏        | 1497/12776 [14:32<38:38,  4.86it/s] 12%|█▏        | 1498/12776 [14:32<40:16,  4.67it/s]                                                     12%|█▏        | 1498/12776 [14:32<40:16,  4.67it/s] 12%|█▏        | 1499/12776 [14:32<38:32,  4.88it/s]                                                     12%|█▏        | 1499/12776 [14:32<38:32,  4.88it/s] 12%|█▏        | 1500/12776 [14:33<1:07:15,  2.79it/s]                                                       12%|█▏        | 1500/12776 [14:33<1:07:15,  2.79it/s] 12%|█▏        | 1501/12776 [14:34<2:16:03,  1.38it/s]                                                       12%|█▏        | 1501/12776 [14:34<2:16:03,  1.38it/s] 12%|█▏        | 1502/12776 [14:35<2:35:55,  1.21it/s]                                                       12%|█▏        | 1502/12776 [14:35<2:35:55,  1.21it/s] 12%|█▏        | 1503/12776 [14:36<2:36:27,  1.20it/s]                                                       12%|█▏        | 1503/12776 [14:36<2:36:27,  1.20it/s] 12%|█▏        | 1504/12776 [14:37<2:32:54,  1.23it/s]                                                       12%|█▏        | 1504/12776 [14:37<2:32:54,  1.23it/s] 12%|█▏        | 1505/12776 [14:38<2:27:55,  1.27it/s]                                                       12%|█▏        | 1505/12776 [14:38<2:27:55,  1.27it/s] 12%|█▏        | 1506/12776 [14:38<2:21:43,  1.33it/s]                                                       12%|█▏        | 1506/12776 [14:38<2:21:43,  1.33it/s] 12%|█▏        | 1507/12776 [14:39<2:16:30,  1.38it/s]                                                       12%|█▏        | 1507/12776 [14:39<2:16:30,  1.38it/s] 12%|█▏        | 1508/12776 [14:40<2:09:54,  1.45it/s]                                                       12%|█▏        | 1508/12776 [14:40<2:09:54,  1.45it/s] 12%|█▏        | 1509/12776 [14:40<2:03:54,  1.52it/s]                                                       12%|█▏        | 1509/12776 [14:40<2:03:54,  1.52it/s] 12%|█▏        | 1510/12776 [14:41<1:57:00,  1.60it/s]                                                       12%|█▏        | 1510/12776 [14:41<1:57:00,  1.60it/s] 12%|█▏        | 1511/12776 [14:41<1:56:11,  1.62it/s]                                                       12%|█▏        | 1511/12776 [14:41<1:56:11,  1.62it/s] 12%|█▏        | 1512/12776 [14:42<1:48:37,  1.73it/s]                                                       12%|█▏        | 1512/12776 [14:42<1:48:37,  1.73it/s] 12%|█▏        | 1513/12776 [14:42<1:47:05,  1.75it/s]                                                       12%|█▏        | 1513/12776 [14:42<1:47:05,  1.75it/s] 12%|█▏        | 1514/12776 [14:43<1:39:35,  1.88it/s]                                                       12%|█▏        | 1514/12776 [14:43<1:39:35,  1.88it/s] 12%|█▏        | 1515/12776 [14:43<1:40:00,  1.88it/s]                                                       12%|█▏        | 1515/12776 [14:43<1:40:00,  1.88it/s] 12%|█▏        | 1516/12776 [14:44<1:32:32,  2.03it/s]                                                       12%|█▏        | 1516/12776 [14:44<1:32:32,  2.03it/s] 12%|█▏        | 1517/12776 [14:44<1:27:21,  2.15it/s]                                                       12%|█▏        | 1517/12776 [14:44<1:27:21,  2.15it/s] 12%|█▏        | 1518/12776 [14:45<1:31:02,  2.06it/s]                                                       12%|█▏        | 1518/12776 [14:45<1:31:02,  2.06it/s] 12%|█▏        | 1519/12776 [14:45<1:24:06,  2.23it/s]                                                       12%|█▏        | 1519/12776 [14:45<1:24:06,  2.23it/s] 12%|█▏        | 1520/12776 [14:45<1:18:46,  2.38it/s]                                                       12%|█▏        | 1520/12776 [14:45<1:18:46,  2.38it/s] 12%|█▏        | 1521/12776 [14:46<1:17:26,  2.42it/s]                                                       12%|█▏        | 1521/12776 [14:46<1:17:26,  2.42it/s] 12%|█▏        | 1522/12776 [14:46<1:13:24,  2.56it/s]                                                       12%|█▏        | 1522/12776 [14:46<1:13:24,  2.56it/s] 12%|█▏        | 1523/12776 [14:46<1:10:04,  2.68it/s]                                                       12%|█▏        | 1523/12776 [14:46<1:10:04,  2.68it/s] 12%|█▏        | 1524/12776 [14:47<1:15:14,  2.49it/s]                                                       12%|█▏        | 1524/12776 [14:47<1:15:14,  2.49it/s] 12%|█▏        | 1525/12776 [14:47<1:10:08,  2.67it/s]                                                       12%|█▏        | 1525/12776 [14:47<1:10:08,  2.67it/s] 12%|█▏        | 1526/12776 [14:48<1:05:45,  2.85it/s]                                                       12%|█▏        | 1526/12776 [14:48<1:05:45,  2.85it/s] 12%|█▏        | 1527/12776 [14:48<1:06:17,  2.83it/s]                                                       12%|█▏        | 1527/12776 [14:48<1:06:17,  2.83it/s] 12%|█▏        | 1528/12776 [14:48<1:02:06,  3.02it/s]                                                       12%|█▏        | 1528/12776 [14:48<1:02:06,  3.02it/s] 12%|█▏        | 1529/12776 [14:48<58:41,  3.19it/s]                                                       12%|█▏        | 1529/12776 [14:48<58:41,  3.19it/s] 12%|█▏        | 1530/12776 [14:49<56:05,  3.34it/s]                                                     12%|█▏        | 1530/12776 [14:49<56:05,  3.34it/s] 12%|█▏        | 1531/12776 [14:49<57:50,  3.24it/s]                                                     12%|█▏        | 1531/12776 [14:49<57:50,  3.24it/s] 12%|█▏        | 1532/12776 [14:49<54:38,  3.43it/s]                                                     12%|█▏        | 1532/12776 [14:49<54:38,  3.43it/s] 12%|█▏        | 1533/12776 [14:50<52:01,  3.60it/s]                                                     12%|█▏        | 1533/12776 [14:50<52:01,  3.60it/s] 12%|█▏        | 1534/12776 [14:50<49:53,  3.76it/s]                                                     12%|█▏        | 1534/12776 [14:50<49:53,  3.76it/s] 12%|█▏        | 1535/12776 [14:50<47:51,  3.92it/s]                                                     12%|█▏        | 1535/12776 [14:50<47:51,  3.92it/s] 12%|█▏        | 1536/12776 [14:50<48:46,  3.84it/s]                                                     12%|█▏        | 1536/12776 [14:50<48:46,  3.84it/s] 12%|█▏        | 1537/12776 [14:51<46:22,  4.04it/s]                                                     12%|█▏        | 1537/12776 [14:51<46:22,  4.04it/s] 12%|█▏        | 1538/12776 [14:51<44:18,  4.23it/s]                                                     12%|█▏        | 1538/12776 [14:51<44:18,  4.23it/s] 12%|█▏        | 1539/12776 [14:51<42:40,  4.39it/s]                                                     12%|█▏        | 1539/12776 [14:51<42:40,  4.39it/s] 12%|█▏        | 1540/12776 [14:51<41:42,  4.49it/s]                                                     12%|█▏        | 1540/12776 [14:51<41:42,  4.49it/s] 12%|█▏        | 1541/12776 [14:51<45:40,  4.10it/s]                                                     12%|█▏        | 1541/12776 [14:51<45:40,  4.10it/s] 12%|█▏        | 1542/12776 [14:52<43:15,  4.33it/s]                                                     12%|█▏        | 1542/12776 [14:52<43:15,  4.33it/s] 12%|█▏        | 1543/12776 [14:52<41:29,  4.51it/s]                                                     12%|█▏        | 1543/12776 [14:52<41:29,  4.51it/s] 12%|█▏        | 1544/12776 [14:52<40:02,  4.68it/s]                                                    {'loss': 0.6939, 'grad_norm': 1.4818170070648193, 'learning_rate': 0.00027649071358748777, 'epoch': 0.23}
+{'loss': 0.6931, 'grad_norm': 1.402114987373352, 'learning_rate': 0.000276466275659824, 'epoch': 0.23}
+{'loss': 0.7895, 'grad_norm': 1.5063778162002563, 'learning_rate': 0.00027644183773216027, 'epoch': 0.23}
+{'loss': 0.6326, 'grad_norm': 2.0992255210876465, 'learning_rate': 0.0002764173998044966, 'epoch': 0.23}
+{'loss': 0.5391, 'grad_norm': 1.7399237155914307, 'learning_rate': 0.00027639296187683283, 'epoch': 0.23}
+{'loss': 0.7821, 'grad_norm': 2.003880023956299, 'learning_rate': 0.0002763685239491691, 'epoch': 0.23}
+{'loss': 0.7965, 'grad_norm': 1.794185996055603, 'learning_rate': 0.0002763440860215054, 'epoch': 0.23}
+{'loss': 0.6783, 'grad_norm': 1.893991470336914, 'learning_rate': 0.0002763196480938416, 'epoch': 0.23}
+{'loss': 0.9839, 'grad_norm': 2.7751851081848145, 'learning_rate': 0.0002762952101661779, 'epoch': 0.23}
+{'loss': 0.5881, 'grad_norm': 1.9507943391799927, 'learning_rate': 0.00027627077223851414, 'epoch': 0.23}
+{'loss': 0.6818, 'grad_norm': 2.9333903789520264, 'learning_rate': 0.0002762463343108504, 'epoch': 0.23}
+{'loss': 0.9943, 'grad_norm': 2.3272409439086914, 'learning_rate': 0.0002762218963831867, 'epoch': 0.23}
+{'loss': 1.225, 'grad_norm': 3.2229695320129395, 'learning_rate': 0.00027619745845552295, 'epoch': 0.23}
+{'loss': 1.034, 'grad_norm': 2.4953155517578125, 'learning_rate': 0.0002761730205278592, 'epoch': 0.23}
+{'loss': 1.2024, 'grad_norm': 2.438466787338257, 'learning_rate': 0.0002761485826001955, 'epoch': 0.23}
+{'loss': 1.0249, 'grad_norm': 4.884873390197754, 'learning_rate': 0.00027612414467253175, 'epoch': 0.23}
+{'loss': 1.2849, 'grad_norm': 2.4104747772216797, 'learning_rate': 0.000276099706744868, 'epoch': 0.23}
+{'loss': 0.8408, 'grad_norm': 3.2531588077545166, 'learning_rate': 0.00027607526881720426, 'epoch': 0.23}
+{'loss': 1.4731, 'grad_norm': 2.4069766998291016, 'learning_rate': 0.00027605083088954056, 'epoch': 0.23}
+{'loss': 1.1409, 'grad_norm': 2.5518875122070312, 'learning_rate': 0.0002760263929618768, 'epoch': 0.23}
+{'loss': 1.2555, 'grad_norm': 2.899477958679199, 'learning_rate': 0.00027600195503421306, 'epoch': 0.23}
+{'loss': 0.8074, 'grad_norm': 1.8513123989105225, 'learning_rate': 0.00027597751710654937, 'epoch': 0.23}
+{'loss': 1.7678, 'grad_norm': 2.3798718452453613, 'learning_rate': 0.00027595307917888557, 'epoch': 0.23}
+{'loss': 1.5967, 'grad_norm': 2.9262709617614746, 'learning_rate': 0.00027592864125122187, 'epoch': 0.23}
+{'loss': 1.9874, 'grad_norm': 2.1684608459472656, 'learning_rate': 0.0002759042033235581, 'epoch': 0.23}
+{'loss': 0.7234, 'grad_norm': 2.6140377521514893, 'learning_rate': 0.0002758797653958944, 'epoch': 0.23}
+{'loss': 1.0605, 'grad_norm': 2.1321074962615967, 'learning_rate': 0.0002758553274682307, 'epoch': 0.23}
+{'loss': 1.8914, 'grad_norm': 2.2342660427093506, 'learning_rate': 0.00027583088954056693, 'epoch': 0.23}
+{'loss': 1.8859, 'grad_norm': 3.242147445678711, 'learning_rate': 0.0002758064516129032, 'epoch': 0.23}
+{'loss': 1.3996, 'grad_norm': 4.4763946533203125, 'learning_rate': 0.0002757820136852395, 'epoch': 0.23}
+{'loss': 0.9282, 'grad_norm': 2.6398067474365234, 'learning_rate': 0.00027575757575757574, 'epoch': 0.23}
+{'loss': 0.9894, 'grad_norm': 1.8949030637741089, 'learning_rate': 0.000275733137829912, 'epoch': 0.23}
+{'loss': 0.7834, 'grad_norm': 1.6546046733856201, 'learning_rate': 0.00027570869990224824, 'epoch': 0.23}
+{'loss': 1.1045, 'grad_norm': 1.8807889223098755, 'learning_rate': 0.00027568426197458455, 'epoch': 0.23}
+{'loss': 0.5635, 'grad_norm': 0.7086061835289001, 'learning_rate': 0.0002756598240469208, 'epoch': 0.23}
+{'loss': 0.6135, 'grad_norm': 1.064099669456482, 'learning_rate': 0.00027563538611925705, 'epoch': 0.24}
+{'loss': 0.4156, 'grad_norm': 0.6339705586433411, 'learning_rate': 0.00027561094819159336, 'epoch': 0.24}
+{'loss': 0.5255, 'grad_norm': 0.6692638397216797, 'learning_rate': 0.0002755865102639296, 'epoch': 0.24}
+{'loss': 0.4684, 'grad_norm': 0.6374533772468567, 'learning_rate': 0.00027556207233626586, 'epoch': 0.24}
+{'loss': 0.4455, 'grad_norm': 0.621849000453949, 'learning_rate': 0.00027553763440860216, 'epoch': 0.24}
+{'loss': 0.5984, 'grad_norm': 1.2488616704940796, 'learning_rate': 0.00027551319648093836, 'epoch': 0.24}
+{'loss': 0.5569, 'grad_norm': 0.7494067549705505, 'learning_rate': 0.00027548875855327467, 'epoch': 0.24}
+{'loss': 1.5669, 'grad_norm': 5.239253520965576, 'learning_rate': 0.0002754643206256109, 'epoch': 0.24}
+{'loss': 0.4972, 'grad_norm': 1.0418435335159302, 'learning_rate': 0.00027543988269794717, 'epoch': 0.24}
+{'loss': 0.7008, 'grad_norm': 1.3810733556747437, 'learning_rate': 0.0002754154447702835, 'epoch': 0.24}
+{'loss': 0.598, 'grad_norm': 1.964684247970581, 'learning_rate': 0.0002753910068426197, 'epoch': 0.24}
+{'loss': 0.7735, 'grad_norm': 1.2020512819290161, 'learning_rate': 0.000275366568914956, 'epoch': 0.24}
+{'loss': 0.6185, 'grad_norm': 1.2619682550430298, 'learning_rate': 0.00027534213098729223, 'epoch': 0.24}
+{'loss': 0.6125, 'grad_norm': 1.121608018875122, 'learning_rate': 0.00027531769305962853, 'epoch': 0.24}
+{'loss': 0.5002, 'grad_norm': 1.6488595008850098, 'learning_rate': 0.0002752932551319648, 'epoch': 0.24}
+{'loss': 0.9657, 'grad_norm': 2.0960159301757812, 'learning_rate': 0.00027526881720430104, 'epoch': 0.24}
+{'loss': 0.6478, 'grad_norm': 1.6600353717803955, 'learning_rate': 0.00027524437927663734, 'epoch': 0.24}
+{'loss': 0.8878, 'grad_norm': 1.770453691482544, 'learning_rate': 0.0002752199413489736, 'epoch': 0.24}
+{'loss': 1.122, 'grad_norm': 1.5966682434082031, 'learning_rate': 0.00027519550342130984, 'epoch': 0.24}
+{'loss': 0.6308, 'grad_norm': 1.4263012409210205, 'learning_rate': 0.00027517106549364615, 'epoch': 0.24}
+{'loss': 0.8983, 'grad_norm': 3.0444369316101074, 'learning_rate': 0.00027514662756598235, 'epoch': 0.24}
+{'loss': 0.6754, 'grad_norm': 1.5377203226089478, 'learning_rate': 0.00027512218963831865, 'epoch': 0.24}
+{'loss': 0.8112, 'grad_norm': 1.37147855758667, 'learning_rate': 0.0002750977517106549, 'epoch': 0.24}
+{'loss': 1.0561, 'grad_norm': 1.6660315990447998, 'learning_rate': 0.00027507331378299115, 'epoch': 0.24}
+{'loss': 1.1573, 'grad_norm': 1.588474988937378, 'learning_rate': 0.00027504887585532746, 'epoch': 0.24}
+{'loss': 1.0771, 'grad_norm': 2.331662178039551, 'learning_rate': 0.0002750244379276637, 'epoch': 0.24}
+{'loss': 0.8527, 'grad_norm': 3.051088571548462, 'learning_rate': 0.00027499999999999996, 'epoch': 0.24}
+{'loss': 1.2319, 'grad_norm': 2.258211374282837, 'learning_rate': 0.00027497556207233627, 'epoch': 0.24}
+{'loss': 1.1426, 'grad_norm': 2.485002279281616, 'learning_rate': 0.0002749511241446725, 'epoch': 0.24}
+{'loss': 1.1472, 'grad_norm': 2.4680216312408447, 'learning_rate': 0.00027492668621700877, 'epoch': 0.24}
+{'loss': 1.0561, 'grad_norm': 1.421920895576477, 'learning_rate': 0.000274902248289345, 'epoch': 0.24}
+{'loss': 1.5544, 'grad_norm': 2.4300615787506104, 'learning_rate': 0.0002748778103616813, 'epoch': 0.24}
+{'loss': 0.8778, 'grad_norm': 3.0856986045837402, 'learning_rate': 0.0002748533724340176, 'epoch': 0.24}
+{'loss': 1.1826, 'grad_norm': 2.003622055053711, 'learning_rate': 0.00027482893450635383, 'epoch': 0.24}
+{'loss': 1.3446, 'grad_norm': 2.1644046306610107, 'learning_rate': 0.00027480449657869013, 'epoch': 0.24}
+{'loss': 1.2308, 'grad_norm': 2.3185083866119385, 'learning_rate': 0.00027478005865102633, 'epoch': 0.24}
+{'loss': 0.9243, 'grad_norm': 2.0810067653656006, 'learning_rate': 0.00027475562072336264, 'epoch': 0.24}
+{'loss': 1.7686, 'grad_norm': 2.293769359588623, 'learning_rate': 0.0002747311827956989, 'epoch': 0.24}
+{'loss': 1.3467, 'grad_norm': 2.4113895893096924, 'learning_rate': 0.00027470674486803514, 'epoch': 0.24}
+{'loss': 1.8965, 'grad_norm': 2.769763708114624, 'learning_rate': 0.00027468230694037145, 'epoch': 0.24}
+{'loss': 1.2388, 'grad_norm': 2.0843498706817627, 'learning_rate': 0.0002746578690127077, 'epoch': 0.24}
+{'loss': 1.3435, 'grad_norm': 3.0376205444335938, 'learning_rate': 0.00027463343108504395, 'epoch': 0.24}
+ 12%|█▏        | 1544/12776 [14:52<40:02,  4.68it/s] 12%|█▏        | 1545/12776 [14:52<38:59,  4.80it/s]                                                     12%|█▏        | 1545/12776 [14:52<38:59,  4.80it/s] 12%|█▏        | 1546/12776 [14:53<45:28,  4.12it/s]                                                     12%|█▏        | 1546/12776 [14:53<45:28,  4.12it/s] 12%|█▏        | 1547/12776 [14:53<42:36,  4.39it/s]                                                     12%|█▏        | 1547/12776 [14:53<42:36,  4.39it/s] 12%|█▏        | 1548/12776 [14:53<40:12,  4.65it/s]                                                     12%|█▏        | 1548/12776 [14:53<40:12,  4.65it/s] 12%|█▏        | 1549/12776 [14:53<38:24,  4.87it/s]                                                     12%|█▏        | 1549/12776 [14:53<38:24,  4.87it/s] 12%|█▏        | 1550/12776 [14:54<1:06:02,  2.83it/s]                                                       12%|█▏        | 1550/12776 [14:54<1:06:02,  2.83it/s] 12%|█▏        | 1551/12776 [14:55<2:20:29,  1.33it/s]                                                       12%|█▏        | 1551/12776 [14:55<2:20:29,  1.33it/s] 12%|█▏        | 1552/12776 [14:56<2:33:56,  1.22it/s]                                                       12%|█▏        | 1552/12776 [14:56<2:33:56,  1.22it/s] 12%|█▏        | 1553/12776 [14:57<2:40:19,  1.17it/s]                                                       12%|█▏        | 1553/12776 [14:57<2:40:19,  1.17it/s] 12%|█▏        | 1554/12776 [14:58<2:37:09,  1.19it/s]                                                       12%|█▏        | 1554/12776 [14:58<2:37:09,  1.19it/s] 12%|█▏        | 1555/12776 [14:59<2:31:29,  1.23it/s]                                                       12%|█▏        | 1555/12776 [14:59<2:31:29,  1.23it/s] 12%|█▏        | 1556/12776 [15:00<2:30:55,  1.24it/s]                                                       12%|█▏        | 1556/12776 [15:00<2:30:55,  1.24it/s] 12%|█▏        | 1557/12776 [15:01<2:28:33,  1.26it/s]                                                       12%|█▏        | 1557/12776 [15:01<2:28:33,  1.26it/s] 12%|█▏        | 1558/12776 [15:01<2:20:08,  1.33it/s]                                                       12%|█▏        | 1558/12776 [15:01<2:20:08,  1.33it/s] 12%|█▏        | 1559/12776 [15:02<2:19:34,  1.34it/s]                                                       12%|█▏        | 1559/12776 [15:02<2:19:34,  1.34it/s] 12%|█▏        | 1560/12776 [15:03<2:10:49,  1.43it/s]                                                       12%|█▏        | 1560/12776 [15:03<2:10:49,  1.43it/s] 12%|█▏        | 1561/12776 [15:03<2:06:41,  1.48it/s]                                                       12%|█▏        | 1561/12776 [15:03<2:06:41,  1.48it/s] 12%|█▏        | 1562/12776 [15:04<1:58:50,  1.57it/s]                                                       12%|█▏        | 1562/12776 [15:04<1:58:50,  1.57it/s] 12%|█▏        | 1563/12776 [15:04<1:56:26,  1.60it/s]                                                       12%|█▏        | 1563/12776 [15:04<1:56:26,  1.60it/s] 12%|█▏        | 1564/12776 [15:05<1:48:51,  1.72it/s]                                                       12%|█▏        | 1564/12776 [15:05<1:48:51,  1.72it/s] 12%|█▏        | 1565/12776 [15:05<1:45:15,  1.78it/s]                                                       12%|█▏        | 1565/12776 [15:05<1:45:15,  1.78it/s] 12%|█▏        | 1566/12776 [15:06<1:38:07,  1.90it/s]                                                       12%|█▏        | 1566/12776 [15:06<1:38:07,  1.90it/s] 12%|█▏        | 1567/12776 [15:06<1:36:47,  1.93it/s]                                                       12%|█▏        | 1567/12776 [15:06<1:36:47,  1.93it/s] 12%|█▏        | 1568/12776 [15:07<1:30:37,  2.06it/s]                                                       12%|█▏        | 1568/12776 [15:07<1:30:37,  2.06it/s] 12%|█▏        | 1569/12776 [15:07<1:25:47,  2.18it/s]                                                       12%|█▏        | 1569/12776 [15:07<1:25:47,  2.18it/s] 12%|█▏        | 1570/12776 [15:08<1:27:56,  2.12it/s]                                                       12%|█▏        | 1570/12776 [15:08<1:27:56,  2.12it/s] 12%|█▏        | 1571/12776 [15:08<1:22:16,  2.27it/s]                                                       12%|█▏        | 1571/12776 [15:08<1:22:16,  2.27it/s] 12%|█▏        | 1572/12776 [15:08<1:17:15,  2.42it/s]                                                       12%|█▏        | 1572/12776 [15:08<1:17:15,  2.42it/s] 12%|█▏        | 1573/12776 [15:09<1:18:55,  2.37it/s]                                                       12%|█▏        | 1573/12776 [15:09<1:18:55,  2.37it/s] 12%|█▏        | 1574/12776 [15:09<1:14:05,  2.52it/s]                                                       12%|█▏        | 1574/12776 [15:09<1:14:05,  2.52it/s] 12%|█▏        | 1575/12776 [15:09<1:10:00,  2.67it/s]                                                       12%|█▏        | 1575/12776 [15:09<1:10:00,  2.67it/s] 12%|█▏        | 1576/12776 [15:10<1:09:59,  2.67it/s]                                                       12%|█▏        | 1576/12776 [15:10<1:09:59,  2.67it/s] 12%|█▏        | 1577/12776 [15:10<1:05:34,  2.85it/s]                                                       12%|█▏        | 1577/12776 [15:10<1:05:34,  2.85it/s] 12%|█▏        | 1578/12776 [15:10<1:01:43,  3.02it/s]                                                       12%|█▏        | 1578/12776 [15:10<1:01:43,  3.02it/s] 12%|█▏        | 1579/12776 [15:11<1:03:53,  2.92it/s]                                                       12%|█▏        | 1579/12776 [15:11<1:03:53,  2.92it/s] 12%|█▏        | 1580/12776 [15:11<59:46,  3.12it/s]                                                       12%|█▏        | 1580/12776 [15:11<59:46,  3.12it/s] 12%|█▏        | 1581/12776 [15:11<55:54,  3.34it/s]                                                     12%|█▏        | 1581/12776 [15:11<55:54,  3.34it/s] 12%|█▏        | 1582/12776 [15:11<53:17,  3.50it/s]                                                     12%|█▏        | 1582/12776 [15:11<53:17,  3.50it/s] 12%|█▏        | 1583/12776 [15:12<56:15,  3.32it/s]                                                     12%|█▏        | 1583/12776 [15:12<56:15,  3.32it/s] 12%|█▏        | 1584/12776 [15:12<52:55,  3.52it/s]                                                     12%|█▏        | 1584/12776 [15:12<52:55,  3.52it/s] 12%|█▏        | 1585/12776 [15:12<50:12,  3.72it/s]                                                     12%|█▏        | 1585/12776 [15:12<50:12,  3.72it/s] 12%|█▏        | 1586/12776 [15:12<47:53,  3.89it/s]                                                     12%|█▏        | 1586/12776 [15:12<47:53,  3.89it/s] 12%|█▏        | 1587/12776 [15:13<45:58,  4.06it/s]                                                     12%|█▏        | 1587/12776 [15:13<45:58,  4.06it/s] 12%|█▏        | 1588/12776 [15:13<49:29,  3.77it/s]                                                     12%|█▏        | 1588/12776 [15:13<49:29,  3.77it/s] 12%|█▏        | 1589/12776 [15:13<46:17,  4.03it/s]                                                     12%|█▏        | 1589/12776 [15:13<46:17,  4.03it/s] 12%|█▏        | 1590/12776 [15:13<44:03,  4.23it/s]                                                     12%|█▏        | 1590/12776 [15:13<44:03,  4.23it/s] 12%|█▏        | 1591/12776 [15:14<42:18,  4.41it/s]                                                     12%|█▏        | 1591/12776 [15:14<42:18,  4.41it/s] 12%|█▏        | 1592/12776 [15:14<41:15,  4.52it/s]                                                     12%|█▏        | 1592/12776 [15:14<41:15,  4.52it/s] 12%|█▏        | 1593/12776 [15:14<47:26,  3.93it/s]                                                     12%|█▏        | 1593/12776 [15:14<47:26,  3.93it/s] 12%|█▏        | 1594/12776 [15:14<44:21,  4.20it/s]                                                     12%|█▏        | 1594/12776 [15:14<44:21,  4.20it/s] 12%|█▏        | 1595/12776 [15:15<42:02,  4.43it/s]                                                     12%|█▏        | 1595/12776 [15:15<42:02,  4.43it/s] 12%|█▏        | 1596/12776 [15:15<40:19,  4.62it/s]                                                     12%|█▏        | 1596/12776 [15:15<40:19,  4.62it/s] 12%|█▎        | 1597/12776 [15:15<38:53,  4.79it/s]                                                     12%|█▎        | 1597/12776 [15:15<38:53,  4.79it/s] 13%|█▎        | 1598/12776 [15:15<41:32,  4.48it/s]                                                     13%|█▎        | 1598/12776 [15:15<41:32,  4.48it/s] 13%|█▎        | 1599/12776 [15:15<39:21,  4.73it/s]                                                     13%|█▎        | 1599/12776 [15:15<39:21,  4.73it/s] 13%|█▎        | 1600/12776 [15:16<1:05:27,  2.85it/s]                                                       13%|█▎        | 1600/12776 [15:16<1:05:27,  2.85it/s]Saving model checkpoint to ./checkpoint-1600
+Configuration saved in ./checkpoint-1600/config.json
+Model weights saved in ./checkpoint-1600/model.safetensors
+Feature extractor saved in ./checkpoint-1600/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-1600/tokenizer_config.json
+Special tokens file saved in ./checkpoint-1600/special_tokens_map.json
+added tokens file saved in ./checkpoint-1600/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-400] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 13%|█▎        | 1601/12776 [15:23<6:50:08,  2.20s/it]                                                       13%|█▎        | 1601/12776 [15:23<6:50:08,  2.20s/it] 13%|█▎        | 1602/12776 [15:24<5:40:44,  1.83s/it]                                                       13%|█▎        | 1602/12776 [15:24<5:40:44,  1.83s/it] 13%|█▎        | 1603/12776 [15:24<4:44:40,  1.53s/it]                                                       13%|█▎        | 1603/12776 [15:24<4:44:40,  1.53s/it] 13%|█▎        | 1604/12776 [15:25<4:01:26,  1.30s/it]                                                       13%|█▎        | 1604/12776 [15:25<4:01:26,  1.30s/it] 13%|█▎        | 1605/12776 [15:26<3:33:14,  1.15s/it]                                                       13%|█▎        | 1605/12776 [15:26<3:33:14,  1.15s/it] 13%|█▎        | 1606/12776 [15:27<3:12:53,  1.04s/it]                                                       13%|█▎        | 1606/12776 [15:27<3:12:53,  1.04s/it] 13%|█▎        | 1607/12776 [15:27<2:50:04,  1.09it/s]                                                       13%|█▎        | 1607/12776 [15:27<2:50:04,  1.09it/s] 13%|█▎        | 1608/12776 [15:28<2:40:00,  1.16it/s]                                                       13%|█▎        | 1608/12776 [15:28<2:40:00,  1.16it/s] 13%|█▎        | 1609/12776 [15:29<2:23:52,  1.29it/s]                                                       13%|█▎        | 1609/12776 [15:29<2:23:52,  1.29it/s] 13%|█▎        | 1610/12776 [15:29<2:14:57,  1.38it/s]                                                       13%|█▎        | 1610/12776 [15:29<2:14:57,  1.38it/s] 13%|█▎        | 1611/12776 [15:30<2:02:42,  1.52it/s]                                                       13%|█▎        | 1611/12776 [15:30<2:02:42,  1.52it/s] 13%|█▎        | 1612/12776 [15:30<2:01:26,  1.53it/s]                                                       13%|█▎        | 1612/12776 [15:30<2:01:26,  1.53it/s] 13%|█▎        | 1613/12776 [15:31<1:50:38,  1.68it/s]                                                       13%|█▎        | 1613/12776 [15:31<1:50:38,  1.68it/s] 13%|█▎        | 1614/12776 [15:31<1:45:27,  1.76it/s]                                                       13%|█▎        | 1614/12776 [15:31<1:45:27,  1.76it/s] 13%|█▎        | 1615/12776 [15:32<1:36:37,  1.93it/s]                                                       13%|█▎        | 1615/12776 [15:32<1:36:37,  1.93it/s] 13%|█▎        | 1616/12776 [15:32<1:29:42,  2.07it/s]                                                       13%|█▎        | 1616/12776 [15:32<1:29:42,  2.07it/s] 13%|█▎        | 1617/12776 [15:33<1:31:30,  2.03it/s]                                                       13%|█▎        | 1617/12776 [15:33<1:31:30,  2.03it/s] 13%|█▎        | 1618/12776 [15:33<1:25:03,  2.19it/s]                                                       13%|█▎        | 1618/12776 [15:33<1:25:03,  2.19it/s] 13%|█▎        | 1619/12776 [15:33<1:19:19,  2.34it/s]                                                       13%|█▎        | 1619/12776 [15:33<1:19:19,  2.34it/s] 13%|█▎        | 1620/12776 [15:34<1:20:30,  2.31it/s]                                                       13%|█▎        | 1620/12776 [15:34<1:20:30,  2.31it/s] 13%|█▎        | 1621/12776 [15:34<1:14:35,  2.49it/s]                                                       13%|█▎        | 1621/12776 [15:34<1:14:35,  2.49it/s] 13%|█▎        | 1622/12776 [15:34<1:09:50,  2.66it/s]                                                      {'loss': 1.5941, 'grad_norm': 4.603314399719238, 'learning_rate': 0.00027460899315738025, 'epoch': 0.24}
+{'loss': 1.5791, 'grad_norm': 3.8465962409973145, 'learning_rate': 0.0002745845552297165, 'epoch': 0.24}
+{'loss': 1.0725, 'grad_norm': 1.3982967138290405, 'learning_rate': 0.00027456011730205276, 'epoch': 0.24}
+{'loss': 1.2196, 'grad_norm': 2.9826741218566895, 'learning_rate': 0.000274535679374389, 'epoch': 0.24}
+{'loss': 1.1231, 'grad_norm': 4.3426055908203125, 'learning_rate': 0.0002745112414467253, 'epoch': 0.24}
+{'loss': 1.4186, 'grad_norm': 2.9004628658294678, 'learning_rate': 0.00027448680351906156, 'epoch': 0.24}
+{'loss': 0.8112, 'grad_norm': 1.7319072484970093, 'learning_rate': 0.0002744623655913978, 'epoch': 0.24}
+{'loss': 0.463, 'grad_norm': 0.6421094536781311, 'learning_rate': 0.0002744379276637341, 'epoch': 0.24}
+{'loss': 0.4454, 'grad_norm': 0.7612909078598022, 'learning_rate': 0.00027441348973607037, 'epoch': 0.24}
+{'loss': 0.5241, 'grad_norm': 0.5956478118896484, 'learning_rate': 0.0002743890518084066, 'epoch': 0.24}
+{'loss': 0.481, 'grad_norm': 0.6601691246032715, 'learning_rate': 0.0002743646138807429, 'epoch': 0.24}
+{'loss': 0.565, 'grad_norm': 0.757583498954773, 'learning_rate': 0.0002743401759530791, 'epoch': 0.24}
+{'loss': 0.555, 'grad_norm': 1.0948114395141602, 'learning_rate': 0.00027431573802541543, 'epoch': 0.24}
+{'loss': 0.4812, 'grad_norm': 0.9005800485610962, 'learning_rate': 0.0002742913000977517, 'epoch': 0.24}
+{'loss': 0.5122, 'grad_norm': 1.3342481851577759, 'learning_rate': 0.00027426686217008793, 'epoch': 0.24}
+{'loss': 0.3781, 'grad_norm': 0.7070523500442505, 'learning_rate': 0.00027424242424242424, 'epoch': 0.24}
+{'loss': 0.5231, 'grad_norm': 0.6875860095024109, 'learning_rate': 0.0002742179863147605, 'epoch': 0.24}
+{'loss': 0.5012, 'grad_norm': 1.033418893814087, 'learning_rate': 0.00027419354838709674, 'epoch': 0.24}
+{'loss': 0.6205, 'grad_norm': 2.32181978225708, 'learning_rate': 0.000274169110459433, 'epoch': 0.24}
+{'loss': 0.5589, 'grad_norm': 1.1538715362548828, 'learning_rate': 0.0002741446725317693, 'epoch': 0.24}
+{'loss': 0.5474, 'grad_norm': 1.1103618144989014, 'learning_rate': 0.00027412023460410555, 'epoch': 0.24}
+{'loss': 0.5257, 'grad_norm': 2.351726531982422, 'learning_rate': 0.0002740957966764418, 'epoch': 0.24}
+{'loss': 0.6285, 'grad_norm': 1.7007243633270264, 'learning_rate': 0.0002740713587487781, 'epoch': 0.25}
+{'loss': 0.9199, 'grad_norm': 1.454067349433899, 'learning_rate': 0.00027404692082111436, 'epoch': 0.25}
+{'loss': 0.7621, 'grad_norm': 0.8819766640663147, 'learning_rate': 0.0002740224828934506, 'epoch': 0.25}
+{'loss': 1.0348, 'grad_norm': 1.5095868110656738, 'learning_rate': 0.0002739980449657869, 'epoch': 0.25}
+{'loss': 0.8568, 'grad_norm': 0.8652186393737793, 'learning_rate': 0.0002739736070381231, 'epoch': 0.25}
+{'loss': 0.5486, 'grad_norm': 1.0354799032211304, 'learning_rate': 0.0002739491691104594, 'epoch': 0.25}
+{'loss': 0.9729, 'grad_norm': 1.751391887664795, 'learning_rate': 0.00027392473118279567, 'epoch': 0.25}
+{'loss': 1.1064, 'grad_norm': 1.665823221206665, 'learning_rate': 0.0002739002932551319, 'epoch': 0.25}
+{'loss': 0.7137, 'grad_norm': 1.3109946250915527, 'learning_rate': 0.0002738758553274682, 'epoch': 0.25}
+{'loss': 1.0663, 'grad_norm': 3.6172003746032715, 'learning_rate': 0.0002738514173998045, 'epoch': 0.25}
+{'loss': 0.8352, 'grad_norm': 1.8106416463851929, 'learning_rate': 0.0002738269794721407, 'epoch': 0.25}
+{'loss': 1.143, 'grad_norm': 1.7313213348388672, 'learning_rate': 0.00027380254154447703, 'epoch': 0.25}
+{'loss': 0.8652, 'grad_norm': 2.3070340156555176, 'learning_rate': 0.0002737781036168133, 'epoch': 0.25}
+{'loss': 1.108, 'grad_norm': 1.9005930423736572, 'learning_rate': 0.00027375366568914953, 'epoch': 0.25}
+{'loss': 0.9733, 'grad_norm': 1.746368408203125, 'learning_rate': 0.0002737292277614858, 'epoch': 0.25}
+{'loss': 0.8982, 'grad_norm': 2.9574499130249023, 'learning_rate': 0.0002737047898338221, 'epoch': 0.25}
+{'loss': 1.5592, 'grad_norm': 2.634615659713745, 'learning_rate': 0.00027368035190615834, 'epoch': 0.25}
+{'loss': 1.8192, 'grad_norm': 2.2195236682891846, 'learning_rate': 0.0002736559139784946, 'epoch': 0.25}
+{'loss': 1.2147, 'grad_norm': 3.19441294670105, 'learning_rate': 0.0002736314760508309, 'epoch': 0.25}
+{'loss': 1.0827, 'grad_norm': 1.6516034603118896, 'learning_rate': 0.0002736070381231671, 'epoch': 0.25}
+{'loss': 1.8071, 'grad_norm': 3.0163512229919434, 'learning_rate': 0.0002735826001955034, 'epoch': 0.25}
+{'loss': 1.1725, 'grad_norm': 3.311854362487793, 'learning_rate': 0.00027355816226783965, 'epoch': 0.25}
+{'loss': 1.9088, 'grad_norm': 2.583314895629883, 'learning_rate': 0.0002735337243401759, 'epoch': 0.25}
+{'loss': 2.057, 'grad_norm': 3.644285202026367, 'learning_rate': 0.0002735092864125122, 'epoch': 0.25}
+{'loss': 1.0609, 'grad_norm': 2.473665475845337, 'learning_rate': 0.00027348484848484846, 'epoch': 0.25}
+{'loss': 1.9401, 'grad_norm': 2.75217342376709, 'learning_rate': 0.0002734604105571847, 'epoch': 0.25}
+{'loss': 1.9969, 'grad_norm': 4.07191801071167, 'learning_rate': 0.000273435972629521, 'epoch': 0.25}
+{'loss': 1.451, 'grad_norm': 2.538170337677002, 'learning_rate': 0.00027341153470185727, 'epoch': 0.25}
+{'loss': 1.7211, 'grad_norm': 1.8366115093231201, 'learning_rate': 0.0002733870967741935, 'epoch': 0.25}
+{'loss': 1.3786, 'grad_norm': 2.4829647541046143, 'learning_rate': 0.00027336265884652977, 'epoch': 0.25}
+{'loss': 1.4635, 'grad_norm': 2.2429263591766357, 'learning_rate': 0.0002733382209188661, 'epoch': 0.25}
+{'loss': 0.9664, 'grad_norm': 2.1900274753570557, 'learning_rate': 0.00027331378299120233, 'epoch': 0.25}
+{'loss': 1.2012, 'grad_norm': 1.736470341682434, 'learning_rate': 0.0002732893450635386, 'epoch': 0.25}
+{'loss': 0.5847, 'grad_norm': 2.750772714614868, 'learning_rate': 0.0002732649071358749, 'epoch': 0.25}
+{'loss': 1.3006, 'grad_norm': 3.358440637588501, 'learning_rate': 0.00027324046920821114, 'epoch': 0.25}
+{'loss': 0.474, 'grad_norm': 0.7446082830429077, 'learning_rate': 0.0002732160312805474, 'epoch': 0.25}
+{'loss': 0.4473, 'grad_norm': 0.5928311347961426, 'learning_rate': 0.00027319159335288364, 'epoch': 0.25}
+{'loss': 0.4695, 'grad_norm': 0.8611072897911072, 'learning_rate': 0.0002731671554252199, 'epoch': 0.25}
+{'loss': 0.5228, 'grad_norm': 0.6917060613632202, 'learning_rate': 0.0002731427174975562, 'epoch': 0.25}
+{'loss': 1.0444, 'grad_norm': 1.070335865020752, 'learning_rate': 0.00027311827956989245, 'epoch': 0.25}
+{'loss': 0.7059, 'grad_norm': 1.0247135162353516, 'learning_rate': 0.0002730938416422287, 'epoch': 0.25}
+{'loss': 0.5032, 'grad_norm': 1.059765100479126, 'learning_rate': 0.000273069403714565, 'epoch': 0.25}
+{'loss': 0.4397, 'grad_norm': 0.8535358905792236, 'learning_rate': 0.00027304496578690125, 'epoch': 0.25}
+{'loss': 0.6977, 'grad_norm': 1.4279924631118774, 'learning_rate': 0.0002730205278592375, 'epoch': 0.25}
+{'loss': 0.4416, 'grad_norm': 0.8265482187271118, 'learning_rate': 0.00027299608993157376, 'epoch': 0.25}
+{'loss': 0.4451, 'grad_norm': 1.0426527261734009, 'learning_rate': 0.00027297165200391006, 'epoch': 0.25}
+{'loss': 0.7127, 'grad_norm': 1.2460665702819824, 'learning_rate': 0.0002729472140762463, 'epoch': 0.25}
+{'loss': 0.5075, 'grad_norm': 1.153203010559082, 'learning_rate': 0.00027292277614858257, 'epoch': 0.25}
+{'loss': 0.6462, 'grad_norm': 1.497168779373169, 'learning_rate': 0.00027289833822091887, 'epoch': 0.25}
+{'loss': 0.5667, 'grad_norm': 0.8022387027740479, 'learning_rate': 0.0002728739002932551, 'epoch': 0.25}
+{'loss': 0.4302, 'grad_norm': 1.270932912826538, 'learning_rate': 0.0002728494623655914, 'epoch': 0.25}
+{'loss': 0.7754, 'grad_norm': 2.111393928527832, 'learning_rate': 0.0002728250244379277, 'epoch': 0.25}
+{'loss': 0.5654, 'grad_norm': 0.9829224348068237, 'learning_rate': 0.0002728005865102639, 'epoch': 0.25}
+{'loss': 0.4941, 'grad_norm': 1.0931804180145264, 'learning_rate': 0.0002727761485826002, 'epoch': 0.25}
+{'loss': 0.8934, 'grad_norm': 1.211100459098816, 'learning_rate': 0.00027275171065493643, 'epoch': 0.25}
+{'loss': 1.0193, 'grad_norm': 1.7143733501434326, 'learning_rate': 0.0002727272727272727, 'epoch': 0.25}
+ 13%|█▎        | 1622/12776 [15:34<1:09:50,  2.66it/s] 13%|█▎        | 1623/12776 [15:35<1:12:38,  2.56it/s]                                                       13%|█▎        | 1623/12776 [15:35<1:12:38,  2.56it/s] 13%|█▎        | 1624/12776 [15:35<1:07:57,  2.74it/s]                                                       13%|█▎        | 1624/12776 [15:35<1:07:57,  2.74it/s] 13%|█▎        | 1625/12776 [15:36<1:03:46,  2.91it/s]                                                       13%|█▎        | 1625/12776 [15:36<1:03:46,  2.91it/s] 13%|█▎        | 1626/12776 [15:36<1:06:20,  2.80it/s]                                                       13%|█▎        | 1626/12776 [15:36<1:06:20,  2.80it/s] 13%|█▎        | 1627/12776 [15:36<1:01:13,  3.04it/s]                                                       13%|█▎        | 1627/12776 [15:36<1:01:13,  3.04it/s] 13%|█▎        | 1628/12776 [15:36<57:24,  3.24it/s]                                                       13%|█▎        | 1628/12776 [15:36<57:24,  3.24it/s] 13%|█▎        | 1629/12776 [15:37<54:16,  3.42it/s]                                                     13%|█▎        | 1629/12776 [15:37<54:16,  3.42it/s] 13%|█▎        | 1630/12776 [15:37<59:41,  3.11it/s]                                                     13%|█▎        | 1630/12776 [15:37<59:41,  3.11it/s] 13%|█▎        | 1631/12776 [15:37<55:22,  3.35it/s]                                                     13%|█▎        | 1631/12776 [15:37<55:22,  3.35it/s] 13%|█▎        | 1632/12776 [15:38<51:41,  3.59it/s]                                                     13%|█▎        | 1632/12776 [15:38<51:41,  3.59it/s] 13%|█▎        | 1633/12776 [15:38<48:48,  3.80it/s]                                                     13%|█▎        | 1633/12776 [15:38<48:48,  3.80it/s] 13%|█▎        | 1634/12776 [15:38<46:21,  4.01it/s]                                                     13%|█▎        | 1634/12776 [15:38<46:21,  4.01it/s] 13%|█▎        | 1635/12776 [15:38<49:16,  3.77it/s]                                                     13%|█▎        | 1635/12776 [15:38<49:16,  3.77it/s] 13%|█▎        | 1636/12776 [15:39<46:06,  4.03it/s]                                                     13%|█▎        | 1636/12776 [15:39<46:06,  4.03it/s] 13%|█▎        | 1637/12776 [15:39<43:21,  4.28it/s]                                                     13%|█▎        | 1637/12776 [15:39<43:21,  4.28it/s] 13%|█▎        | 1638/12776 [15:39<41:06,  4.52it/s]                                                     13%|█▎        | 1638/12776 [15:39<41:06,  4.52it/s] 13%|█▎        | 1639/12776 [15:39<39:12,  4.73it/s]                                                     13%|█▎        | 1639/12776 [15:39<39:12,  4.73it/s] 13%|█▎        | 1640/12776 [15:39<43:58,  4.22it/s]                                                     13%|█▎        | 1640/12776 [15:39<43:58,  4.22it/s] 13%|█▎        | 1641/12776 [15:40<41:01,  4.52it/s]                                                     13%|█▎        | 1641/12776 [15:40<41:01,  4.52it/s] 13%|█▎        | 1642/12776 [15:40<38:52,  4.77it/s]                                                     13%|█▎        | 1642/12776 [15:40<38:52,  4.77it/s] 13%|█▎        | 1643/12776 [15:40<37:07,  5.00it/s]                                                     13%|█▎        | 1643/12776 [15:40<37:07,  5.00it/s] 13%|█▎        | 1644/12776 [15:40<35:52,  5.17it/s]                                                     13%|█▎        | 1644/12776 [15:40<35:52,  5.17it/s] 13%|█▎        | 1645/12776 [15:40<34:47,  5.33it/s]                                                     13%|█▎        | 1645/12776 [15:40<34:47,  5.33it/s] 13%|█▎        | 1646/12776 [15:41<37:33,  4.94it/s]                                                     13%|█▎        | 1646/12776 [15:41<37:33,  4.94it/s] 13%|█▎        | 1647/12776 [15:41<35:45,  5.19it/s]                                                     13%|█▎        | 1647/12776 [15:41<35:45,  5.19it/s] 13%|█▎        | 1648/12776 [15:41<34:08,  5.43it/s]                                                     13%|█▎        | 1648/12776 [15:41<34:08,  5.43it/s] 13%|█▎        | 1649/12776 [15:41<33:01,  5.62it/s]                                                     13%|█▎        | 1649/12776 [15:41<33:01,  5.62it/s] 13%|█▎        | 1650/12776 [15:42<1:00:26,  3.07it/s]                                                       13%|█▎        | 1650/12776 [15:42<1:00:26,  3.07it/s] 13%|█▎        | 1651/12776 [15:43<2:01:23,  1.53it/s]                                                       13%|█▎        | 1651/12776 [15:43<2:01:23,  1.53it/s] 13%|█▎        | 1652/12776 [15:44<2:14:21,  1.38it/s]                                                       13%|█▎        | 1652/12776 [15:44<2:14:21,  1.38it/s] 13%|█▎        | 1653/12776 [15:45<2:18:25,  1.34it/s]                                                       13%|█▎        | 1653/12776 [15:45<2:18:25,  1.34it/s] 13%|█▎        | 1654/12776 [15:46<2:16:25,  1.36it/s]                                                       13%|█▎        | 1654/12776 [15:46<2:16:25,  1.36it/s] 13%|█▎        | 1655/12776 [15:46<2:13:49,  1.38it/s]                                                       13%|█▎        | 1655/12776 [15:46<2:13:49,  1.38it/s] 13%|█▎        | 1656/12776 [15:47<2:09:14,  1.43it/s]                                                       13%|█▎        | 1656/12776 [15:47<2:09:14,  1.43it/s] 13%|█▎        | 1657/12776 [15:48<2:12:48,  1.40it/s]                                                       13%|█▎        | 1657/12776 [15:48<2:12:48,  1.40it/s] 13%|█▎        | 1658/12776 [15:48<2:07:17,  1.46it/s]                                                       13%|█▎        | 1658/12776 [15:48<2:07:17,  1.46it/s] 13%|█▎        | 1659/12776 [15:49<2:04:33,  1.49it/s]                                                       13%|█▎        | 1659/12776 [15:49<2:04:33,  1.49it/s] 13%|█▎        | 1660/12776 [15:49<1:58:11,  1.57it/s]                                                       13%|█▎        | 1660/12776 [15:49<1:58:11,  1.57it/s] 13%|█▎        | 1661/12776 [15:50<1:54:50,  1.61it/s]                                                       13%|█▎        | 1661/12776 [15:50<1:54:50,  1.61it/s] 13%|█▎        | 1662/12776 [15:51<1:48:37,  1.71it/s]                                                       13%|█▎        | 1662/12776 [15:51<1:48:37,  1.71it/s] 13%|█▎        | 1663/12776 [15:51<1:47:03,  1.73it/s]                                                       13%|█▎        | 1663/12776 [15:51<1:47:03,  1.73it/s] 13%|█▎        | 1664/12776 [15:51<1:38:19,  1.88it/s]                                                       13%|█▎        | 1664/12776 [15:51<1:38:19,  1.88it/s] 13%|█▎        | 1665/12776 [15:52<1:34:02,  1.97it/s]                                                       13%|█▎        | 1665/12776 [15:52<1:34:02,  1.97it/s] 13%|█▎        | 1666/12776 [15:52<1:26:50,  2.13it/s]                                                       13%|█▎        | 1666/12776 [15:52<1:26:50,  2.13it/s] 13%|█▎        | 1667/12776 [15:53<1:21:24,  2.27it/s]                                                       13%|█▎        | 1667/12776 [15:53<1:21:24,  2.27it/s] 13%|█▎        | 1668/12776 [15:53<1:21:15,  2.28it/s]                                                       13%|█▎        | 1668/12776 [15:53<1:21:15,  2.28it/s] 13%|█▎        | 1669/12776 [15:53<1:16:26,  2.42it/s]                                                       13%|█▎        | 1669/12776 [15:53<1:16:26,  2.42it/s] 13%|█▎        | 1670/12776 [15:54<1:12:12,  2.56it/s]                                                       13%|█▎        | 1670/12776 [15:54<1:12:12,  2.56it/s] 13%|█▎        | 1671/12776 [15:54<1:16:38,  2.41it/s]                                                       13%|█▎        | 1671/12776 [15:54<1:16:38,  2.41it/s] 13%|█▎        | 1672/12776 [15:55<1:11:13,  2.60it/s]                                                       13%|█▎        | 1672/12776 [15:55<1:11:13,  2.60it/s] 13%|█▎        | 1673/12776 [15:55<1:06:45,  2.77it/s]                                                       13%|█▎        | 1673/12776 [15:55<1:06:45,  2.77it/s] 13%|█▎        | 1674/12776 [15:55<1:03:37,  2.91it/s]                                                       13%|█▎        | 1674/12776 [15:55<1:03:37,  2.91it/s] 13%|█▎        | 1675/12776 [15:56<1:05:42,  2.82it/s]                                                       13%|█▎        | 1675/12776 [15:56<1:05:42,  2.82it/s] 13%|█▎        | 1676/12776 [15:56<1:01:04,  3.03it/s]                                                       13%|█▎        | 1676/12776 [15:56<1:01:04,  3.03it/s] 13%|█▎        | 1677/12776 [15:56<57:44,  3.20it/s]                                                       13%|█▎        | 1677/12776 [15:56<57:44,  3.20it/s] 13%|█▎        | 1678/12776 [15:56<54:33,  3.39it/s]                                                     13%|█▎        | 1678/12776 [15:56<54:33,  3.39it/s] 13%|█▎        | 1679/12776 [15:57<55:47,  3.32it/s]                                                     13%|█▎        | 1679/12776 [15:57<55:47,  3.32it/s] 13%|█▎        | 1680/12776 [15:57<52:25,  3.53it/s]                                                     13%|█▎        | 1680/12776 [15:57<52:25,  3.53it/s] 13%|█▎        | 1681/12776 [15:57<49:21,  3.75it/s]                                                     13%|█▎        | 1681/12776 [15:57<49:21,  3.75it/s] 13%|█▎        | 1682/12776 [15:57<46:55,  3.94it/s]                                                     13%|█▎        | 1682/12776 [15:57<46:55,  3.94it/s] 13%|█▎        | 1683/12776 [15:58<49:28,  3.74it/s]                                                     13%|█▎        | 1683/12776 [15:58<49:28,  3.74it/s] 13%|█▎        | 1684/12776 [15:58<47:51,  3.86it/s]                                                     13%|█▎        | 1684/12776 [15:58<47:51,  3.86it/s] 13%|█▎        | 1685/12776 [15:58<46:19,  3.99it/s]                                                     13%|█▎        | 1685/12776 [15:58<46:19,  3.99it/s] 13%|█▎        | 1686/12776 [15:58<45:02,  4.10it/s]                                                     13%|█▎        | 1686/12776 [15:58<45:02,  4.10it/s] 13%|█▎        | 1687/12776 [15:59<43:56,  4.21it/s]                                                     13%|█▎        | 1687/12776 [15:59<43:56,  4.21it/s] 13%|█▎        | 1688/12776 [15:59<47:25,  3.90it/s]                                                     13%|█▎        | 1688/12776 [15:59<47:25,  3.90it/s] 13%|█▎        | 1689/12776 [15:59<45:42,  4.04it/s]                                                     13%|█▎        | 1689/12776 [15:59<45:42,  4.04it/s] 13%|█▎        | 1690/12776 [15:59<43:27,  4.25it/s]                                                     13%|█▎        | 1690/12776 [15:59<43:27,  4.25it/s] 13%|█▎        | 1691/12776 [16:00<41:51,  4.41it/s]                                                     13%|█▎        | 1691/12776 [16:00<41:51,  4.41it/s] 13%|█▎        | 1692/12776 [16:00<40:34,  4.55it/s]                                                     13%|█▎        | 1692/12776 [16:00<40:34,  4.55it/s] 13%|█▎        | 1693/12776 [16:00<43:23,  4.26it/s]                                                     13%|█▎        | 1693/12776 [16:00<43:23,  4.26it/s] 13%|█▎        | 1694/12776 [16:00<41:16,  4.47it/s]                                                     13%|█▎        | 1694/12776 [16:00<41:16,  4.47it/s] 13%|█▎        | 1695/12776 [16:00<39:53,  4.63it/s]                                                     13%|█▎        | 1695/12776 [16:00<39:53,  4.63it/s] 13%|█▎        | 1696/12776 [16:01<38:41,  4.77it/s]                                                     13%|█▎        | 1696/12776 [16:01<38:41,  4.77it/s] 13%|█▎        | 1697/12776 [16:01<37:44,  4.89it/s]                                                     13%|█▎        | 1697/12776 [16:01<37:44,  4.89it/s] 13%|█▎        | 1698/12776 [16:01<39:13,  4.71it/s]                                                     13%|█▎        | 1698/12776 [16:01<39:13,  4.71it/s] 13%|█▎        | 1699/12776 [16:01<37:47,  4.89it/s]                                                    {'loss': 1.0734, 'grad_norm': 1.8827818632125854, 'learning_rate': 0.000272702834799609, 'epoch': 0.25}
+{'loss': 1.0196, 'grad_norm': 1.6230251789093018, 'learning_rate': 0.00027267839687194524, 'epoch': 0.25}
+{'loss': 0.8388, 'grad_norm': 1.9909404516220093, 'learning_rate': 0.0002726539589442815, 'epoch': 0.25}
+{'loss': 0.803, 'grad_norm': 1.6396865844726562, 'learning_rate': 0.00027262952101661774, 'epoch': 0.25}
+{'loss': 0.8712, 'grad_norm': 1.7983092069625854, 'learning_rate': 0.00027260508308895405, 'epoch': 0.25}
+{'loss': 0.9533, 'grad_norm': 2.2103474140167236, 'learning_rate': 0.0002725806451612903, 'epoch': 0.25}
+{'loss': 0.9976, 'grad_norm': 1.7631006240844727, 'learning_rate': 0.00027255620723362655, 'epoch': 0.25}
+{'loss': 1.206, 'grad_norm': 3.220822811126709, 'learning_rate': 0.00027253176930596286, 'epoch': 0.26}
+{'loss': 1.0133, 'grad_norm': 1.8891594409942627, 'learning_rate': 0.0002725073313782991, 'epoch': 0.26}
+{'loss': 0.8946, 'grad_norm': 1.985506534576416, 'learning_rate': 0.00027248289345063536, 'epoch': 0.26}
+{'loss': 1.3356, 'grad_norm': 3.2439825534820557, 'learning_rate': 0.00027245845552297166, 'epoch': 0.26}
+{'loss': 0.8516, 'grad_norm': 1.7113157510757446, 'learning_rate': 0.00027243401759530786, 'epoch': 0.26}
+{'loss': 1.5037, 'grad_norm': 2.8625941276550293, 'learning_rate': 0.00027240957966764417, 'epoch': 0.26}
+{'loss': 1.167, 'grad_norm': 1.7289516925811768, 'learning_rate': 0.0002723851417399804, 'epoch': 0.26}
+{'loss': 1.2579, 'grad_norm': 1.9103654623031616, 'learning_rate': 0.00027236070381231667, 'epoch': 0.26}
+{'loss': 1.6795, 'grad_norm': 2.2144134044647217, 'learning_rate': 0.000272336265884653, 'epoch': 0.26}
+{'loss': 1.2621, 'grad_norm': 2.3531837463378906, 'learning_rate': 0.0002723118279569892, 'epoch': 0.26}
+{'loss': 1.9715, 'grad_norm': 3.0877432823181152, 'learning_rate': 0.0002722873900293255, 'epoch': 0.26}
+{'loss': 1.0049, 'grad_norm': 1.3705271482467651, 'learning_rate': 0.0002722629521016618, 'epoch': 0.26}
+{'loss': 1.4665, 'grad_norm': 4.535170078277588, 'learning_rate': 0.00027223851417399803, 'epoch': 0.26}
+{'loss': 1.2625, 'grad_norm': 2.5787107944488525, 'learning_rate': 0.0002722140762463343, 'epoch': 0.26}
+{'loss': 1.1023, 'grad_norm': 2.6251347064971924, 'learning_rate': 0.00027218963831867054, 'epoch': 0.26}
+{'loss': 0.7943, 'grad_norm': 1.5550967454910278, 'learning_rate': 0.00027216520039100684, 'epoch': 0.26}
+{'loss': 1.731, 'grad_norm': 3.5643835067749023, 'learning_rate': 0.0002721407624633431, 'epoch': 0.26}
+{'loss': 1.2029, 'grad_norm': 1.838036298751831, 'learning_rate': 0.00027211632453567934, 'epoch': 0.26}
+{'loss': 0.5492, 'grad_norm': 1.7118128538131714, 'learning_rate': 0.00027209188660801565, 'epoch': 0.26}
+{'loss': 0.6806, 'grad_norm': 1.6779016256332397, 'learning_rate': 0.00027206744868035185, 'epoch': 0.26}
+{'loss': 0.4437, 'grad_norm': 1.5262585878372192, 'learning_rate': 0.00027204301075268815, 'epoch': 0.26}
+{'loss': 1.9419, 'grad_norm': 4.473031520843506, 'learning_rate': 0.0002720185728250244, 'epoch': 0.26}
+{'loss': 0.4091, 'grad_norm': 0.5745874643325806, 'learning_rate': 0.00027199413489736065, 'epoch': 0.26}
+{'loss': 0.3966, 'grad_norm': 0.6224820017814636, 'learning_rate': 0.00027196969696969696, 'epoch': 0.26}
+{'loss': 0.6685, 'grad_norm': 1.2538436651229858, 'learning_rate': 0.0002719452590420332, 'epoch': 0.26}
+{'loss': 0.3777, 'grad_norm': 0.558447003364563, 'learning_rate': 0.00027192082111436946, 'epoch': 0.26}
+{'loss': 0.3588, 'grad_norm': 0.6431753635406494, 'learning_rate': 0.00027189638318670577, 'epoch': 0.26}
+{'loss': 0.3271, 'grad_norm': 0.5876404047012329, 'learning_rate': 0.000271871945259042, 'epoch': 0.26}
+{'loss': 0.5317, 'grad_norm': 1.1488398313522339, 'learning_rate': 0.00027184750733137827, 'epoch': 0.26}
+{'loss': 0.7552, 'grad_norm': 1.7112611532211304, 'learning_rate': 0.0002718230694037145, 'epoch': 0.26}
+{'loss': 0.5836, 'grad_norm': 0.7966535687446594, 'learning_rate': 0.00027179863147605083, 'epoch': 0.26}
+{'loss': 0.3743, 'grad_norm': 0.847993016242981, 'learning_rate': 0.0002717741935483871, 'epoch': 0.26}
+{'loss': 0.5694, 'grad_norm': 1.1151633262634277, 'learning_rate': 0.00027174975562072333, 'epoch': 0.26}
+{'loss': 0.5105, 'grad_norm': 1.010614275932312, 'learning_rate': 0.00027172531769305964, 'epoch': 0.26}
+{'loss': 0.4341, 'grad_norm': 1.286232590675354, 'learning_rate': 0.0002717008797653959, 'epoch': 0.26}
+{'loss': 0.5435, 'grad_norm': 1.0436598062515259, 'learning_rate': 0.00027167644183773214, 'epoch': 0.26}
+{'loss': 0.703, 'grad_norm': 1.534713625907898, 'learning_rate': 0.00027165200391006844, 'epoch': 0.26}
+{'loss': 0.8479, 'grad_norm': 1.9755645990371704, 'learning_rate': 0.00027162756598240464, 'epoch': 0.26}
+{'loss': 0.9621, 'grad_norm': 2.3660671710968018, 'learning_rate': 0.00027160312805474095, 'epoch': 0.26}
+{'loss': 0.6693, 'grad_norm': 0.8814460039138794, 'learning_rate': 0.0002715786901270772, 'epoch': 0.26}
+{'loss': 0.5287, 'grad_norm': 1.214966893196106, 'learning_rate': 0.00027155425219941345, 'epoch': 0.26}
+{'loss': 0.9414, 'grad_norm': 1.7603254318237305, 'learning_rate': 0.00027152981427174975, 'epoch': 0.26}
+{'loss': 0.6992, 'grad_norm': 1.4818147420883179, 'learning_rate': 0.000271505376344086, 'epoch': 0.26}
+{'loss': 0.6655, 'grad_norm': 1.955994725227356, 'learning_rate': 0.00027148093841642226, 'epoch': 0.26}
+{'loss': 0.9702, 'grad_norm': 1.3508237600326538, 'learning_rate': 0.0002714565004887585, 'epoch': 0.26}
+{'loss': 1.1312, 'grad_norm': 2.0264861583709717, 'learning_rate': 0.0002714320625610948, 'epoch': 0.26}
+{'loss': 0.8464, 'grad_norm': 1.562551736831665, 'learning_rate': 0.00027140762463343106, 'epoch': 0.26}
+{'loss': 0.9695, 'grad_norm': 1.3331489562988281, 'learning_rate': 0.0002713831867057673, 'epoch': 0.26}
+{'loss': 0.8289, 'grad_norm': 1.8584641218185425, 'learning_rate': 0.0002713587487781036, 'epoch': 0.26}
+{'loss': 0.8063, 'grad_norm': 2.1813502311706543, 'learning_rate': 0.00027133431085043987, 'epoch': 0.26}
+{'loss': 0.9312, 'grad_norm': 1.9859607219696045, 'learning_rate': 0.0002713098729227761, 'epoch': 0.26}
+{'loss': 1.1479, 'grad_norm': 4.345942974090576, 'learning_rate': 0.00027128543499511243, 'epoch': 0.26}
+{'loss': 1.2104, 'grad_norm': 2.3821566104888916, 'learning_rate': 0.0002712609970674486, 'epoch': 0.26}
+{'loss': 1.3765, 'grad_norm': 1.971257209777832, 'learning_rate': 0.00027123655913978493, 'epoch': 0.26}
+{'loss': 1.3679, 'grad_norm': 2.1938669681549072, 'learning_rate': 0.0002712121212121212, 'epoch': 0.26}
+{'loss': 0.8468, 'grad_norm': 2.430959701538086, 'learning_rate': 0.00027118768328445743, 'epoch': 0.26}
+{'loss': 1.5774, 'grad_norm': 3.02817964553833, 'learning_rate': 0.00027116324535679374, 'epoch': 0.26}
+{'loss': 0.9381, 'grad_norm': 2.2461435794830322, 'learning_rate': 0.00027113880742913, 'epoch': 0.26}
+{'loss': 0.7952, 'grad_norm': 1.5934573411941528, 'learning_rate': 0.00027111436950146624, 'epoch': 0.26}
+{'loss': 1.4084, 'grad_norm': 2.398531436920166, 'learning_rate': 0.00027108993157380255, 'epoch': 0.26}
+{'loss': 1.2535, 'grad_norm': 2.056870698928833, 'learning_rate': 0.0002710654936461388, 'epoch': 0.26}
+{'loss': 1.2529, 'grad_norm': 2.2823963165283203, 'learning_rate': 0.00027104105571847505, 'epoch': 0.26}
+{'loss': 1.3346, 'grad_norm': 1.608046293258667, 'learning_rate': 0.0002710166177908113, 'epoch': 0.26}
+{'loss': 1.4861, 'grad_norm': 3.5349557399749756, 'learning_rate': 0.0002709921798631476, 'epoch': 0.26}
+{'loss': 1.4499, 'grad_norm': 2.209035634994507, 'learning_rate': 0.00027096774193548386, 'epoch': 0.27}
+{'loss': 0.9006, 'grad_norm': 2.169724464416504, 'learning_rate': 0.0002709433040078201, 'epoch': 0.27}
+{'loss': 1.2534, 'grad_norm': 2.5454139709472656, 'learning_rate': 0.0002709188660801564, 'epoch': 0.27}
+{'loss': 0.8634, 'grad_norm': 1.16903555393219, 'learning_rate': 0.0002708944281524926, 'epoch': 0.27}
+{'loss': 1.026, 'grad_norm': 2.4895148277282715, 'learning_rate': 0.0002708699902248289, 'epoch': 0.27}
+{'loss': 0.9984, 'grad_norm': 1.859477162361145, 'learning_rate': 0.00027084555229716517, 'epoch': 0.27}
+ 13%|█▎        | 1699/12776 [16:01<37:47,  4.89it/s] 13%|█▎        | 1700/12776 [16:02<1:06:17,  2.79it/s]                                                       13%|█▎        | 1700/12776 [16:02<1:06:17,  2.79it/s] 13%|█▎        | 1701/12776 [16:03<2:09:17,  1.43it/s]                                                       13%|█▎        | 1701/12776 [16:03<2:09:17,  1.43it/s] 13%|█▎        | 1702/12776 [16:04<2:27:24,  1.25it/s]                                                       13%|█▎        | 1702/12776 [16:04<2:27:24,  1.25it/s] 13%|█▎        | 1703/12776 [16:05<2:29:16,  1.24it/s]                                                       13%|█▎        | 1703/12776 [16:05<2:29:16,  1.24it/s] 13%|█▎        | 1704/12776 [16:06<2:27:35,  1.25it/s]                                                       13%|█▎        | 1704/12776 [16:06<2:27:35,  1.25it/s] 13%|█▎        | 1705/12776 [16:07<2:24:02,  1.28it/s]                                                       13%|█▎        | 1705/12776 [16:07<2:24:02,  1.28it/s] 13%|█▎        | 1706/12776 [16:08<2:20:02,  1.32it/s]                                                       13%|█▎        | 1706/12776 [16:08<2:20:02,  1.32it/s] 13%|█▎        | 1707/12776 [16:08<2:20:12,  1.32it/s]                                                       13%|█▎        | 1707/12776 [16:08<2:20:12,  1.32it/s] 13%|█▎        | 1708/12776 [16:09<2:21:37,  1.30it/s]                                                       13%|█▎        | 1708/12776 [16:09<2:21:37,  1.30it/s] 13%|█▎        | 1709/12776 [16:10<2:12:06,  1.40it/s]                                                       13%|█▎        | 1709/12776 [16:10<2:12:06,  1.40it/s] 13%|█▎        | 1710/12776 [16:10<2:04:59,  1.48it/s]                                                       13%|█▎        | 1710/12776 [16:10<2:04:59,  1.48it/s] 13%|█▎        | 1711/12776 [16:11<1:57:55,  1.56it/s]                                                       13%|█▎        | 1711/12776 [16:11<1:57:55,  1.56it/s] 13%|█▎        | 1712/12776 [16:11<1:54:56,  1.60it/s]                                                       13%|█▎        | 1712/12776 [16:11<1:54:56,  1.60it/s] 13%|█▎        | 1713/12776 [16:12<1:47:56,  1.71it/s]                                                       13%|█▎        | 1713/12776 [16:12<1:47:56,  1.71it/s] 13%|█▎        | 1714/12776 [16:12<1:40:51,  1.83it/s]                                                       13%|█▎        | 1714/12776 [16:12<1:40:51,  1.83it/s] 13%|█▎        | 1715/12776 [16:13<1:35:18,  1.93it/s]                                                       13%|█▎        | 1715/12776 [16:13<1:35:18,  1.93it/s] 13%|█▎        | 1716/12776 [16:13<1:30:27,  2.04it/s]                                                       13%|█▎        | 1716/12776 [16:13<1:30:27,  2.04it/s] 13%|█▎        | 1717/12776 [16:14<1:27:56,  2.10it/s]                                                       13%|█▎        | 1717/12776 [16:14<1:27:56,  2.10it/s] 13%|█▎        | 1718/12776 [16:14<1:23:24,  2.21it/s]                                                       13%|█▎        | 1718/12776 [16:14<1:23:24,  2.21it/s] 13%|█▎        | 1719/12776 [16:14<1:19:31,  2.32it/s]                                                       13%|█▎        | 1719/12776 [16:14<1:19:31,  2.32it/s] 13%|█▎        | 1720/12776 [16:15<1:18:35,  2.34it/s]                                                       13%|█▎        | 1720/12776 [16:15<1:18:35,  2.34it/s] 13%|█▎        | 1721/12776 [16:15<1:14:27,  2.47it/s]                                                       13%|█▎        | 1721/12776 [16:15<1:14:27,  2.47it/s] 13%|█▎        | 1722/12776 [16:16<1:10:58,  2.60it/s]                                                       13%|█▎        | 1722/12776 [16:16<1:10:58,  2.60it/s] 13%|█▎        | 1723/12776 [16:16<1:14:06,  2.49it/s]                                                       13%|█▎        | 1723/12776 [16:16<1:14:06,  2.49it/s] 13%|█▎        | 1724/12776 [16:16<1:09:49,  2.64it/s]                                                       13%|█▎        | 1724/12776 [16:16<1:09:49,  2.64it/s] 14%|█▎        | 1725/12776 [16:17<1:06:15,  2.78it/s]                                                       14%|█▎        | 1725/12776 [16:17<1:06:15,  2.78it/s] 14%|█▎        | 1726/12776 [16:17<1:03:15,  2.91it/s]                                                       14%|█▎        | 1726/12776 [16:17<1:03:15,  2.91it/s] 14%|█▎        | 1727/12776 [16:17<1:04:12,  2.87it/s]                                                       14%|█▎        | 1727/12776 [16:17<1:04:12,  2.87it/s] 14%|█▎        | 1728/12776 [16:18<1:01:02,  3.02it/s]                                                       14%|█▎        | 1728/12776 [16:18<1:01:02,  3.02it/s] 14%|█▎        | 1729/12776 [16:18<58:07,  3.17it/s]                                                       14%|█▎        | 1729/12776 [16:18<58:07,  3.17it/s] 14%|█▎        | 1730/12776 [16:18<55:45,  3.30it/s]                                                     14%|█▎        | 1730/12776 [16:18<55:45,  3.30it/s] 14%|█▎        | 1731/12776 [16:18<54:15,  3.39it/s]                                                     14%|█▎        | 1731/12776 [16:18<54:15,  3.39it/s] 14%|█▎        | 1732/12776 [16:19<52:24,  3.51it/s]                                                     14%|█▎        | 1732/12776 [16:19<52:24,  3.51it/s] 14%|█▎        | 1733/12776 [16:19<50:40,  3.63it/s]                                                     14%|█▎        | 1733/12776 [16:19<50:40,  3.63it/s] 14%|█▎        | 1734/12776 [16:19<49:11,  3.74it/s]                                                     14%|█▎        | 1734/12776 [16:19<49:11,  3.74it/s] 14%|█▎        | 1735/12776 [16:20<54:57,  3.35it/s]                                                     14%|█▎        | 1735/12776 [16:20<54:57,  3.35it/s] 14%|█▎        | 1736/12776 [16:20<51:38,  3.56it/s]                                                     14%|█▎        | 1736/12776 [16:20<51:38,  3.56it/s] 14%|█▎        | 1737/12776 [16:20<48:58,  3.76it/s]                                                     14%|█▎        | 1737/12776 [16:20<48:58,  3.76it/s] 14%|█▎        | 1738/12776 [16:20<46:51,  3.93it/s]                                                     14%|█▎        | 1738/12776 [16:20<46:51,  3.93it/s] 14%|█▎        | 1739/12776 [16:21<48:39,  3.78it/s]                                                     14%|█▎        | 1739/12776 [16:21<48:39,  3.78it/s] 14%|█▎        | 1740/12776 [16:21<45:50,  4.01it/s]                                                     14%|█▎        | 1740/12776 [16:21<45:50,  4.01it/s] 14%|█▎        | 1741/12776 [16:21<43:41,  4.21it/s]                                                     14%|█▎        | 1741/12776 [16:21<43:41,  4.21it/s] 14%|█▎        | 1742/12776 [16:21<42:09,  4.36it/s]                                                     14%|█▎        | 1742/12776 [16:21<42:09,  4.36it/s] 14%|█▎        | 1743/12776 [16:21<40:56,  4.49it/s]                                                     14%|█▎        | 1743/12776 [16:21<40:56,  4.49it/s] 14%|█▎        | 1744/12776 [16:22<44:07,  4.17it/s]                                                     14%|█▎        | 1744/12776 [16:22<44:07,  4.17it/s] 14%|█▎        | 1745/12776 [16:22<41:42,  4.41it/s]                                                     14%|█▎        | 1745/12776 [16:22<41:42,  4.41it/s] 14%|█▎        | 1746/12776 [16:22<39:47,  4.62it/s]                                                     14%|█▎        | 1746/12776 [16:22<39:47,  4.62it/s] 14%|█▎        | 1747/12776 [16:22<38:18,  4.80it/s]                                                     14%|█▎        | 1747/12776 [16:22<38:18,  4.80it/s] 14%|█▎        | 1748/12776 [16:22<36:59,  4.97it/s]                                                     14%|█▎        | 1748/12776 [16:22<36:59,  4.97it/s] 14%|█▎        | 1749/12776 [16:23<36:01,  5.10it/s]                                                     14%|█▎        | 1749/12776 [16:23<36:01,  5.10it/s] 14%|█▎        | 1750/12776 [16:23<1:04:54,  2.83it/s]                                                       14%|█▎        | 1750/12776 [16:23<1:04:54,  2.83it/s] 14%|█▎        | 1751/12776 [16:25<2:02:32,  1.50it/s]                                                       14%|█▎        | 1751/12776 [16:25<2:02:32,  1.50it/s] 14%|█▎        | 1752/12776 [16:26<2:19:30,  1.32it/s]                                                       14%|█▎        | 1752/12776 [16:26<2:19:30,  1.32it/s] 14%|█▎        | 1753/12776 [16:27<2:28:55,  1.23it/s]                                                       14%|█▎        | 1753/12776 [16:27<2:28:55,  1.23it/s] 14%|█▎        | 1754/12776 [16:27<2:30:04,  1.22it/s]                                                       14%|█▎        | 1754/12776 [16:27<2:30:04,  1.22it/s] 14%|█▎        | 1755/12776 [16:28<2:26:45,  1.25it/s]                                                       14%|█▎        | 1755/12776 [16:28<2:26:45,  1.25it/s] 14%|█▎        | 1756/12776 [16:29<2:25:34,  1.26it/s]                                                       14%|█▎        | 1756/12776 [16:29<2:25:34,  1.26it/s] 14%|█▍        | 1757/12776 [16:30<2:21:54,  1.29it/s]                                                       14%|█▍        | 1757/12776 [16:30<2:21:54,  1.29it/s] 14%|█▍        | 1758/12776 [16:30<2:14:56,  1.36it/s]                                                       14%|█▍        | 1758/12776 [16:30<2:14:56,  1.36it/s] 14%|█▍        | 1759/12776 [16:31<2:14:09,  1.37it/s]                                                       14%|█▍        | 1759/12776 [16:31<2:14:09,  1.37it/s] 14%|█▍        | 1760/12776 [16:32<2:05:28,  1.46it/s]                                                       14%|█▍        | 1760/12776 [16:32<2:05:28,  1.46it/s] 14%|█▍        | 1761/12776 [16:32<2:01:42,  1.51it/s]                                                       14%|█▍        | 1761/12776 [16:32<2:01:42,  1.51it/s] 14%|█▍        | 1762/12776 [16:33<1:53:15,  1.62it/s]                                                       14%|█▍        | 1762/12776 [16:33<1:53:15,  1.62it/s] 14%|█▍        | 1763/12776 [16:33<1:49:28,  1.68it/s]                                                       14%|█▍        | 1763/12776 [16:33<1:49:28,  1.68it/s] 14%|█▍        | 1764/12776 [16:34<1:41:33,  1.81it/s]                                                       14%|█▍        | 1764/12776 [16:34<1:41:33,  1.81it/s] 14%|█▍        | 1765/12776 [16:34<1:39:34,  1.84it/s]                                                       14%|█▍        | 1765/12776 [16:34<1:39:34,  1.84it/s] 14%|█▍        | 1766/12776 [16:35<1:32:23,  1.99it/s]                                                       14%|█▍        | 1766/12776 [16:35<1:32:23,  1.99it/s] 14%|█▍        | 1767/12776 [16:35<1:26:50,  2.11it/s]                                                       14%|█▍        | 1767/12776 [16:35<1:26:50,  2.11it/s] 14%|█▍        | 1768/12776 [16:36<1:29:09,  2.06it/s]                                                       14%|█▍        | 1768/12776 [16:36<1:29:09,  2.06it/s] 14%|█▍        | 1769/12776 [16:36<1:23:20,  2.20it/s]                                                       14%|█▍        | 1769/12776 [16:36<1:23:20,  2.20it/s] 14%|█▍        | 1770/12776 [16:36<1:18:02,  2.35it/s]                                                       14%|█▍        | 1770/12776 [16:36<1:18:02,  2.35it/s] 14%|█▍        | 1771/12776 [16:37<1:17:57,  2.35it/s]                                                       14%|█▍        | 1771/12776 [16:37<1:17:57,  2.35it/s] 14%|█▍        | 1772/12776 [16:37<1:13:22,  2.50it/s]                                                       14%|█▍        | 1772/12776 [16:37<1:13:22,  2.50it/s] 14%|█▍        | 1773/12776 [16:37<1:09:19,  2.65it/s]                                                       14%|█▍        | 1773/12776 [16:37<1:09:19,  2.65it/s] 14%|█▍        | 1774/12776 [16:38<1:11:22,  2.57it/s]                                                       14%|█▍        | 1774/12776 [16:38<1:11:22,  2.57it/s] 14%|█▍        | 1775/12776 [16:38<1:06:10,  2.77it/s]                                                       14%|█▍        | 1775/12776 [16:38<1:06:10,  2.77it/s] 14%|█▍        | 1776/12776 [16:38<1:01:59,  2.96it/s]                                                       14%|█▍        | 1776/12776 [16:38<1:01:59,  2.96it/s] 14%|█▍        | 1777/12776 [16:39<58:43,  3.12it/s]                                                      {'loss': 1.1007, 'grad_norm': 3.284623146057129, 'learning_rate': 0.0002708211143695014, 'epoch': 0.27}
+{'loss': 1.955, 'grad_norm': 3.7506582736968994, 'learning_rate': 0.0002707966764418377, 'epoch': 0.27}
+{'loss': 0.4298, 'grad_norm': 0.6009975671768188, 'learning_rate': 0.000270772238514174, 'epoch': 0.27}
+{'loss': 0.622, 'grad_norm': 1.3551870584487915, 'learning_rate': 0.00027074780058651023, 'epoch': 0.27}
+{'loss': 0.6164, 'grad_norm': 1.262035608291626, 'learning_rate': 0.00027072336265884653, 'epoch': 0.27}
+{'loss': 0.4694, 'grad_norm': 0.7926015257835388, 'learning_rate': 0.0002706989247311828, 'epoch': 0.27}
+{'loss': 0.66, 'grad_norm': 1.2867344617843628, 'learning_rate': 0.00027067448680351904, 'epoch': 0.27}
+{'loss': 0.9503, 'grad_norm': 1.134917140007019, 'learning_rate': 0.0002706500488758553, 'epoch': 0.27}
+{'loss': 0.9773, 'grad_norm': 3.0994365215301514, 'learning_rate': 0.0002706256109481916, 'epoch': 0.27}
+{'loss': 0.4688, 'grad_norm': 0.9256912469863892, 'learning_rate': 0.00027060117302052784, 'epoch': 0.27}
+{'loss': 0.6223, 'grad_norm': 1.101236343383789, 'learning_rate': 0.0002705767350928641, 'epoch': 0.27}
+{'loss': 0.4361, 'grad_norm': 0.7278368473052979, 'learning_rate': 0.0002705522971652004, 'epoch': 0.27}
+{'loss': 0.8, 'grad_norm': 2.448662757873535, 'learning_rate': 0.00027052785923753665, 'epoch': 0.27}
+{'loss': 0.5331, 'grad_norm': 1.1800535917282104, 'learning_rate': 0.0002705034213098729, 'epoch': 0.27}
+{'loss': 1.0572, 'grad_norm': 2.716262102127075, 'learning_rate': 0.00027047898338220915, 'epoch': 0.27}
+{'loss': 0.6161, 'grad_norm': 0.9216450452804565, 'learning_rate': 0.0002704545454545454, 'epoch': 0.27}
+{'loss': 0.6809, 'grad_norm': 1.1275360584259033, 'learning_rate': 0.0002704301075268817, 'epoch': 0.27}
+{'loss': 0.8089, 'grad_norm': 1.6321802139282227, 'learning_rate': 0.00027040566959921796, 'epoch': 0.27}
+{'loss': 0.9202, 'grad_norm': 1.1545872688293457, 'learning_rate': 0.0002703812316715542, 'epoch': 0.27}
+{'loss': 0.7897, 'grad_norm': 2.506645679473877, 'learning_rate': 0.0002703567937438905, 'epoch': 0.27}
+{'loss': 0.5747, 'grad_norm': 1.1123408079147339, 'learning_rate': 0.00027033235581622677, 'epoch': 0.27}
+{'loss': 0.5402, 'grad_norm': 0.9035351872444153, 'learning_rate': 0.000270307917888563, 'epoch': 0.27}
+{'loss': 0.5699, 'grad_norm': 1.5112823247909546, 'learning_rate': 0.00027028347996089927, 'epoch': 0.27}
+{'loss': 0.5282, 'grad_norm': 1.4688304662704468, 'learning_rate': 0.0002702590420332356, 'epoch': 0.27}
+{'loss': 0.6053, 'grad_norm': 1.3641126155853271, 'learning_rate': 0.00027023460410557183, 'epoch': 0.27}
+{'loss': 0.8346, 'grad_norm': 1.88762366771698, 'learning_rate': 0.0002702101661779081, 'epoch': 0.27}
+{'loss': 0.9424, 'grad_norm': 1.9120090007781982, 'learning_rate': 0.0002701857282502444, 'epoch': 0.27}
+{'loss': 0.9418, 'grad_norm': 1.8165515661239624, 'learning_rate': 0.00027016129032258064, 'epoch': 0.27}
+{'loss': 0.8333, 'grad_norm': 2.26983904838562, 'learning_rate': 0.0002701368523949169, 'epoch': 0.27}
+{'loss': 0.8788, 'grad_norm': 1.2658028602600098, 'learning_rate': 0.0002701124144672532, 'epoch': 0.27}
+{'loss': 0.8146, 'grad_norm': 2.336582660675049, 'learning_rate': 0.0002700879765395894, 'epoch': 0.27}
+{'loss': 0.8878, 'grad_norm': 1.6825093030929565, 'learning_rate': 0.0002700635386119257, 'epoch': 0.27}
+{'loss': 1.2894, 'grad_norm': 2.3204588890075684, 'learning_rate': 0.00027003910068426195, 'epoch': 0.27}
+{'loss': 1.2633, 'grad_norm': 2.438342809677124, 'learning_rate': 0.0002700146627565982, 'epoch': 0.27}
+{'loss': 1.0154, 'grad_norm': 2.8188915252685547, 'learning_rate': 0.0002699902248289345, 'epoch': 0.27}
+{'loss': 1.3788, 'grad_norm': 2.1101574897766113, 'learning_rate': 0.00026996578690127076, 'epoch': 0.27}
+{'loss': 1.0987, 'grad_norm': 2.3884902000427246, 'learning_rate': 0.000269941348973607, 'epoch': 0.27}
+{'loss': 1.1042, 'grad_norm': 3.389667510986328, 'learning_rate': 0.0002699169110459433, 'epoch': 0.27}
+{'loss': 1.6512, 'grad_norm': 2.0286519527435303, 'learning_rate': 0.00026989247311827956, 'epoch': 0.27}
+{'loss': 1.1985, 'grad_norm': 2.0738837718963623, 'learning_rate': 0.0002698680351906158, 'epoch': 0.27}
+{'loss': 1.225, 'grad_norm': 2.440089464187622, 'learning_rate': 0.00026984359726295207, 'epoch': 0.27}
+{'loss': 2.0094, 'grad_norm': 2.8969979286193848, 'learning_rate': 0.00026981915933528837, 'epoch': 0.27}
+{'loss': 1.367, 'grad_norm': 2.208665609359741, 'learning_rate': 0.0002697947214076246, 'epoch': 0.27}
+{'loss': 1.2281, 'grad_norm': 2.2335593700408936, 'learning_rate': 0.0002697702834799609, 'epoch': 0.27}
+{'loss': 0.98, 'grad_norm': 1.435701608657837, 'learning_rate': 0.0002697458455522972, 'epoch': 0.27}
+{'loss': 1.5297, 'grad_norm': 2.6410810947418213, 'learning_rate': 0.0002697214076246334, 'epoch': 0.27}
+{'loss': 1.528, 'grad_norm': 3.8984954357147217, 'learning_rate': 0.0002696969696969697, 'epoch': 0.27}
+{'loss': 1.1372, 'grad_norm': 1.9594566822052002, 'learning_rate': 0.00026967253176930593, 'epoch': 0.27}
+{'loss': 0.8802, 'grad_norm': 1.6567978858947754, 'learning_rate': 0.0002696480938416422, 'epoch': 0.27}
+{'loss': 0.7041, 'grad_norm': 1.2935997247695923, 'learning_rate': 0.0002696236559139785, 'epoch': 0.27}
+{'loss': 1.4213, 'grad_norm': 3.2540674209594727, 'learning_rate': 0.00026959921798631474, 'epoch': 0.27}
+{'loss': 1.246, 'grad_norm': 2.091407060623169, 'learning_rate': 0.000269574780058651, 'epoch': 0.27}
+{'loss': 0.4907, 'grad_norm': 0.7957701683044434, 'learning_rate': 0.0002695503421309873, 'epoch': 0.27}
+{'loss': 0.3429, 'grad_norm': 0.6549016833305359, 'learning_rate': 0.00026952590420332355, 'epoch': 0.27}
+{'loss': 0.4866, 'grad_norm': 0.8715705275535583, 'learning_rate': 0.0002695014662756598, 'epoch': 0.27}
+{'loss': 0.4019, 'grad_norm': 0.8090335726737976, 'learning_rate': 0.00026947702834799605, 'epoch': 0.27}
+{'loss': 0.4006, 'grad_norm': 0.5464131236076355, 'learning_rate': 0.00026945259042033236, 'epoch': 0.27}
+{'loss': 0.6231, 'grad_norm': 1.1286975145339966, 'learning_rate': 0.0002694281524926686, 'epoch': 0.27}
+{'loss': 0.4803, 'grad_norm': 1.059890627861023, 'learning_rate': 0.00026940371456500486, 'epoch': 0.28}
+{'loss': 0.4935, 'grad_norm': 1.019981861114502, 'learning_rate': 0.00026937927663734116, 'epoch': 0.28}
+{'loss': 0.5164, 'grad_norm': 0.9815790057182312, 'learning_rate': 0.0002693548387096774, 'epoch': 0.28}
+{'loss': 0.5096, 'grad_norm': 1.1376479864120483, 'learning_rate': 0.00026933040078201367, 'epoch': 0.28}
+{'loss': 0.4121, 'grad_norm': 0.9377894997596741, 'learning_rate': 0.0002693059628543499, 'epoch': 0.28}
+{'loss': 0.4009, 'grad_norm': 0.6440132856369019, 'learning_rate': 0.00026928152492668617, 'epoch': 0.28}
+{'loss': 0.5908, 'grad_norm': 1.1063646078109741, 'learning_rate': 0.0002692570869990225, 'epoch': 0.28}
+{'loss': 0.6754, 'grad_norm': 1.2366384267807007, 'learning_rate': 0.0002692326490713587, 'epoch': 0.28}
+{'loss': 0.6837, 'grad_norm': 1.511243224143982, 'learning_rate': 0.000269208211143695, 'epoch': 0.28}
+{'loss': 0.9414, 'grad_norm': 2.002196788787842, 'learning_rate': 0.0002691837732160313, 'epoch': 0.28}
+{'loss': 0.6387, 'grad_norm': 0.955176591873169, 'learning_rate': 0.00026915933528836753, 'epoch': 0.28}
+{'loss': 0.7687, 'grad_norm': 1.9328628778457642, 'learning_rate': 0.0002691348973607038, 'epoch': 0.28}
+{'loss': 0.5347, 'grad_norm': 1.1169955730438232, 'learning_rate': 0.00026911045943304004, 'epoch': 0.28}
+{'loss': 0.4861, 'grad_norm': 1.7990258932113647, 'learning_rate': 0.00026908602150537634, 'epoch': 0.28}
+{'loss': 0.7257, 'grad_norm': 1.4510148763656616, 'learning_rate': 0.0002690615835777126, 'epoch': 0.28}
+{'loss': 0.7709, 'grad_norm': 2.4672670364379883, 'learning_rate': 0.00026903714565004884, 'epoch': 0.28}
+{'loss': 0.7594, 'grad_norm': 1.4069247245788574, 'learning_rate': 0.00026901270772238515, 'epoch': 0.28}
+{'loss': 0.7431, 'grad_norm': 1.4989938735961914, 'learning_rate': 0.0002689882697947214, 'epoch': 0.28}
+{'loss': 1.0465, 'grad_norm': 1.9093029499053955, 'learning_rate': 0.00026896383186705765, 'epoch': 0.28}
+{'loss': 1.0814, 'grad_norm': 1.3027005195617676, 'learning_rate': 0.00026893939393939396, 'epoch': 0.28}
+ 14%|█▍        | 1777/12776 [16:39<58:43,  3.12it/s] 14%|█▍        | 1778/12776 [16:39<56:22,  3.25it/s]                                                     14%|█▍        | 1778/12776 [16:39<56:22,  3.25it/s] 14%|█▍        | 1779/12776 [16:39<53:53,  3.40it/s]                                                     14%|█▍        | 1779/12776 [16:39<53:53,  3.40it/s] 14%|█▍        | 1780/12776 [16:40<51:42,  3.54it/s]                                                     14%|█▍        | 1780/12776 [16:40<51:42,  3.54it/s] 14%|█▍        | 1781/12776 [16:40<50:03,  3.66it/s]                                                     14%|█▍        | 1781/12776 [16:40<50:03,  3.66it/s] 14%|█▍        | 1782/12776 [16:40<53:28,  3.43it/s]                                                     14%|█▍        | 1782/12776 [16:40<53:28,  3.43it/s] 14%|█▍        | 1783/12776 [16:40<50:47,  3.61it/s]                                                     14%|█▍        | 1783/12776 [16:40<50:47,  3.61it/s] 14%|█▍        | 1784/12776 [16:41<48:44,  3.76it/s]                                                     14%|█▍        | 1784/12776 [16:41<48:44,  3.76it/s] 14%|█▍        | 1785/12776 [16:41<46:49,  3.91it/s]                                                     14%|█▍        | 1785/12776 [16:41<46:49,  3.91it/s] 14%|█▍        | 1786/12776 [16:41<48:50,  3.75it/s]                                                     14%|█▍        | 1786/12776 [16:41<48:50,  3.75it/s] 14%|█▍        | 1787/12776 [16:41<46:11,  3.96it/s]                                                     14%|█▍        | 1787/12776 [16:41<46:11,  3.96it/s] 14%|█▍        | 1788/12776 [16:42<43:53,  4.17it/s]                                                     14%|█▍        | 1788/12776 [16:42<43:53,  4.17it/s] 14%|█▍        | 1789/12776 [16:42<42:34,  4.30it/s]                                                     14%|█▍        | 1789/12776 [16:42<42:34,  4.30it/s] 14%|█▍        | 1790/12776 [16:42<41:19,  4.43it/s]                                                     14%|█▍        | 1790/12776 [16:42<41:19,  4.43it/s] 14%|█▍        | 1791/12776 [16:42<43:37,  4.20it/s]                                                     14%|█▍        | 1791/12776 [16:42<43:37,  4.20it/s] 14%|█▍        | 1792/12776 [16:42<41:40,  4.39it/s]                                                     14%|█▍        | 1792/12776 [16:42<41:40,  4.39it/s] 14%|█▍        | 1793/12776 [16:43<40:07,  4.56it/s]                                                     14%|█▍        | 1793/12776 [16:43<40:07,  4.56it/s] 14%|█▍        | 1794/12776 [16:43<38:56,  4.70it/s]                                                     14%|█▍        | 1794/12776 [16:43<38:56,  4.70it/s] 14%|█▍        | 1795/12776 [16:43<37:59,  4.82it/s]                                                     14%|█▍        | 1795/12776 [16:43<37:59,  4.82it/s] 14%|█▍        | 1796/12776 [16:43<37:10,  4.92it/s]                                                     14%|█▍        | 1796/12776 [16:43<37:10,  4.92it/s] 14%|█▍        | 1797/12776 [16:44<40:30,  4.52it/s]                                                     14%|█▍        | 1797/12776 [16:44<40:30,  4.52it/s] 14%|█▍        | 1798/12776 [16:44<38:30,  4.75it/s]                                                     14%|█▍        | 1798/12776 [16:44<38:30,  4.75it/s] 14%|█▍        | 1799/12776 [16:44<37:03,  4.94it/s]                                                     14%|█▍        | 1799/12776 [16:44<37:03,  4.94it/s] 14%|█▍        | 1800/12776 [16:45<1:07:09,  2.72it/s]                                                       14%|█▍        | 1800/12776 [16:45<1:07:09,  2.72it/s] 14%|█▍        | 1801/12776 [16:46<2:06:28,  1.45it/s]                                                       14%|█▍        | 1801/12776 [16:46<2:06:28,  1.45it/s] 14%|█▍        | 1802/12776 [16:47<2:20:25,  1.30it/s]                                                       14%|█▍        | 1802/12776 [16:47<2:20:25,  1.30it/s] 14%|█▍        | 1803/12776 [16:48<2:32:30,  1.20it/s]                                                       14%|█▍        | 1803/12776 [16:48<2:32:30,  1.20it/s] 14%|█▍        | 1804/12776 [16:49<2:30:16,  1.22it/s]                                                       14%|█▍        | 1804/12776 [16:49<2:30:16,  1.22it/s] 14%|█▍        | 1805/12776 [16:50<2:25:59,  1.25it/s]                                                       14%|█▍        | 1805/12776 [16:50<2:25:59,  1.25it/s] 14%|█▍        | 1806/12776 [16:50<2:21:26,  1.29it/s]                                                       14%|█▍        | 1806/12776 [16:50<2:21:26,  1.29it/s] 14%|█▍        | 1807/12776 [16:51<2:14:34,  1.36it/s]                                                       14%|█▍        | 1807/12776 [16:51<2:14:34,  1.36it/s] 14%|█▍        | 1808/12776 [16:52<2:14:22,  1.36it/s]                                                       14%|█▍        | 1808/12776 [16:52<2:14:22,  1.36it/s] 14%|█▍        | 1809/12776 [16:52<2:05:50,  1.45it/s]                                                       14%|█▍        | 1809/12776 [16:52<2:05:50,  1.45it/s] 14%|█▍        | 1810/12776 [16:53<2:02:28,  1.49it/s]                                                       14%|█▍        | 1810/12776 [16:53<2:02:28,  1.49it/s] 14%|█▍        | 1811/12776 [16:53<1:53:53,  1.60it/s]                                                       14%|█▍        | 1811/12776 [16:53<1:53:53,  1.60it/s] 14%|█▍        | 1812/12776 [16:54<1:49:09,  1.67it/s]                                                       14%|█▍        | 1812/12776 [16:54<1:49:09,  1.67it/s] 14%|█▍        | 1813/12776 [16:54<1:41:20,  1.80it/s]                                                       14%|█▍        | 1813/12776 [16:54<1:41:20,  1.80it/s] 14%|█▍        | 1814/12776 [16:55<1:37:43,  1.87it/s]                                                       14%|█▍        | 1814/12776 [16:55<1:37:43,  1.87it/s] 14%|█▍        | 1815/12776 [16:55<1:31:32,  2.00it/s]                                                       14%|█▍        | 1815/12776 [16:55<1:31:32,  2.00it/s] 14%|█▍        | 1816/12776 [16:56<1:25:58,  2.12it/s]                                                       14%|█▍        | 1816/12776 [16:56<1:25:58,  2.12it/s] 14%|█▍        | 1817/12776 [16:56<1:26:54,  2.10it/s]                                                       14%|█▍        | 1817/12776 [16:56<1:26:54,  2.10it/s] 14%|█▍        | 1818/12776 [16:57<1:20:54,  2.26it/s]                                                       14%|█▍        | 1818/12776 [16:57<1:20:54,  2.26it/s] 14%|█▍        | 1819/12776 [16:57<1:15:48,  2.41it/s]                                                       14%|█▍        | 1819/12776 [16:57<1:15:48,  2.41it/s] 14%|█▍        | 1820/12776 [16:57<1:16:05,  2.40it/s]                                                       14%|█▍        | 1820/12776 [16:57<1:16:05,  2.40it/s] 14%|█▍        | 1821/12776 [16:58<1:11:51,  2.54it/s]                                                       14%|█▍        | 1821/12776 [16:58<1:11:51,  2.54it/s] 14%|█▍        | 1822/12776 [16:58<1:08:44,  2.66it/s]                                                       14%|█▍        | 1822/12776 [16:58<1:08:44,  2.66it/s] 14%|█▍        | 1823/12776 [16:58<1:13:15,  2.49it/s]                                                       14%|█▍        | 1823/12776 [16:58<1:13:15,  2.49it/s] 14%|█▍        | 1824/12776 [16:59<1:08:15,  2.67it/s]                                                       14%|█▍        | 1824/12776 [16:59<1:08:15,  2.67it/s] 14%|█▍        | 1825/12776 [16:59<1:04:36,  2.82it/s]                                                       14%|█▍        | 1825/12776 [16:59<1:04:36,  2.82it/s] 14%|█▍        | 1826/12776 [16:59<1:04:56,  2.81it/s]                                                       14%|█▍        | 1826/12776 [16:59<1:04:56,  2.81it/s] 14%|█▍        | 1827/12776 [17:00<1:01:25,  2.97it/s]                                                       14%|█▍        | 1827/12776 [17:00<1:01:25,  2.97it/s] 14%|█▍        | 1828/12776 [17:00<58:31,  3.12it/s]                                                       14%|█▍        | 1828/12776 [17:00<58:31,  3.12it/s] 14%|█▍        | 1829/12776 [17:00<56:46,  3.21it/s]                                                     14%|█▍        | 1829/12776 [17:00<56:46,  3.21it/s] 14%|█▍        | 1830/12776 [17:01<59:57,  3.04it/s]                                                     14%|█▍        | 1830/12776 [17:01<59:57,  3.04it/s] 14%|█▍        | 1831/12776 [17:01<55:56,  3.26it/s]                                                     14%|█▍        | 1831/12776 [17:01<55:56,  3.26it/s] 14%|█▍        | 1832/12776 [17:01<52:57,  3.44it/s]                                                     14%|█▍        | 1832/12776 [17:01<52:57,  3.44it/s] 14%|█▍        | 1833/12776 [17:01<50:38,  3.60it/s]                                                     14%|█▍        | 1833/12776 [17:01<50:38,  3.60it/s] 14%|█▍        | 1834/12776 [17:02<54:53,  3.32it/s]                                                     14%|█▍        | 1834/12776 [17:02<54:53,  3.32it/s] 14%|█▍        | 1835/12776 [17:02<51:30,  3.54it/s]                                                     14%|█▍        | 1835/12776 [17:02<51:30,  3.54it/s] 14%|█▍        | 1836/12776 [17:02<48:51,  3.73it/s]                                                     14%|█▍        | 1836/12776 [17:02<48:51,  3.73it/s] 14%|█▍        | 1837/12776 [17:02<46:33,  3.92it/s]                                                     14%|█▍        | 1837/12776 [17:02<46:33,  3.92it/s] 14%|█▍        | 1838/12776 [17:03<51:06,  3.57it/s]                                                     14%|█▍        | 1838/12776 [17:03<51:06,  3.57it/s] 14%|█▍        | 1839/12776 [17:03<47:22,  3.85it/s]                                                     14%|█▍        | 1839/12776 [17:03<47:22,  3.85it/s] 14%|█▍        | 1840/12776 [17:03<44:25,  4.10it/s]                                                     14%|█▍        | 1840/12776 [17:03<44:25,  4.10it/s] 14%|█▍        | 1841/12776 [17:03<42:30,  4.29it/s]                                                     14%|█▍        | 1841/12776 [17:03<42:30,  4.29it/s] 14%|█▍        | 1842/12776 [17:04<40:50,  4.46it/s]                                                     14%|█▍        | 1842/12776 [17:04<40:50,  4.46it/s] 14%|█▍        | 1843/12776 [17:04<44:51,  4.06it/s]                                                     14%|█▍        | 1843/12776 [17:04<44:51,  4.06it/s] 14%|█▍        | 1844/12776 [17:04<42:13,  4.31it/s]                                                     14%|█▍        | 1844/12776 [17:04<42:13,  4.31it/s] 14%|█▍        | 1845/12776 [17:04<40:16,  4.52it/s]                                                     14%|█▍        | 1845/12776 [17:04<40:16,  4.52it/s] 14%|█▍        | 1846/12776 [17:05<38:51,  4.69it/s]                                                     14%|█▍        | 1846/12776 [17:05<38:51,  4.69it/s] 14%|█▍        | 1847/12776 [17:05<37:37,  4.84it/s]                                                     14%|█▍        | 1847/12776 [17:05<37:37,  4.84it/s] 14%|█▍        | 1848/12776 [17:05<37:00,  4.92it/s]                                                     14%|█▍        | 1848/12776 [17:05<37:00,  4.92it/s] 14%|█▍        | 1849/12776 [17:05<41:01,  4.44it/s]                                                     14%|█▍        | 1849/12776 [17:05<41:01,  4.44it/s] 14%|█▍        | 1850/12776 [17:06<1:07:34,  2.69it/s]                                                       14%|█▍        | 1850/12776 [17:06<1:07:34,  2.69it/s] 14%|█▍        | 1851/12776 [17:07<2:13:42,  1.36it/s]                                                       14%|█▍        | 1851/12776 [17:07<2:13:42,  1.36it/s] 14%|█▍        | 1852/12776 [17:09<2:32:47,  1.19it/s]                                                       14%|█▍        | 1852/12776 [17:09<2:32:47,  1.19it/s] 15%|█▍        | 1853/12776 [17:09<2:34:39,  1.18it/s]                                                       15%|█▍        | 1853/12776 [17:09<2:34:39,  1.18it/s] 15%|█▍        | 1854/12776 [17:10<2:31:28,  1.20it/s]                                                      {'loss': 0.8204, 'grad_norm': 2.1441051959991455, 'learning_rate': 0.00026891495601173016, 'epoch': 0.28}
+{'loss': 1.1269, 'grad_norm': 2.118739128112793, 'learning_rate': 0.00026889051808406646, 'epoch': 0.28}
+{'loss': 0.8659, 'grad_norm': 1.8077781200408936, 'learning_rate': 0.0002688660801564027, 'epoch': 0.28}
+{'loss': 0.9428, 'grad_norm': 2.058222770690918, 'learning_rate': 0.00026884164222873896, 'epoch': 0.28}
+{'loss': 0.8841, 'grad_norm': 2.2564380168914795, 'learning_rate': 0.00026881720430107527, 'epoch': 0.28}
+{'loss': 0.7061, 'grad_norm': 3.851144313812256, 'learning_rate': 0.0002687927663734115, 'epoch': 0.28}
+{'loss': 1.2807, 'grad_norm': 1.6981579065322876, 'learning_rate': 0.00026876832844574777, 'epoch': 0.28}
+{'loss': 1.2373, 'grad_norm': 3.6595795154571533, 'learning_rate': 0.000268743890518084, 'epoch': 0.28}
+{'loss': 1.029, 'grad_norm': 2.8983232975006104, 'learning_rate': 0.0002687194525904203, 'epoch': 0.28}
+{'loss': 1.1833, 'grad_norm': 1.8303897380828857, 'learning_rate': 0.0002686950146627566, 'epoch': 0.28}
+{'loss': 1.5727, 'grad_norm': 2.681189775466919, 'learning_rate': 0.00026867057673509283, 'epoch': 0.28}
+{'loss': 1.4353, 'grad_norm': 2.851393699645996, 'learning_rate': 0.0002686461388074291, 'epoch': 0.28}
+{'loss': 1.3998, 'grad_norm': 2.83847713470459, 'learning_rate': 0.0002686217008797654, 'epoch': 0.28}
+{'loss': 1.2927, 'grad_norm': 2.401233673095703, 'learning_rate': 0.00026859726295210164, 'epoch': 0.28}
+{'loss': 1.3452, 'grad_norm': 2.81550931930542, 'learning_rate': 0.0002685728250244379, 'epoch': 0.28}
+{'loss': 1.2581, 'grad_norm': 2.0500288009643555, 'learning_rate': 0.00026854838709677414, 'epoch': 0.28}
+{'loss': 0.9827, 'grad_norm': 2.6344480514526367, 'learning_rate': 0.00026852394916911045, 'epoch': 0.28}
+{'loss': 0.8994, 'grad_norm': 1.7978938817977905, 'learning_rate': 0.0002684995112414467, 'epoch': 0.28}
+{'loss': 1.9596, 'grad_norm': 2.9591362476348877, 'learning_rate': 0.00026847507331378295, 'epoch': 0.28}
+{'loss': 1.1068, 'grad_norm': 2.2683701515197754, 'learning_rate': 0.00026845063538611925, 'epoch': 0.28}
+{'loss': 0.515, 'grad_norm': 1.6625263690948486, 'learning_rate': 0.0002684261974584555, 'epoch': 0.28}
+{'loss': 0.6351, 'grad_norm': 1.8417325019836426, 'learning_rate': 0.00026840175953079176, 'epoch': 0.28}
+{'loss': 1.6654, 'grad_norm': 2.7256836891174316, 'learning_rate': 0.00026837732160312806, 'epoch': 0.28}
+{'loss': 1.1826, 'grad_norm': 2.1390318870544434, 'learning_rate': 0.00026835288367546426, 'epoch': 0.28}
+{'loss': 0.4343, 'grad_norm': 0.6134200096130371, 'learning_rate': 0.00026832844574780056, 'epoch': 0.28}
+{'loss': 0.3972, 'grad_norm': 0.9784292578697205, 'learning_rate': 0.0002683040078201368, 'epoch': 0.28}
+{'loss': 0.4933, 'grad_norm': 0.6980729699134827, 'learning_rate': 0.00026827956989247307, 'epoch': 0.28}
+{'loss': 0.3944, 'grad_norm': 0.6361396312713623, 'learning_rate': 0.00026825513196480937, 'epoch': 0.28}
+{'loss': 0.443, 'grad_norm': 0.7284366488456726, 'learning_rate': 0.0002682306940371456, 'epoch': 0.28}
+{'loss': 0.5435, 'grad_norm': 0.8181057572364807, 'learning_rate': 0.0002682062561094819, 'epoch': 0.28}
+{'loss': 0.4323, 'grad_norm': 0.7504498362541199, 'learning_rate': 0.0002681818181818181, 'epoch': 0.28}
+{'loss': 0.5852, 'grad_norm': 1.1806275844573975, 'learning_rate': 0.00026815738025415443, 'epoch': 0.28}
+{'loss': 0.4522, 'grad_norm': 1.2418806552886963, 'learning_rate': 0.0002681329423264907, 'epoch': 0.28}
+{'loss': 0.6192, 'grad_norm': 1.2037335634231567, 'learning_rate': 0.00026810850439882693, 'epoch': 0.28}
+{'loss': 0.4402, 'grad_norm': 1.9327640533447266, 'learning_rate': 0.00026808406647116324, 'epoch': 0.28}
+{'loss': 0.4974, 'grad_norm': 1.134433388710022, 'learning_rate': 0.0002680596285434995, 'epoch': 0.28}
+{'loss': 0.4516, 'grad_norm': 0.9470615386962891, 'learning_rate': 0.00026803519061583574, 'epoch': 0.28}
+{'loss': 0.7034, 'grad_norm': 1.848379135131836, 'learning_rate': 0.00026801075268817205, 'epoch': 0.28}
+{'loss': 0.6472, 'grad_norm': 1.2914631366729736, 'learning_rate': 0.00026798631476050824, 'epoch': 0.28}
+{'loss': 0.5035, 'grad_norm': 1.4809837341308594, 'learning_rate': 0.00026796187683284455, 'epoch': 0.28}
+{'loss': 0.7554, 'grad_norm': 1.478471279144287, 'learning_rate': 0.0002679374389051808, 'epoch': 0.28}
+{'loss': 0.5768, 'grad_norm': 1.0676370859146118, 'learning_rate': 0.00026791300097751705, 'epoch': 0.28}
+{'loss': 0.6962, 'grad_norm': 1.5760576725006104, 'learning_rate': 0.00026788856304985336, 'epoch': 0.28}
+{'loss': 0.4515, 'grad_norm': 0.8975147604942322, 'learning_rate': 0.0002678641251221896, 'epoch': 0.28}
+{'loss': 0.6175, 'grad_norm': 2.1367173194885254, 'learning_rate': 0.00026783968719452586, 'epoch': 0.29}
+{'loss': 0.7304, 'grad_norm': 2.5454366207122803, 'learning_rate': 0.00026781524926686217, 'epoch': 0.29}
+{'loss': 0.7314, 'grad_norm': 1.3795878887176514, 'learning_rate': 0.0002677908113391984, 'epoch': 0.29}
+{'loss': 0.9597, 'grad_norm': 1.7124171257019043, 'learning_rate': 0.00026776637341153467, 'epoch': 0.29}
+{'loss': 1.0985, 'grad_norm': 2.3599631786346436, 'learning_rate': 0.0002677419354838709, 'epoch': 0.29}
+{'loss': 0.7528, 'grad_norm': 2.4604930877685547, 'learning_rate': 0.0002677174975562072, 'epoch': 0.29}
+{'loss': 1.0192, 'grad_norm': 2.542212724685669, 'learning_rate': 0.0002676930596285435, 'epoch': 0.29}
+{'loss': 1.0454, 'grad_norm': 2.8099851608276367, 'learning_rate': 0.00026766862170087973, 'epoch': 0.29}
+{'loss': 1.0623, 'grad_norm': 1.8754301071166992, 'learning_rate': 0.00026764418377321603, 'epoch': 0.29}
+{'loss': 0.9486, 'grad_norm': 1.9678064584732056, 'learning_rate': 0.0002676197458455523, 'epoch': 0.29}
+{'loss': 0.6753, 'grad_norm': 1.242255687713623, 'learning_rate': 0.00026759530791788854, 'epoch': 0.29}
+{'loss': 1.168, 'grad_norm': 2.036165475845337, 'learning_rate': 0.0002675708699902248, 'epoch': 0.29}
+{'loss': 0.8798, 'grad_norm': 2.8120241165161133, 'learning_rate': 0.00026754643206256104, 'epoch': 0.29}
+{'loss': 1.0636, 'grad_norm': 1.734350562095642, 'learning_rate': 0.00026752199413489734, 'epoch': 0.29}
+{'loss': 1.011, 'grad_norm': 2.014326572418213, 'learning_rate': 0.0002674975562072336, 'epoch': 0.29}
+{'loss': 1.0242, 'grad_norm': 2.908611536026001, 'learning_rate': 0.00026747311827956985, 'epoch': 0.29}
+{'loss': 1.1949, 'grad_norm': 2.8601036071777344, 'learning_rate': 0.00026744868035190615, 'epoch': 0.29}
+{'loss': 0.8634, 'grad_norm': 2.5931968688964844, 'learning_rate': 0.0002674242424242424, 'epoch': 0.29}
+{'loss': 1.0344, 'grad_norm': 2.2925524711608887, 'learning_rate': 0.00026739980449657865, 'epoch': 0.29}
+{'loss': 1.3297, 'grad_norm': 1.680653691291809, 'learning_rate': 0.0002673753665689149, 'epoch': 0.29}
+{'loss': 1.2022, 'grad_norm': 1.445891261100769, 'learning_rate': 0.0002673509286412512, 'epoch': 0.29}
+{'loss': 1.3316, 'grad_norm': 3.1469690799713135, 'learning_rate': 0.00026732649071358746, 'epoch': 0.29}
+{'loss': 1.4019, 'grad_norm': 1.903037190437317, 'learning_rate': 0.0002673020527859237, 'epoch': 0.29}
+{'loss': 1.8396, 'grad_norm': 5.236410617828369, 'learning_rate': 0.00026727761485826, 'epoch': 0.29}
+{'loss': 1.8712, 'grad_norm': 4.735574722290039, 'learning_rate': 0.00026725317693059627, 'epoch': 0.29}
+{'loss': 1.0403, 'grad_norm': 1.4214756488800049, 'learning_rate': 0.0002672287390029325, 'epoch': 0.29}
+{'loss': 1.6399, 'grad_norm': 2.3396573066711426, 'learning_rate': 0.0002672043010752688, 'epoch': 0.29}
+{'loss': 1.184, 'grad_norm': 3.088986873626709, 'learning_rate': 0.000267179863147605, 'epoch': 0.29}
+{'loss': 1.3135, 'grad_norm': 2.6393306255340576, 'learning_rate': 0.00026715542521994133, 'epoch': 0.29}
+{'loss': 1.4623, 'grad_norm': 1.8152228593826294, 'learning_rate': 0.0002671309872922776, 'epoch': 0.29}
+{'loss': 0.5949, 'grad_norm': 0.7032530903816223, 'learning_rate': 0.00026710654936461383, 'epoch': 0.29}
+{'loss': 0.3533, 'grad_norm': 0.7973015904426575, 'learning_rate': 0.00026708211143695014, 'epoch': 0.29}
+{'loss': 0.5212, 'grad_norm': 1.0158485174179077, 'learning_rate': 0.0002670576735092864, 'epoch': 0.29}
+ 15%|█▍        | 1854/12776 [17:10<2:31:28,  1.20it/s] 15%|█▍        | 1855/12776 [17:11<2:35:01,  1.17it/s]                                                       15%|█▍        | 1855/12776 [17:11<2:35:01,  1.17it/s] 15%|█▍        | 1856/12776 [17:12<2:26:22,  1.24it/s]                                                       15%|█▍        | 1856/12776 [17:12<2:26:22,  1.24it/s] 15%|█▍        | 1857/12776 [17:12<2:18:26,  1.31it/s]                                                       15%|█▍        | 1857/12776 [17:12<2:18:26,  1.31it/s] 15%|█▍        | 1858/12776 [17:13<2:12:56,  1.37it/s]                                                       15%|█▍        | 1858/12776 [17:13<2:12:56,  1.37it/s] 15%|█▍        | 1859/12776 [17:14<2:06:07,  1.44it/s]                                                       15%|█▍        | 1859/12776 [17:14<2:06:07,  1.44it/s] 15%|█▍        | 1860/12776 [17:14<2:00:28,  1.51it/s]                                                       15%|█▍        | 1860/12776 [17:14<2:00:28,  1.51it/s] 15%|█▍        | 1861/12776 [17:15<1:52:59,  1.61it/s]                                                       15%|█▍        | 1861/12776 [17:15<1:52:59,  1.61it/s] 15%|█▍        | 1862/12776 [17:16<1:54:31,  1.59it/s]                                                       15%|█▍        | 1862/12776 [17:16<1:54:31,  1.59it/s] 15%|█▍        | 1863/12776 [17:16<1:45:53,  1.72it/s]                                                       15%|█▍        | 1863/12776 [17:16<1:45:53,  1.72it/s] 15%|█▍        | 1864/12776 [17:16<1:38:19,  1.85it/s]                                                       15%|█▍        | 1864/12776 [17:16<1:38:19,  1.85it/s] 15%|█▍        | 1865/12776 [17:17<1:37:34,  1.86it/s]                                                       15%|█▍        | 1865/12776 [17:17<1:37:34,  1.86it/s] 15%|█▍        | 1866/12776 [17:17<1:30:30,  2.01it/s]                                                       15%|█▍        | 1866/12776 [17:17<1:30:30,  2.01it/s] 15%|█▍        | 1867/12776 [17:18<1:29:34,  2.03it/s]                                                       15%|█▍        | 1867/12776 [17:18<1:29:34,  2.03it/s] 15%|█▍        | 1868/12776 [17:18<1:23:31,  2.18it/s]                                                       15%|█▍        | 1868/12776 [17:18<1:23:31,  2.18it/s] 15%|█▍        | 1869/12776 [17:19<1:17:52,  2.33it/s]                                                       15%|█▍        | 1869/12776 [17:19<1:17:52,  2.33it/s] 15%|█▍        | 1870/12776 [17:19<1:16:08,  2.39it/s]                                                       15%|█▍        | 1870/12776 [17:19<1:16:08,  2.39it/s] 15%|█▍        | 1871/12776 [17:19<1:11:42,  2.53it/s]                                                       15%|█▍        | 1871/12776 [17:19<1:11:42,  2.53it/s] 15%|█▍        | 1872/12776 [17:20<1:08:21,  2.66it/s]                                                       15%|█▍        | 1872/12776 [17:20<1:08:21,  2.66it/s] 15%|█▍        | 1873/12776 [17:20<1:05:16,  2.78it/s]                                                       15%|█▍        | 1873/12776 [17:20<1:05:16,  2.78it/s] 15%|█▍        | 1874/12776 [17:20<1:01:56,  2.93it/s]                                                       15%|█▍        | 1874/12776 [17:20<1:01:56,  2.93it/s] 15%|█▍        | 1875/12776 [17:21<59:29,  3.05it/s]                                                       15%|█▍        | 1875/12776 [17:21<59:29,  3.05it/s] 15%|█▍        | 1876/12776 [17:21<57:21,  3.17it/s]                                                     15%|█▍        | 1876/12776 [17:21<57:21,  3.17it/s] 15%|█▍        | 1877/12776 [17:21<1:00:43,  2.99it/s]                                                       15%|█▍        | 1877/12776 [17:21<1:00:43,  2.99it/s] 15%|█▍        | 1878/12776 [17:21<57:19,  3.17it/s]                                                       15%|█▍        | 1878/12776 [17:21<57:19,  3.17it/s] 15%|█▍        | 1879/12776 [17:22<54:42,  3.32it/s]                                                     15%|█▍        | 1879/12776 [17:22<54:42,  3.32it/s] 15%|█▍        | 1880/12776 [17:22<52:40,  3.45it/s]                                                     15%|█▍        | 1880/12776 [17:22<52:40,  3.45it/s] 15%|█▍        | 1881/12776 [17:22<54:07,  3.35it/s]                                                     15%|█▍        | 1881/12776 [17:22<54:07,  3.35it/s] 15%|█▍        | 1882/12776 [17:23<51:19,  3.54it/s]                                                     15%|█▍        | 1882/12776 [17:23<51:19,  3.54it/s] 15%|█▍        | 1883/12776 [17:23<49:01,  3.70it/s]                                                     15%|█▍        | 1883/12776 [17:23<49:01,  3.70it/s] 15%|█▍        | 1884/12776 [17:23<47:11,  3.85it/s]                                                     15%|█▍        | 1884/12776 [17:23<47:11,  3.85it/s] 15%|█▍        | 1885/12776 [17:23<45:39,  3.98it/s]                                                     15%|█▍        | 1885/12776 [17:23<45:39,  3.98it/s] 15%|█▍        | 1886/12776 [17:24<48:49,  3.72it/s]                                                     15%|█▍        | 1886/12776 [17:24<48:49,  3.72it/s] 15%|█▍        | 1887/12776 [17:24<46:07,  3.93it/s]                                                     15%|█▍        | 1887/12776 [17:24<46:07,  3.93it/s] 15%|█▍        | 1888/12776 [17:24<44:01,  4.12it/s]                                                     15%|█▍        | 1888/12776 [17:24<44:01,  4.12it/s] 15%|█▍        | 1889/12776 [17:24<42:09,  4.30it/s]                                                     15%|█▍        | 1889/12776 [17:24<42:09,  4.30it/s] 15%|█▍        | 1890/12776 [17:24<40:56,  4.43it/s]                                                     15%|█▍        | 1890/12776 [17:24<40:56,  4.43it/s] 15%|█▍        | 1891/12776 [17:25<44:31,  4.07it/s]                                                     15%|█▍        | 1891/12776 [17:25<44:31,  4.07it/s] 15%|█▍        | 1892/12776 [17:25<42:13,  4.30it/s]                                                     15%|█▍        | 1892/12776 [17:25<42:13,  4.30it/s] 15%|█▍        | 1893/12776 [17:25<40:24,  4.49it/s]                                                     15%|█▍        | 1893/12776 [17:25<40:24,  4.49it/s] 15%|█▍        | 1894/12776 [17:25<39:07,  4.64it/s]                                                     15%|█▍        | 1894/12776 [17:25<39:07,  4.64it/s] 15%|█▍        | 1895/12776 [17:26<38:04,  4.76it/s]                                                     15%|█▍        | 1895/12776 [17:26<38:04,  4.76it/s] 15%|█▍        | 1896/12776 [17:26<41:47,  4.34it/s]                                                     15%|█▍        | 1896/12776 [17:26<41:47,  4.34it/s] 15%|█▍        | 1897/12776 [17:26<38:34,  4.70it/s]                                                     15%|█▍        | 1897/12776 [17:26<38:34,  4.70it/s] 15%|█▍        | 1898/12776 [17:26<37:19,  4.86it/s]                                                     15%|█▍        | 1898/12776 [17:26<37:19,  4.86it/s] 15%|█▍        | 1899/12776 [17:26<36:09,  5.01it/s]                                                     15%|█▍        | 1899/12776 [17:26<36:09,  5.01it/s] 15%|█▍        | 1900/12776 [17:27<1:04:23,  2.82it/s]                                                       15%|█▍        | 1900/12776 [17:27<1:04:23,  2.82it/s] 15%|█▍        | 1901/12776 [17:29<2:11:51,  1.37it/s]                                                       15%|█▍        | 1901/12776 [17:29<2:11:51,  1.37it/s] 15%|█▍        | 1902/12776 [17:30<2:30:59,  1.20it/s]                                                       15%|█▍        | 1902/12776 [17:30<2:30:59,  1.20it/s] 15%|█▍        | 1903/12776 [17:31<2:37:03,  1.15it/s]                                                       15%|█▍        | 1903/12776 [17:31<2:37:03,  1.15it/s] 15%|█▍        | 1904/12776 [17:32<2:33:43,  1.18it/s]                                                       15%|█▍        | 1904/12776 [17:32<2:33:43,  1.18it/s] 15%|█▍        | 1905/12776 [17:32<2:27:47,  1.23it/s]                                                       15%|█▍        | 1905/12776 [17:32<2:27:47,  1.23it/s] 15%|█▍        | 1906/12776 [17:33<2:24:35,  1.25it/s]                                                       15%|█▍        | 1906/12776 [17:33<2:24:35,  1.25it/s] 15%|█▍        | 1907/12776 [17:34<2:23:03,  1.27it/s]                                                       15%|█▍        | 1907/12776 [17:34<2:23:03,  1.27it/s] 15%|█▍        | 1908/12776 [17:34<2:13:09,  1.36it/s]                                                       15%|█▍        | 1908/12776 [17:34<2:13:09,  1.36it/s] 15%|█▍        | 1909/12776 [17:35<2:04:36,  1.45it/s]                                                       15%|█▍        | 1909/12776 [17:35<2:04:36,  1.45it/s] 15%|█▍        | 1910/12776 [17:36<1:56:58,  1.55it/s]                                                       15%|█▍        | 1910/12776 [17:36<1:56:58,  1.55it/s] 15%|█▍        | 1911/12776 [17:36<1:55:18,  1.57it/s]                                                       15%|█▍        | 1911/12776 [17:36<1:55:18,  1.57it/s] 15%|█▍        | 1912/12776 [17:37<1:48:06,  1.67it/s]                                                       15%|█▍        | 1912/12776 [17:37<1:48:06,  1.67it/s] 15%|█▍        | 1913/12776 [17:37<1:41:57,  1.78it/s]                                                       15%|█▍        | 1913/12776 [17:37<1:41:57,  1.78it/s] 15%|█▍        | 1914/12776 [17:38<1:38:31,  1.84it/s]                                                       15%|█▍        | 1914/12776 [17:38<1:38:31,  1.84it/s] 15%|█▍        | 1915/12776 [17:38<1:32:39,  1.95it/s]                                                       15%|█▍        | 1915/12776 [17:38<1:32:39,  1.95it/s] 15%|█▍        | 1916/12776 [17:39<1:31:42,  1.97it/s]                                                       15%|█▍        | 1916/12776 [17:39<1:31:42,  1.97it/s] 15%|█▌        | 1917/12776 [17:39<1:26:41,  2.09it/s]                                                       15%|█▌        | 1917/12776 [17:39<1:26:41,  2.09it/s] 15%|█▌        | 1918/12776 [17:39<1:23:32,  2.17it/s]                                                       15%|█▌        | 1918/12776 [17:39<1:23:32,  2.17it/s] 15%|█▌        | 1919/12776 [17:40<1:23:28,  2.17it/s]                                                       15%|█▌        | 1919/12776 [17:40<1:23:28,  2.17it/s] 15%|█▌        | 1920/12776 [17:40<1:18:25,  2.31it/s]                                                       15%|█▌        | 1920/12776 [17:40<1:18:25,  2.31it/s] 15%|█▌        | 1921/12776 [17:41<1:13:55,  2.45it/s]                                                       15%|█▌        | 1921/12776 [17:41<1:13:55,  2.45it/s] 15%|█▌        | 1922/12776 [17:41<1:13:18,  2.47it/s]                                                       15%|█▌        | 1922/12776 [17:41<1:13:18,  2.47it/s] 15%|█▌        | 1923/12776 [17:41<1:09:31,  2.60it/s]                                                       15%|█▌        | 1923/12776 [17:41<1:09:31,  2.60it/s] 15%|█▌        | 1924/12776 [17:42<1:06:23,  2.72it/s]                                                       15%|█▌        | 1924/12776 [17:42<1:06:23,  2.72it/s] 15%|█▌        | 1925/12776 [17:42<1:09:02,  2.62it/s]                                                       15%|█▌        | 1925/12776 [17:42<1:09:02,  2.62it/s] 15%|█▌        | 1926/12776 [17:42<1:04:41,  2.80it/s]                                                       15%|█▌        | 1926/12776 [17:42<1:04:41,  2.80it/s] 15%|█▌        | 1927/12776 [17:43<1:01:00,  2.96it/s]                                                       15%|█▌        | 1927/12776 [17:43<1:01:00,  2.96it/s] 15%|█▌        | 1928/12776 [17:43<1:04:14,  2.81it/s]                                                       15%|█▌        | 1928/12776 [17:43<1:04:14,  2.81it/s] 15%|█▌        | 1929/12776 [17:43<59:35,  3.03it/s]                                                       15%|█▌        | 1929/12776 [17:43<59:35,  3.03it/s] 15%|█▌        | 1930/12776 [17:44<56:00,  3.23it/s]                                                     15%|█▌        | 1930/12776 [17:44<56:00,  3.23it/s] 15%|█▌        | 1931/12776 [17:44<53:02,  3.41it/s]                                                     15%|█▌        | 1931/12776 [17:44<53:02,  3.41it/s] 15%|█▌        | 1932/12776 [17:44<56:04,  3.22it/s]                                                    {'loss': 0.5227, 'grad_norm': 0.6405438184738159, 'learning_rate': 0.00026703323558162264, 'epoch': 0.29}
+{'loss': 0.5069, 'grad_norm': 0.8163948059082031, 'learning_rate': 0.0002670087976539589, 'epoch': 0.29}
+{'loss': 0.9599, 'grad_norm': 2.606863021850586, 'learning_rate': 0.0002669843597262952, 'epoch': 0.29}
+{'loss': 0.5773, 'grad_norm': 0.8287584781646729, 'learning_rate': 0.00026695992179863145, 'epoch': 0.29}
+{'loss': 0.2348, 'grad_norm': 0.5821240544319153, 'learning_rate': 0.0002669354838709677, 'epoch': 0.29}
+{'loss': 0.3831, 'grad_norm': 0.9026615023612976, 'learning_rate': 0.000266911045943304, 'epoch': 0.29}
+{'loss': 0.4268, 'grad_norm': 1.199336290359497, 'learning_rate': 0.00026688660801564026, 'epoch': 0.29}
+{'loss': 0.5279, 'grad_norm': 1.1749603748321533, 'learning_rate': 0.0002668621700879765, 'epoch': 0.29}
+{'loss': 0.3649, 'grad_norm': 0.8679521083831787, 'learning_rate': 0.0002668377321603128, 'epoch': 0.29}
+{'loss': 0.6133, 'grad_norm': 1.6801387071609497, 'learning_rate': 0.000266813294232649, 'epoch': 0.29}
+{'loss': 0.4792, 'grad_norm': 1.3388230800628662, 'learning_rate': 0.0002667888563049853, 'epoch': 0.29}
+{'loss': 0.6117, 'grad_norm': 1.3929903507232666, 'learning_rate': 0.00026676441837732157, 'epoch': 0.29}
+{'loss': 0.9218, 'grad_norm': 1.5032627582550049, 'learning_rate': 0.0002667399804496578, 'epoch': 0.29}
+{'loss': 0.7628, 'grad_norm': 1.732055902481079, 'learning_rate': 0.0002667155425219941, 'epoch': 0.29}
+{'loss': 0.652, 'grad_norm': 1.4332983493804932, 'learning_rate': 0.0002666911045943304, 'epoch': 0.29}
+{'loss': 0.7529, 'grad_norm': 2.0694615840911865, 'learning_rate': 0.0002666666666666666, 'epoch': 0.29}
+{'loss': 0.7831, 'grad_norm': 1.9924960136413574, 'learning_rate': 0.00026664222873900293, 'epoch': 0.29}
+{'loss': 1.0141, 'grad_norm': 1.686233401298523, 'learning_rate': 0.0002666177908113392, 'epoch': 0.29}
+{'loss': 0.6643, 'grad_norm': 1.5367885828018188, 'learning_rate': 0.00026659335288367543, 'epoch': 0.29}
+{'loss': 0.8695, 'grad_norm': 1.6426396369934082, 'learning_rate': 0.0002665689149560117, 'epoch': 0.29}
+{'loss': 0.7657, 'grad_norm': 2.6876118183135986, 'learning_rate': 0.000266544477028348, 'epoch': 0.29}
+{'loss': 1.0113, 'grad_norm': 2.0909829139709473, 'learning_rate': 0.00026652003910068424, 'epoch': 0.29}
+{'loss': 1.0539, 'grad_norm': 1.8119468688964844, 'learning_rate': 0.0002664956011730205, 'epoch': 0.29}
+{'loss': 0.8561, 'grad_norm': 2.4083352088928223, 'learning_rate': 0.0002664711632453568, 'epoch': 0.29}
+{'loss': 0.6055, 'grad_norm': 1.916272521018982, 'learning_rate': 0.000266446725317693, 'epoch': 0.29}
+{'loss': 1.3, 'grad_norm': 1.9073026180267334, 'learning_rate': 0.0002664222873900293, 'epoch': 0.29}
+{'loss': 1.0639, 'grad_norm': 2.932504415512085, 'learning_rate': 0.00026639784946236555, 'epoch': 0.29}
+{'loss': 1.1687, 'grad_norm': 2.1795926094055176, 'learning_rate': 0.0002663734115347018, 'epoch': 0.29}
+{'loss': 1.2471, 'grad_norm': 2.62552809715271, 'learning_rate': 0.0002663489736070381, 'epoch': 0.29}
+{'loss': 1.0549, 'grad_norm': 2.430758237838745, 'learning_rate': 0.00026632453567937436, 'epoch': 0.29}
+{'loss': 1.0626, 'grad_norm': 1.8063433170318604, 'learning_rate': 0.0002663000977517106, 'epoch': 0.29}
+{'loss': 1.1665, 'grad_norm': 7.484827995300293, 'learning_rate': 0.0002662756598240469, 'epoch': 0.3}
+{'loss': 1.9209, 'grad_norm': 3.3804683685302734, 'learning_rate': 0.00026625122189638317, 'epoch': 0.3}
+{'loss': 1.1098, 'grad_norm': 1.9869343042373657, 'learning_rate': 0.0002662267839687194, 'epoch': 0.3}
+{'loss': 1.3265, 'grad_norm': 3.7342841625213623, 'learning_rate': 0.00026620234604105567, 'epoch': 0.3}
+{'loss': 1.5005, 'grad_norm': 1.7713956832885742, 'learning_rate': 0.000266177908113392, 'epoch': 0.3}
+{'loss': 0.8232, 'grad_norm': 2.1043894290924072, 'learning_rate': 0.0002661534701857282, 'epoch': 0.3}
+{'loss': 1.1112, 'grad_norm': 2.7855064868927, 'learning_rate': 0.0002661290322580645, 'epoch': 0.3}
+{'loss': 1.4161, 'grad_norm': 2.4430272579193115, 'learning_rate': 0.0002661045943304008, 'epoch': 0.3}
+{'loss': 1.8431, 'grad_norm': 2.2559332847595215, 'learning_rate': 0.00026608015640273703, 'epoch': 0.3}
+{'loss': 1.9822, 'grad_norm': 6.513355255126953, 'learning_rate': 0.0002660557184750733, 'epoch': 0.3}
+{'loss': 1.2078, 'grad_norm': 2.599079132080078, 'learning_rate': 0.0002660312805474096, 'epoch': 0.3}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0002660312805474096, 'epoch': 0.3}
+{'loss': 1.1353, 'grad_norm': 5.364233016967773, 'learning_rate': 0.0002660068426197458, 'epoch': 0.3}
+{'loss': 0.9987, 'grad_norm': 3.103199005126953, 'learning_rate': 0.0002659824046920821, 'epoch': 0.3}
+{'loss': 0.7689, 'grad_norm': 1.9960848093032837, 'learning_rate': 0.00026595796676441835, 'epoch': 0.3}
+{'loss': 1.1175, 'grad_norm': 1.9379894733428955, 'learning_rate': 0.0002659335288367546, 'epoch': 0.3}
+{'loss': 0.3971, 'grad_norm': 0.6607356667518616, 'learning_rate': 0.0002659090909090909, 'epoch': 0.3}
+{'loss': 0.4854, 'grad_norm': 0.8567434549331665, 'learning_rate': 0.00026588465298142715, 'epoch': 0.3}
+{'loss': 0.3991, 'grad_norm': 0.6764739751815796, 'learning_rate': 0.0002658602150537634, 'epoch': 0.3}
+{'loss': 0.3442, 'grad_norm': 0.630438506603241, 'learning_rate': 0.00026583577712609966, 'epoch': 0.3}
+{'loss': 0.3289, 'grad_norm': 0.5661547780036926, 'learning_rate': 0.00026581133919843596, 'epoch': 0.3}
+{'loss': 0.4095, 'grad_norm': 0.7079960107803345, 'learning_rate': 0.0002657869012707722, 'epoch': 0.3}
+{'loss': 0.4561, 'grad_norm': 0.7579357624053955, 'learning_rate': 0.00026576246334310846, 'epoch': 0.3}
+{'loss': 0.4748, 'grad_norm': 0.753103494644165, 'learning_rate': 0.00026573802541544477, 'epoch': 0.3}
+{'loss': 0.3129, 'grad_norm': 0.5085161328315735, 'learning_rate': 0.000265713587487781, 'epoch': 0.3}
+{'loss': 0.8525, 'grad_norm': 1.400939702987671, 'learning_rate': 0.00026568914956011727, 'epoch': 0.3}
+{'loss': 0.5729, 'grad_norm': 1.0124390125274658, 'learning_rate': 0.0002656647116324536, 'epoch': 0.3}
+{'loss': 0.6094, 'grad_norm': 1.1342532634735107, 'learning_rate': 0.0002656402737047898, 'epoch': 0.3}
+{'loss': 0.5798, 'grad_norm': 0.8373773694038391, 'learning_rate': 0.0002656158357771261, 'epoch': 0.3}
+{'loss': 0.6012, 'grad_norm': 1.389040470123291, 'learning_rate': 0.00026559139784946233, 'epoch': 0.3}
+{'loss': 0.7038, 'grad_norm': 0.9821066856384277, 'learning_rate': 0.0002655669599217986, 'epoch': 0.3}
+{'loss': 0.5822, 'grad_norm': 0.9827203750610352, 'learning_rate': 0.0002655425219941349, 'epoch': 0.3}
+{'loss': 0.4766, 'grad_norm': 1.2951587438583374, 'learning_rate': 0.00026551808406647114, 'epoch': 0.3}
+{'loss': 0.5216, 'grad_norm': 1.0430768728256226, 'learning_rate': 0.0002654936461388074, 'epoch': 0.3}
+{'loss': 0.4903, 'grad_norm': 0.9382654428482056, 'learning_rate': 0.0002654692082111437, 'epoch': 0.3}
+{'loss': 1.2844, 'grad_norm': 4.105330944061279, 'learning_rate': 0.00026544477028347995, 'epoch': 0.3}
+{'loss': 0.9027, 'grad_norm': 1.663966178894043, 'learning_rate': 0.0002654203323558162, 'epoch': 0.3}
+{'loss': 1.0099, 'grad_norm': 2.5586514472961426, 'learning_rate': 0.00026539589442815245, 'epoch': 0.3}
+{'loss': 0.6638, 'grad_norm': 1.8075790405273438, 'learning_rate': 0.00026537145650048875, 'epoch': 0.3}
+{'loss': 0.5234, 'grad_norm': 1.4178240299224854, 'learning_rate': 0.000265347018572825, 'epoch': 0.3}
+{'loss': 1.948, 'grad_norm': 8.215217590332031, 'learning_rate': 0.00026532258064516126, 'epoch': 0.3}
+{'loss': 0.7896, 'grad_norm': 2.2477364540100098, 'learning_rate': 0.00026529814271749756, 'epoch': 0.3}
+{'loss': 0.9689, 'grad_norm': 2.111276626586914, 'learning_rate': 0.00026527370478983376, 'epoch': 0.3}
+{'loss': 1.6356, 'grad_norm': 5.124137878417969, 'learning_rate': 0.00026524926686217006, 'epoch': 0.3}
+{'loss': 0.7388, 'grad_norm': 1.8129172325134277, 'learning_rate': 0.0002652248289345063, 'epoch': 0.3}
+{'loss': 0.8196, 'grad_norm': 2.3607561588287354, 'learning_rate': 0.00026520039100684257, 'epoch': 0.3}
+{'loss': 0.8933, 'grad_norm': 2.1512415409088135, 'learning_rate': 0.0002651759530791789, 'epoch': 0.3}
+ 15%|█▌        | 1932/12776 [17:44<56:04,  3.22it/s] 15%|█▌        | 1933/12776 [17:44<52:39,  3.43it/s]                                                     15%|█▌        | 1933/12776 [17:44<52:39,  3.43it/s] 15%|█▌        | 1934/12776 [17:45<50:03,  3.61it/s]                                                     15%|█▌        | 1934/12776 [17:45<50:03,  3.61it/s] 15%|█▌        | 1935/12776 [17:45<47:55,  3.77it/s]                                                     15%|█▌        | 1935/12776 [17:45<47:55,  3.77it/s] 15%|█▌        | 1936/12776 [17:45<46:02,  3.92it/s]                                                     15%|█▌        | 1936/12776 [17:45<46:02,  3.92it/s] 15%|█▌        | 1937/12776 [17:45<48:58,  3.69it/s]                                                     15%|█▌        | 1937/12776 [17:45<48:58,  3.69it/s] 15%|█▌        | 1938/12776 [17:46<46:08,  3.91it/s]                                                     15%|█▌        | 1938/12776 [17:46<46:08,  3.91it/s] 15%|█▌        | 1939/12776 [17:46<43:55,  4.11it/s]                                                     15%|█▌        | 1939/12776 [17:46<43:55,  4.11it/s] 15%|█▌        | 1940/12776 [17:46<42:02,  4.30it/s]                                                     15%|█▌        | 1940/12776 [17:46<42:02,  4.30it/s] 15%|█▌        | 1941/12776 [17:46<40:52,  4.42it/s]                                                     15%|█▌        | 1941/12776 [17:46<40:52,  4.42it/s] 15%|█▌        | 1942/12776 [17:47<43:17,  4.17it/s]                                                     15%|█▌        | 1942/12776 [17:47<43:17,  4.17it/s] 15%|█▌        | 1943/12776 [17:47<41:15,  4.38it/s]                                                     15%|█▌        | 1943/12776 [17:47<41:15,  4.38it/s] 15%|█▌        | 1944/12776 [17:47<39:34,  4.56it/s]                                                     15%|█▌        | 1944/12776 [17:47<39:34,  4.56it/s] 15%|█▌        | 1945/12776 [17:47<38:19,  4.71it/s]                                                     15%|█▌        | 1945/12776 [17:47<38:19,  4.71it/s] 15%|█▌        | 1946/12776 [17:47<37:25,  4.82it/s]                                                     15%|█▌        | 1946/12776 [17:47<37:25,  4.82it/s] 15%|█▌        | 1947/12776 [17:48<44:10,  4.09it/s]                                                     15%|█▌        | 1947/12776 [17:48<44:10,  4.09it/s] 15%|█▌        | 1948/12776 [17:48<41:14,  4.38it/s]                                                     15%|█▌        | 1948/12776 [17:48<41:14,  4.38it/s] 15%|█▌        | 1949/12776 [17:48<39:12,  4.60it/s]                                                     15%|█▌        | 1949/12776 [17:48<39:12,  4.60it/s] 15%|█▌        | 1950/12776 [17:49<1:09:09,  2.61it/s]                                                       15%|█▌        | 1950/12776 [17:49<1:09:09,  2.61it/s] 15%|█▌        | 1951/12776 [17:50<2:03:01,  1.47it/s]                                                       15%|█▌        | 1951/12776 [17:50<2:03:01,  1.47it/s] 15%|█▌        | 1952/12776 [17:51<2:14:46,  1.34it/s]                                                       15%|█▌        | 1952/12776 [17:51<2:14:46,  1.34it/s] 15%|█▌        | 1953/12776 [17:52<2:17:15,  1.31it/s]                                                       15%|█▌        | 1953/12776 [17:52<2:17:15,  1.31it/s] 15%|█▌        | 1954/12776 [17:53<2:17:17,  1.31it/s]                                                       15%|█▌        | 1954/12776 [17:53<2:17:17,  1.31it/s] 15%|█▌        | 1955/12776 [17:53<2:14:43,  1.34it/s]                                                       15%|█▌        | 1955/12776 [17:53<2:14:43,  1.34it/s] 15%|█▌        | 1956/12776 [17:54<2:10:17,  1.38it/s]                                                       15%|█▌        | 1956/12776 [17:54<2:10:17,  1.38it/s] 15%|█▌        | 1957/12776 [17:55<2:12:33,  1.36it/s]                                                       15%|█▌        | 1957/12776 [17:55<2:12:33,  1.36it/s] 15%|█▌        | 1958/12776 [17:55<2:05:42,  1.43it/s]                                                       15%|█▌        | 1958/12776 [17:55<2:05:42,  1.43it/s] 15%|█▌        | 1959/12776 [17:56<2:01:13,  1.49it/s]                                                       15%|█▌        | 1959/12776 [17:56<2:01:13,  1.49it/s] 15%|█▌        | 1960/12776 [17:57<1:54:48,  1.57it/s]                                                       15%|█▌        | 1960/12776 [17:57<1:54:48,  1.57it/s] 15%|█▌        | 1961/12776 [17:57<1:51:11,  1.62it/s]                                                       15%|█▌        | 1961/12776 [17:57<1:51:11,  1.62it/s] 15%|█▌        | 1962/12776 [17:58<1:45:08,  1.71it/s]                                                       15%|█▌        | 1962/12776 [17:58<1:45:08,  1.71it/s] 15%|█▌        | 1963/12776 [17:58<1:40:54,  1.79it/s]                                                       15%|█▌        | 1963/12776 [17:58<1:40:54,  1.79it/s] 15%|█▌        | 1964/12776 [17:59<1:34:57,  1.90it/s]                                                       15%|█▌        | 1964/12776 [17:59<1:34:57,  1.90it/s] 15%|█▌        | 1965/12776 [17:59<1:32:26,  1.95it/s]                                                       15%|█▌        | 1965/12776 [17:59<1:32:26,  1.95it/s] 15%|█▌        | 1966/12776 [18:00<1:26:51,  2.07it/s]                                                       15%|█▌        | 1966/12776 [18:00<1:26:51,  2.07it/s] 15%|█▌        | 1967/12776 [18:00<1:21:47,  2.20it/s]                                                       15%|█▌        | 1967/12776 [18:00<1:21:47,  2.20it/s] 15%|█▌        | 1968/12776 [18:00<1:23:56,  2.15it/s]                                                       15%|█▌        | 1968/12776 [18:00<1:23:56,  2.15it/s] 15%|█▌        | 1969/12776 [18:01<1:18:34,  2.29it/s]                                                       15%|█▌        | 1969/12776 [18:01<1:18:34,  2.29it/s] 15%|█▌        | 1970/12776 [18:01<1:13:54,  2.44it/s]                                                       15%|█▌        | 1970/12776 [18:01<1:13:54,  2.44it/s] 15%|█▌        | 1971/12776 [18:02<1:14:09,  2.43it/s]                                                       15%|█▌        | 1971/12776 [18:02<1:14:09,  2.43it/s] 15%|█▌        | 1972/12776 [18:02<1:09:48,  2.58it/s]                                                       15%|█▌        | 1972/12776 [18:02<1:09:48,  2.58it/s] 15%|█▌        | 1973/12776 [18:02<1:06:36,  2.70it/s]                                                       15%|█▌        | 1973/12776 [18:02<1:06:36,  2.70it/s] 15%|█▌        | 1974/12776 [18:03<1:05:18,  2.76it/s]                                                       15%|█▌        | 1974/12776 [18:03<1:05:18,  2.76it/s] 15%|█▌        | 1975/12776 [18:03<1:02:06,  2.90it/s]                                                       15%|█▌        | 1975/12776 [18:03<1:02:06,  2.90it/s] 15%|█▌        | 1976/12776 [18:03<59:25,  3.03it/s]                                                       15%|█▌        | 1976/12776 [18:03<59:25,  3.03it/s] 15%|█▌        | 1977/12776 [18:03<57:15,  3.14it/s]                                                     15%|█▌        | 1977/12776 [18:03<57:15,  3.14it/s] 15%|█▌        | 1978/12776 [18:04<1:01:24,  2.93it/s]                                                       15%|█▌        | 1978/12776 [18:04<1:01:24,  2.93it/s] 15%|█▌        | 1979/12776 [18:04<57:57,  3.10it/s]                                                       15%|█▌        | 1979/12776 [18:04<57:57,  3.10it/s] 15%|█▌        | 1980/12776 [18:04<55:07,  3.26it/s]                                                     15%|█▌        | 1980/12776 [18:04<55:07,  3.26it/s] 16%|█▌        | 1981/12776 [18:05<52:41,  3.41it/s]                                                     16%|█▌        | 1981/12776 [18:05<52:41,  3.41it/s] 16%|█▌        | 1982/12776 [18:05<56:00,  3.21it/s]                                                     16%|█▌        | 1982/12776 [18:05<56:00,  3.21it/s] 16%|█▌        | 1983/12776 [18:05<52:23,  3.43it/s]                                                     16%|█▌        | 1983/12776 [18:05<52:23,  3.43it/s] 16%|█▌        | 1984/12776 [18:05<49:34,  3.63it/s]                                                     16%|█▌        | 1984/12776 [18:05<49:34,  3.63it/s] 16%|█▌        | 1985/12776 [18:06<47:04,  3.82it/s]                                                     16%|█▌        | 1985/12776 [18:06<47:04,  3.82it/s] 16%|█▌        | 1986/12776 [18:06<48:17,  3.72it/s]                                                     16%|█▌        | 1986/12776 [18:06<48:17,  3.72it/s] 16%|█▌        | 1987/12776 [18:06<45:30,  3.95it/s]                                                     16%|█▌        | 1987/12776 [18:06<45:30,  3.95it/s] 16%|█▌        | 1988/12776 [18:06<43:15,  4.16it/s]                                                     16%|█▌        | 1988/12776 [18:06<43:15,  4.16it/s] 16%|█▌        | 1989/12776 [18:07<41:33,  4.33it/s]                                                     16%|█▌        | 1989/12776 [18:07<41:33,  4.33it/s] 16%|█▌        | 1990/12776 [18:07<40:24,  4.45it/s]                                                     16%|█▌        | 1990/12776 [18:07<40:24,  4.45it/s] 16%|█▌        | 1991/12776 [18:07<44:24,  4.05it/s]                                                     16%|█▌        | 1991/12776 [18:07<44:24,  4.05it/s] 16%|█▌        | 1992/12776 [18:07<42:00,  4.28it/s]                                                     16%|█▌        | 1992/12776 [18:07<42:00,  4.28it/s] 16%|█▌        | 1993/12776 [18:08<40:13,  4.47it/s]                                                     16%|█▌        | 1993/12776 [18:08<40:13,  4.47it/s] 16%|█▌        | 1994/12776 [18:08<38:55,  4.62it/s]                                                     16%|█▌        | 1994/12776 [18:08<38:55,  4.62it/s] 16%|█▌        | 1995/12776 [18:08<37:51,  4.75it/s]                                                     16%|█▌        | 1995/12776 [18:08<37:51,  4.75it/s] 16%|█▌        | 1996/12776 [18:08<37:02,  4.85it/s]                                                     16%|█▌        | 1996/12776 [18:08<37:02,  4.85it/s] 16%|█▌        | 1997/12776 [18:08<39:07,  4.59it/s]                                                     16%|█▌        | 1997/12776 [18:08<39:07,  4.59it/s] 16%|█▌        | 1998/12776 [18:09<37:24,  4.80it/s]                                                     16%|█▌        | 1998/12776 [18:09<37:24,  4.80it/s] 16%|█▌        | 1999/12776 [18:09<36:06,  4.97it/s]                                                     16%|█▌        | 1999/12776 [18:09<36:06,  4.97it/s] 16%|█▌        | 2000/12776 [18:10<1:07:12,  2.67it/s]                                                       16%|█▌        | 2000/12776 [18:10<1:07:12,  2.67it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 0.944, 'grad_norm': 1.5390371084213257, 'learning_rate': 0.0002651515151515151, 'epoch': 0.3}
+{'loss': 1.0435, 'grad_norm': 1.9705681800842285, 'learning_rate': 0.0002651270772238514, 'epoch': 0.3}
+{'loss': 0.7962, 'grad_norm': 1.8132861852645874, 'learning_rate': 0.0002651026392961877, 'epoch': 0.3}
+{'loss': 1.2715, 'grad_norm': 3.0275063514709473, 'learning_rate': 0.00026507820136852393, 'epoch': 0.3}
+{'loss': 1.5763, 'grad_norm': 1.90733003616333, 'learning_rate': 0.0002650537634408602, 'epoch': 0.3}
+{'loss': 1.7163, 'grad_norm': 2.937795400619507, 'learning_rate': 0.00026502932551319643, 'epoch': 0.3}
+{'loss': 1.1086, 'grad_norm': 3.4657444953918457, 'learning_rate': 0.00026500488758553274, 'epoch': 0.3}
+{'loss': 1.3036, 'grad_norm': 2.1386613845825195, 'learning_rate': 0.000264980449657869, 'epoch': 0.3}
+{'loss': 1.1613, 'grad_norm': 3.0543034076690674, 'learning_rate': 0.00026495601173020524, 'epoch': 0.3}
+{'loss': 1.7809, 'grad_norm': 4.59813117980957, 'learning_rate': 0.00026493157380254155, 'epoch': 0.3}
+{'loss': 1.3399, 'grad_norm': 1.5989995002746582, 'learning_rate': 0.0002649071358748778, 'epoch': 0.3}
+{'loss': 1.2275, 'grad_norm': 3.1781742572784424, 'learning_rate': 0.00026488269794721405, 'epoch': 0.3}
+{'loss': 1.4865, 'grad_norm': 1.8627430200576782, 'learning_rate': 0.0002648582600195503, 'epoch': 0.3}
+{'loss': 1.359, 'grad_norm': 2.872692823410034, 'learning_rate': 0.00026483382209188655, 'epoch': 0.3}
+{'loss': 1.3721, 'grad_norm': 2.231991767883301, 'learning_rate': 0.00026480938416422286, 'epoch': 0.3}
+{'loss': 1.0801, 'grad_norm': 4.821183681488037, 'learning_rate': 0.0002647849462365591, 'epoch': 0.3}
+{'loss': 0.9191, 'grad_norm': 3.1386799812316895, 'learning_rate': 0.00026476050830889536, 'epoch': 0.3}
+{'loss': 1.0087, 'grad_norm': 1.6399694681167603, 'learning_rate': 0.00026473607038123167, 'epoch': 0.31}
+{'loss': 1.5208, 'grad_norm': 1.6671782732009888, 'learning_rate': 0.0002647116324535679, 'epoch': 0.31}
+{'loss': 0.4585, 'grad_norm': 0.6199254393577576, 'learning_rate': 0.00026468719452590417, 'epoch': 0.31}
+{'loss': 0.4495, 'grad_norm': 0.783209502696991, 'learning_rate': 0.0002646627565982404, 'epoch': 0.31}
+{'loss': 0.452, 'grad_norm': 0.6731662154197693, 'learning_rate': 0.0002646383186705767, 'epoch': 0.31}
+{'loss': 0.4503, 'grad_norm': 0.6478911638259888, 'learning_rate': 0.000264613880742913, 'epoch': 0.31}
+{'loss': 0.4677, 'grad_norm': 1.0483900308609009, 'learning_rate': 0.00026458944281524923, 'epoch': 0.31}
+{'loss': 0.4934, 'grad_norm': 0.8272550106048584, 'learning_rate': 0.00026456500488758553, 'epoch': 0.31}
+{'loss': 0.3189, 'grad_norm': 0.5366849899291992, 'learning_rate': 0.0002645405669599218, 'epoch': 0.31}
+{'loss': 0.4301, 'grad_norm': 2.845299243927002, 'learning_rate': 0.00026451612903225804, 'epoch': 0.31}
+{'loss': 0.5793, 'grad_norm': 1.312286615371704, 'learning_rate': 0.00026449169110459434, 'epoch': 0.31}
+{'loss': 0.4355, 'grad_norm': 1.1284027099609375, 'learning_rate': 0.00026446725317693054, 'epoch': 0.31}
+{'loss': 0.6103, 'grad_norm': 1.250265121459961, 'learning_rate': 0.00026444281524926684, 'epoch': 0.31}
+{'loss': 0.4448, 'grad_norm': 1.059794545173645, 'learning_rate': 0.0002644183773216031, 'epoch': 0.31}
+{'loss': 0.4652, 'grad_norm': 1.0672193765640259, 'learning_rate': 0.00026439393939393935, 'epoch': 0.31}
+{'loss': 0.855, 'grad_norm': 1.6198948621749878, 'learning_rate': 0.00026436950146627565, 'epoch': 0.31}
+{'loss': 0.6643, 'grad_norm': 1.385394811630249, 'learning_rate': 0.0002643450635386119, 'epoch': 0.31}
+{'loss': 0.7861, 'grad_norm': 1.2622168064117432, 'learning_rate': 0.00026432062561094815, 'epoch': 0.31}
+{'loss': 0.5213, 'grad_norm': 1.0766664743423462, 'learning_rate': 0.0002642961876832844, 'epoch': 0.31}
+{'loss': 0.6966, 'grad_norm': 1.1218162775039673, 'learning_rate': 0.0002642717497556207, 'epoch': 0.31}
+{'loss': 0.538, 'grad_norm': 1.4492287635803223, 'learning_rate': 0.00026424731182795696, 'epoch': 0.31}
+{'loss': 0.8476, 'grad_norm': 1.70439875125885, 'learning_rate': 0.0002642228739002932, 'epoch': 0.31}
+{'loss': 0.641, 'grad_norm': 1.5610077381134033, 'learning_rate': 0.0002641984359726295, 'epoch': 0.31}
+{'loss': 0.9228, 'grad_norm': 1.6430991888046265, 'learning_rate': 0.00026417399804496577, 'epoch': 0.31}
+{'loss': 1.2539, 'grad_norm': 3.3181350231170654, 'learning_rate': 0.000264149560117302, 'epoch': 0.31}
+{'loss': 0.8461, 'grad_norm': 1.5349977016448975, 'learning_rate': 0.00026412512218963833, 'epoch': 0.31}
+{'loss': 0.8088, 'grad_norm': 1.3889473676681519, 'learning_rate': 0.0002641006842619745, 'epoch': 0.31}
+{'loss': 1.0715, 'grad_norm': 3.5759482383728027, 'learning_rate': 0.00026407624633431083, 'epoch': 0.31}
+{'loss': 1.1101, 'grad_norm': 5.998546123504639, 'learning_rate': 0.0002640518084066471, 'epoch': 0.31}
+{'loss': 0.9514, 'grad_norm': 1.6314876079559326, 'learning_rate': 0.00026402737047898333, 'epoch': 0.31}
+{'loss': 0.9667, 'grad_norm': 1.3417260646820068, 'learning_rate': 0.00026400293255131964, 'epoch': 0.31}
+{'loss': 0.9157, 'grad_norm': 1.557165265083313, 'learning_rate': 0.0002639784946236559, 'epoch': 0.31}
+{'loss': 0.7029, 'grad_norm': 1.574750542640686, 'learning_rate': 0.00026395405669599214, 'epoch': 0.31}
+{'loss': 1.3352, 'grad_norm': 2.596886157989502, 'learning_rate': 0.00026392961876832845, 'epoch': 0.31}
+{'loss': 0.6754, 'grad_norm': 1.6578302383422852, 'learning_rate': 0.0002639051808406647, 'epoch': 0.31}
+{'loss': 1.0541, 'grad_norm': 1.7295254468917847, 'learning_rate': 0.00026388074291300095, 'epoch': 0.31}
+{'loss': 1.1757, 'grad_norm': 1.5874801874160767, 'learning_rate': 0.0002638563049853372, 'epoch': 0.31}
+{'loss': 1.4245, 'grad_norm': 2.339158535003662, 'learning_rate': 0.0002638318670576735, 'epoch': 0.31}
+{'loss': 1.1071, 'grad_norm': 1.606540560722351, 'learning_rate': 0.00026380742913000976, 'epoch': 0.31}
+{'loss': 1.4799, 'grad_norm': 3.946507215499878, 'learning_rate': 0.000263782991202346, 'epoch': 0.31}
+{'loss': 1.7003, 'grad_norm': 3.6631851196289062, 'learning_rate': 0.0002637585532746823, 'epoch': 0.31}
+{'loss': 1.3078, 'grad_norm': 2.883183240890503, 'learning_rate': 0.00026373411534701856, 'epoch': 0.31}
+{'loss': 1.4201, 'grad_norm': 1.4559465646743774, 'learning_rate': 0.0002637096774193548, 'epoch': 0.31}
+{'loss': 1.3366, 'grad_norm': 1.9634108543395996, 'learning_rate': 0.00026368523949169107, 'epoch': 0.31}
+{'loss': 1.6637, 'grad_norm': 2.3829219341278076, 'learning_rate': 0.0002636608015640273, 'epoch': 0.31}
+{'loss': 1.1262, 'grad_norm': 2.1768481731414795, 'learning_rate': 0.0002636363636363636, 'epoch': 0.31}
+{'loss': 1.3264, 'grad_norm': 2.132509231567383, 'learning_rate': 0.0002636119257086999, 'epoch': 0.31}
+{'loss': 0.7347, 'grad_norm': 3.3182246685028076, 'learning_rate': 0.0002635874877810361, 'epoch': 0.31}
+{'loss': 0.9086, 'grad_norm': 1.8568942546844482, 'learning_rate': 0.00026356304985337243, 'epoch': 0.31}
+{'loss': 1.2884, 'grad_norm': 2.9322288036346436, 'learning_rate': 0.0002635386119257087, 'epoch': 0.31}
+{'loss': 1.1595, 'grad_norm': 5.352544784545898, 'learning_rate': 0.00026351417399804493, 'epoch': 0.31}
+{'loss': 1.4768, 'grad_norm': 2.5578665733337402, 'learning_rate': 0.0002634897360703812, 'epoch': 0.31}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:09,  5.97it/s][A
+  0%|          | 3/774 [00:00<02:48,  4.57it/s][A
+  1%|          | 4/774 [00:00<03:16,  3.93it/s][A
+  1%|          | 5/774 [00:01<03:16,  3.92it/s][A
+  1%|          | 6/774 [00:01<03:29,  3.67it/s][A
+  1%|          | 7/774 [00:01<03:25,  3.74it/s][A
+  1%|          | 8/774 [00:02<03:29,  3.66it/s][A
+  1%|          | 9/774 [00:02<03:17,  3.88it/s][A
+  1%|▏         | 10/774 [00:02<03:16,  3.89it/s][A
+  1%|▏         | 11/774 [00:02<03:31,  3.62it/s][A
+  2%|▏         | 12/774 [00:03<03:16,  3.89it/s][A
+  2%|▏         | 13/774 [00:03<03:08,  4.03it/s][A
+  2%|▏         | 14/774 [00:03<03:22,  3.75it/s][A
+  2%|▏         | 15/774 [00:03<03:40,  3.44it/s][A
+  2%|▏         | 16/774 [00:04<03:36,  3.51it/s][A
+  2%|▏         | 17/774 [00:04<03:13,  3.91it/s][A
+  2%|▏         | 18/774 [00:04<03:06,  4.06it/s][A
+  2%|▏         | 19/774 [00:04<03:15,  3.85it/s][A
+  3%|▎         | 20/774 [00:05<03:12,  3.91it/s][A
+  3%|▎         | 21/774 [00:05<03:16,  3.83it/s][A
+  3%|▎         | 22/774 [00:05<03:21,  3.73it/s][A
+  3%|▎         | 23/774 [00:06<03:33,  3.52it/s][A
+  3%|▎         | 24/774 [00:06<03:32,  3.53it/s][A
+  3%|▎         | 25/774 [00:06<03:38,  3.43it/s][A
+  3%|▎         | 26/774 [00:06<03:36,  3.45it/s][A
+  3%|▎         | 27/774 [00:07<03:38,  3.43it/s][A
+  4%|▎         | 28/774 [00:07<03:43,  3.33it/s][A
+  4%|▎         | 29/774 [00:07<03:48,  3.26it/s][A
+  4%|▍         | 30/774 [00:08<03:35,  3.46it/s][A
+  4%|▍         | 31/774 [00:08<03:34,  3.46it/s][A
+  4%|▍         | 32/774 [00:08<04:09,  2.97it/s][A
+  4%|▍         | 33/774 [00:09<03:56,  3.13it/s][A
+  4%|▍         | 34/774 [00:09<03:42,  3.32it/s][A
+  5%|▍         | 35/774 [00:09<03:49,  3.22it/s][A
+  5%|▍         | 36/774 [00:10<03:47,  3.24it/s][A
+  5%|▍         | 37/774 [00:10<03:47,  3.24it/s][A
+  5%|▍         | 38/774 [00:10<03:38,  3.37it/s][A
+  5%|▌         | 39/774 [00:10<03:23,  3.61it/s][A
+  5%|▌         | 40/774 [00:11<03:26,  3.56it/s][A
+  5%|▌         | 41/774 [00:11<03:25,  3.56it/s][A
+  5%|▌         | 42/774 [00:11<03:14,  3.77it/s][A
+  6%|▌         | 43/774 [00:11<03:25,  3.55it/s][A
+  6%|▌         | 44/774 [00:12<03:29,  3.48it/s][A
+  6%|▌         | 45/774 [00:12<03:18,  3.68it/s][A
+  6%|▌         | 46/774 [00:12<03:02,  3.98it/s][A
+  6%|▌         | 47/774 [00:12<02:51,  4.25it/s][A
+  6%|▌         | 48/774 [00:13<02:52,  4.20it/s][A
+  6%|▋         | 49/774 [00:13<02:54,  4.16it/s][A
+  6%|▋         | 50/774 [00:13<02:56,  4.10it/s][A
+  7%|▋         | 51/774 [00:13<02:58,  4.06it/s][A
+  7%|▋         | 52/774 [00:14<02:56,  4.09it/s][A
+  7%|▋         | 53/774 [00:14<03:05,  3.89it/s][A
+  7%|▋         | 54/774 [00:14<03:07,  3.85it/s][A
+  7%|▋         | 55/774 [00:14<03:16,  3.66it/s][A
+  7%|▋         | 56/774 [00:15<03:16,  3.64it/s][A
+  7%|▋         | 57/774 [00:15<03:23,  3.52it/s][A
+  7%|▋         | 58/774 [00:15<03:21,  3.55it/s][A
+  8%|▊         | 59/774 [00:16<03:06,  3.83it/s][A
+  8%|▊         | 60/774 [00:16<02:53,  4.12it/s][A
+  8%|▊         | 61/774 [00:16<02:31,  4.70it/s][A
+  8%|▊         | 62/774 [00:16<02:29,  4.75it/s][A
+  8%|▊         | 63/774 [00:16<02:55,  4.06it/s][A
+  8%|▊         | 64/774 [00:17<02:46,  4.26it/s][A
+  8%|▊         | 65/774 [00:17<02:48,  4.20it/s][A
+  9%|▊         | 66/774 [00:17<02:46,  4.26it/s][A
+  9%|▊         | 67/774 [00:17<02:40,  4.40it/s][A
+  9%|▉         | 68/774 [00:18<02:36,  4.51it/s][A
+  9%|▉         | 69/774 [00:18<02:28,  4.76it/s][A
+  9%|▉         | 70/774 [00:18<02:36,  4.50it/s][A
+  9%|▉         | 71/774 [00:18<02:31,  4.65it/s][A
+  9%|▉         | 72/774 [00:18<02:41,  4.34it/s][A
+  9%|▉         | 73/774 [00:19<02:50,  4.10it/s][A
+ 10%|▉         | 74/774 [00:19<02:57,  3.93it/s][A
+ 10%|▉         | 75/774 [00:19<03:04,  3.79it/s][A
+ 10%|▉         | 76/774 [00:20<03:00,  3.87it/s][A
+ 10%|▉         | 77/774 [00:20<03:13,  3.61it/s][A
+ 10%|█         | 78/774 [00:20<02:54,  3.99it/s][A
+ 10%|█         | 79/774 [00:20<02:42,  4.28it/s][A
+ 10%|█         | 80/774 [00:20<02:39,  4.35it/s][A
+ 10%|█         | 81/774 [00:21<02:17,  5.02it/s][A
+ 11%|█         | 82/774 [00:21<02:18,  4.99it/s][A
+ 11%|█         | 83/774 [00:21<02:21,  4.90it/s][A
+ 11%|█         | 84/774 [00:21<02:27,  4.69it/s][A
+ 11%|█         | 85/774 [00:21<02:35,  4.42it/s][A
+ 11%|█         | 86/774 [00:22<02:42,  4.22it/s][A
+ 11%|█         | 87/774 [00:22<02:43,  4.19it/s][A
+ 11%|█▏        | 88/774 [00:22<02:31,  4.53it/s][A
+ 11%|█▏        | 89/774 [00:22<02:26,  4.68it/s][A
+ 12%|█▏        | 90/774 [00:23<02:34,  4.43it/s][A
+ 12%|█▏        | 91/774 [00:23<02:48,  4.06it/s][A
+ 12%|█▏        | 92/774 [00:23<03:02,  3.74it/s][A
+ 12%|█▏        | 93/774 [00:23<02:57,  3.85it/s][A
+ 12%|█▏        | 94/774 [00:24<03:00,  3.76it/s][A
+ 12%|█▏        | 95/774 [00:24<02:59,  3.78it/s][A
+ 12%|█▏        | 96/774 [00:24<02:54,  3.88it/s][A
+ 13%|█▎        | 97/774 [00:24<02:39,  4.23it/s][A
+ 13%|█▎        | 98/774 [00:25<02:30,  4.48it/s][A
+ 13%|█▎        | 99/774 [00:25<02:44,  4.11it/s][A
+ 13%|█▎        | 100/774 [00:25<02:55,  3.83it/s][A
+ 13%|█▎        | 101/774 [00:26<03:02,  3.70it/s][A
+ 13%|█▎        | 102/774 [00:26<03:12,  3.49it/s][A
+ 13%|█▎        | 103/774 [00:26<03:15,  3.43it/s][A
+ 13%|█▎        | 104/774 [00:26<03:15,  3.43it/s][A
+ 14%|█▎        | 105/774 [00:27<03:15,  3.42it/s][A
+ 14%|█▎        | 106/774 [00:27<03:35,  3.09it/s][A
+ 14%|█▍        | 107/774 [00:27<03:48,  2.92it/s][A
+ 14%|█▍        | 108/774 [00:28<03:38,  3.05it/s][A
+ 14%|█▍        | 109/774 [00:28<03:34,  3.10it/s][A
+ 14%|█▍        | 110/774 [00:28<03:25,  3.23it/s][A
+ 14%|█▍        | 111/774 [00:29<03:26,  3.21it/s][A
+ 14%|█▍        | 112/774 [00:29<03:15,  3.39it/s][A
+ 15%|█▍        | 113/774 [00:29<03:18,  3.32it/s][A
+ 15%|█▍        | 114/774 [00:30<03:22,  3.25it/s][A
+ 15%|█▍        | 115/774 [00:30<03:16,  3.36it/s][A
+ 15%|█▍        | 116/774 [00:30<03:01,  3.63it/s][A
+ 15%|█▌        | 117/774 [00:30<03:06,  3.53it/s][A
+ 15%|█▌        | 118/774 [00:31<03:05,  3.54it/s][A
+ 15%|█▌        | 119/774 [00:31<02:57,  3.69it/s][A
+ 16%|█▌        | 120/774 [00:31<03:08,  3.48it/s][A
+ 16%|█▌        | 121/774 [00:32<03:02,  3.57it/s][A
+ 16%|█▌        | 122/774 [00:32<03:06,  3.50it/s][A
+ 16%|█▌        | 123/774 [00:32<02:57,  3.67it/s][A
+ 16%|█▌        | 124/774 [00:32<02:58,  3.63it/s][A
+ 16%|█▌        | 125/774 [00:33<03:00,  3.59it/s][A
+ 16%|█▋        | 126/774 [00:33<03:08,  3.44it/s][A
+ 16%|█▋        | 127/774 [00:33<03:18,  3.27it/s][A
+ 17%|█▋        | 128/774 [00:34<03:08,  3.43it/s][A
+ 17%|█▋        | 129/774 [00:34<03:09,  3.41it/s][A
+ 17%|█▋        | 130/774 [00:34<03:16,  3.28it/s][A
+ 17%|█▋        | 131/774 [00:34<03:06,  3.44it/s][A
+ 17%|█▋        | 132/774 [00:35<03:07,  3.42it/s][A
+ 17%|█▋        | 133/774 [00:35<03:03,  3.49it/s][A
+ 17%|█▋        | 134/774 [00:35<03:03,  3.50it/s][A
+ 17%|█▋        | 135/774 [00:36<03:20,  3.19it/s][A
+ 18%|█▊        | 136/774 [00:36<03:27,  3.07it/s][A
+ 18%|█▊        | 137/774 [00:36<03:25,  3.10it/s][A
+ 18%|█▊        | 138/774 [00:37<03:21,  3.15it/s][A
+ 18%|█▊        | 139/774 [00:37<03:23,  3.13it/s][A
+ 18%|█▊        | 140/774 [00:37<03:19,  3.18it/s][A
+ 18%|█▊        | 141/774 [00:38<03:10,  3.33it/s][A
+ 18%|█▊        | 142/774 [00:38<03:20,  3.14it/s][A
+ 18%|█▊        | 143/774 [00:38<03:18,  3.18it/s][A
+ 19%|█▊        | 144/774 [00:38<03:08,  3.34it/s][A
+ 19%|█▊        | 145/774 [00:39<03:01,  3.47it/s][A
+ 19%|█▉        | 146/774 [00:39<02:50,  3.69it/s][A
+ 19%|█▉        | 147/774 [00:39<02:41,  3.89it/s][A
+ 19%|█▉        | 148/774 [00:39<02:50,  3.66it/s][A
+ 19%|█▉        | 149/774 [00:40<03:01,  3.44it/s][A
+ 19%|█▉        | 150/774 [00:40<03:05,  3.37it/s][A
+ 20%|█▉        | 151/774 [00:40<02:55,  3.56it/s][A
+ 20%|█▉        | 152/774 [00:41<02:47,  3.71it/s][A
+ 20%|█▉        | 153/774 [00:41<02:54,  3.57it/s][A
+ 20%|█▉        | 154/774 [00:41<02:49,  3.66it/s][A
+ 20%|██        | 155/774 [00:41<02:46,  3.73it/s][A
+ 20%|██        | 156/774 [00:42<02:40,  3.84it/s][A
+ 20%|██        | 157/774 [00:42<02:33,  4.01it/s][A
+ 20%|██        | 158/774 [00:42<02:36,  3.93it/s][A
+ 21%|██        | 159/774 [00:42<02:39,  3.86it/s][A
+ 21%|██        | 160/774 [00:43<02:31,  4.06it/s][A
+ 21%|██        | 161/774 [00:43<02:40,  3.81it/s][A
+ 21%|██        | 162/774 [00:43<02:46,  3.67it/s][A
+ 21%|██        | 163/774 [00:44<02:45,  3.68it/s][A
+ 21%|██        | 164/774 [00:44<02:39,  3.82it/s][A
+ 21%|██▏       | 165/774 [00:44<02:37,  3.86it/s][A
+ 21%|██▏       | 166/774 [00:44<02:41,  3.75it/s][A
+ 22%|██▏       | 167/774 [00:45<02:43,  3.72it/s][A
+ 22%|██▏       | 168/774 [00:45<02:34,  3.92it/s][A
+ 22%|██▏       | 169/774 [00:45<02:27,  4.10it/s][A
+ 22%|██▏       | 170/774 [00:45<02:36,  3.86it/s][A
+ 22%|██▏       | 171/774 [00:46<02:46,  3.61it/s][A
+ 22%|██▏       | 172/774 [00:46<02:54,  3.45it/s][A
+ 22%|██▏       | 173/774 [00:46<02:50,  3.53it/s][A
+ 22%|██▏       | 174/774 [00:46<02:43,  3.68it/s][A
+ 23%|██▎       | 175/774 [00:47<02:44,  3.64it/s][A
+ 23%|██▎       | 176/774 [00:47<02:37,  3.79it/s][A
+ 23%|██▎       | 177/774 [00:47<02:51,  3.48it/s][A
+ 23%|██▎       | 178/774 [00:48<02:36,  3.82it/s][A
+ 23%|██▎       | 179/774 [00:48<02:22,  4.17it/s][A
+ 23%|██▎       | 180/774 [00:48<02:17,  4.33it/s][A
+ 23%|██▎       | 181/774 [00:48<02:20,  4.21it/s][A
+ 24%|██▎       | 182/774 [00:48<02:24,  4.09it/s][A
+ 24%|██▎       | 183/774 [00:49<02:25,  4.06it/s][A
+ 24%|██▍       | 184/774 [00:49<02:36,  3.78it/s][A
+ 24%|██▍       | 185/774 [00:49<02:43,  3.60it/s][A
+ 24%|██▍       | 186/774 [00:50<02:43,  3.61it/s][A
+ 24%|██▍       | 187/774 [00:50<02:36,  3.75it/s][A
+ 24%|██▍       | 188/774 [00:50<02:35,  3.77it/s][A
+ 24%|██▍       | 189/774 [00:50<02:32,  3.84it/s][A
+ 25%|██▍       | 190/774 [00:51<02:27,  3.97it/s][A
+ 25%|██▍       | 191/774 [00:51<02:32,  3.82it/s][A
+ 25%|██▍       | 192/774 [00:51<02:37,  3.71it/s][A
+ 25%|██▍       | 193/774 [00:51<02:40,  3.62it/s][A
+ 25%|██▌       | 194/774 [00:52<02:48,  3.43it/s][A
+ 25%|██▌       | 195/774 [00:52<02:57,  3.27it/s][A
+ 25%|██▌       | 196/774 [00:52<02:57,  3.25it/s][A
+ 25%|██▌       | 197/774 [00:53<02:53,  3.33it/s][A
+ 26%|██▌       | 198/774 [00:53<02:43,  3.51it/s][A
+ 26%|██▌       | 199/774 [00:53<02:44,  3.50it/s][A
+ 26%|██▌       | 200/774 [00:53<02:39,  3.60it/s][A
+ 26%|██▌       | 201/774 [00:54<02:36,  3.66it/s][A
+ 26%|██▌       | 202/774 [00:54<02:34,  3.71it/s][A
+ 26%|██▌       | 203/774 [00:54<02:27,  3.88it/s][A
+ 26%|██▋       | 204/774 [00:55<02:32,  3.73it/s][A
+ 26%|██▋       | 205/774 [00:55<02:41,  3.52it/s][A
+ 27%|██▋       | 206/774 [00:55<02:35,  3.65it/s][A
+ 27%|██▋       | 207/774 [00:55<02:32,  3.72it/s][A
+ 27%|██▋       | 208/774 [00:56<02:32,  3.71it/s][A
+ 27%|██▋       | 209/774 [00:56<02:30,  3.76it/s][A
+ 27%|██▋       | 210/774 [00:56<02:28,  3.80it/s][A
+ 27%|██▋       | 211/774 [00:56<02:25,  3.87it/s][A
+ 27%|██▋       | 212/774 [00:57<02:15,  4.16it/s][A
+ 28%|██▊       | 213/774 [00:57<01:59,  4.68it/s][A
+ 28%|██▊       | 214/774 [00:57<02:02,  4.57it/s][A
+ 28%|██▊       | 215/774 [00:57<02:01,  4.58it/s][A
+ 28%|██▊       | 216/774 [00:57<02:00,  4.62it/s][A
+ 28%|██▊       | 217/774 [00:58<02:04,  4.49it/s][A
+ 28%|██▊       | 218/774 [00:58<02:09,  4.28it/s][A
+ 28%|██▊       | 219/774 [00:58<02:18,  4.00it/s][A
+ 28%|██▊       | 220/774 [00:58<02:16,  4.05it/s][A
+ 29%|██▊       | 221/774 [00:59<02:22,  3.88it/s][A
+ 29%|██▊       | 222/774 [00:59<02:31,  3.64it/s][A
+ 29%|██▉       | 223/774 [00:59<02:49,  3.26it/s][A
+ 29%|██▉       | 224/774 [01:00<02:59,  3.07it/s][A
+ 29%|██▉       | 225/774 [01:00<03:09,  2.89it/s][A
+ 29%|██▉       | 226/774 [01:01<03:13,  2.83it/s][A
+ 29%|██▉       | 227/774 [01:01<03:09,  2.89it/s][A
+ 29%|██▉       | 228/774 [01:01<03:01,  3.00it/s][A
+ 30%|██▉       | 229/774 [01:02<03:14,  2.81it/s][A
+ 30%|██▉       | 230/774 [01:02<03:01,  3.00it/s][A
+ 30%|██▉       | 231/774 [01:02<02:58,  3.04it/s][A
+ 30%|██▉       | 232/774 [01:02<02:50,  3.18it/s][A
+ 30%|███       | 233/774 [01:03<03:05,  2.91it/s][A
+ 30%|███       | 234/774 [01:03<03:09,  2.85it/s][A
+ 30%|███       | 235/774 [01:04<03:08,  2.86it/s][A
+ 30%|███       | 236/774 [01:04<03:12,  2.80it/s][A
+ 31%|███       | 237/774 [01:04<03:07,  2.86it/s][A
+ 31%|███       | 238/774 [01:05<02:58,  3.01it/s][A
+ 31%|███       | 239/774 [01:05<02:56,  3.03it/s][A
+ 31%|███       | 240/774 [01:05<02:56,  3.03it/s][A
+ 31%|███       | 241/774 [01:06<02:59,  2.97it/s][A
+ 31%|███▏      | 242/774 [01:06<03:10,  2.80it/s][A
+ 31%|███▏      | 243/774 [01:06<03:20,  2.65it/s][A
+ 32%|███▏      | 244/774 [01:07<03:14,  2.72it/s][A
+ 32%|███▏      | 245/774 [01:07<03:06,  2.84it/s][A
+ 32%|███▏      | 246/774 [01:07<03:05,  2.85it/s][A
+ 32%|███▏      | 247/774 [01:08<03:46,  2.32it/s][A
+ 32%|███▏      | 248/774 [01:08<03:50,  2.28it/s][A
+ 32%|███▏      | 249/774 [01:09<03:25,  2.55it/s][A
+ 32%|███▏      | 250/774 [01:09<03:18,  2.64it/s][A
+ 32%|███▏      | 251/774 [01:09<03:16,  2.66it/s][A
+ 33%|███▎      | 252/774 [01:10<03:12,  2.71it/s][A
+ 33%|███▎      | 253/774 [01:10<03:10,  2.74it/s][A
+ 33%|███▎      | 254/774 [01:11<03:05,  2.81it/s][A
+ 33%|███▎      | 255/774 [01:11<03:01,  2.87it/s][A
+ 33%|███▎      | 256/774 [01:11<02:55,  2.95it/s][A
+ 33%|███▎      | 257/774 [01:12<02:53,  2.97it/s][A
+ 33%|███▎      | 258/774 [01:12<02:39,  3.24it/s][A
+ 33%|███▎      | 259/774 [01:12<02:22,  3.62it/s][A
+ 34%|███▎      | 260/774 [01:12<02:21,  3.63it/s][A
+ 34%|███▎      | 261/774 [01:13<02:26,  3.51it/s][A
+ 34%|███▍      | 262/774 [01:13<02:11,  3.89it/s][A
+ 34%|███▍      | 263/774 [01:13<02:04,  4.11it/s][A
+ 34%|███▍      | 264/774 [01:13<02:13,  3.82it/s][A
+ 34%|███▍      | 265/774 [01:13<02:07,  3.99it/s][A
+ 34%|███▍      | 266/774 [01:14<02:00,  4.20it/s][A
+ 34%|███▍      | 267/774 [01:14<01:59,  4.24it/s][A
+ 35%|███▍      | 268/774 [01:14<02:06,  4.01it/s][A
+ 35%|███▍      | 269/774 [01:14<02:11,  3.83it/s][A
+ 35%|███▍      | 270/774 [01:15<02:16,  3.69it/s][A
+ 35%|███▌      | 271/774 [01:15<02:13,  3.77it/s][A
+ 35%|███▌      | 272/774 [01:15<02:02,  4.09it/s][A
+ 35%|███▌      | 273/774 [01:15<01:58,  4.21it/s][A
+ 35%|███▌      | 274/774 [01:16<02:03,  4.05it/s][A
+ 36%|███▌      | 275/774 [01:16<01:56,  4.28it/s][A
+ 36%|███▌      | 276/774 [01:16<01:50,  4.50it/s][A
+ 36%|███▌      | 277/774 [01:16<01:54,  4.34it/s][A
+ 36%|███▌      | 278/774 [01:17<01:57,  4.23it/s][A
+ 36%|███▌      | 279/774 [01:17<01:51,  4.44it/s][A
+ 36%|███▌      | 280/774 [01:17<01:52,  4.39it/s][A
+ 36%|███▋      | 281/774 [01:17<02:03,  4.00it/s][A
+ 36%|███▋      | 282/774 [01:18<02:14,  3.66it/s][A
+ 37%|███▋      | 283/774 [01:18<02:10,  3.78it/s][A
+ 37%|███▋      | 284/774 [01:18<02:11,  3.74it/s][A
+ 37%|███▋      | 285/774 [01:18<02:03,  3.95it/s][A
+ 37%|███▋      | 286/774 [01:19<01:58,  4.10it/s][A
+ 37%|███▋      | 287/774 [01:19<02:10,  3.75it/s][A
+ 37%|███▋      | 288/774 [01:19<02:13,  3.63it/s][A
+ 37%|███▋      | 289/774 [01:20<02:11,  3.69it/s][A
+ 37%|███▋      | 290/774 [01:20<02:07,  3.80it/s][A
+ 38%|███▊      | 291/774 [01:20<02:06,  3.83it/s][A
+ 38%|███▊      | 292/774 [01:20<02:03,  3.92it/s][A
+ 38%|███▊      | 293/774 [01:20<01:52,  4.27it/s][A
+ 38%|███▊      | 294/774 [01:21<01:49,  4.40it/s][A
+ 38%|███▊      | 295/774 [01:21<01:47,  4.45it/s][A
+ 38%|███▊      | 296/774 [01:21<01:42,  4.67it/s][A
+ 38%|███▊      | 297/774 [01:21<01:36,  4.92it/s][A
+ 39%|███▊      | 298/774 [01:21<01:41,  4.67it/s][A
+ 39%|███▊      | 299/774 [01:22<01:44,  4.53it/s][A
+ 39%|███▉      | 300/774 [01:22<01:51,  4.24it/s][A
+ 39%|███▉      | 301/774 [01:22<01:44,  4.53it/s][A
+ 39%|███▉      | 302/774 [01:22<01:38,  4.77it/s][A
+ 39%|███▉      | 303/774 [01:23<01:36,  4.90it/s][A
+ 39%|███▉      | 304/774 [01:23<01:25,  5.52it/s][A
+ 39%|███▉      | 305/774 [01:23<01:23,  5.63it/s][A
+ 40%|███▉      | 306/774 [01:23<01:35,  4.91it/s][A
+ 40%|███▉      | 307/774 [01:23<01:40,  4.64it/s][A
+ 40%|███▉      | 308/774 [01:24<01:36,  4.85it/s][A
+ 40%|███▉      | 309/774 [01:24<01:36,  4.81it/s][A
+ 40%|████      | 310/774 [01:24<01:41,  4.59it/s][A
+ 40%|████      | 311/774 [01:24<01:39,  4.64it/s][A
+ 40%|████      | 312/774 [01:24<01:37,  4.76it/s][A
+ 40%|████      | 313/774 [01:25<01:37,  4.74it/s][A
+ 41%|████      | 314/774 [01:25<01:38,  4.65it/s][A
+ 41%|████      | 315/774 [01:25<01:47,  4.28it/s][A
+ 41%|████      | 316/774 [01:25<01:38,  4.65it/s][A
+ 41%|████      | 317/774 [01:25<01:31,  5.00it/s][A
+ 41%|████      | 318/774 [01:26<01:34,  4.81it/s][A
+ 41%|████      | 319/774 [01:26<01:37,  4.67it/s][A
+ 41%|████▏     | 320/774 [01:26<01:37,  4.63it/s][A
+ 41%|████▏     | 321/774 [01:26<01:28,  5.10it/s][A
+ 42%|████▏     | 322/774 [01:26<01:23,  5.40it/s][A
+ 42%|████▏     | 323/774 [01:27<01:15,  5.97it/s][A
+ 42%|████▏     | 324/774 [01:27<01:22,  5.47it/s][A
+ 42%|████▏     | 325/774 [01:27<01:26,  5.17it/s][A
+ 42%|████▏     | 326/774 [01:27<01:23,  5.35it/s][A
+ 42%|████▏     | 327/774 [01:27<01:26,  5.16it/s][A
+ 42%|████▏     | 328/774 [01:28<01:24,  5.30it/s][A
+ 43%|████▎     | 329/774 [01:28<01:32,  4.81it/s][A
+ 43%|████▎     | 330/774 [01:28<01:28,  5.00it/s][A
+ 43%|████▎     | 331/774 [01:28<01:20,  5.48it/s][A
+ 43%|████▎     | 332/774 [01:28<01:18,  5.62it/s][A
+ 43%|████▎     | 333/774 [01:28<01:21,  5.38it/s][A
+ 43%|████▎     | 334/774 [01:29<01:24,  5.19it/s][A
+ 43%|████▎     | 335/774 [01:29<01:25,  5.14it/s][A
+ 43%|████▎     | 336/774 [01:29<01:24,  5.17it/s][A
+ 44%|████▎     | 337/774 [01:29<01:18,  5.56it/s][A
+ 44%|████▎     | 338/774 [01:29<01:13,  5.94it/s][A
+ 44%|████▍     | 339/774 [01:30<01:09,  6.29it/s][A
+ 44%|████▍     | 340/774 [01:30<01:08,  6.29it/s][A
+ 44%|████▍     | 341/774 [01:30<01:25,  5.04it/s][A
+ 44%|████▍     | 342/774 [01:30<01:35,  4.54it/s][A
+ 44%|████▍     | 343/774 [01:30<01:36,  4.48it/s][A
+ 44%|████▍     | 344/774 [01:31<01:39,  4.30it/s][A
+ 45%|████▍     | 345/774 [01:31<01:43,  4.16it/s][A
+ 45%|████▍     | 346/774 [01:31<01:45,  4.07it/s][A
+ 45%|████▍     | 347/774 [01:31<01:42,  4.18it/s][A
+ 45%|████▍     | 348/774 [01:32<01:37,  4.35it/s][A
+ 45%|████▌     | 349/774 [01:32<01:33,  4.53it/s][A
+ 45%|████▌     | 350/774 [01:32<01:36,  4.40it/s][A
+ 45%|████▌     | 351/774 [01:32<01:36,  4.38it/s][A
+ 45%|████▌     | 352/774 [01:33<01:32,  4.56it/s][A
+ 46%|��███▌     | 353/774 [01:33<01:32,  4.55it/s][A
+ 46%|████▌     | 354/774 [01:33<01:32,  4.55it/s][A
+ 46%|████▌     | 355/774 [01:33<01:36,  4.33it/s][A
+ 46%|████▌     | 356/774 [01:34<01:46,  3.91it/s][A
+ 46%|████▌     | 357/774 [01:34<02:03,  3.38it/s][A
+ 46%|████▋     | 358/774 [01:34<02:07,  3.27it/s][A
+ 46%|████▋     | 359/774 [01:35<02:05,  3.31it/s][A
+ 47%|████▋     | 360/774 [01:35<02:05,  3.30it/s][A
+ 47%|████▋     | 361/774 [01:35<01:59,  3.46it/s][A
+ 47%|████▋     | 362/774 [01:35<02:05,  3.28it/s][A
+ 47%|████▋     | 363/774 [01:36<02:03,  3.32it/s][A
+ 47%|████▋     | 364/774 [01:36<02:05,  3.27it/s][A
+ 47%|████▋     | 365/774 [01:36<02:02,  3.34it/s][A
+ 47%|████▋     | 366/774 [01:37<01:53,  3.60it/s][A
+ 47%|████▋     | 367/774 [01:37<01:47,  3.78it/s][A
+ 48%|████▊     | 368/774 [01:37<01:45,  3.85it/s][A
+ 48%|████▊     | 369/774 [01:37<01:52,  3.60it/s][A
+ 48%|████▊     | 370/774 [01:38<02:06,  3.19it/s][A
+ 48%|████▊     | 371/774 [01:38<01:57,  3.44it/s][A
+ 48%|████▊     | 372/774 [01:38<01:57,  3.41it/s][A
+ 48%|████▊     | 373/774 [01:39<01:56,  3.44it/s][A
+ 48%|████▊     | 374/774 [01:39<01:53,  3.52it/s][A
+ 48%|████▊     | 375/774 [01:39<01:53,  3.52it/s][A
+ 49%|████▊     | 376/774 [01:39<01:57,  3.37it/s][A
+ 49%|████▊     | 377/774 [01:40<02:10,  3.04it/s][A
+ 49%|████▉     | 378/774 [01:40<02:10,  3.02it/s][A
+ 49%|████▉     | 379/774 [01:40<02:01,  3.25it/s][A
+ 49%|████▉     | 380/774 [01:41<01:51,  3.53it/s][A
+ 49%|████▉     | 381/774 [01:41<01:43,  3.80it/s][A
+ 49%|████▉     | 382/774 [01:41<01:40,  3.91it/s][A
+ 49%|████▉     | 383/774 [01:41<01:37,  3.99it/s][A
+ 50%|████▉     | 384/774 [01:42<01:45,  3.71it/s][A
+ 50%|████▉     | 385/774 [01:42<01:53,  3.42it/s][A
+ 50%|████▉     | 386/774 [01:42<01:46,  3.64it/s][A
+ 50%|█████     | 387/774 [01:43<01:39,  3.88it/s][A
+ 50%|█████     | 388/774 [01:43<01:44,  3.68it/s][A
+ 50%|█████     | 389/774 [01:43<01:41,  3.80it/s][A
+ 50%|█████     | 390/774 [01:43<01:54,  3.35it/s][A
+ 51%|█████     | 391/774 [01:44<01:55,  3.30it/s][A
+ 51%|█████     | 392/774 [01:44<01:46,  3.58it/s][A
+ 51%|█████     | 393/774 [01:44<01:38,  3.87it/s][A
+ 51%|█████     | 394/774 [01:44<01:38,  3.84it/s][A
+ 51%|█████     | 395/774 [01:45<01:46,  3.57it/s][A
+ 51%|█████     | 396/774 [01:45<01:42,  3.68it/s][A
+ 51%|█████▏    | 397/774 [01:45<01:46,  3.55it/s][A
+ 51%|█████▏    | 398/774 [01:46<01:42,  3.67it/s][A
+ 52%|█████▏    | 399/774 [01:46<01:42,  3.67it/s][A
+ 52%|█████▏    | 400/774 [01:46<01:34,  3.95it/s][A
+ 52%|█████▏    | 401/774 [01:46<01:30,  4.10it/s][A
+ 52%|█████▏    | 402/774 [01:47<01:30,  4.13it/s][A
+ 52%|█████▏    | 403/774 [01:47<01:33,  3.96it/s][A
+ 52%|█████▏    | 404/774 [01:47<01:39,  3.71it/s][A
+ 52%|█████▏    | 405/774 [01:47<01:38,  3.76it/s][A
+ 52%|█████▏    | 406/774 [01:48<01:40,  3.67it/s][A
+ 53%|█████▎    | 407/774 [01:48<01:45,  3.47it/s][A
+ 53%|█████▎    | 408/774 [01:48<01:41,  3.61it/s][A
+ 53%|█████▎    | 409/774 [01:48<01:37,  3.76it/s][A
+ 53%|█████▎    | 410/774 [01:49<01:38,  3.71it/s][A
+ 53%|█████▎    | 411/774 [01:49<01:38,  3.69it/s][A
+ 53%|█████▎    | 412/774 [01:49<01:39,  3.64it/s][A
+ 53%|█████▎    | 413/774 [01:50<01:36,  3.72it/s][A
+ 53%|█████▎    | 414/774 [01:50<01:34,  3.82it/s][A
+ 54%|█████▎    | 415/774 [01:50<01:23,  4.31it/s][A
+ 54%|█████▎    | 416/774 [01:50<01:23,  4.27it/s][A
+ 54%|█████▍    | 417/774 [01:50<01:22,  4.31it/s][A
+ 54%|█████▍    | 418/774 [01:51<01:16,  4.65it/s][A
+ 54%|█████▍    | 419/774 [01:51<01:30,  3.92it/s][A
+ 54%|█████▍    | 420/774 [01:51<01:35,  3.72it/s][A
+ 54%|█████▍    | 421/774 [01:52<01:34,  3.72it/s][A
+ 55%|█████▍    | 422/774 [01:52<01:34,  3.72it/s][A
+ 55%|█████▍    | 423/774 [01:52<01:34,  3.70it/s][A
+ 55%|█████▍    | 424/774 [01:52<01:33,  3.76it/s][A
+ 55%|█████▍    | 425/774 [01:53<01:22,  4.23it/s][A
+ 55%|█████▌    | 426/774 [01:53<01:16,  4.55it/s][A
+ 55%|█████▌    | 427/774 [01:53<01:12,  4.75it/s][A
+ 55%|█████▌    | 428/774 [01:53<01:14,  4.63it/s][A
+ 55%|█████▌    | 429/774 [01:53<01:17,  4.47it/s][A
+ 56%|█████▌    | 430/774 [01:54<01:21,  4.20it/s][A
+ 56%|█████▌    | 431/774 [01:54<01:34,  3.63it/s][A
+ 56%|█████▌    | 432/774 [01:54<01:32,  3.68it/s][A
+ 56%|█████▌    | 433/774 [01:54<01:26,  3.95it/s][A
+ 56%|█████▌    | 434/774 [01:55<01:21,  4.18it/s][A
+ 56%|█████▌    | 435/774 [01:55<01:20,  4.21it/s][A
+ 56%|█████▋    | 436/774 [01:55<01:22,  4.12it/s][A
+ 56%|█████▋    | 437/774 [01:55<01:18,  4.29it/s][A
+ 57%|█████▋    | 438/774 [01:56<01:14,  4.49it/s][A
+ 57%|█████▋    | 439/774 [01:56<01:17,  4.30it/s][A
+ 57%|█████▋    | 440/774 [01:56<01:21,  4.09it/s][A
+ 57%|█████▋    | 441/774 [01:56<01:25,  3.88it/s][A
+ 57%|█████▋    | 442/774 [01:57<01:26,  3.82it/s][A
+ 57%|█████▋    | 443/774 [01:57<01:24,  3.90it/s][A
+ 57%|█████▋    | 444/774 [01:57<01:23,  3.95it/s][A
+ 57%|█████▋    | 445/774 [01:57<01:23,  3.93it/s][A
+ 58%|█████▊    | 446/774 [01:58<01:21,  4.04it/s][A
+ 58%|█████▊    | 447/774 [01:58<01:19,  4.11it/s][A
+ 58%|█████▊    | 448/774 [01:58<01:12,  4.49it/s][A
+ 58%|█████▊    | 449/774 [01:58<01:13,  4.41it/s][A
+ 58%|█████▊    | 450/774 [01:59<01:16,  4.25it/s][A
+ 58%|█████▊    | 451/774 [01:59<01:14,  4.34it/s][A
+ 58%|█████▊    | 452/774 [01:59<01:09,  4.62it/s][A
+ 59%|█████▊    | 453/774 [01:59<01:08,  4.66it/s][A
+ 59%|█████▊    | 454/774 [01:59<01:14,  4.28it/s][A
+ 59%|█████▉    | 455/774 [02:00<01:19,  4.02it/s][A
+ 59%|█████▉    | 456/774 [02:00<01:24,  3.77it/s][A
+ 59%|█████▉    | 457/774 [02:00<01:18,  4.06it/s][A
+ 59%|█████▉    | 458/774 [02:00<01:18,  4.03it/s][A
+ 59%|█████▉    | 459/774 [02:01<01:16,  4.13it/s][A
+ 59%|█████▉    | 460/774 [02:01<01:21,  3.86it/s][A
+ 60%|█████▉    | 461/774 [02:01<01:27,  3.58it/s][A
+ 60%|█████▉    | 462/774 [02:02<01:25,  3.65it/s][A
+ 60%|█████▉    | 463/774 [02:02<01:22,  3.76it/s][A
+ 60%|█████▉    | 464/774 [02:02<01:22,  3.75it/s][A
+ 60%|██████    | 465/774 [02:02<01:14,  4.16it/s][A
+ 60%|██████    | 466/774 [02:02<01:10,  4.35it/s][A
+ 60%|██████    | 467/774 [02:03<01:06,  4.60it/s][A
+ 60%|██████    | 468/774 [02:03<01:07,  4.56it/s][A
+ 61%|██████    | 469/774 [02:03<01:01,  4.97it/s][A
+ 61%|██████    | 470/774 [02:03<00:58,  5.16it/s][A
+ 61%|██████    | 471/774 [02:03<01:01,  4.96it/s][A
+ 61%|██████    | 472/774 [02:04<01:05,  4.62it/s][A
+ 61%|██████    | 473/774 [02:04<01:08,  4.40it/s][A
+ 61%|██████    | 474/774 [02:04<01:06,  4.48it/s][A
+ 61%|██████▏   | 475/774 [02:04<01:08,  4.39it/s][A
+ 61%|██████▏   | 476/774 [02:05<01:16,  3.91it/s][A
+ 62%|██████▏   | 477/774 [02:05<01:30,  3.29it/s][A
+ 62%|██████▏   | 478/774 [02:05<01:31,  3.25it/s][A
+ 62%|██████▏   | 479/774 [02:06<01:28,  3.32it/s][A
+ 62%|██████▏   | 480/774 [02:06<01:25,  3.45it/s][A
+ 62%|██████▏   | 481/774 [02:06<01:26,  3.39it/s][A
+ 62%|██████▏   | 482/774 [02:07<01:24,  3.45it/s][A
+ 62%|██████▏   | 483/774 [02:07<01:22,  3.53it/s][A
+ 63%|██████▎   | 484/774 [02:07<01:23,  3.46it/s][A
+ 63%|██████▎   | 485/774 [02:07<01:26,  3.35it/s][A
+ 63%|██████▎   | 486/774 [02:08<01:22,  3.47it/s][A
+ 63%|██████▎   | 487/774 [02:08<01:24,  3.41it/s][A
+ 63%|██████▎   | 488/774 [02:08<01:21,  3.52it/s][A
+ 63%|██████▎   | 489/774 [02:09<01:16,  3.71it/s][A
+ 63%|██████▎   | 490/774 [02:09<01:16,  3.69it/s][A
+ 63%|██████▎   | 491/774 [02:09<01:15,  3.73it/s][A
+ 64%|██████▎   | 492/774 [02:09<01:17,  3.65it/s][A
+ 64%|██████▎   | 493/774 [02:10<01:18,  3.60it/s][A
+ 64%|██████▍   | 494/774 [02:10<01:16,  3.64it/s][A
+ 64%|██████▍   | 495/774 [02:10<01:16,  3.65it/s][A
+ 64%|██████▍   | 496/774 [02:11<01:21,  3.41it/s][A
+ 64%|██████▍   | 497/774 [02:11<01:22,  3.35it/s][A
+ 64%|██████▍   | 498/774 [02:11<01:21,  3.40it/s][A
+ 64%|██████▍   | 499/774 [02:11<01:19,  3.48it/s][A
+ 65%|██████▍   | 500/774 [02:12<01:16,  3.57it/s][A
+ 65%|██████▍   | 501/774 [02:12<01:13,  3.70it/s][A
+ 65%|██████▍   | 502/774 [02:12<01:13,  3.71it/s][A
+ 65%|██████▍   | 503/774 [02:13<01:19,  3.41it/s][A
+ 65%|██████▌   | 504/774 [02:13<01:21,  3.31it/s][A
+ 65%|██████▌   | 505/774 [02:13<01:18,  3.44it/s][A
+ 65%|█████��▌   | 506/774 [02:13<01:18,  3.42it/s][A
+ 66%|██████▌   | 507/774 [02:14<01:22,  3.22it/s][A
+ 66%|██████▌   | 508/774 [02:14<01:20,  3.31it/s][A
+ 66%|██████▌   | 509/774 [02:14<01:19,  3.35it/s][A
+ 66%|██████▌   | 510/774 [02:15<01:16,  3.46it/s][A
+ 66%|██████▌   | 511/774 [02:15<01:12,  3.63it/s][A
+ 66%|██████▌   | 512/774 [02:15<01:10,  3.72it/s][A
+ 66%|██████▋   | 513/774 [02:15<01:14,  3.53it/s][A
+ 66%|██████▋   | 514/774 [02:16<01:15,  3.44it/s][A
+ 67%|██████▋   | 515/774 [02:16<01:21,  3.18it/s][A
+ 67%|██████▋   | 516/774 [02:16<01:15,  3.40it/s][A
+ 67%|██████▋   | 517/774 [02:17<01:09,  3.69it/s][A
+ 67%|██████▋   | 518/774 [02:17<01:06,  3.82it/s][A
+ 67%|██████▋   | 519/774 [02:17<01:09,  3.67it/s][A
+ 67%|██████▋   | 520/774 [02:17<01:08,  3.69it/s][A
+ 67%|██████▋   | 521/774 [02:18<01:06,  3.82it/s][A
+ 67%|██████▋   | 522/774 [02:18<01:02,  4.03it/s][A
+ 68%|██████▊   | 523/774 [02:18<01:00,  4.13it/s][A
+ 68%|██████▊   | 524/774 [02:18<01:05,  3.83it/s][A
+ 68%|██████▊   | 525/774 [02:19<01:06,  3.75it/s][A
+ 68%|██████▊   | 526/774 [02:19<01:09,  3.59it/s][A
+ 68%|██████▊   | 527/774 [02:19<01:10,  3.51it/s][A
+ 68%|██████▊   | 528/774 [02:20<01:09,  3.52it/s][A
+ 68%|██████▊   | 529/774 [02:20<01:06,  3.71it/s][A
+ 68%|██████▊   | 530/774 [02:20<01:04,  3.77it/s][A
+ 69%|██████▊   | 531/774 [02:20<01:04,  3.79it/s][A
+ 69%|██████▊   | 532/774 [02:21<01:02,  3.89it/s][A
+ 69%|██████▉   | 533/774 [02:21<00:58,  4.09it/s][A
+ 69%|██████▉   | 534/774 [02:21<00:56,  4.28it/s][A
+ 69%|██████▉   | 535/774 [02:21<00:58,  4.09it/s][A
+ 69%|██████▉   | 536/774 [02:21<01:00,  3.93it/s][A
+ 69%|██████▉   | 537/774 [02:22<01:01,  3.88it/s][A
+ 70%|██████▉   | 538/774 [02:22<01:04,  3.64it/s][A
+ 70%|██████▉   | 539/774 [02:22<01:04,  3.65it/s][A
+ 70%|██████▉   | 540/774 [02:23<01:03,  3.66it/s][A
+ 70%|██████▉   | 541/774 [02:23<01:01,  3.77it/s][A
+ 70%|███████   | 542/774 [02:23<01:01,  3.77it/s][A
+ 70%|███████   | 543/774 [02:23<01:02,  3.68it/s][A
+ 70%|███████   | 544/774 [02:24<01:02,  3.66it/s][A
+ 70%|███████   | 545/774 [02:24<01:00,  3.79it/s][A
+ 71%|███████   | 546/774 [02:24<00:56,  4.00it/s][A
+ 71%|███████   | 547/774 [02:24<00:54,  4.16it/s][A
+ 71%|███████   | 548/774 [02:25<00:53,  4.21it/s][A
+ 71%|███████   | 549/774 [02:25<00:54,  4.15it/s][A
+ 71%|███████   | 550/774 [02:25<00:57,  3.87it/s][A
+ 71%|███████   | 551/774 [02:25<01:00,  3.69it/s][A
+ 71%|███████▏  | 552/774 [02:26<01:03,  3.50it/s][A
+ 71%|███████▏  | 553/774 [02:26<01:07,  3.27it/s][A
+ 72%|███████▏  | 554/774 [02:26<01:06,  3.32it/s][A
+ 72%|███████▏  | 555/774 [02:27<01:05,  3.32it/s][A
+ 72%|███████▏  | 556/774 [02:27<01:02,  3.50it/s][A
+ 72%|███████▏  | 557/774 [02:27<01:05,  3.29it/s][A
+ 72%|███████▏  | 558/774 [02:28<01:00,  3.60it/s][A
+ 72%|███████▏  | 559/774 [02:28<00:55,  3.90it/s][A
+ 72%|███████▏  | 560/774 [02:28<00:59,  3.59it/s][A
+ 72%|███████▏  | 561/774 [02:28<00:56,  3.79it/s][A
+ 73%|███████▎  | 562/774 [02:28<00:51,  4.10it/s][A
+ 73%|███████▎  | 563/774 [02:29<00:49,  4.25it/s][A
+ 73%|███████▎  | 564/774 [02:29<00:51,  4.10it/s][A
+ 73%|███████▎  | 565/774 [02:29<00:52,  3.94it/s][A
+ 73%|███████▎  | 566/774 [02:29<00:49,  4.24it/s][A
+ 73%|███████▎  | 567/774 [02:30<00:45,  4.58it/s][A
+ 73%|███████▎  | 568/774 [02:30<00:46,  4.39it/s][A
+ 74%|███████▎  | 569/774 [02:30<00:47,  4.31it/s][A
+ 74%|███████▎  | 570/774 [02:30<00:47,  4.34it/s][A
+ 74%|███████▍  | 571/774 [02:31<00:51,  3.97it/s][A
+ 74%|███████▍  | 572/774 [02:31<00:53,  3.80it/s][A
+ 74%|███████▍  | 573/774 [02:31<00:52,  3.81it/s][A
+ 74%|███████▍  | 574/774 [02:31<00:51,  3.92it/s][A
+ 74%|███████▍  | 575/774 [02:32<00:50,  3.94it/s][A
+ 74%|███████▍  | 576/774 [02:32<00:55,  3.54it/s][A
+ 75%|███████▍  | 577/774 [02:32<00:53,  3.66it/s][A
+ 75%|███████▍  | 578/774 [02:33<00:52,  3.71it/s][A
+ 75%|███████▍  | 579/774 [02:33<00:54,  3.56it/s][A
+ 75%|███████▍  | 580/774 [02:33<00:54,  3.56it/s][A
+ 75%|███████▌  | 581/774 [02:33<00:53,  3.59it/s][A
+ 75%|███████▌  | 582/774 [02:34<00:52,  3.69it/s][A
+ 75%|███████▌  | 583/774 [02:34<00:49,  3.83it/s][A
+ 75%|███████▌  | 584/774 [02:34<00:49,  3.85it/s][A
+ 76%|███████▌  | 585/774 [02:34<00:51,  3.68it/s][A
+ 76%|███████▌  | 586/774 [02:35<00:51,  3.62it/s][A
+ 76%|███████▌  | 587/774 [02:35<00:50,  3.69it/s][A
+ 76%|███████▌  | 588/774 [02:35<00:49,  3.76it/s][A
+ 76%|███████▌  | 589/774 [02:35<00:48,  3.83it/s][A
+ 76%|███████▌  | 590/774 [02:36<00:44,  4.10it/s][A
+ 76%|███████▋  | 591/774 [02:36<00:46,  3.96it/s][A
+ 76%|███████▋  | 592/774 [02:36<00:48,  3.72it/s][A
+ 77%|███████▋  | 593/774 [02:37<00:49,  3.65it/s][A
+ 77%|███████▋  | 594/774 [02:37<00:49,  3.61it/s][A
+ 77%|███████▋  | 595/774 [02:37<00:53,  3.34it/s][A
+ 77%|███████▋  | 596/774 [02:38<00:55,  3.19it/s][A
+ 77%|███████▋  | 597/774 [02:38<00:55,  3.16it/s][A
+ 77%|███████▋  | 598/774 [02:38<00:56,  3.10it/s][A
+ 77%|███████▋  | 599/774 [02:39<00:57,  3.05it/s][A
+ 78%|███████▊  | 600/774 [02:39<00:57,  3.03it/s][A
+ 78%|███████▊  | 601/774 [02:39<00:57,  3.01it/s][A
+ 78%|███████▊  | 602/774 [02:40<00:57,  3.00it/s][A
+ 78%|███████▊  | 603/774 [02:40<00:56,  3.05it/s][A
+ 78%|███████▊  | 604/774 [02:40<00:56,  3.00it/s][A
+ 78%|███████▊  | 605/774 [02:41<00:55,  3.06it/s][A
+ 78%|███████▊  | 606/774 [02:41<00:56,  2.98it/s][A
+ 78%|███████▊  | 607/774 [02:41<00:55,  3.00it/s][A
+ 79%|███████▊  | 608/774 [02:42<00:55,  2.99it/s][A
+ 79%|███████▊  | 609/774 [02:42<00:52,  3.12it/s][A
+ 79%|███████▉  | 610/774 [02:42<00:53,  3.04it/s][A
+ 79%|███████▉  | 611/774 [02:43<00:58,  2.79it/s][A
+ 79%|███████▉  | 612/774 [02:43<01:00,  2.68it/s][A
+ 79%|███████▉  | 613/774 [02:43<00:56,  2.87it/s][A
+ 79%|███████▉  | 614/774 [02:44<00:54,  2.92it/s][A
+ 79%|███████▉  | 615/774 [02:44<00:52,  3.05it/s][A
+ 80%|███████▉  | 616/774 [02:44<00:51,  3.10it/s][A
+ 80%|███████▉  | 617/774 [02:45<00:50,  3.12it/s][A
+ 80%|███████▉  | 618/774 [02:45<00:47,  3.27it/s][A
+ 80%|███████▉  | 619/774 [02:45<00:45,  3.44it/s][A
+ 80%|████████  | 620/774 [02:45<00:44,  3.48it/s][A
+ 80%|████████  | 621/774 [02:46<00:40,  3.76it/s][A
+ 80%|████████  | 622/774 [02:46<00:37,  4.01it/s][A
+ 80%|████████  | 623/774 [02:46<00:37,  3.98it/s][A
+ 81%|████████  | 624/774 [02:46<00:40,  3.66it/s][A
+ 81%|████████  | 625/774 [02:47<00:40,  3.64it/s][A
+ 81%|████████  | 626/774 [02:47<00:44,  3.36it/s][A
+ 81%|████████  | 627/774 [02:47<00:44,  3.27it/s][A
+ 81%|████████  | 628/774 [02:48<00:44,  3.27it/s][A
+ 81%|████████▏ | 629/774 [02:48<00:43,  3.37it/s][A
+ 81%|████████▏ | 630/774 [02:48<00:40,  3.60it/s][A
+ 82%|████████▏ | 631/774 [02:48<00:37,  3.79it/s][A
+ 82%|████████▏ | 632/774 [02:49<00:37,  3.80it/s][A
+ 82%|████████▏ | 633/774 [02:49<00:38,  3.64it/s][A
+ 82%|████████▏ | 634/774 [02:49<00:39,  3.53it/s][A
+ 82%|████████▏ | 635/774 [02:50<00:38,  3.57it/s][A
+ 82%|████████▏ | 636/774 [02:50<00:39,  3.50it/s][A
+ 82%|████████▏ | 637/774 [02:50<00:38,  3.57it/s][A
+ 82%|████████▏ | 638/774 [02:50<00:38,  3.54it/s][A
+ 83%|████████▎ | 639/774 [02:51<00:43,  3.12it/s][A
+ 83%|████████▎ | 640/774 [02:51<00:49,  2.71it/s][A
+ 83%|████████▎ | 641/774 [02:52<00:48,  2.73it/s][A
+ 83%|████████▎ | 642/774 [02:52<00:45,  2.90it/s][A
+ 83%|████████▎ | 643/774 [02:52<00:44,  2.93it/s][A
+ 83%|████████▎ | 644/774 [02:52<00:40,  3.18it/s][A
+ 83%|████████▎ | 645/774 [02:53<00:37,  3.48it/s][A
+ 83%|████████▎ | 646/774 [02:53<00:34,  3.71it/s][A
+ 84%|████████▎ | 647/774 [02:53<00:31,  3.98it/s][A
+ 84%|████████▎ | 648/774 [02:53<00:30,  4.14it/s][A
+ 84%|████████▍ | 649/774 [02:54<00:30,  4.16it/s][A
+ 84%|████████▍ | 650/774 [02:54<00:28,  4.38it/s][A
+ 84%|███���████▍ | 651/774 [02:54<00:28,  4.34it/s][A
+ 84%|████████▍ | 652/774 [02:54<00:29,  4.18it/s][A
+ 84%|████████▍ | 653/774 [02:55<00:30,  3.91it/s][A
+ 84%|████████▍ | 654/774 [02:55<00:28,  4.18it/s][A
+ 85%|████████▍ | 655/774 [02:55<00:26,  4.49it/s][A
+ 85%|████████▍ | 656/774 [02:55<00:27,  4.30it/s][A
+ 85%|████████▍ | 657/774 [02:55<00:25,  4.51it/s][A
+ 85%|████████▌ | 658/774 [02:56<00:26,  4.30it/s][A
+ 85%|████████▌ | 659/774 [02:56<00:28,  3.99it/s][A
+ 85%|████████▌ | 660/774 [02:56<00:29,  3.83it/s][A
+ 85%|████████▌ | 661/774 [02:57<00:30,  3.76it/s][A
+ 86%|████████▌ | 662/774 [02:57<00:28,  3.95it/s][A
+ 86%|████████▌ | 663/774 [02:57<00:29,  3.72it/s][A
+ 86%|████████▌ | 664/774 [02:57<00:29,  3.68it/s][A
+ 86%|████████▌ | 665/774 [02:58<00:27,  3.97it/s][A
+ 86%|████████▌ | 666/774 [02:58<00:24,  4.39it/s][A
+ 86%|████████▌ | 667/774 [02:58<00:22,  4.66it/s][A
+ 86%|████████▋ | 668/774 [02:58<00:23,  4.49it/s][A
+ 86%|████████▋ | 669/774 [02:58<00:24,  4.25it/s][A
+ 87%|████████▋ | 670/774 [02:59<00:23,  4.38it/s][A
+ 87%|████████▋ | 671/774 [02:59<00:25,  3.98it/s][A
+ 87%|████████▋ | 672/774 [02:59<00:25,  4.05it/s][A
+ 87%|████████▋ | 673/774 [02:59<00:24,  4.16it/s][A
+ 87%|████████▋ | 674/774 [03:00<00:24,  4.10it/s][A
+ 87%|████████▋ | 675/774 [03:00<00:22,  4.31it/s][A
+ 87%|████████▋ | 676/774 [03:00<00:21,  4.53it/s][A
+ 87%|████████▋ | 677/774 [03:00<00:21,  4.49it/s][A
+ 88%|████████▊ | 678/774 [03:00<00:21,  4.54it/s][A
+ 88%|████████▊ | 679/774 [03:01<00:22,  4.29it/s][A
+ 88%|████████▊ | 680/774 [03:01<00:22,  4.24it/s][A
+ 88%|████████▊ | 681/774 [03:01<00:20,  4.52it/s][A
+ 88%|████████▊ | 682/774 [03:01<00:20,  4.56it/s][A
+ 88%|████████▊ | 683/774 [03:02<00:21,  4.19it/s][A
+ 88%|████████▊ | 684/774 [03:02<00:22,  3.93it/s][A
+ 89%|████████▊ | 685/774 [03:02<00:23,  3.75it/s][A
+ 89%|████████▊ | 686/774 [03:03<00:22,  3.87it/s][A
+ 89%|████████▉ | 687/774 [03:03<00:21,  4.11it/s][A
+ 89%|████████▉ | 688/774 [03:03<00:20,  4.12it/s][A
+ 89%|████████▉ | 689/774 [03:03<00:19,  4.27it/s][A
+ 89%|████████▉ | 690/774 [03:03<00:19,  4.38it/s][A
+ 89%|████████▉ | 691/774 [03:04<00:18,  4.48it/s][A
+ 89%|████████▉ | 692/774 [03:04<00:18,  4.53it/s][A
+ 90%|████████▉ | 693/774 [03:04<00:17,  4.54it/s][A
+ 90%|████████▉ | 694/774 [03:04<00:18,  4.24it/s][A
+ 90%|████████▉ | 695/774 [03:05<00:20,  3.91it/s][A
+ 90%|████████▉ | 696/774 [03:05<00:19,  4.01it/s][A
+ 90%|█████████ | 697/774 [03:05<00:19,  4.02it/s][A
+ 90%|█████████ | 698/774 [03:05<00:17,  4.44it/s][A
+ 90%|█████████ | 699/774 [03:05<00:15,  4.82it/s][A
+ 90%|█████████ | 700/774 [03:06<00:16,  4.41it/s][A
+ 91%|█████████ | 701/774 [03:06<00:16,  4.48it/s][A
+ 91%|█████████ | 702/774 [03:06<00:16,  4.47it/s][A
+ 91%|█████████ | 703/774 [03:06<00:15,  4.46it/s][A
+ 91%|█████████ | 704/774 [03:07<00:16,  4.30it/s][A
+ 91%|█████████ | 705/774 [03:07<00:14,  4.66it/s][A
+ 91%|█████████ | 706/774 [03:07<00:14,  4.83it/s][A
+ 91%|█████████▏| 707/774 [03:07<00:14,  4.74it/s][A
+ 91%|█████████▏| 708/774 [03:07<00:13,  5.01it/s][A
+ 92%|█████████▏| 709/774 [03:08<00:13,  4.89it/s][A
+ 92%|█████████▏| 710/774 [03:08<00:13,  4.78it/s][A
+ 92%|█████████▏| 711/774 [03:08<00:12,  4.95it/s][A
+ 92%|█████████▏| 712/774 [03:08<00:11,  5.19it/s][A
+ 92%|█████████▏| 713/774 [03:08<00:12,  5.02it/s][A
+ 92%|█████████▏| 714/774 [03:09<00:12,  4.72it/s][A
+ 92%|█████████▏| 715/774 [03:09<00:12,  4.88it/s][A
+ 93%|█████████▎| 716/774 [03:09<00:10,  5.37it/s][A
+ 93%|█████████▎| 717/774 [03:09<00:10,  5.43it/s][A
+ 93%|█████████▎| 718/774 [03:09<00:11,  4.85it/s][A
+ 93%|█████████▎| 719/774 [03:10<00:11,  4.71it/s][A
+ 93%|█████████▎| 720/774 [03:10<00:10,  5.04it/s][A
+ 93%|█████████▎| 721/774 [03:10<00:09,  5.37it/s][A
+ 93%|█████████▎| 722/774 [03:10<00:09,  5.76it/s][A
+ 93%|█████████▎| 723/774 [03:10<00:09,  5.53it/s][A
+ 94%|█████████▎| 724/774 [03:10<00:09,  5.47it/s][A
+ 94%|█████████▎| 725/774 [03:11<00:08,  5.61it/s][A
+ 94%|█████████▍| 726/774 [03:11<00:08,  5.66it/s][A
+ 94%|█████████▍| 727/774 [03:11<00:08,  5.44it/s][A
+ 94%|█████████▍| 728/774 [03:11<00:09,  4.89it/s][A
+ 94%|█████████▍| 729/774 [03:11<00:08,  5.19it/s][A
+ 94%|█████████▍| 730/774 [03:12<00:08,  5.48it/s][A
+ 94%|█████████▍| 731/774 [03:12<00:07,  5.48it/s][A
+ 95%|█████████▍| 732/774 [03:12<00:07,  5.63it/s][A
+ 95%|█████████▍| 733/774 [03:12<00:07,  5.62it/s][A
+ 95%|█████████▍| 734/774 [03:12<00:06,  5.76it/s][A
+ 95%|█████████▍| 735/774 [03:12<00:06,  5.87it/s][A
+ 95%|█████████▌| 736/774 [03:13<00:06,  5.94it/s][A
+ 95%|█████████▌| 737/774 [03:13<00:06,  5.85it/s][A
+ 95%|█████████▌| 738/774 [03:13<00:06,  5.64it/s][A
+ 95%|█████████▌| 739/774 [03:13<00:06,  5.59it/s][A
+ 96%|█████████▌| 740/774 [03:13<00:06,  5.49it/s][A
+ 96%|█████████▌| 741/774 [03:14<00:06,  5.16it/s][A
+ 96%|█████████▌| 742/774 [03:14<00:05,  5.36it/s][A
+ 96%|█████████▌| 743/774 [03:14<00:05,  5.68it/s][A
+ 96%|█████████▌| 744/774 [03:14<00:05,  5.47it/s][A
+ 96%|█████████▋| 745/774 [03:14<00:06,  4.56it/s][A
+ 96%|█████████▋| 746/774 [03:15<00:07,  3.96it/s][A
+ 97%|█████████▋| 747/774 [03:15<00:06,  4.16it/s][A
+ 97%|█████████▋| 748/774 [03:15<00:05,  4.36it/s][A
+ 97%|█████████▋| 749/774 [03:15<00:05,  4.66it/s][A
+ 97%|█████████▋| 750/774 [03:16<00:05,  4.32it/s][A
+ 97%|█████████▋| 751/774 [03:16<00:05,  4.57it/s][A
+ 97%|█████████▋| 752/774 [03:16<00:04,  4.50it/s][A
+ 97%|█████████▋| 753/774 [03:16<00:04,  4.79it/s][A
+ 97%|█████████▋| 754/774 [03:16<00:03,  5.42it/s][A
+ 98%|█████████▊| 755/774 [03:16<00:03,  5.71it/s][A
+ 98%|█████████▊| 756/774 [03:17<00:03,  5.56it/s][A
+ 98%|█████████▊| 757/774 [03:17<00:03,  5.37it/s][A
+ 98%|█████████▊| 758/774 [03:17<00:03,  5.28it/s][A
+ 98%|█████████▊| 759/774 [03:17<00:02,  5.54it/s][A
+ 98%|█████████▊| 760/774 [03:17<00:02,  5.52it/s][A
+ 98%|█████████▊| 761/774 [03:18<00:02,  6.00it/s][A
+ 98%|█████████▊| 762/774 [03:18<00:01,  6.10it/s][A
+ 99%|█████████▊| 763/774 [03:18<00:01,  6.28it/s][A
+ 99%|█████████▊| 764/774 [03:18<00:01,  6.39it/s][A
+ 99%|█████████▉| 765/774 [03:18<00:01,  6.35it/s][A
+ 99%|█████████▉| 766/774 [03:18<00:01,  5.43it/s][A
+ 99%|█████████▉| 767/774 [03:19<00:01,  5.60it/s][A
+ 99%|█████████▉| 768/774 [03:19<00:01,  5.58it/s][A
+ 99%|█████████▉| 769/774 [03:19<00:00,  5.28it/s][A
+ 99%|█████████▉| 770/774 [03:19<00:00,  5.14it/s][A
+100%|█████████▉| 771/774 [03:19<00:00,  5.47it/s][A
+100%|█████████▉| 772/774 [03:20<00:00,  5.17it/s][A
+100%|█████████▉| 773/774 [03:20<00:00,  4.99it/s][A                                                      
+                                                 [A 16%|█▌        | 2000/12776 [21:33<1:07:12,  2.67it/s]
+100%|██████████| 774/774 [03:22<00:00,  4.99it/s][A
+                                                 [ASaving model checkpoint to ./checkpoint-2000
+Configuration saved in ./checkpoint-2000/config.json
+Model weights saved in ./checkpoint-2000/model.safetensors
+Feature extractor saved in ./checkpoint-2000/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json
+Special tokens file saved in ./checkpoint-2000/special_tokens_map.json
+added tokens file saved in ./checkpoint-2000/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-800] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 16%|█▌        | 2001/12776 [21:39<189:10:37, 63.21s/it]                                                         16%|█▌        | 2001/12776 [21:39<189:10:37, 63.21s/it] 16%|█▌        | 2002/12776 [21:40<133:19:03, 44.55s/it]                                                         16%|█▌        | 2002/12776 [21:40<133:19:03, 44.55s/it] 16%|█▌        | 2003/12776 [21:41<94:07:45, 31.46s/it]                                                         16%|█▌        | 2003/12776 [21:41<94:07:45, 31.46s/it] 16%|█▌        | 2004/12776 [21:42<66:37:13, 22.26s/it]                                                        16%|█▌        | 2004/12776 [21:42<66:37:13, 22.26s/it] 16%|█▌        | 2005/12776 [21:43<47:14:31, 15.79s/it]                                                        16%|█▌        | 2005/12776 [21:43<47:14:31, 15.79s/it] 16%|█▌        | 2006/12776 [21:43<33:44:44, 11.28s/it]                                                        16%|█▌        | 2006/12776 [21:43<33:44:44, 11.28s/it] 16%|█▌        | 2007/12776 [21:44<24:09:59,  8.08s/it]                                                        16%|█▌        | 2007/12776 [21:44<24:09:59,  8.08s/it] 16%|█▌        | 2008/12776 [21:45<17:28:25,  5.84s/it]                                                        16%|█▌        | 2008/12776 [21:45<17:28:25,  5.84s/it] 16%|█▌        | 2009/12776 [21:45<12:42:42,  4.25s/it]                                                        16%|█▌        | 2009/12776 [21:45<12:42:42,  4.25s/it] 16%|█▌        | 2010/12776 [21:46<9:23:26,  3.14s/it]                                                        16%|█▌        | 2010/12776 [21:46<9:23:26,  3.14s/it] 16%|█▌        | 2011/12776 [21:46<7:00:40,  2.34s/it]                                                       16%|█▌        | 2011/12776 [21:46<7:00:40,  2.34s/it] 16%|█▌        | 2012/12776 [21:47<5:25:53,  1.82s/it]                                                       16%|█▌        | 2012/12776 [21:47<5:25:53,  1.82s/it] 16%|█▌        | 2013/12776 [21:47<4:10:55,  1.40s/it]                                                       16%|█▌        | 2013/12776 [21:47<4:10:55,  1.40s/it] 16%|█▌        | 2014/12776 [21:48<3:24:25,  1.14s/it]                                                       16%|█▌        | 2014/12776 [21:48<3:24:25,  1.14s/it] 16%|█▌        | 2015/12776 [21:48<2:44:39,  1.09it/s]                                                       16%|█▌        | 2015/12776 [21:48<2:44:39,  1.09it/s] 16%|█▌        | 2016/12776 [21:49<2:15:38,  1.32it/s]                                                       16%|█▌        | 2016/12776 [21:49<2:15:38,  1.32it/s] 16%|█▌        | 2017/12776 [21:49<1:56:50,  1.53it/s]                                                       16%|█▌        | 2017/12776 [21:49<1:56:50,  1.53it/s] 16%|█▌        | 2018/12776 [21:49<1:40:57,  1.78it/s]                                                       16%|█▌        | 2018/12776 [21:49<1:40:57,  1.78it/s] 16%|█▌        | 2019/12776 [21:50<1:29:31,  2.00it/s]                                                       16%|█▌        | 2019/12776 [21:50<1:29:31,  2.00it/s] 16%|█▌        | 2020/12776 [21:50<1:24:20,  2.13it/s]                                                       16%|█▌        | 2020/12776 [21:50<1:24:20,  2.13it/s] 16%|█▌        | 2021/12776 [21:50<1:16:16,  2.35it/s]                                                       16%|█▌        | 2021/12776 [21:50<1:16:16,  2.35it/s] 16%|█▌        | 2022/12776 [21:51<1:10:23,  2.55it/s]                                                       16%|█▌        | 2022/12776 [21:51<1:10:23,  2.55it/s] 16%|█▌        | 2023/12776 [21:51<1:05:45,  2.73it/s]                                                       16%|█▌        | 2023/12776 [21:51<1:05:45,  2.73it/s] 16%|█▌        | 2024/12776 [21:51<1:07:23,  2.66it/s]                                                       16%|█▌        | 2024/12776 [21:51<1:07:23,  2.66it/s] 16%|█▌        | 2025/12776 [21:52<1:03:16,  2.83it/s]                                                       16%|█▌        | 2025/12776 [21:52<1:03:16,  2.83it/s] 16%|█▌        | 2026/12776 [21:52<59:32,  3.01it/s]                                                       16%|█▌        | 2026/12776 [21:52<59:32,  3.01it/s] 16%|█▌        | 2027/12776 [21:52<1:03:26,  2.82it/s]                                                       16%|█▌        | 2027/12776 [21:52<1:03:26,  2.82it/s] 16%|█▌        | 2028/12776 [21:53<58:40,  3.05it/s]                                                       16%|█▌        | 2028/12776 [21:53<58:40,  3.05it/s] 16%|█▌        | 2029/12776 [21:53<54:49,  3.27it/s]                                                     16%|█▌        | 2029/12776 [21:53<54:49,  3.27it/s] 16%|█▌        | 2030/12776 [21:53<51:34,  3.47it/s]                                                     16%|█▌        | 2030/12776 [21:53<51:34,  3.47it/s] 16%|█▌        | 2031/12776 [21:54<55:54,  3.20it/s]                                                     16%|█▌        | 2031/12776 [21:54<55:54,  3.20it/s] 16%|█▌        | 2032/12776 [21:54<51:20,  3.49it/s]                                                     16%|█▌        | 2032/12776 [21:54<51:20,  3.49it/s] 16%|█▌        | 2033/12776 [21:54<47:50,  3.74it/s]                                                     16%|█▌        | 2033/12776 [21:54<47:50,  3.74it/s] 16%|█▌        | 2034/12776 [21:54<45:07,  3.97it/s]                                                     16%|█▌        | 2034/12776 [21:54<45:07,  3.97it/s] 16%|█▌        | 2035/12776 [21:54<42:54,  4.17it/s]                                                     16%|█▌        | 2035/12776 [21:54<42:54,  4.17it/s] 16%|█▌        | 2036/12776 [21:55<44:29,  4.02it/s]                                                     16%|█▌        | 2036/12776 [21:55<44:29,  4.02it/s] 16%|█▌        | 2037/12776 [21:55<41:43,  4.29it/s]                                                     16%|█▌        | 2037/12776 [21:55<41:43,  4.29it/s] 16%|█▌        | 2038/12776 [21:55<39:22,  4.55it/s]                                                     16%|█▌        | 2038/12776 [21:55<39:22,  4.55it/s] 16%|█▌        | 2039/12776 [21:55<37:33,  4.76it/s]                                                     16%|█▌        | 2039/12776 [21:55<37:33,  4.76it/s] 16%|█▌        | 2040/12776 [21:56<36:18,  4.93it/s]                                                     16%|█▌        | 2040/12776 [21:56<36:18,  4.93it/s] 16%|█▌        | 2041/12776 [21:56<35:05,  5.10it/s]                                                     16%|█▌        | 2041/12776 [21:56<35:05,  5.10it/s] 16%|█▌        | 2042/12776 [21:56<39:45,  4.50it/s]                                                     16%|█▌        | 2042/12776 [21:56<39:45,  4.50it/s] 16%|█▌        | 2043/12776 [21:56<37:10,  4.81it/s]                                                     16%|█▌        | 2043/12776 [21:56<37:10,  4.81it/s] 16%|█▌        | 2044/12776 [21:56<35:20,  5.06it/s]                                                     16%|█▌        | 2044/12776 [21:56<35:20,  5.06it/s] 16%|█▌        | 2045/12776 [21:57<34:10,  5.23it/s]                                                     16%|█▌        | 2045/12776 [21:57<34:10,  5.23it/s] 16%|█▌        | 2046/12776 [21:57<32:56,  5.43it/s]                                                     16%|█▌        | 2046/12776 [21:57<32:56,  5.43it/s] 16%|█▌        | 2047/12776 [21:57<31:57,  5.59it/s]                                                     16%|█▌        | 2047/12776 [21:57<31:57,  5.59it/s] 16%|█▌        | 2048/12776 [21:57<35:51,  4.99it/s]                                                     16%|█▌        | 2048/12776 [21:57<35:51,  4.99it/s] 16%|█▌        | 2049/12776 [21:57<33:41,  5.31it/s]                                                     16%|█▌        | 2049/12776 [21:57<33:41,  5.31it/s] 16%|█▌        | 2050/12776 [21:58<59:06,  3.02it/s]                                                     16%|█▌        | 2050/12776 [21:58<59:06,  3.02it/s] 16%|█▌        | 2051/12776 [21:59<1:55:38,  1.55it/s]                                                       16%|█▌        | 2051/12776 [21:59<1:55:38,  1.55it/s] 16%|█▌        | 2052/12776 [22:00<2:10:20,  1.37it/s]                                                       16%|█▌        | 2052/12776 [22:00<2:10:20,  1.37it/s] 16%|█▌        | 2053/12776 [22:01<2:16:11,  1.31it/s]                                                       16%|█▌        | 2053/12776 [22:01<2:16:11,  1.31it/s] 16%|█▌        | 2054/12776 [22:02<2:18:57,  1.29it/s]                                                       16%|█▌        | 2054/12776 [22:02<2:18:57,  1.29it/s] 16%|█▌        | 2055/12776 [22:03<2:21:44,  1.26it/s]                                                       16%|█▌        | 2055/12776 [22:03<2:21:44,  1.26it/s] 16%|█▌        | 2056/12776 [22:03<2:15:38,  1.32it/s]                                                       16%|█▌        | 2056/12776 [22:03<2:15:38,  1.32it/s] 16%|█▌        | 2057/12776 [22:04<2:13:19,  1.34it/s]                                                       16%|█▌        | 2057/12776 [22:04<2:13:19,  1.34it/s] 16%|█▌        | 2058/12776 [22:05<2:06:25,  1.41it/s]                                                       16%|█▌        | 2058/12776 [22:05<2:06:25,  1.41it/s] 16%|█▌        | 2059/12776 [22:05<1:59:39,  1.49it/s]                                                       16%|█▌        | 2059/12776 [22:05<1:59:39,  1.49it/s] 16%|█▌        | 2060/12776 [22:06<1:53:27,  1.57it/s]                                                       16%|█▌        | 2060/12776 [22:06<1:53:27,  1.57it/s] 16%|█▌        | 2061/12776 [22:06<1:48:06,  1.65it/s]                                                       16%|█▌        | 2061/12776 [22:06<1:48:06,  1.65it/s] 16%|█▌        | 2062/12776 [22:07<1:43:14,  1.73it/s]                                                       16%|█▌        | 2062/12776 [22:07<1:43:14,  1.73it/s] 16%|█▌        | 2063/12776 [22:07<1:42:56,  1.73it/s]                                                       16%|█▌        | 2063/12776 [22:07<1:42:56,  1.73it/s] 16%|█▌        | 2064/12776 [22:08<1:36:53,  1.84it/s]                                                       16%|█▌        | 2064/12776 [22:08<1:36:53,  1.84it/s] 16%|█▌        | 2065/12776 [22:08<1:30:29,  1.97it/s]                                                       16%|█▌        | 2065/12776 [22:08<1:30:29,  1.97it/s] 16%|█▌        | 2066/12776 [22:09<1:24:21,  2.12it/s]                                                       16%|█▌        | 2066/12776 [22:09<1:24:21,  2.12it/s] 16%|█▌        | 2067/12776 [22:09<1:19:00,  2.26it/s]                                                       16%|█▌        | 2067/12776 [22:09<1:19:00,  2.26it/s] 16%|█▌        | 2068/12776 [22:10<1:18:08,  2.28it/s]                                                       16%|█▌        | 2068/12776 [22:10<1:18:08,  2.28it/s] 16%|█▌        | 2069/12776 [22:10<1:13:20,  2.43it/s]                                                       16%|█▌        | 2069/12776 [22:10<1:13:20,  2.43it/s] 16%|█▌        | 2070/12776 [22:10<1:09:04,  2.58it/s]                                                       16%|█▌        | 2070/12776 [22:10<1:09:04,  2.58it/s] 16%|█▌        | 2071/12776 [22:11<1:05:18,  2.73it/s]                                                       16%|█▌        | 2071/12776 [22:11<1:05:18,  2.73it/s] 16%|█▌        | 2072/12776 [22:11<1:04:34,  2.76it/s]                                                       16%|█▌        | 2072/12776 [22:11<1:04:34,  2.76it/s] 16%|█▌        | 2073/12776 [22:11<1:01:28,  2.90it/s]                                                       16%|█▌        | 2073/12776 [22:11<1:01:28,  2.90it/s] 16%|█▌        | 2074/12776 [22:11<58:37,  3.04it/s]                                                       16%|█▌        | 2074/12776 [22:11<58:37,  3.04it/s] 16%|█▌        | 2075/12776 [22:12<59:56,  2.98it/s]                                                     16%|█▌        | 2075/12776 [22:12<59:56,  2.98it/s] 16%|█▌        | 2076/12776 [22:12<56:19,  3.17it/s]                                                     16%|█▌        | 2076/12776 [22:12<56:19,  3.17it/s] 16%|█▋        | 2077/12776 [22:12<53:08,  3.36it/s]                                                    {'eval_loss': 0.7156243920326233, 'eval_wer': 0.4355993794759547, 'eval_runtime': 203.0672, 'eval_samples_per_second': 60.98, 'eval_steps_per_second': 3.812, 'epoch': 0.31}
+{'loss': 0.5763, 'grad_norm': 0.7377122044563293, 'learning_rate': 0.0002634652981427175, 'epoch': 0.31}
+{'loss': 0.4244, 'grad_norm': 0.6151639819145203, 'learning_rate': 0.00026344086021505374, 'epoch': 0.31}
+{'loss': 0.4682, 'grad_norm': 0.7877262830734253, 'learning_rate': 0.00026341642228739, 'epoch': 0.31}
+{'loss': 0.4186, 'grad_norm': 0.6403424143791199, 'learning_rate': 0.0002633919843597263, 'epoch': 0.31}
+{'loss': 0.5447, 'grad_norm': 0.7786056995391846, 'learning_rate': 0.00026336754643206255, 'epoch': 0.31}
+{'loss': 0.4301, 'grad_norm': 1.1472690105438232, 'learning_rate': 0.0002633431085043988, 'epoch': 0.31}
+{'loss': 0.3747, 'grad_norm': 0.6084722280502319, 'learning_rate': 0.0002633186705767351, 'epoch': 0.31}
+{'loss': 0.6128, 'grad_norm': 0.780753493309021, 'learning_rate': 0.0002632942326490713, 'epoch': 0.31}
+{'loss': 0.5328, 'grad_norm': 0.6669083833694458, 'learning_rate': 0.0002632697947214076, 'epoch': 0.31}
+{'loss': 0.457, 'grad_norm': 1.1285808086395264, 'learning_rate': 0.00026324535679374386, 'epoch': 0.31}
+{'loss': 0.5403, 'grad_norm': 0.7840350270271301, 'learning_rate': 0.0002632209188660801, 'epoch': 0.31}
+{'loss': 0.5125, 'grad_norm': 0.7977207899093628, 'learning_rate': 0.0002631964809384164, 'epoch': 0.31}
+{'loss': 0.5996, 'grad_norm': 3.0473263263702393, 'learning_rate': 0.00026317204301075267, 'epoch': 0.32}
+{'loss': 0.5371, 'grad_norm': 1.2086435556411743, 'learning_rate': 0.0002631476050830889, 'epoch': 0.32}
+{'loss': 0.5623, 'grad_norm': 1.242156744003296, 'learning_rate': 0.00026312316715542517, 'epoch': 0.32}
+{'loss': 0.7507, 'grad_norm': 1.3142002820968628, 'learning_rate': 0.0002630987292277615, 'epoch': 0.32}
+{'loss': 0.5674, 'grad_norm': 1.0907604694366455, 'learning_rate': 0.0002630742913000977, 'epoch': 0.32}
+{'loss': 0.9088, 'grad_norm': 2.107612133026123, 'learning_rate': 0.000263049853372434, 'epoch': 0.32}
+{'loss': 0.9719, 'grad_norm': 2.506220817565918, 'learning_rate': 0.0002630254154447703, 'epoch': 0.32}
+{'loss': 0.7515, 'grad_norm': 1.3462073802947998, 'learning_rate': 0.00026300097751710654, 'epoch': 0.32}
+{'loss': 0.7944, 'grad_norm': 2.0875937938690186, 'learning_rate': 0.0002629765395894428, 'epoch': 0.32}
+{'loss': 1.2749, 'grad_norm': 2.3353264331817627, 'learning_rate': 0.0002629521016617791, 'epoch': 0.32}
+{'loss': 1.3145, 'grad_norm': 2.3827357292175293, 'learning_rate': 0.0002629276637341153, 'epoch': 0.32}
+{'loss': 0.6288, 'grad_norm': 1.2165812253952026, 'learning_rate': 0.0002629032258064516, 'epoch': 0.32}
+{'loss': 0.9532, 'grad_norm': 2.692824363708496, 'learning_rate': 0.00026287878787878785, 'epoch': 0.32}
+{'loss': 0.6626, 'grad_norm': 1.6349315643310547, 'learning_rate': 0.0002628543499511241, 'epoch': 0.32}
+{'loss': 0.6427, 'grad_norm': 1.819525957107544, 'learning_rate': 0.0002628299120234604, 'epoch': 0.32}
+{'loss': 0.7039, 'grad_norm': 1.595823049545288, 'learning_rate': 0.00026280547409579665, 'epoch': 0.32}
+{'loss': 1.2991, 'grad_norm': 1.4594745635986328, 'learning_rate': 0.0002627810361681329, 'epoch': 0.32}
+{'loss': 0.9092, 'grad_norm': 2.08130145072937, 'learning_rate': 0.0002627565982404692, 'epoch': 0.32}
+{'loss': 0.8779, 'grad_norm': 1.4197502136230469, 'learning_rate': 0.00026273216031280546, 'epoch': 0.32}
+{'loss': 0.8164, 'grad_norm': 1.6842551231384277, 'learning_rate': 0.0002627077223851417, 'epoch': 0.32}
+{'loss': 1.2897, 'grad_norm': 2.0086166858673096, 'learning_rate': 0.00026268328445747796, 'epoch': 0.32}
+{'loss': 0.9483, 'grad_norm': 1.6585288047790527, 'learning_rate': 0.00026265884652981427, 'epoch': 0.32}
+{'loss': 0.9985, 'grad_norm': 2.2773189544677734, 'learning_rate': 0.0002626344086021505, 'epoch': 0.32}
+{'loss': 0.8284, 'grad_norm': 1.6106942892074585, 'learning_rate': 0.00026260997067448677, 'epoch': 0.32}
+{'loss': 1.1095, 'grad_norm': 4.228276252746582, 'learning_rate': 0.0002625855327468231, 'epoch': 0.32}
+{'loss': 1.3409, 'grad_norm': 2.243229866027832, 'learning_rate': 0.0002625610948191593, 'epoch': 0.32}
+{'loss': 1.0103, 'grad_norm': 1.684822678565979, 'learning_rate': 0.0002625366568914956, 'epoch': 0.32}
+{'loss': 1.5314, 'grad_norm': 2.6656551361083984, 'learning_rate': 0.00026251221896383183, 'epoch': 0.32}
+{'loss': 1.4516, 'grad_norm': 2.3108103275299072, 'learning_rate': 0.0002624877810361681, 'epoch': 0.32}
+{'loss': 1.5163, 'grad_norm': 3.1501247882843018, 'learning_rate': 0.0002624633431085044, 'epoch': 0.32}
+{'loss': 1.4617, 'grad_norm': 3.26369571685791, 'learning_rate': 0.00026243890518084064, 'epoch': 0.32}
+{'loss': 1.5977, 'grad_norm': 2.4706554412841797, 'learning_rate': 0.0002624144672531769, 'epoch': 0.32}
+{'loss': 1.0742, 'grad_norm': 2.1074883937835693, 'learning_rate': 0.0002623900293255132, 'epoch': 0.32}
+{'loss': 0.9828, 'grad_norm': 1.818274974822998, 'learning_rate': 0.00026236559139784945, 'epoch': 0.32}
+{'loss': 1.3465, 'grad_norm': 4.391941070556641, 'learning_rate': 0.0002623411534701857, 'epoch': 0.32}
+{'loss': 1.4826, 'grad_norm': 2.8843443393707275, 'learning_rate': 0.00026231671554252195, 'epoch': 0.32}
+{'loss': 0.8778, 'grad_norm': 1.6411322355270386, 'learning_rate': 0.00026229227761485825, 'epoch': 0.32}
+{'loss': 1.4236, 'grad_norm': 1.7798197269439697, 'learning_rate': 0.0002622678396871945, 'epoch': 0.32}
+{'loss': 0.3424, 'grad_norm': 0.6216705441474915, 'learning_rate': 0.00026224340175953076, 'epoch': 0.32}
+{'loss': 0.3667, 'grad_norm': 0.7121636271476746, 'learning_rate': 0.00026221896383186706, 'epoch': 0.32}
+{'loss': 0.5881, 'grad_norm': 1.5780848264694214, 'learning_rate': 0.0002621945259042033, 'epoch': 0.32}
+{'loss': 0.6034, 'grad_norm': 1.3239952325820923, 'learning_rate': 0.00026217008797653957, 'epoch': 0.32}
+{'loss': 0.4193, 'grad_norm': 0.8389119505882263, 'learning_rate': 0.00026214565004887587, 'epoch': 0.32}
+{'loss': 0.4454, 'grad_norm': 1.1425360441207886, 'learning_rate': 0.00026212121212121207, 'epoch': 0.32}
+{'loss': 0.4994, 'grad_norm': 0.9988243579864502, 'learning_rate': 0.0002620967741935484, 'epoch': 0.32}
+{'loss': 0.4687, 'grad_norm': 0.9738520383834839, 'learning_rate': 0.0002620723362658846, 'epoch': 0.32}
+{'loss': 0.5803, 'grad_norm': 0.7718535661697388, 'learning_rate': 0.0002620478983382209, 'epoch': 0.32}
+{'loss': 0.5174, 'grad_norm': 1.0445855855941772, 'learning_rate': 0.0002620234604105572, 'epoch': 0.32}
+{'loss': 0.5589, 'grad_norm': 1.0972551107406616, 'learning_rate': 0.00026199902248289343, 'epoch': 0.32}
+{'loss': 0.6339, 'grad_norm': 0.9006467461585999, 'learning_rate': 0.0002619745845552297, 'epoch': 0.32}
+{'loss': 0.6673, 'grad_norm': 1.221784234046936, 'learning_rate': 0.00026195014662756594, 'epoch': 0.32}
+{'loss': 0.5886, 'grad_norm': 0.9309597611427307, 'learning_rate': 0.00026192570869990224, 'epoch': 0.32}
+{'loss': 0.8406, 'grad_norm': 1.0560715198516846, 'learning_rate': 0.0002619012707722385, 'epoch': 0.32}
+{'loss': 0.4253, 'grad_norm': 1.26978600025177, 'learning_rate': 0.00026187683284457474, 'epoch': 0.32}
+{'loss': 0.6387, 'grad_norm': 1.005332350730896, 'learning_rate': 0.00026185239491691105, 'epoch': 0.32}
+{'loss': 0.7234, 'grad_norm': 2.363250970840454, 'learning_rate': 0.0002618279569892473, 'epoch': 0.32}
+{'loss': 0.6617, 'grad_norm': 1.3625165224075317, 'learning_rate': 0.00026180351906158355, 'epoch': 0.32}
+{'loss': 0.7068, 'grad_norm': 1.591560959815979, 'learning_rate': 0.00026177908113391986, 'epoch': 0.32}
+{'loss': 0.7811, 'grad_norm': 1.429697036743164, 'learning_rate': 0.00026175464320625605, 'epoch': 0.32}
+{'loss': 0.885, 'grad_norm': 2.3727924823760986, 'learning_rate': 0.00026173020527859236, 'epoch': 0.32}
+{'loss': 0.8557, 'grad_norm': 1.7005720138549805, 'learning_rate': 0.0002617057673509286, 'epoch': 0.32}
+{'loss': 0.8502, 'grad_norm': 1.7432682514190674, 'learning_rate': 0.00026168132942326486, 'epoch': 0.32}
+{'loss': 1.4696, 'grad_norm': 2.942030906677246, 'learning_rate': 0.00026165689149560117, 'epoch': 0.32}
+{'loss': 0.5626, 'grad_norm': 1.5261107683181763, 'learning_rate': 0.0002616324535679374, 'epoch': 0.32}
+ 16%|█▋        | 2077/12776 [22:12<53:08,  3.36it/s] 16%|█▋        | 2078/12776 [22:13<50:29,  3.53it/s]                                                     16%|█▋        | 2078/12776 [22:13<50:29,  3.53it/s] 16%|█▋        | 2079/12776 [22:13<54:19,  3.28it/s]                                                     16%|█▋        | 2079/12776 [22:13<54:19,  3.28it/s] 16%|█▋        | 2080/12776 [22:13<50:33,  3.53it/s]                                                     16%|█▋        | 2080/12776 [22:13<50:33,  3.53it/s] 16%|█▋        | 2081/12776 [22:13<47:17,  3.77it/s]                                                     16%|█▋        | 2081/12776 [22:13<47:17,  3.77it/s] 16%|█▋        | 2082/12776 [22:14<44:45,  3.98it/s]                                                     16%|█▋        | 2082/12776 [22:14<44:45,  3.98it/s] 16%|█▋        | 2083/12776 [22:14<42:45,  4.17it/s]                                                     16%|█▋        | 2083/12776 [22:14<42:45,  4.17it/s] 16%|█▋        | 2084/12776 [22:14<44:13,  4.03it/s]                                                     16%|█▋        | 2084/12776 [22:14<44:13,  4.03it/s] 16%|█▋        | 2085/12776 [22:14<41:45,  4.27it/s]                                                     16%|█▋        | 2085/12776 [22:14<41:45,  4.27it/s] 16%|█▋        | 2086/12776 [22:15<40:07,  4.44it/s]                                                     16%|█▋        | 2086/12776 [22:15<40:07,  4.44it/s] 16%|█▋        | 2087/12776 [22:15<38:16,  4.65it/s]                                                     16%|█▋        | 2087/12776 [22:15<38:16,  4.65it/s] 16%|█▋        | 2088/12776 [22:15<36:51,  4.83it/s]                                                     16%|█▋        | 2088/12776 [22:15<36:51,  4.83it/s] 16%|█▋        | 2089/12776 [22:15<35:38,  5.00it/s]                                                     16%|█▋        | 2089/12776 [22:15<35:38,  5.00it/s] 16%|█▋        | 2090/12776 [22:15<41:06,  4.33it/s]                                                     16%|█▋        | 2090/12776 [22:15<41:06,  4.33it/s] 16%|█▋        | 2091/12776 [22:16<38:17,  4.65it/s]                                                     16%|█▋        | 2091/12776 [22:16<38:17,  4.65it/s] 16%|█▋        | 2092/12776 [22:16<36:49,  4.83it/s]                                                     16%|█▋        | 2092/12776 [22:16<36:49,  4.83it/s] 16%|█▋        | 2093/12776 [22:16<36:07,  4.93it/s]                                                     16%|█▋        | 2093/12776 [22:16<36:07,  4.93it/s] 16%|█▋        | 2094/12776 [22:16<35:30,  5.01it/s]                                                     16%|█▋        | 2094/12776 [22:16<35:30,  5.01it/s] 16%|█▋        | 2095/12776 [22:16<38:41,  4.60it/s]                                                     16%|█▋        | 2095/12776 [22:16<38:41,  4.60it/s] 16%|█▋        | 2096/12776 [22:17<37:02,  4.80it/s]                                                     16%|█▋        | 2096/12776 [22:17<37:02,  4.80it/s] 16%|█▋        | 2097/12776 [22:17<35:49,  4.97it/s]                                                     16%|█▋        | 2097/12776 [22:17<35:49,  4.97it/s] 16%|█▋        | 2098/12776 [22:17<34:39,  5.14it/s]                                                     16%|█▋        | 2098/12776 [22:17<34:39,  5.14it/s] 16%|█▋        | 2099/12776 [22:17<33:42,  5.28it/s]                                                     16%|█▋        | 2099/12776 [22:17<33:42,  5.28it/s] 16%|█▋        | 2100/12776 [22:18<58:52,  3.02it/s]                                                     16%|█▋        | 2100/12776 [22:18<58:52,  3.02it/s] 16%|█▋        | 2101/12776 [22:19<1:56:52,  1.52it/s]                                                       16%|█▋        | 2101/12776 [22:19<1:56:52,  1.52it/s] 16%|█▋        | 2102/12776 [22:20<2:15:32,  1.31it/s]                                                       16%|█▋        | 2102/12776 [22:20<2:15:32,  1.31it/s] 16%|█▋        | 2103/12776 [22:21<2:19:08,  1.28it/s]                                                       16%|█▋        | 2103/12776 [22:21<2:19:08,  1.28it/s] 16%|█▋        | 2104/12776 [22:22<2:16:12,  1.31it/s]                                                       16%|█▋        | 2104/12776 [22:22<2:16:12,  1.31it/s] 16%|█▋        | 2105/12776 [22:23<2:16:17,  1.30it/s]                                                       16%|█▋        | 2105/12776 [22:23<2:16:17,  1.30it/s] 16%|█▋        | 2106/12776 [22:23<2:12:57,  1.34it/s]                                                       16%|█▋        | 2106/12776 [22:23<2:12:57,  1.34it/s] 16%|█▋        | 2107/12776 [22:24<2:06:06,  1.41it/s]                                                       16%|█▋        | 2107/12776 [22:24<2:06:06,  1.41it/s] 16%|█▋        | 2108/12776 [22:24<1:58:52,  1.50it/s]                                                       16%|█▋        | 2108/12776 [22:24<1:58:52,  1.50it/s] 17%|█▋        | 2109/12776 [22:25<1:52:20,  1.58it/s]                                                       17%|█▋        | 2109/12776 [22:25<1:52:20,  1.58it/s] 17%|█▋        | 2110/12776 [22:26<1:49:54,  1.62it/s]                                                       17%|█▋        | 2110/12776 [22:26<1:49:54,  1.62it/s] 17%|█▋        | 2111/12776 [22:26<1:43:28,  1.72it/s]                                                       17%|█▋        | 2111/12776 [22:26<1:43:28,  1.72it/s] 17%|█▋        | 2112/12776 [22:27<1:38:00,  1.81it/s]                                                       17%|█▋        | 2112/12776 [22:27<1:38:00,  1.81it/s] 17%|█▋        | 2113/12776 [22:27<1:33:59,  1.89it/s]                                                       17%|█▋        | 2113/12776 [22:27<1:33:59,  1.89it/s] 17%|█▋        | 2114/12776 [22:27<1:28:53,  2.00it/s]                                                       17%|█▋        | 2114/12776 [22:27<1:28:53,  2.00it/s] 17%|█▋        | 2115/12776 [22:28<1:26:20,  2.06it/s]                                                       17%|█▋        | 2115/12776 [22:28<1:26:20,  2.06it/s] 17%|█▋        | 2116/12776 [22:28<1:21:38,  2.18it/s]                                                       17%|█▋        | 2116/12776 [22:28<1:21:38,  2.18it/s] 17%|█▋        | 2117/12776 [22:29<1:17:48,  2.28it/s]                                                       17%|█▋        | 2117/12776 [22:29<1:17:48,  2.28it/s] 17%|█▋        | 2118/12776 [22:29<1:20:47,  2.20it/s]                                                       17%|█▋        | 2118/12776 [22:29<1:20:47,  2.20it/s] 17%|█▋        | 2119/12776 [22:30<1:15:32,  2.35it/s]                                                       17%|█▋        | 2119/12776 [22:30<1:15:32,  2.35it/s] 17%|█▋        | 2120/12776 [22:30<1:11:45,  2.47it/s]                                                       17%|█▋        | 2120/12776 [22:30<1:11:45,  2.47it/s] 17%|█▋        | 2121/12776 [22:30<1:12:05,  2.46it/s]                                                       17%|█▋        | 2121/12776 [22:30<1:12:05,  2.46it/s] 17%|█▋        | 2122/12776 [22:31<1:08:03,  2.61it/s]                                                       17%|█▋        | 2122/12776 [22:31<1:08:03,  2.61it/s] 17%|█▋        | 2123/12776 [22:31<1:04:45,  2.74it/s]                                                       17%|█▋        | 2123/12776 [22:31<1:04:45,  2.74it/s] 17%|█▋        | 2124/12776 [22:31<1:03:15,  2.81it/s]                                                       17%|█▋        | 2124/12776 [22:31<1:03:15,  2.81it/s] 17%|█▋        | 2125/12776 [22:32<1:00:03,  2.96it/s]                                                       17%|█▋        | 2125/12776 [22:32<1:00:03,  2.96it/s] 17%|█▋        | 2126/12776 [22:32<57:28,  3.09it/s]                                                       17%|█▋        | 2126/12776 [22:32<57:28,  3.09it/s] 17%|█▋        | 2127/12776 [22:32<55:19,  3.21it/s]                                                     17%|█▋        | 2127/12776 [22:32<55:19,  3.21it/s] 17%|█▋        | 2128/12776 [22:33<59:16,  2.99it/s]                                                     17%|█▋        | 2128/12776 [22:33<59:16,  2.99it/s] 17%|█▋        | 2129/12776 [22:33<55:32,  3.20it/s]                                                     17%|█▋        | 2129/12776 [22:33<55:32,  3.20it/s] 17%|█▋        | 2130/12776 [22:33<52:26,  3.38it/s]                                                     17%|█▋        | 2130/12776 [22:33<52:26,  3.38it/s] 17%|█▋        | 2131/12776 [22:33<49:43,  3.57it/s]                                                     17%|█▋        | 2131/12776 [22:33<49:43,  3.57it/s] 17%|█▋        | 2132/12776 [22:34<53:31,  3.31it/s]                                                     17%|█▋        | 2132/12776 [22:34<53:31,  3.31it/s] 17%|█▋        | 2133/12776 [22:34<49:59,  3.55it/s]                                                     17%|█▋        | 2133/12776 [22:34<49:59,  3.55it/s] 17%|█▋        | 2134/12776 [22:34<47:13,  3.76it/s]                                                     17%|█▋        | 2134/12776 [22:34<47:13,  3.76it/s] 17%|█▋        | 2135/12776 [22:34<44:54,  3.95it/s]                                                     17%|█▋        | 2135/12776 [22:34<44:54,  3.95it/s] 17%|█▋        | 2136/12776 [22:35<48:26,  3.66it/s]                                                     17%|█▋        | 2136/12776 [22:35<48:26,  3.66it/s] 17%|█▋        | 2137/12776 [22:35<48:53,  3.63it/s]                                                     17%|█▋        | 2137/12776 [22:35<48:53,  3.63it/s] 17%|█▋        | 2138/12776 [22:35<45:13,  3.92it/s]                                                     17%|█▋        | 2138/12776 [22:35<45:13,  3.92it/s] 17%|█▋        | 2139/12776 [22:35<42:23,  4.18it/s]                                                     17%|█▋        | 2139/12776 [22:35<42:23,  4.18it/s] 17%|█▋        | 2140/12776 [22:36<40:26,  4.38it/s]                                                     17%|█▋        | 2140/12776 [22:36<40:26,  4.38it/s] 17%|█▋        | 2141/12776 [22:36<41:13,  4.30it/s]                                                     17%|█▋        | 2141/12776 [22:36<41:13,  4.30it/s] 17%|█▋        | 2142/12776 [22:36<39:07,  4.53it/s]                                                     17%|█▋        | 2142/12776 [22:36<39:07,  4.53it/s] 17%|█▋        | 2143/12776 [22:36<37:35,  4.71it/s]                                                     17%|█▋        | 2143/12776 [22:36<37:35,  4.71it/s] 17%|█▋        | 2144/12776 [22:36<36:21,  4.87it/s]                                                     17%|█▋        | 2144/12776 [22:36<36:21,  4.87it/s] 17%|█▋        | 2145/12776 [22:37<35:27,  5.00it/s]                                                     17%|█▋        | 2145/12776 [22:37<35:27,  5.00it/s] 17%|█▋        | 2146/12776 [22:37<34:44,  5.10it/s]                                                     17%|█▋        | 2146/12776 [22:37<34:44,  5.10it/s] 17%|█▋        | 2147/12776 [22:37<39:20,  4.50it/s]                                                     17%|█▋        | 2147/12776 [22:37<39:20,  4.50it/s] 17%|█▋        | 2148/12776 [22:37<37:03,  4.78it/s]                                                     17%|█▋        | 2148/12776 [22:37<37:03,  4.78it/s] 17%|█▋        | 2149/12776 [22:37<35:18,  5.02it/s]                                                     17%|█▋        | 2149/12776 [22:37<35:18,  5.02it/s] 17%|█▋        | 2150/12776 [22:38<1:04:29,  2.75it/s]                                                       17%|█▋        | 2150/12776 [22:38<1:04:29,  2.75it/s] 17%|█▋        | 2151/12776 [22:39<1:50:08,  1.61it/s]                                                       17%|█▋        | 2151/12776 [22:39<1:50:08,  1.61it/s] 17%|█▋        | 2152/12776 [22:40<2:08:12,  1.38it/s]                                                       17%|█▋        | 2152/12776 [22:40<2:08:12,  1.38it/s] 17%|█▋        | 2153/12776 [22:41<2:13:06,  1.33it/s]                                                       17%|█▋        | 2153/12776 [22:41<2:13:06,  1.33it/s] 17%|█▋        | 2154/12776 [22:42<2:13:19,  1.33it/s]                                                       17%|█▋        | 2154/12776 [22:42<2:13:19,  1.33it/s] 17%|█▋        | 2155/12776 [22:43<2:11:07,  1.35it/s]                                                      {'loss': 0.7561, 'grad_norm': 1.8354300260543823, 'learning_rate': 0.00026160801564027367, 'epoch': 0.33}
+{'loss': 0.4853, 'grad_norm': 1.2063688039779663, 'learning_rate': 0.00026158357771261, 'epoch': 0.33}
+{'loss': 0.8312, 'grad_norm': 2.0545923709869385, 'learning_rate': 0.0002615591397849462, 'epoch': 0.33}
+{'loss': 1.2077, 'grad_norm': 8.21367073059082, 'learning_rate': 0.0002615347018572825, 'epoch': 0.33}
+{'loss': 0.5682, 'grad_norm': 1.7242724895477295, 'learning_rate': 0.00026151026392961873, 'epoch': 0.33}
+{'loss': 1.3158, 'grad_norm': 3.253326177597046, 'learning_rate': 0.00026148582600195503, 'epoch': 0.33}
+{'loss': 1.1925, 'grad_norm': 2.2287137508392334, 'learning_rate': 0.0002614613880742913, 'epoch': 0.33}
+{'loss': 1.5877, 'grad_norm': 2.752636194229126, 'learning_rate': 0.00026143695014662754, 'epoch': 0.33}
+{'loss': 0.4494, 'grad_norm': 1.1275129318237305, 'learning_rate': 0.00026141251221896384, 'epoch': 0.33}
+{'loss': 1.5737, 'grad_norm': 2.6209516525268555, 'learning_rate': 0.00026138807429130004, 'epoch': 0.33}
+{'loss': 0.9766, 'grad_norm': 2.409445285797119, 'learning_rate': 0.00026136363636363634, 'epoch': 0.33}
+{'loss': 1.4515, 'grad_norm': 3.750723123550415, 'learning_rate': 0.0002613391984359726, 'epoch': 0.33}
+{'loss': 0.7738, 'grad_norm': 2.0334136486053467, 'learning_rate': 0.00026131476050830885, 'epoch': 0.33}
+{'loss': 1.5601, 'grad_norm': 3.3774590492248535, 'learning_rate': 0.00026129032258064515, 'epoch': 0.33}
+{'loss': 0.8408, 'grad_norm': 1.3902050256729126, 'learning_rate': 0.0002612658846529814, 'epoch': 0.33}
+{'loss': 0.886, 'grad_norm': 1.9110989570617676, 'learning_rate': 0.00026124144672531765, 'epoch': 0.33}
+{'loss': 1.8936, 'grad_norm': 1.9537615776062012, 'learning_rate': 0.00026121700879765396, 'epoch': 0.33}
+{'loss': 1.3683, 'grad_norm': 3.947450876235962, 'learning_rate': 0.0002611925708699902, 'epoch': 0.33}
+{'loss': 1.0555, 'grad_norm': 1.9001954793930054, 'learning_rate': 0.00026116813294232646, 'epoch': 0.33}
+{'loss': 0.8577, 'grad_norm': 1.3852555751800537, 'learning_rate': 0.0002611436950146627, 'epoch': 0.33}
+{'loss': 0.5502, 'grad_norm': 1.6378505229949951, 'learning_rate': 0.000261119257086999, 'epoch': 0.33}
+{'loss': 1.164, 'grad_norm': 1.4456698894500732, 'learning_rate': 0.00026109481915933527, 'epoch': 0.33}
+{'loss': 1.174, 'grad_norm': 2.254760265350342, 'learning_rate': 0.0002610703812316715, 'epoch': 0.33}
+{'loss': 1.4058, 'grad_norm': 2.9925920963287354, 'learning_rate': 0.00026104594330400783, 'epoch': 0.33}
+{'loss': 0.4183, 'grad_norm': 0.5673936605453491, 'learning_rate': 0.0002610215053763441, 'epoch': 0.33}
+{'loss': 0.3369, 'grad_norm': 0.6673199534416199, 'learning_rate': 0.00026099706744868033, 'epoch': 0.33}
+{'loss': 0.4785, 'grad_norm': 0.8384460806846619, 'learning_rate': 0.0002609726295210166, 'epoch': 0.33}
+{'loss': 0.5129, 'grad_norm': 0.8750125765800476, 'learning_rate': 0.00026094819159335283, 'epoch': 0.33}
+{'loss': 0.5408, 'grad_norm': 0.729326069355011, 'learning_rate': 0.00026092375366568914, 'epoch': 0.33}
+{'loss': 0.5732, 'grad_norm': 0.8103402256965637, 'learning_rate': 0.0002608993157380254, 'epoch': 0.33}
+{'loss': 0.5434, 'grad_norm': 1.0650441646575928, 'learning_rate': 0.00026087487781036164, 'epoch': 0.33}
+{'loss': 0.4231, 'grad_norm': 0.8528415560722351, 'learning_rate': 0.00026085043988269795, 'epoch': 0.33}
+{'loss': 0.533, 'grad_norm': 1.0440559387207031, 'learning_rate': 0.0002608260019550342, 'epoch': 0.33}
+{'loss': 0.6891, 'grad_norm': 0.9997760653495789, 'learning_rate': 0.00026080156402737045, 'epoch': 0.33}
+{'loss': 0.4688, 'grad_norm': 1.1891223192214966, 'learning_rate': 0.0002607771260997067, 'epoch': 0.33}
+{'loss': 0.5096, 'grad_norm': 1.3714631795883179, 'learning_rate': 0.000260752688172043, 'epoch': 0.33}
+{'loss': 0.6755, 'grad_norm': 1.0900437831878662, 'learning_rate': 0.00026072825024437926, 'epoch': 0.33}
+{'loss': 0.8436, 'grad_norm': 2.1338839530944824, 'learning_rate': 0.0002607038123167155, 'epoch': 0.33}
+{'loss': 0.7621, 'grad_norm': 1.6171543598175049, 'learning_rate': 0.0002606793743890518, 'epoch': 0.33}
+{'loss': 0.491, 'grad_norm': 0.8001659512519836, 'learning_rate': 0.00026065493646138806, 'epoch': 0.33}
+{'loss': 0.6333, 'grad_norm': 1.0575730800628662, 'learning_rate': 0.0002606304985337243, 'epoch': 0.33}
+{'loss': 0.73, 'grad_norm': 1.6386489868164062, 'learning_rate': 0.0002606060606060606, 'epoch': 0.33}
+{'loss': 0.5608, 'grad_norm': 1.1848156452178955, 'learning_rate': 0.0002605816226783968, 'epoch': 0.33}
+{'loss': 0.5895, 'grad_norm': 1.1664621829986572, 'learning_rate': 0.0002605571847507331, 'epoch': 0.33}
+{'loss': 0.5018, 'grad_norm': 1.0166929960250854, 'learning_rate': 0.0002605327468230694, 'epoch': 0.33}
+{'loss': 0.7414, 'grad_norm': 3.1172375679016113, 'learning_rate': 0.0002605083088954056, 'epoch': 0.33}
+{'loss': 0.5268, 'grad_norm': 1.3032346963882446, 'learning_rate': 0.00026048387096774193, 'epoch': 0.33}
+{'loss': 0.8175, 'grad_norm': 2.1473283767700195, 'learning_rate': 0.0002604594330400782, 'epoch': 0.33}
+{'loss': 1.0549, 'grad_norm': 2.331975221633911, 'learning_rate': 0.00026043499511241443, 'epoch': 0.33}
+{'loss': 1.1403, 'grad_norm': 2.3260395526885986, 'learning_rate': 0.0002604105571847507, 'epoch': 0.33}
+{'loss': 0.8268, 'grad_norm': 3.693405866622925, 'learning_rate': 0.000260386119257087, 'epoch': 0.33}
+{'loss': 1.0704, 'grad_norm': 1.5741246938705444, 'learning_rate': 0.00026036168132942324, 'epoch': 0.33}
+{'loss': 0.8378, 'grad_norm': 1.5389891862869263, 'learning_rate': 0.0002603372434017595, 'epoch': 0.33}
+{'loss': 1.0205, 'grad_norm': 1.7030190229415894, 'learning_rate': 0.0002603128054740958, 'epoch': 0.33}
+{'loss': 1.128, 'grad_norm': 2.1614034175872803, 'learning_rate': 0.00026028836754643205, 'epoch': 0.33}
+{'loss': 0.7876, 'grad_norm': 1.5320353507995605, 'learning_rate': 0.0002602639296187683, 'epoch': 0.33}
+{'loss': 0.8237, 'grad_norm': 1.6267305612564087, 'learning_rate': 0.0002602394916911046, 'epoch': 0.33}
+{'loss': 1.1185, 'grad_norm': 3.2950265407562256, 'learning_rate': 0.0002602150537634408, 'epoch': 0.33}
+{'loss': 0.91, 'grad_norm': 2.2613959312438965, 'learning_rate': 0.0002601906158357771, 'epoch': 0.33}
+{'loss': 1.006, 'grad_norm': 1.922223687171936, 'learning_rate': 0.00026016617790811336, 'epoch': 0.33}
+{'loss': 1.4087, 'grad_norm': 3.277189016342163, 'learning_rate': 0.0002601417399804496, 'epoch': 0.33}
+{'loss': 1.3359, 'grad_norm': 2.7033369541168213, 'learning_rate': 0.0002601173020527859, 'epoch': 0.33}
+{'loss': 0.9543, 'grad_norm': 1.8570423126220703, 'learning_rate': 0.00026009286412512217, 'epoch': 0.33}
+{'loss': 1.0798, 'grad_norm': 4.569610595703125, 'learning_rate': 0.0002600684261974584, 'epoch': 0.34}
+{'loss': 2.1632, 'grad_norm': 2.035679578781128, 'learning_rate': 0.0002600439882697947, 'epoch': 0.34}
+{'loss': 1.4521, 'grad_norm': 1.7678215503692627, 'learning_rate': 0.000260019550342131, 'epoch': 0.34}
+{'loss': 1.5297, 'grad_norm': 2.673259735107422, 'learning_rate': 0.00025999511241446723, 'epoch': 0.34}
+{'loss': 1.1949, 'grad_norm': 2.435270309448242, 'learning_rate': 0.0002599706744868035, 'epoch': 0.34}
+{'loss': 1.356, 'grad_norm': 2.4045560359954834, 'learning_rate': 0.0002599462365591398, 'epoch': 0.34}
+{'loss': 0.9344, 'grad_norm': 2.4822990894317627, 'learning_rate': 0.00025992179863147604, 'epoch': 0.34}
+{'loss': 0.7547, 'grad_norm': 1.8672354221343994, 'learning_rate': 0.0002598973607038123, 'epoch': 0.34}
+{'loss': 0.8016, 'grad_norm': 0.980976939201355, 'learning_rate': 0.0002598729227761486, 'epoch': 0.34}
+{'loss': 1.0073, 'grad_norm': 1.5991452932357788, 'learning_rate': 0.00025984848484848484, 'epoch': 0.34}
+{'loss': 1.0327, 'grad_norm': 1.6782243251800537, 'learning_rate': 0.0002598240469208211, 'epoch': 0.34}
+{'loss': 0.4323, 'grad_norm': 0.7231286764144897, 'learning_rate': 0.00025979960899315735, 'epoch': 0.34}
+{'loss': 0.5182, 'grad_norm': 0.8727384805679321, 'learning_rate': 0.0002597751710654936, 'epoch': 0.34}
+{'loss': 0.374, 'grad_norm': 0.49551793932914734, 'learning_rate': 0.0002597507331378299, 'epoch': 0.34}
+{'loss': 0.4585, 'grad_norm': 1.4629757404327393, 'learning_rate': 0.00025972629521016615, 'epoch': 0.34}
+ 17%|█▋        | 2155/12776 [22:43<2:11:07,  1.35it/s] 17%|█▋        | 2156/12776 [22:43<2:06:19,  1.40it/s]                                                       17%|█▋        | 2156/12776 [22:43<2:06:19,  1.40it/s] 17%|█▋        | 2157/12776 [22:44<2:07:08,  1.39it/s]                                                       17%|█▋        | 2157/12776 [22:44<2:07:08,  1.39it/s] 17%|█▋        | 2158/12776 [22:45<2:00:46,  1.47it/s]                                                       17%|█▋        | 2158/12776 [22:45<2:00:46,  1.47it/s] 17%|█▋        | 2159/12776 [22:45<1:56:28,  1.52it/s]                                                       17%|█▋        | 2159/12776 [22:45<1:56:28,  1.52it/s] 17%|█▋        | 2160/12776 [22:46<1:49:28,  1.62it/s]                                                       17%|█▋        | 2160/12776 [22:46<1:49:28,  1.62it/s] 17%|█▋        | 2161/12776 [22:46<1:46:49,  1.66it/s]                                                       17%|█▋        | 2161/12776 [22:46<1:46:49,  1.66it/s] 17%|█▋        | 2162/12776 [22:47<1:39:27,  1.78it/s]                                                       17%|█▋        | 2162/12776 [22:47<1:39:27,  1.78it/s] 17%|█▋        | 2163/12776 [22:47<1:39:54,  1.77it/s]                                                       17%|█▋        | 2163/12776 [22:47<1:39:54,  1.77it/s] 17%|█▋        | 2164/12776 [22:48<1:32:41,  1.91it/s]                                                       17%|█▋        | 2164/12776 [22:48<1:32:41,  1.91it/s] 17%|█▋        | 2165/12776 [22:48<1:31:46,  1.93it/s]                                                       17%|█▋        | 2165/12776 [22:48<1:31:46,  1.93it/s] 17%|█▋        | 2166/12776 [22:49<1:25:30,  2.07it/s]                                                       17%|█▋        | 2166/12776 [22:49<1:25:30,  2.07it/s] 17%|█▋        | 2167/12776 [22:49<1:20:37,  2.19it/s]                                                       17%|█▋        | 2167/12776 [22:49<1:20:37,  2.19it/s] 17%|█▋        | 2168/12776 [22:50<1:22:40,  2.14it/s]                                                       17%|█▋        | 2168/12776 [22:50<1:22:40,  2.14it/s] 17%|█▋        | 2169/12776 [22:50<1:16:45,  2.30it/s]                                                       17%|█▋        | 2169/12776 [22:50<1:16:45,  2.30it/s] 17%|█▋        | 2170/12776 [22:50<1:11:44,  2.46it/s]                                                       17%|█▋        | 2170/12776 [22:50<1:11:44,  2.46it/s] 17%|█▋        | 2171/12776 [22:51<1:13:38,  2.40it/s]                                                       17%|█▋        | 2171/12776 [22:51<1:13:38,  2.40it/s] 17%|█▋        | 2172/12776 [22:51<1:08:30,  2.58it/s]                                                       17%|█▋        | 2172/12776 [22:51<1:08:30,  2.58it/s] 17%|█▋        | 2173/12776 [22:51<1:04:23,  2.74it/s]                                                       17%|█▋        | 2173/12776 [22:51<1:04:23,  2.74it/s] 17%|█▋        | 2174/12776 [22:52<1:03:39,  2.78it/s]                                                       17%|█▋        | 2174/12776 [22:52<1:03:39,  2.78it/s] 17%|█▋        | 2175/12776 [22:52<59:51,  2.95it/s]                                                       17%|█▋        | 2175/12776 [22:52<59:51,  2.95it/s] 17%|█▋        | 2176/12776 [22:52<56:51,  3.11it/s]                                                     17%|█▋        | 2176/12776 [22:52<56:51,  3.11it/s] 17%|█▋        | 2177/12776 [22:53<54:13,  3.26it/s]                                                     17%|█▋        | 2177/12776 [22:53<54:13,  3.26it/s] 17%|█▋        | 2178/12776 [22:53<53:44,  3.29it/s]                                                     17%|█▋        | 2178/12776 [22:53<53:44,  3.29it/s] 17%|█▋        | 2179/12776 [22:53<51:11,  3.45it/s]                                                     17%|█▋        | 2179/12776 [22:53<51:11,  3.45it/s] 17%|█▋        | 2180/12776 [22:53<48:31,  3.64it/s]                                                     17%|█▋        | 2180/12776 [22:53<48:31,  3.64it/s] 17%|█▋        | 2181/12776 [22:54<46:47,  3.77it/s]                                                     17%|█▋        | 2181/12776 [22:54<46:47,  3.77it/s] 17%|█▋        | 2182/12776 [22:54<45:22,  3.89it/s]                                                     17%|█▋        | 2182/12776 [22:54<45:22,  3.89it/s] 17%|█▋        | 2183/12776 [22:54<46:40,  3.78it/s]                                                     17%|█▋        | 2183/12776 [22:54<46:40,  3.78it/s] 17%|█▋        | 2184/12776 [22:54<44:53,  3.93it/s]                                                     17%|█▋        | 2184/12776 [22:54<44:53,  3.93it/s] 17%|█▋        | 2185/12776 [22:55<43:14,  4.08it/s]                                                     17%|█▋        | 2185/12776 [22:55<43:14,  4.08it/s] 17%|█▋        | 2186/12776 [22:55<41:47,  4.22it/s]                                                     17%|█▋        | 2186/12776 [22:55<41:47,  4.22it/s] 17%|█▋        | 2187/12776 [22:55<45:54,  3.84it/s]                                                     17%|█▋        | 2187/12776 [22:55<45:54,  3.84it/s] 17%|█▋        | 2188/12776 [22:55<43:11,  4.09it/s]                                                     17%|█▋        | 2188/12776 [22:55<43:11,  4.09it/s] 17%|█▋        | 2189/12776 [22:55<41:00,  4.30it/s]                                                     17%|█▋        | 2189/12776 [22:55<41:00,  4.30it/s] 17%|█▋        | 2190/12776 [22:56<39:25,  4.48it/s]                                                     17%|█▋        | 2190/12776 [22:56<39:25,  4.48it/s] 17%|█▋        | 2191/12776 [22:56<38:27,  4.59it/s]                                                     17%|█▋        | 2191/12776 [22:56<38:27,  4.59it/s] 17%|█▋        | 2192/12776 [22:56<42:27,  4.15it/s]                                                     17%|█▋        | 2192/12776 [22:56<42:27,  4.15it/s] 17%|█▋        | 2193/12776 [22:56<40:09,  4.39it/s]                                                     17%|█▋        | 2193/12776 [22:56<40:09,  4.39it/s] 17%|█▋        | 2194/12776 [22:57<38:15,  4.61it/s]                                                     17%|█▋        | 2194/12776 [22:57<38:15,  4.61it/s] 17%|█▋        | 2195/12776 [22:57<36:53,  4.78it/s]                                                     17%|█▋        | 2195/12776 [22:57<36:53,  4.78it/s] 17%|█▋        | 2196/12776 [22:57<35:53,  4.91it/s]                                                     17%|█▋        | 2196/12776 [22:57<35:53,  4.91it/s] 17%|█▋        | 2197/12776 [22:57<34:59,  5.04it/s]                                                     17%|█▋        | 2197/12776 [22:57<34:59,  5.04it/s] 17%|█▋        | 2198/12776 [22:57<39:43,  4.44it/s]                                                     17%|█▋        | 2198/12776 [22:57<39:43,  4.44it/s] 17%|█▋        | 2199/12776 [22:58<37:15,  4.73it/s]                                                     17%|█▋        | 2199/12776 [22:58<37:15,  4.73it/s] 17%|█▋        | 2200/12776 [22:58<1:10:22,  2.50it/s]                                                       17%|█▋        | 2200/12776 [22:58<1:10:22,  2.50it/s] 17%|█▋        | 2201/12776 [23:00<2:07:46,  1.38it/s]                                                       17%|█▋        | 2201/12776 [23:00<2:07:46,  1.38it/s] 17%|█▋        | 2202/12776 [23:01<2:20:10,  1.26it/s]                                                       17%|█▋        | 2202/12776 [23:01<2:20:10,  1.26it/s] 17%|█▋        | 2203/12776 [23:02<2:20:48,  1.25it/s]                                                       17%|█▋        | 2203/12776 [23:02<2:20:48,  1.25it/s] 17%|█▋        | 2204/12776 [23:02<2:17:34,  1.28it/s]                                                       17%|█▋        | 2204/12776 [23:02<2:17:34,  1.28it/s] 17%|█▋        | 2205/12776 [23:03<2:12:50,  1.33it/s]                                                       17%|█▋        | 2205/12776 [23:03<2:12:50,  1.33it/s] 17%|█▋        | 2206/12776 [23:04<2:07:09,  1.39it/s]                                                       17%|█▋        | 2206/12776 [23:04<2:07:09,  1.39it/s] 17%|█▋        | 2207/12776 [23:04<2:04:37,  1.41it/s]                                                       17%|█▋        | 2207/12776 [23:04<2:04:37,  1.41it/s] 17%|█▋        | 2208/12776 [23:05<1:57:38,  1.50it/s]                                                       17%|█▋        | 2208/12776 [23:05<1:57:38,  1.50it/s] 17%|█▋        | 2209/12776 [23:06<1:55:21,  1.53it/s]                                                       17%|█▋        | 2209/12776 [23:06<1:55:21,  1.53it/s] 17%|█▋        | 2210/12776 [23:06<1:49:13,  1.61it/s]                                                       17%|█▋        | 2210/12776 [23:06<1:49:13,  1.61it/s] 17%|█▋        | 2211/12776 [23:07<1:46:12,  1.66it/s]                                                       17%|█▋        | 2211/12776 [23:07<1:46:12,  1.66it/s] 17%|█▋        | 2212/12776 [23:07<1:40:44,  1.75it/s]                                                       17%|█▋        | 2212/12776 [23:07<1:40:44,  1.75it/s] 17%|█▋        | 2213/12776 [23:08<1:38:14,  1.79it/s]                                                       17%|█▋        | 2213/12776 [23:08<1:38:14,  1.79it/s] 17%|█▋        | 2214/12776 [23:08<1:32:21,  1.91it/s]                                                       17%|█▋        | 2214/12776 [23:08<1:32:21,  1.91it/s] 17%|█▋        | 2215/12776 [23:09<1:31:12,  1.93it/s]                                                       17%|█▋        | 2215/12776 [23:09<1:31:12,  1.93it/s] 17%|█▋        | 2216/12776 [23:09<1:25:57,  2.05it/s]                                                       17%|█▋        | 2216/12776 [23:09<1:25:57,  2.05it/s] 17%|█▋        | 2217/12776 [23:10<1:21:25,  2.16it/s]                                                       17%|█▋        | 2217/12776 [23:10<1:21:25,  2.16it/s] 17%|█▋        | 2218/12776 [23:10<1:23:32,  2.11it/s]                                                       17%|█▋        | 2218/12776 [23:10<1:23:32,  2.11it/s] 17%|█▋        | 2219/12776 [23:10<1:18:19,  2.25it/s]                                                       17%|█▋        | 2219/12776 [23:10<1:18:19,  2.25it/s] 17%|█▋        | 2220/12776 [23:11<1:13:55,  2.38it/s]                                                       17%|█▋        | 2220/12776 [23:11<1:13:55,  2.38it/s] 17%|█▋        | 2221/12776 [23:11<1:15:04,  2.34it/s]                                                       17%|█▋        | 2221/12776 [23:11<1:15:04,  2.34it/s] 17%|█▋        | 2222/12776 [23:12<1:10:06,  2.51it/s]                                                       17%|█▋        | 2222/12776 [23:12<1:10:06,  2.51it/s] 17%|█▋        | 2223/12776 [23:12<1:06:03,  2.66it/s]                                                       17%|█▋        | 2223/12776 [23:12<1:06:03,  2.66it/s] 17%|█▋        | 2224/12776 [23:12<1:06:51,  2.63it/s]                                                       17%|█▋        | 2224/12776 [23:12<1:06:51,  2.63it/s] 17%|█▋        | 2225/12776 [23:13<1:02:33,  2.81it/s]                                                       17%|█▋        | 2225/12776 [23:13<1:02:33,  2.81it/s] 17%|█▋        | 2226/12776 [23:13<59:01,  2.98it/s]                                                       17%|█▋        | 2226/12776 [23:13<59:01,  2.98it/s] 17%|█▋        | 2227/12776 [23:13<1:01:05,  2.88it/s]                                                       17%|█▋        | 2227/12776 [23:13<1:01:05,  2.88it/s] 17%|█▋        | 2228/12776 [23:14<57:00,  3.08it/s]                                                       17%|█▋        | 2228/12776 [23:14<57:00,  3.08it/s] 17%|█▋        | 2229/12776 [23:14<53:43,  3.27it/s]                                                     17%|█▋        | 2229/12776 [23:14<53:43,  3.27it/s] 17%|█▋        | 2230/12776 [23:14<51:33,  3.41it/s]                                                     17%|█▋        | 2230/12776 [23:14<51:33,  3.41it/s] 17%|█▋        | 2231/12776 [23:14<53:50,  3.26it/s]                                                     17%|█▋        | 2231/12776 [23:14<53:50,  3.26it/s] 17%|█▋        | 2232/12776 [23:15<50:21,  3.49it/s]                                                    {'loss': 0.4198, 'grad_norm': 0.6694740653038025, 'learning_rate': 0.0002597018572825024, 'epoch': 0.34}
+{'loss': 0.4105, 'grad_norm': 0.9819236993789673, 'learning_rate': 0.0002596774193548387, 'epoch': 0.34}
+{'loss': 0.5237, 'grad_norm': 1.0089393854141235, 'learning_rate': 0.00025965298142717496, 'epoch': 0.34}
+{'loss': 0.4374, 'grad_norm': 0.5175392627716064, 'learning_rate': 0.0002596285434995112, 'epoch': 0.34}
+{'loss': 0.5646, 'grad_norm': 0.8120709657669067, 'learning_rate': 0.00025960410557184746, 'epoch': 0.34}
+{'loss': 0.5908, 'grad_norm': 0.9096924662590027, 'learning_rate': 0.00025957966764418377, 'epoch': 0.34}
+{'loss': 0.4136, 'grad_norm': 0.8007375001907349, 'learning_rate': 0.00025955522971652, 'epoch': 0.34}
+{'loss': 0.7746, 'grad_norm': 1.0222002267837524, 'learning_rate': 0.00025953079178885627, 'epoch': 0.34}
+{'loss': 0.4193, 'grad_norm': 0.897656261920929, 'learning_rate': 0.0002595063538611926, 'epoch': 0.34}
+{'loss': 0.3561, 'grad_norm': 1.0817437171936035, 'learning_rate': 0.00025948191593352883, 'epoch': 0.34}
+{'loss': 0.4638, 'grad_norm': 1.0257635116577148, 'learning_rate': 0.0002594574780058651, 'epoch': 0.34}
+{'loss': 0.8187, 'grad_norm': 1.651721715927124, 'learning_rate': 0.0002594330400782014, 'epoch': 0.34}
+{'loss': 0.5079, 'grad_norm': 1.1372004747390747, 'learning_rate': 0.0002594086021505376, 'epoch': 0.34}
+{'loss': 0.7663, 'grad_norm': 1.8931221961975098, 'learning_rate': 0.0002593841642228739, 'epoch': 0.34}
+{'loss': 0.6915, 'grad_norm': 1.1629186868667603, 'learning_rate': 0.00025935972629521014, 'epoch': 0.34}
+{'loss': 0.9156, 'grad_norm': 1.5948940515518188, 'learning_rate': 0.0002593352883675464, 'epoch': 0.34}
+{'loss': 0.6997, 'grad_norm': 1.6868611574172974, 'learning_rate': 0.0002593108504398827, 'epoch': 0.34}
+{'loss': 0.744, 'grad_norm': 2.6906371116638184, 'learning_rate': 0.00025928641251221895, 'epoch': 0.34}
+{'loss': 0.5701, 'grad_norm': 1.4386720657348633, 'learning_rate': 0.0002592619745845552, 'epoch': 0.34}
+{'loss': 0.9001, 'grad_norm': 1.7945785522460938, 'learning_rate': 0.00025923753665689145, 'epoch': 0.34}
+{'loss': 0.7199, 'grad_norm': 1.8731091022491455, 'learning_rate': 0.00025921309872922776, 'epoch': 0.34}
+{'loss': 1.005, 'grad_norm': 2.1651723384857178, 'learning_rate': 0.000259188660801564, 'epoch': 0.34}
+{'loss': 1.1601, 'grad_norm': 2.3972442150115967, 'learning_rate': 0.00025916422287390026, 'epoch': 0.34}
+{'loss': 0.8559, 'grad_norm': 1.9120584726333618, 'learning_rate': 0.00025913978494623656, 'epoch': 0.34}
+{'loss': 0.9065, 'grad_norm': 2.741961717605591, 'learning_rate': 0.0002591153470185728, 'epoch': 0.34}
+{'loss': 1.1946, 'grad_norm': 2.225860834121704, 'learning_rate': 0.00025909090909090907, 'epoch': 0.34}
+{'loss': 1.0144, 'grad_norm': 2.113170623779297, 'learning_rate': 0.00025906647116324537, 'epoch': 0.34}
+{'loss': 1.0589, 'grad_norm': 3.7552993297576904, 'learning_rate': 0.00025904203323558157, 'epoch': 0.34}
+{'loss': 1.034, 'grad_norm': 2.8972368240356445, 'learning_rate': 0.0002590175953079179, 'epoch': 0.34}
+{'loss': 1.3211, 'grad_norm': 9.414438247680664, 'learning_rate': 0.0002589931573802541, 'epoch': 0.34}
+{'loss': 1.1937, 'grad_norm': 2.1980931758880615, 'learning_rate': 0.0002589687194525904, 'epoch': 0.34}
+{'loss': 1.2868, 'grad_norm': 1.7976131439208984, 'learning_rate': 0.0002589442815249267, 'epoch': 0.34}
+{'loss': 1.3121, 'grad_norm': 3.20818829536438, 'learning_rate': 0.00025891984359726293, 'epoch': 0.34}
+{'loss': 1.4822, 'grad_norm': 2.546349287033081, 'learning_rate': 0.0002588954056695992, 'epoch': 0.34}
+{'loss': 1.3909, 'grad_norm': 2.756331205368042, 'learning_rate': 0.0002588709677419355, 'epoch': 0.34}
+{'loss': 1.4207, 'grad_norm': 2.7843143939971924, 'learning_rate': 0.00025884652981427174, 'epoch': 0.34}
+{'loss': 1.0539, 'grad_norm': 2.695613145828247, 'learning_rate': 0.000258822091886608, 'epoch': 0.34}
+{'loss': 1.2176, 'grad_norm': 2.704921007156372, 'learning_rate': 0.00025879765395894424, 'epoch': 0.34}
+{'loss': 1.6507, 'grad_norm': 2.946237802505493, 'learning_rate': 0.00025877321603128055, 'epoch': 0.34}
+{'loss': 1.5005, 'grad_norm': 2.0598270893096924, 'learning_rate': 0.0002587487781036168, 'epoch': 0.34}
+{'loss': 0.7967, 'grad_norm': 2.0431604385375977, 'learning_rate': 0.00025872434017595305, 'epoch': 0.34}
+{'loss': 1.6454, 'grad_norm': 3.8533642292022705, 'learning_rate': 0.00025869990224828936, 'epoch': 0.34}
+{'loss': 1.0377, 'grad_norm': 1.8645732402801514, 'learning_rate': 0.00025867546432062555, 'epoch': 0.34}
+{'loss': 0.9068, 'grad_norm': 2.5659990310668945, 'learning_rate': 0.00025865102639296186, 'epoch': 0.34}
+{'loss': 0.9772, 'grad_norm': 2.7070820331573486, 'learning_rate': 0.0002586265884652981, 'epoch': 0.34}
+{'loss': 1.4371, 'grad_norm': 1.378251075744629, 'learning_rate': 0.00025860215053763436, 'epoch': 0.34}
+{'loss': 0.4837, 'grad_norm': 0.5317636132240295, 'learning_rate': 0.00025857771260997067, 'epoch': 0.34}
+{'loss': 0.579, 'grad_norm': 0.7095066905021667, 'learning_rate': 0.0002585532746823069, 'epoch': 0.34}
+{'loss': 0.6504, 'grad_norm': 0.9275551438331604, 'learning_rate': 0.00025852883675464317, 'epoch': 0.34}
+{'loss': 0.4966, 'grad_norm': 1.716207504272461, 'learning_rate': 0.0002585043988269795, 'epoch': 0.35}
+{'loss': 0.5413, 'grad_norm': 0.6072659492492676, 'learning_rate': 0.0002584799608993157, 'epoch': 0.35}
+{'loss': 0.4245, 'grad_norm': 1.4741132259368896, 'learning_rate': 0.000258455522971652, 'epoch': 0.35}
+{'loss': 0.5105, 'grad_norm': 0.9101395010948181, 'learning_rate': 0.00025843108504398823, 'epoch': 0.35}
+{'loss': 0.5917, 'grad_norm': 0.8929093480110168, 'learning_rate': 0.00025840664711632453, 'epoch': 0.35}
+{'loss': 0.618, 'grad_norm': 0.9677044153213501, 'learning_rate': 0.0002583822091886608, 'epoch': 0.35}
+{'loss': 0.5954, 'grad_norm': 0.9117663502693176, 'learning_rate': 0.00025835777126099704, 'epoch': 0.35}
+{'loss': 0.4431, 'grad_norm': 0.8379285335540771, 'learning_rate': 0.00025833333333333334, 'epoch': 0.35}
+{'loss': 0.4436, 'grad_norm': 0.8588935136795044, 'learning_rate': 0.0002583088954056696, 'epoch': 0.35}
+{'loss': 0.3352, 'grad_norm': 1.0925568342208862, 'learning_rate': 0.00025828445747800584, 'epoch': 0.35}
+{'loss': 0.6069, 'grad_norm': 1.6619585752487183, 'learning_rate': 0.00025826001955034215, 'epoch': 0.35}
+{'loss': 0.7189, 'grad_norm': 2.354327440261841, 'learning_rate': 0.00025823558162267835, 'epoch': 0.35}
+{'loss': 0.4776, 'grad_norm': 1.1300790309906006, 'learning_rate': 0.00025821114369501465, 'epoch': 0.35}
+{'loss': 0.8701, 'grad_norm': 1.8510233163833618, 'learning_rate': 0.0002581867057673509, 'epoch': 0.35}
+{'loss': 0.621, 'grad_norm': 1.0941369533538818, 'learning_rate': 0.00025816226783968716, 'epoch': 0.35}
+{'loss': 0.4966, 'grad_norm': 1.9270963668823242, 'learning_rate': 0.00025813782991202346, 'epoch': 0.35}
+{'loss': 0.7897, 'grad_norm': 2.781996726989746, 'learning_rate': 0.0002581133919843597, 'epoch': 0.35}
+{'loss': 0.7361, 'grad_norm': 1.3982248306274414, 'learning_rate': 0.00025808895405669596, 'epoch': 0.35}
+{'loss': 0.6928, 'grad_norm': 1.4898085594177246, 'learning_rate': 0.0002580645161290322, 'epoch': 0.35}
+{'loss': 0.6501, 'grad_norm': 1.5134382247924805, 'learning_rate': 0.0002580400782013685, 'epoch': 0.35}
+{'loss': 0.8948, 'grad_norm': 2.149883985519409, 'learning_rate': 0.00025801564027370477, 'epoch': 0.35}
+{'loss': 0.9539, 'grad_norm': 2.79316782951355, 'learning_rate': 0.000257991202346041, 'epoch': 0.35}
+{'loss': 0.6291, 'grad_norm': 1.0646382570266724, 'learning_rate': 0.00025796676441837733, 'epoch': 0.35}
+{'loss': 0.8177, 'grad_norm': 1.6639641523361206, 'learning_rate': 0.0002579423264907136, 'epoch': 0.35}
+{'loss': 0.6531, 'grad_norm': 1.5588454008102417, 'learning_rate': 0.00025791788856304983, 'epoch': 0.35}
+{'loss': 0.976, 'grad_norm': 2.3921139240264893, 'learning_rate': 0.00025789345063538614, 'epoch': 0.35}
+{'loss': 1.2253, 'grad_norm': 3.0724098682403564, 'learning_rate': 0.00025786901270772233, 'epoch': 0.35}
+{'loss': 1.3439, 'grad_norm': 1.900876760482788, 'learning_rate': 0.00025784457478005864, 'epoch': 0.35}
+ 17%|█▋        | 2232/12776 [23:15<50:21,  3.49it/s] 17%|█▋        | 2233/12776 [23:15<47:34,  3.69it/s]                                                     17%|█▋        | 2233/12776 [23:15<47:34,  3.69it/s] 17%|█▋        | 2234/12776 [23:15<45:29,  3.86it/s]                                                     17%|█▋        | 2234/12776 [23:15<45:29,  3.86it/s] 17%|█▋        | 2235/12776 [23:15<43:46,  4.01it/s]                                                     17%|█▋        | 2235/12776 [23:15<43:46,  4.01it/s] 18%|█▊        | 2236/12776 [23:16<45:59,  3.82it/s]                                                     18%|█▊        | 2236/12776 [23:16<45:59,  3.82it/s] 18%|█▊        | 2237/12776 [23:16<43:28,  4.04it/s]                                                     18%|█▊        | 2237/12776 [23:16<43:28,  4.04it/s] 18%|█▊        | 2238/12776 [23:16<41:19,  4.25it/s]                                                     18%|█▊        | 2238/12776 [23:16<41:19,  4.25it/s] 18%|█▊        | 2239/12776 [23:16<39:38,  4.43it/s]                                                     18%|█▊        | 2239/12776 [23:16<39:38,  4.43it/s] 18%|█▊        | 2240/12776 [23:16<38:29,  4.56it/s]                                                     18%|█▊        | 2240/12776 [23:16<38:29,  4.56it/s] 18%|█▊        | 2241/12776 [23:17<42:25,  4.14it/s]                                                     18%|█▊        | 2241/12776 [23:17<42:25,  4.14it/s] 18%|█▊        | 2242/12776 [23:17<40:04,  4.38it/s]                                                     18%|█▊        | 2242/12776 [23:17<40:04,  4.38it/s] 18%|█▊        | 2243/12776 [23:17<38:21,  4.58it/s]                                                     18%|█▊        | 2243/12776 [23:17<38:21,  4.58it/s] 18%|█▊        | 2244/12776 [23:17<37:02,  4.74it/s]                                                     18%|█▊        | 2244/12776 [23:17<37:02,  4.74it/s] 18%|█▊        | 2245/12776 [23:18<36:03,  4.87it/s]                                                     18%|█▊        | 2245/12776 [23:18<36:03,  4.87it/s] 18%|█▊        | 2246/12776 [23:18<38:24,  4.57it/s]                                                     18%|█▊        | 2246/12776 [23:18<38:24,  4.57it/s] 18%|█▊        | 2247/12776 [23:18<36:39,  4.79it/s]                                                     18%|█▊        | 2247/12776 [23:18<36:39,  4.79it/s] 18%|█▊        | 2248/12776 [23:18<35:11,  4.99it/s]                                                     18%|█▊        | 2248/12776 [23:18<35:11,  4.99it/s] 18%|█▊        | 2249/12776 [23:18<36:43,  4.78it/s]                                                     18%|█▊        | 2249/12776 [23:18<36:43,  4.78it/s] 18%|█▊        | 2250/12776 [23:19<1:05:19,  2.69it/s]                                                       18%|█▊        | 2250/12776 [23:19<1:05:19,  2.69it/s] 18%|█▊        | 2251/12776 [23:21<2:16:02,  1.29it/s]                                                       18%|█▊        | 2251/12776 [23:21<2:16:02,  1.29it/s] 18%|█▊        | 2252/12776 [23:22<2:27:14,  1.19it/s]                                                       18%|█▊        | 2252/12776 [23:22<2:27:14,  1.19it/s] 18%|█▊        | 2253/12776 [23:23<2:31:52,  1.15it/s]                                                       18%|█▊        | 2253/12776 [23:23<2:31:52,  1.15it/s] 18%|█▊        | 2254/12776 [23:24<2:36:17,  1.12it/s]                                                       18%|█▊        | 2254/12776 [23:24<2:36:17,  1.12it/s] 18%|█▊        | 2255/12776 [23:25<2:32:18,  1.15it/s]                                                       18%|█▊        | 2255/12776 [23:25<2:32:18,  1.15it/s] 18%|█▊        | 2256/12776 [23:25<2:24:39,  1.21it/s]                                                       18%|█▊        | 2256/12776 [23:25<2:24:39,  1.21it/s] 18%|█▊        | 2257/12776 [23:26<2:17:55,  1.27it/s]                                                       18%|█▊        | 2257/12776 [23:26<2:17:55,  1.27it/s] 18%|█▊        | 2258/12776 [23:27<2:09:48,  1.35it/s]                                                       18%|█▊        | 2258/12776 [23:27<2:09:48,  1.35it/s] 18%|█▊        | 2259/12776 [23:27<2:02:09,  1.43it/s]                                                       18%|█▊        | 2259/12776 [23:27<2:02:09,  1.43it/s] 18%|█▊        | 2260/12776 [23:28<1:54:56,  1.52it/s]                                                       18%|█▊        | 2260/12776 [23:28<1:54:56,  1.52it/s] 18%|█▊        | 2261/12776 [23:28<1:49:40,  1.60it/s]                                                       18%|█▊        | 2261/12776 [23:28<1:49:40,  1.60it/s] 18%|█▊        | 2262/12776 [23:29<1:43:43,  1.69it/s]                                                       18%|█▊        | 2262/12776 [23:29<1:43:43,  1.69it/s] 18%|█▊        | 2263/12776 [23:29<1:43:51,  1.69it/s]                                                       18%|█▊        | 2263/12776 [23:29<1:43:51,  1.69it/s] 18%|█▊        | 2264/12776 [23:30<1:37:28,  1.80it/s]                                                       18%|█▊        | 2264/12776 [23:30<1:37:28,  1.80it/s] 18%|█▊        | 2265/12776 [23:30<1:36:10,  1.82it/s]                                                       18%|█▊        | 2265/12776 [23:30<1:36:10,  1.82it/s] 18%|█▊        | 2266/12776 [23:31<1:29:52,  1.95it/s]                                                       18%|█▊        | 2266/12776 [23:31<1:29:52,  1.95it/s] 18%|█▊        | 2267/12776 [23:31<1:31:38,  1.91it/s]                                                       18%|█▊        | 2267/12776 [23:31<1:31:38,  1.91it/s] 18%|█▊        | 2268/12776 [23:32<1:24:53,  2.06it/s]                                                       18%|█▊        | 2268/12776 [23:32<1:24:53,  2.06it/s] 18%|█▊        | 2269/12776 [23:32<1:19:22,  2.21it/s]                                                       18%|█▊        | 2269/12776 [23:32<1:19:22,  2.21it/s] 18%|█▊        | 2270/12776 [23:33<1:16:58,  2.27it/s]                                                       18%|█▊        | 2270/12776 [23:33<1:16:58,  2.27it/s] 18%|█▊        | 2271/12776 [23:33<1:12:17,  2.42it/s]                                                       18%|█▊        | 2271/12776 [23:33<1:12:17,  2.42it/s] 18%|█▊        | 2272/12776 [23:33<1:08:28,  2.56it/s]                                                       18%|█▊        | 2272/12776 [23:33<1:08:28,  2.56it/s] 18%|█▊        | 2273/12776 [23:34<1:09:48,  2.51it/s]                                                       18%|█▊        | 2273/12776 [23:34<1:09:48,  2.51it/s] 18%|█▊        | 2274/12776 [23:34<1:05:29,  2.67it/s]                                                       18%|█▊        | 2274/12776 [23:34<1:05:29,  2.67it/s] 18%|█▊        | 2275/12776 [23:34<1:01:29,  2.85it/s]                                                       18%|█▊        | 2275/12776 [23:34<1:01:29,  2.85it/s] 18%|█▊        | 2276/12776 [23:35<57:58,  3.02it/s]                                                       18%|█▊        | 2276/12776 [23:35<57:58,  3.02it/s] 18%|█▊        | 2277/12776 [23:35<1:00:06,  2.91it/s]                                                       18%|█▊        | 2277/12776 [23:35<1:00:06,  2.91it/s] 18%|█▊        | 2278/12776 [23:35<56:18,  3.11it/s]                                                       18%|█▊        | 2278/12776 [23:35<56:18,  3.11it/s] 18%|█▊        | 2279/12776 [23:35<53:13,  3.29it/s]                                                     18%|█▊        | 2279/12776 [23:35<53:13,  3.29it/s] 18%|█▊        | 2280/12776 [23:36<50:27,  3.47it/s]                                                     18%|█▊        | 2280/12776 [23:36<50:27,  3.47it/s] 18%|█▊        | 2281/12776 [23:36<52:11,  3.35it/s]                                                     18%|█▊        | 2281/12776 [23:36<52:11,  3.35it/s] 18%|█▊        | 2282/12776 [23:36<49:15,  3.55it/s]                                                     18%|█▊        | 2282/12776 [23:36<49:15,  3.55it/s] 18%|█▊        | 2283/12776 [23:37<46:56,  3.73it/s]                                                     18%|█▊        | 2283/12776 [23:37<46:56,  3.73it/s] 18%|█▊        | 2284/12776 [23:37<45:02,  3.88it/s]                                                     18%|█▊        | 2284/12776 [23:37<45:02,  3.88it/s] 18%|█▊        | 2285/12776 [23:37<47:06,  3.71it/s]                                                     18%|█▊        | 2285/12776 [23:37<47:06,  3.71it/s] 18%|█▊        | 2286/12776 [23:37<44:24,  3.94it/s]                                                     18%|█▊        | 2286/12776 [23:37<44:24,  3.94it/s] 18%|█▊        | 2287/12776 [23:37<42:14,  4.14it/s]                                                     18%|█▊        | 2287/12776 [23:37<42:14,  4.14it/s] 18%|█▊        | 2288/12776 [23:38<40:34,  4.31it/s]                                                     18%|█▊        | 2288/12776 [23:38<40:34,  4.31it/s] 18%|█▊        | 2289/12776 [23:38<39:25,  4.43it/s]                                                     18%|█▊        | 2289/12776 [23:38<39:25,  4.43it/s] 18%|█▊        | 2290/12776 [23:38<42:33,  4.11it/s]                                                     18%|█▊        | 2290/12776 [23:38<42:33,  4.11it/s] 18%|█▊        | 2291/12776 [23:38<40:33,  4.31it/s]                                                     18%|█▊        | 2291/12776 [23:38<40:33,  4.31it/s] 18%|█▊        | 2292/12776 [23:39<38:58,  4.48it/s]                                                     18%|█▊        | 2292/12776 [23:39<38:58,  4.48it/s] 18%|█▊        | 2293/12776 [23:39<37:51,  4.62it/s]                                                     18%|█▊        | 2293/12776 [23:39<37:51,  4.62it/s] 18%|█▊        | 2294/12776 [23:39<36:52,  4.74it/s]                                                     18%|█▊        | 2294/12776 [23:39<36:52,  4.74it/s] 18%|█▊        | 2295/12776 [23:39<44:00,  3.97it/s]                                                     18%|█▊        | 2295/12776 [23:39<44:00,  3.97it/s] 18%|█▊        | 2296/12776 [23:40<40:49,  4.28it/s]                                                     18%|█▊        | 2296/12776 [23:40<40:49,  4.28it/s] 18%|█▊        | 2297/12776 [23:40<38:34,  4.53it/s]                                                     18%|█▊        | 2297/12776 [23:40<38:34,  4.53it/s] 18%|█▊        | 2298/12776 [23:40<36:40,  4.76it/s]                                                     18%|█▊        | 2298/12776 [23:40<36:40,  4.76it/s] 18%|█▊        | 2299/12776 [23:40<35:21,  4.94it/s]                                                     18%|█▊        | 2299/12776 [23:40<35:21,  4.94it/s] 18%|█▊        | 2300/12776 [23:41<1:04:10,  2.72it/s]                                                       18%|█▊        | 2300/12776 [23:41<1:04:10,  2.72it/s] 18%|█▊        | 2301/12776 [23:42<2:05:20,  1.39it/s]                                                       18%|█▊        | 2301/12776 [23:42<2:05:20,  1.39it/s] 18%|█▊        | 2302/12776 [23:43<2:20:01,  1.25it/s]                                                       18%|█▊        | 2302/12776 [23:43<2:20:01,  1.25it/s] 18%|█▊        | 2303/12776 [23:44<2:26:46,  1.19it/s]                                                       18%|█▊        | 2303/12776 [23:44<2:26:46,  1.19it/s] 18%|█▊        | 2304/12776 [23:45<2:26:31,  1.19it/s]                                                       18%|█▊        | 2304/12776 [23:45<2:26:31,  1.19it/s] 18%|█▊        | 2305/12776 [23:46<2:22:02,  1.23it/s]                                                       18%|█▊        | 2305/12776 [23:46<2:22:02,  1.23it/s] 18%|█▊        | 2306/12776 [23:47<2:21:39,  1.23it/s]                                                       18%|█▊        | 2306/12776 [23:47<2:21:39,  1.23it/s] 18%|█▊        | 2307/12776 [23:47<2:17:21,  1.27it/s]                                                       18%|█▊        | 2307/12776 [23:47<2:17:21,  1.27it/s] 18%|█▊        | 2308/12776 [23:48<2:09:34,  1.35it/s]                                                       18%|█▊        | 2308/12776 [23:48<2:09:34,  1.35it/s] 18%|█▊        | 2309/12776 [23:49<2:11:27,  1.33it/s]                                                       18%|█▊        | 2309/12776 [23:49<2:11:27,  1.33it/s] 18%|█▊        | 2310/12776 [23:49<2:02:07,  1.43it/s]                                                      {'loss': 1.0304, 'grad_norm': 1.975421667098999, 'learning_rate': 0.0002578201368523949, 'epoch': 0.35}
+{'loss': 0.8912, 'grad_norm': 1.7853161096572876, 'learning_rate': 0.00025779569892473114, 'epoch': 0.35}
+{'loss': 0.7942, 'grad_norm': 2.0983622074127197, 'learning_rate': 0.00025777126099706745, 'epoch': 0.35}
+{'loss': 1.3245, 'grad_norm': 2.663015127182007, 'learning_rate': 0.0002577468230694037, 'epoch': 0.35}
+{'loss': 1.5237, 'grad_norm': 3.564725399017334, 'learning_rate': 0.00025772238514173995, 'epoch': 0.35}
+{'loss': 0.9078, 'grad_norm': 3.92618727684021, 'learning_rate': 0.00025769794721407625, 'epoch': 0.35}
+{'loss': 1.2595, 'grad_norm': 2.12117862701416, 'learning_rate': 0.0002576735092864125, 'epoch': 0.35}
+{'loss': 1.0926, 'grad_norm': 2.0806806087493896, 'learning_rate': 0.00025764907135874876, 'epoch': 0.35}
+{'loss': 1.1132, 'grad_norm': 2.7820568084716797, 'learning_rate': 0.000257624633431085, 'epoch': 0.35}
+{'loss': 1.3909, 'grad_norm': 2.921229362487793, 'learning_rate': 0.0002576001955034213, 'epoch': 0.35}
+{'loss': 1.3431, 'grad_norm': 2.3714778423309326, 'learning_rate': 0.00025757575757575756, 'epoch': 0.35}
+{'loss': 1.0429, 'grad_norm': 1.7340811491012573, 'learning_rate': 0.0002575513196480938, 'epoch': 0.35}
+{'loss': 1.5378, 'grad_norm': 2.6049928665161133, 'learning_rate': 0.0002575268817204301, 'epoch': 0.35}
+{'loss': 0.9605, 'grad_norm': 2.498192310333252, 'learning_rate': 0.0002575024437927663, 'epoch': 0.35}
+{'loss': 1.601, 'grad_norm': 2.289841890335083, 'learning_rate': 0.0002574780058651026, 'epoch': 0.35}
+{'loss': 0.9655, 'grad_norm': 3.376864194869995, 'learning_rate': 0.0002574535679374389, 'epoch': 0.35}
+{'loss': 0.8568, 'grad_norm': 1.3867840766906738, 'learning_rate': 0.0002574291300097751, 'epoch': 0.35}
+{'loss': 0.9717, 'grad_norm': 2.4591383934020996, 'learning_rate': 0.00025740469208211143, 'epoch': 0.35}
+{'loss': 1.1796, 'grad_norm': 2.244575023651123, 'learning_rate': 0.0002573802541544477, 'epoch': 0.35}
+{'loss': 0.3882, 'grad_norm': 0.7717861533164978, 'learning_rate': 0.00025735581622678393, 'epoch': 0.35}
+{'loss': 0.2957, 'grad_norm': 0.5575485825538635, 'learning_rate': 0.00025733137829912024, 'epoch': 0.35}
+{'loss': 0.5051, 'grad_norm': 0.7117191553115845, 'learning_rate': 0.0002573069403714565, 'epoch': 0.35}
+{'loss': 0.3964, 'grad_norm': 0.5327755808830261, 'learning_rate': 0.00025728250244379274, 'epoch': 0.35}
+{'loss': 0.4411, 'grad_norm': 0.7493363618850708, 'learning_rate': 0.000257258064516129, 'epoch': 0.35}
+{'loss': 0.3451, 'grad_norm': 0.5741692185401917, 'learning_rate': 0.0002572336265884653, 'epoch': 0.35}
+{'loss': 0.3136, 'grad_norm': 0.6874625086784363, 'learning_rate': 0.00025720918866080155, 'epoch': 0.35}
+{'loss': 0.5177, 'grad_norm': 1.1320338249206543, 'learning_rate': 0.0002571847507331378, 'epoch': 0.35}
+{'loss': 0.3906, 'grad_norm': 1.1008598804473877, 'learning_rate': 0.0002571603128054741, 'epoch': 0.35}
+{'loss': 0.4661, 'grad_norm': 1.950392484664917, 'learning_rate': 0.00025713587487781036, 'epoch': 0.35}
+{'loss': 0.3923, 'grad_norm': 0.9202426075935364, 'learning_rate': 0.0002571114369501466, 'epoch': 0.35}
+{'loss': 0.5517, 'grad_norm': 1.0292601585388184, 'learning_rate': 0.00025708699902248286, 'epoch': 0.35}
+{'loss': 0.432, 'grad_norm': 0.9722676277160645, 'learning_rate': 0.0002570625610948191, 'epoch': 0.35}
+{'loss': 0.4121, 'grad_norm': 2.0393378734588623, 'learning_rate': 0.0002570381231671554, 'epoch': 0.35}
+{'loss': 0.571, 'grad_norm': 1.5482399463653564, 'learning_rate': 0.00025701368523949167, 'epoch': 0.35}
+{'loss': 0.6699, 'grad_norm': 1.6801763772964478, 'learning_rate': 0.0002569892473118279, 'epoch': 0.35}
+{'loss': 0.9005, 'grad_norm': 1.4256728887557983, 'learning_rate': 0.0002569648093841642, 'epoch': 0.35}
+{'loss': 0.5094, 'grad_norm': 1.3114697933197021, 'learning_rate': 0.0002569403714565005, 'epoch': 0.36}
+{'loss': 0.7312, 'grad_norm': 1.3349591493606567, 'learning_rate': 0.00025691593352883673, 'epoch': 0.36}
+{'loss': 0.6074, 'grad_norm': 1.4924392700195312, 'learning_rate': 0.000256891495601173, 'epoch': 0.36}
+{'loss': 0.9784, 'grad_norm': 1.610140323638916, 'learning_rate': 0.0002568670576735093, 'epoch': 0.36}
+{'loss': 0.738, 'grad_norm': 3.816253662109375, 'learning_rate': 0.00025684261974584554, 'epoch': 0.36}
+{'loss': 0.6391, 'grad_norm': 2.6387505531311035, 'learning_rate': 0.0002568181818181818, 'epoch': 0.36}
+{'loss': 1.1959, 'grad_norm': 2.228098154067993, 'learning_rate': 0.0002567937438905181, 'epoch': 0.36}
+{'loss': 0.9127, 'grad_norm': 2.4509754180908203, 'learning_rate': 0.00025676930596285434, 'epoch': 0.36}
+{'loss': 1.3263, 'grad_norm': 2.0629775524139404, 'learning_rate': 0.0002567448680351906, 'epoch': 0.36}
+{'loss': 1.176, 'grad_norm': 1.7958539724349976, 'learning_rate': 0.0002567204301075269, 'epoch': 0.36}
+{'loss': 0.7305, 'grad_norm': 1.8170077800750732, 'learning_rate': 0.0002566959921798631, 'epoch': 0.36}
+{'loss': 1.1738, 'grad_norm': 2.358222007751465, 'learning_rate': 0.0002566715542521994, 'epoch': 0.36}
+{'loss': 1.1321, 'grad_norm': 1.7362068891525269, 'learning_rate': 0.00025664711632453565, 'epoch': 0.36}
+{'loss': 1.4302, 'grad_norm': 1.7529265880584717, 'learning_rate': 0.0002566226783968719, 'epoch': 0.36}
+{'loss': 0.9679, 'grad_norm': 2.1040377616882324, 'learning_rate': 0.0002565982404692082, 'epoch': 0.36}
+{'loss': 0.93, 'grad_norm': 3.2066938877105713, 'learning_rate': 0.00025657380254154446, 'epoch': 0.36}
+{'loss': 1.1485, 'grad_norm': 1.855434536933899, 'learning_rate': 0.0002565493646138807, 'epoch': 0.36}
+{'loss': 1.2011, 'grad_norm': 1.5777641534805298, 'learning_rate': 0.00025652492668621696, 'epoch': 0.36}
+{'loss': 1.5703, 'grad_norm': 2.2551398277282715, 'learning_rate': 0.00025650048875855327, 'epoch': 0.36}
+{'loss': 1.4385, 'grad_norm': 2.300269365310669, 'learning_rate': 0.0002564760508308895, 'epoch': 0.36}
+{'loss': 1.1124, 'grad_norm': 1.81515634059906, 'learning_rate': 0.00025645161290322577, 'epoch': 0.36}
+{'loss': 1.3713, 'grad_norm': 1.9418299198150635, 'learning_rate': 0.0002564271749755621, 'epoch': 0.36}
+{'loss': 1.1791, 'grad_norm': 2.397949695587158, 'learning_rate': 0.00025640273704789833, 'epoch': 0.36}
+{'loss': 1.7395, 'grad_norm': 2.4161155223846436, 'learning_rate': 0.0002563782991202346, 'epoch': 0.36}
+{'loss': 0.9496, 'grad_norm': 2.1113173961639404, 'learning_rate': 0.0002563538611925709, 'epoch': 0.36}
+{'loss': 1.4005, 'grad_norm': 1.8949174880981445, 'learning_rate': 0.0002563294232649071, 'epoch': 0.36}
+{'loss': 1.3552, 'grad_norm': 3.1635854244232178, 'learning_rate': 0.0002563049853372434, 'epoch': 0.36}
+{'loss': 1.6418, 'grad_norm': 1.8511496782302856, 'learning_rate': 0.00025628054740957964, 'epoch': 0.36}
+{'loss': 0.8151, 'grad_norm': 2.0520694255828857, 'learning_rate': 0.0002562561094819159, 'epoch': 0.36}
+{'loss': 0.7499, 'grad_norm': 2.5839831829071045, 'learning_rate': 0.0002562316715542522, 'epoch': 0.36}
+{'loss': 0.8066, 'grad_norm': 1.368067979812622, 'learning_rate': 0.00025620723362658845, 'epoch': 0.36}
+{'loss': 1.376, 'grad_norm': 2.371605634689331, 'learning_rate': 0.0002561827956989247, 'epoch': 0.36}
+{'loss': 1.2232, 'grad_norm': 2.2831456661224365, 'learning_rate': 0.000256158357771261, 'epoch': 0.36}
+{'loss': 0.3244, 'grad_norm': 0.44485318660736084, 'learning_rate': 0.0002561339198435972, 'epoch': 0.36}
+{'loss': 0.5188, 'grad_norm': 0.7267107367515564, 'learning_rate': 0.0002561094819159335, 'epoch': 0.36}
+{'loss': 0.404, 'grad_norm': 0.7034205794334412, 'learning_rate': 0.00025608504398826976, 'epoch': 0.36}
+{'loss': 0.4213, 'grad_norm': 0.8780544996261597, 'learning_rate': 0.000256060606060606, 'epoch': 0.36}
+{'loss': 0.387, 'grad_norm': 0.5896530747413635, 'learning_rate': 0.0002560361681329423, 'epoch': 0.36}
+{'loss': 0.3646, 'grad_norm': 0.7313452959060669, 'learning_rate': 0.00025601173020527857, 'epoch': 0.36}
+{'loss': 0.4382, 'grad_norm': 0.7097526788711548, 'learning_rate': 0.0002559872922776148, 'epoch': 0.36}
+{'loss': 0.4068, 'grad_norm': 0.685795247554779, 'learning_rate': 0.0002559628543499511, 'epoch': 0.36}
+{'loss': 0.4213, 'grad_norm': 0.6047770380973816, 'learning_rate': 0.0002559384164222874, 'epoch': 0.36}
+ 18%|█▊        | 2310/12776 [23:49<2:02:07,  1.43it/s] 18%|█▊        | 2311/12776 [23:50<1:58:38,  1.47it/s]                                                       18%|█▊        | 2311/12776 [23:50<1:58:38,  1.47it/s] 18%|█▊        | 2312/12776 [23:51<1:51:02,  1.57it/s]                                                       18%|█▊        | 2312/12776 [23:51<1:51:02,  1.57it/s] 18%|█▊        | 2313/12776 [23:51<1:49:07,  1.60it/s]                                                       18%|█▊        | 2313/12776 [23:51<1:49:07,  1.60it/s] 18%|█▊        | 2314/12776 [23:52<1:41:58,  1.71it/s]                                                       18%|█▊        | 2314/12776 [23:52<1:41:58,  1.71it/s] 18%|█▊        | 2315/12776 [23:52<1:39:05,  1.76it/s]                                                       18%|█▊        | 2315/12776 [23:52<1:39:05,  1.76it/s] 18%|█▊        | 2316/12776 [23:53<1:32:32,  1.88it/s]                                                       18%|█▊        | 2316/12776 [23:53<1:32:32,  1.88it/s] 18%|█▊        | 2317/12776 [23:53<1:31:44,  1.90it/s]                                                       18%|█▊        | 2317/12776 [23:53<1:31:44,  1.90it/s] 18%|█▊        | 2318/12776 [23:54<1:25:49,  2.03it/s]                                                       18%|█▊        | 2318/12776 [23:54<1:25:49,  2.03it/s] 18%|█▊        | 2319/12776 [23:54<1:20:53,  2.15it/s]                                                       18%|█▊        | 2319/12776 [23:54<1:20:53,  2.15it/s] 18%|█▊        | 2320/12776 [23:54<1:23:25,  2.09it/s]                                                       18%|█▊        | 2320/12776 [23:54<1:23:25,  2.09it/s] 18%|█▊        | 2321/12776 [23:55<1:17:02,  2.26it/s]                                                       18%|█▊        | 2321/12776 [23:55<1:17:02,  2.26it/s] 18%|█▊        | 2322/12776 [23:55<1:12:09,  2.41it/s]                                                       18%|█▊        | 2322/12776 [23:55<1:12:09,  2.41it/s] 18%|█▊        | 2323/12776 [23:56<1:10:53,  2.46it/s]                                                       18%|█▊        | 2323/12776 [23:56<1:10:53,  2.46it/s] 18%|█▊        | 2324/12776 [23:56<1:06:38,  2.61it/s]                                                       18%|█▊        | 2324/12776 [23:56<1:06:38,  2.61it/s] 18%|█▊        | 2325/12776 [23:56<1:03:07,  2.76it/s]                                                       18%|█▊        | 2325/12776 [23:56<1:03:07,  2.76it/s] 18%|█▊        | 2326/12776 [23:57<1:02:19,  2.79it/s]                                                       18%|█▊        | 2326/12776 [23:57<1:02:19,  2.79it/s] 18%|█▊        | 2327/12776 [23:57<58:57,  2.95it/s]                                                       18%|█▊        | 2327/12776 [23:57<58:57,  2.95it/s] 18%|█▊        | 2328/12776 [23:57<55:48,  3.12it/s]                                                     18%|█▊        | 2328/12776 [23:57<55:48,  3.12it/s] 18%|█▊        | 2329/12776 [23:57<52:57,  3.29it/s]                                                     18%|█▊        | 2329/12776 [23:57<52:57,  3.29it/s] 18%|█▊        | 2330/12776 [23:58<53:37,  3.25it/s]                                                     18%|█▊        | 2330/12776 [23:58<53:37,  3.25it/s] 18%|█▊        | 2331/12776 [23:58<50:18,  3.46it/s]                                                     18%|█▊        | 2331/12776 [23:58<50:18,  3.46it/s] 18%|█▊        | 2332/12776 [23:58<47:57,  3.63it/s]                                                     18%|█▊        | 2332/12776 [23:58<47:57,  3.63it/s] 18%|█▊        | 2333/12776 [23:58<46:13,  3.76it/s]                                                     18%|█▊        | 2333/12776 [23:58<46:13,  3.76it/s] 18%|█▊        | 2334/12776 [23:59<44:49,  3.88it/s]                                                     18%|█▊        | 2334/12776 [23:59<44:49,  3.88it/s] 18%|█▊        | 2335/12776 [23:59<46:28,  3.74it/s]                                                     18%|█▊        | 2335/12776 [23:59<46:28,  3.74it/s] 18%|█▊        | 2336/12776 [23:59<44:14,  3.93it/s]                                                     18%|█▊        | 2336/12776 [23:59<44:14,  3.93it/s] 18%|█▊        | 2337/12776 [23:59<42:12,  4.12it/s]                                                     18%|█▊        | 2337/12776 [23:59<42:12,  4.12it/s] 18%|█▊        | 2338/12776 [24:00<40:35,  4.29it/s]                                                     18%|█▊        | 2338/12776 [24:00<40:35,  4.29it/s] 18%|█▊        | 2339/12776 [24:00<39:14,  4.43it/s]                                                     18%|█▊        | 2339/12776 [24:00<39:14,  4.43it/s] 18%|█▊        | 2340/12776 [24:00<43:20,  4.01it/s]                                                     18%|█▊        | 2340/12776 [24:00<43:20,  4.01it/s] 18%|█▊        | 2341/12776 [24:00<40:54,  4.25it/s]                                                     18%|█▊        | 2341/12776 [24:00<40:54,  4.25it/s] 18%|█▊        | 2342/12776 [24:01<39:07,  4.45it/s]                                                     18%|█▊        | 2342/12776 [24:01<39:07,  4.45it/s] 18%|█▊        | 2343/12776 [24:01<37:53,  4.59it/s]                                                     18%|█▊        | 2343/12776 [24:01<37:53,  4.59it/s] 18%|█▊        | 2344/12776 [24:01<36:49,  4.72it/s]                                                     18%|█▊        | 2344/12776 [24:01<36:49,  4.72it/s] 18%|█▊        | 2345/12776 [24:01<42:20,  4.11it/s]                                                     18%|█▊        | 2345/12776 [24:01<42:20,  4.11it/s] 18%|█▊        | 2346/12776 [24:01<39:46,  4.37it/s]                                                     18%|█▊        | 2346/12776 [24:01<39:46,  4.37it/s] 18%|█▊        | 2347/12776 [24:02<37:59,  4.57it/s]                                                     18%|█▊        | 2347/12776 [24:02<37:59,  4.57it/s] 18%|█▊        | 2348/12776 [24:02<36:22,  4.78it/s]                                                     18%|█▊        | 2348/12776 [24:02<36:22,  4.78it/s] 18%|█▊        | 2349/12776 [24:02<35:21,  4.91it/s]                                                     18%|█▊        | 2349/12776 [24:02<35:21,  4.91it/s] 18%|█▊        | 2350/12776 [24:03<59:32,  2.92it/s]                                                     18%|█▊        | 2350/12776 [24:03<59:32,  2.92it/s] 18%|█▊        | 2351/12776 [24:04<1:49:39,  1.58it/s]                                                       18%|█▊        | 2351/12776 [24:04<1:49:39,  1.58it/s] 18%|█▊        | 2352/12776 [24:05<2:06:48,  1.37it/s]                                                       18%|█▊        | 2352/12776 [24:05<2:06:48,  1.37it/s] 18%|█▊        | 2353/12776 [24:06<2:10:37,  1.33it/s]                                                       18%|█▊        | 2353/12776 [24:06<2:10:37,  1.33it/s] 18%|█▊        | 2354/12776 [24:07<2:11:03,  1.33it/s]                                                       18%|█▊        | 2354/12776 [24:07<2:11:03,  1.33it/s] 18%|█▊        | 2355/12776 [24:07<2:13:47,  1.30it/s]                                                       18%|█▊        | 2355/12776 [24:07<2:13:47,  1.30it/s] 18%|█▊        | 2356/12776 [24:08<2:11:14,  1.32it/s]                                                       18%|█▊        | 2356/12776 [24:08<2:11:14,  1.32it/s] 18%|█▊        | 2357/12776 [24:09<2:06:37,  1.37it/s]                                                       18%|█▊        | 2357/12776 [24:09<2:06:37,  1.37it/s] 18%|█▊        | 2358/12776 [24:09<2:08:00,  1.36it/s]                                                       18%|█▊        | 2358/12776 [24:09<2:08:00,  1.36it/s] 18%|█▊        | 2359/12776 [24:10<2:00:11,  1.44it/s]                                                       18%|█▊        | 2359/12776 [24:10<2:00:11,  1.44it/s] 18%|█▊        | 2360/12776 [24:11<1:56:38,  1.49it/s]                                                       18%|█▊        | 2360/12776 [24:11<1:56:38,  1.49it/s] 18%|█▊        | 2361/12776 [24:11<1:50:02,  1.58it/s]                                                       18%|█▊        | 2361/12776 [24:11<1:50:02,  1.58it/s] 18%|█▊        | 2362/12776 [24:12<1:48:54,  1.59it/s]                                                       18%|█▊        | 2362/12776 [24:12<1:48:54,  1.59it/s] 18%|█▊        | 2363/12776 [24:12<1:42:17,  1.70it/s]                                                       18%|█▊        | 2363/12776 [24:12<1:42:17,  1.70it/s] 19%|█▊        | 2364/12776 [24:13<1:38:31,  1.76it/s]                                                       19%|█▊        | 2364/12776 [24:13<1:38:31,  1.76it/s] 19%|█▊        | 2365/12776 [24:13<1:32:30,  1.88it/s]                                                       19%|█▊        | 2365/12776 [24:13<1:32:30,  1.88it/s] 19%|█▊        | 2366/12776 [24:14<1:30:52,  1.91it/s]                                                       19%|█▊        | 2366/12776 [24:14<1:30:52,  1.91it/s] 19%|█▊        | 2367/12776 [24:14<1:25:55,  2.02it/s]                                                       19%|█▊        | 2367/12776 [24:14<1:25:55,  2.02it/s] 19%|█▊        | 2368/12776 [24:15<1:21:25,  2.13it/s]                                                       19%|█▊        | 2368/12776 [24:15<1:21:25,  2.13it/s] 19%|█▊        | 2369/12776 [24:15<1:20:35,  2.15it/s]                                                       19%|█▊        | 2369/12776 [24:15<1:20:35,  2.15it/s] 19%|█▊        | 2370/12776 [24:15<1:16:19,  2.27it/s]                                                       19%|█▊        | 2370/12776 [24:15<1:16:19,  2.27it/s] 19%|█▊        | 2371/12776 [24:16<1:12:31,  2.39it/s]                                                       19%|█▊        | 2371/12776 [24:16<1:12:31,  2.39it/s] 19%|█▊        | 2372/12776 [24:16<1:13:29,  2.36it/s]                                                       19%|█▊        | 2372/12776 [24:16<1:13:29,  2.36it/s] 19%|█▊        | 2373/12776 [24:17<1:09:31,  2.49it/s]                                                       19%|█▊        | 2373/12776 [24:17<1:09:31,  2.49it/s] 19%|█▊        | 2374/12776 [24:17<1:06:05,  2.62it/s]                                                       19%|█▊        | 2374/12776 [24:17<1:06:05,  2.62it/s] 19%|█▊        | 2375/12776 [24:17<1:05:54,  2.63it/s]                                                       19%|█▊        | 2375/12776 [24:17<1:05:54,  2.63it/s] 19%|█▊        | 2376/12776 [24:18<1:01:50,  2.80it/s]                                                       19%|█▊        | 2376/12776 [24:18<1:01:50,  2.80it/s] 19%|█▊        | 2377/12776 [24:18<58:36,  2.96it/s]                                                       19%|█▊        | 2377/12776 [24:18<58:36,  2.96it/s] 19%|█▊        | 2378/12776 [24:18<1:00:12,  2.88it/s]                                                       19%|█▊        | 2378/12776 [24:18<1:00:12,  2.88it/s] 19%|█▊        | 2379/12776 [24:19<56:51,  3.05it/s]                                                       19%|█▊        | 2379/12776 [24:19<56:51,  3.05it/s] 19%|█▊        | 2380/12776 [24:19<53:53,  3.21it/s]                                                     19%|█▊        | 2380/12776 [24:19<53:53,  3.21it/s] 19%|█▊        | 2381/12776 [24:19<51:17,  3.38it/s]                                                     19%|█▊        | 2381/12776 [24:19<51:17,  3.38it/s] 19%|█▊        | 2382/12776 [24:19<52:40,  3.29it/s]                                                     19%|█▊        | 2382/12776 [24:19<52:40,  3.29it/s] 19%|█▊        | 2383/12776 [24:20<49:43,  3.48it/s]                                                     19%|█▊        | 2383/12776 [24:20<49:43,  3.48it/s] 19%|█▊        | 2384/12776 [24:20<47:19,  3.66it/s]                                                     19%|█▊        | 2384/12776 [24:20<47:19,  3.66it/s] 19%|█▊        | 2385/12776 [24:20<45:10,  3.83it/s]                                                     19%|█▊        | 2385/12776 [24:20<45:10,  3.83it/s] 19%|█▊        | 2386/12776 [24:20<43:36,  3.97it/s]                                                     19%|█▊        | 2386/12776 [24:20<43:36,  3.97it/s] 19%|█▊        | 2387/12776 [24:21<45:55,  3.77it/s]                                                     19%|█▊        | 2387/12776 [24:21<45:55,  3.77it/s] 19%|█▊        | 2388/12776 [24:21<43:18,  4.00it/s]                                                    {'loss': 3.0132, 'grad_norm': 13.313465118408203, 'learning_rate': 0.0002559139784946236, 'epoch': 0.36}
+{'loss': 0.396, 'grad_norm': 1.098598599433899, 'learning_rate': 0.0002558895405669599, 'epoch': 0.36}
+{'loss': 0.4162, 'grad_norm': 0.8045850396156311, 'learning_rate': 0.0002558651026392962, 'epoch': 0.36}
+{'loss': 0.7236, 'grad_norm': 1.3596242666244507, 'learning_rate': 0.00025584066471163243, 'epoch': 0.36}
+{'loss': 0.4784, 'grad_norm': 1.0989056825637817, 'learning_rate': 0.0002558162267839687, 'epoch': 0.36}
+{'loss': 0.9534, 'grad_norm': 5.293839931488037, 'learning_rate': 0.000255791788856305, 'epoch': 0.36}
+{'loss': 0.7846, 'grad_norm': 0.8783222436904907, 'learning_rate': 0.0002557673509286412, 'epoch': 0.36}
+{'loss': 0.5091, 'grad_norm': 1.2049531936645508, 'learning_rate': 0.0002557429130009775, 'epoch': 0.36}
+{'loss': 0.5985, 'grad_norm': 1.0293569564819336, 'learning_rate': 0.00025571847507331374, 'epoch': 0.36}
+{'loss': 0.7279, 'grad_norm': 1.8985306024551392, 'learning_rate': 0.00025569403714565, 'epoch': 0.36}
+{'loss': 0.6954, 'grad_norm': 1.2464323043823242, 'learning_rate': 0.0002556695992179863, 'epoch': 0.36}
+{'loss': 0.461, 'grad_norm': 0.8314011096954346, 'learning_rate': 0.00025564516129032255, 'epoch': 0.36}
+{'loss': 0.719, 'grad_norm': 1.2355091571807861, 'learning_rate': 0.0002556207233626588, 'epoch': 0.36}
+{'loss': 0.5286, 'grad_norm': 1.9257937669754028, 'learning_rate': 0.0002555962854349951, 'epoch': 0.36}
+{'loss': 0.709, 'grad_norm': 1.4777895212173462, 'learning_rate': 0.00025557184750733136, 'epoch': 0.36}
+{'loss': 0.936, 'grad_norm': 2.033358573913574, 'learning_rate': 0.0002555474095796676, 'epoch': 0.36}
+{'loss': 1.2012, 'grad_norm': 2.625703811645508, 'learning_rate': 0.00025552297165200386, 'epoch': 0.36}
+{'loss': 0.7433, 'grad_norm': 2.080801486968994, 'learning_rate': 0.00025549853372434017, 'epoch': 0.36}
+{'loss': 0.8065, 'grad_norm': 1.9644887447357178, 'learning_rate': 0.0002554740957966764, 'epoch': 0.36}
+{'loss': 1.1196, 'grad_norm': 3.320422410964966, 'learning_rate': 0.00025544965786901267, 'epoch': 0.36}
+{'loss': 0.8819, 'grad_norm': 1.20960533618927, 'learning_rate': 0.000255425219941349, 'epoch': 0.36}
+{'loss': 0.5572, 'grad_norm': 1.6307368278503418, 'learning_rate': 0.0002554007820136852, 'epoch': 0.36}
+{'loss': 1.4483, 'grad_norm': 1.9175331592559814, 'learning_rate': 0.0002553763440860215, 'epoch': 0.37}
+{'loss': 1.2187, 'grad_norm': 3.3411760330200195, 'learning_rate': 0.00025535190615835773, 'epoch': 0.37}
+{'loss': 1.3473, 'grad_norm': 2.1116483211517334, 'learning_rate': 0.000255327468230694, 'epoch': 0.37}
+{'loss': 0.8682, 'grad_norm': 3.1497905254364014, 'learning_rate': 0.0002553030303030303, 'epoch': 0.37}
+{'loss': 1.1501, 'grad_norm': 2.2446794509887695, 'learning_rate': 0.00025527859237536654, 'epoch': 0.37}
+{'loss': 1.2884, 'grad_norm': 1.977690577507019, 'learning_rate': 0.0002552541544477028, 'epoch': 0.37}
+{'loss': 1.3231, 'grad_norm': 2.598881721496582, 'learning_rate': 0.0002552297165200391, 'epoch': 0.37}
+{'loss': 0.86, 'grad_norm': 2.4923884868621826, 'learning_rate': 0.00025520527859237535, 'epoch': 0.37}
+{'loss': 1.0986, 'grad_norm': 2.453099250793457, 'learning_rate': 0.0002551808406647116, 'epoch': 0.37}
+{'loss': 1.8547, 'grad_norm': 3.555971384048462, 'learning_rate': 0.00025515640273704785, 'epoch': 0.37}
+{'loss': 1.1017, 'grad_norm': 1.9464694261550903, 'learning_rate': 0.00025513196480938415, 'epoch': 0.37}
+{'loss': 1.7284, 'grad_norm': 3.1742851734161377, 'learning_rate': 0.0002551075268817204, 'epoch': 0.37}
+{'loss': 1.1764, 'grad_norm': 1.5996363162994385, 'learning_rate': 0.00025508308895405666, 'epoch': 0.37}
+{'loss': 1.5985, 'grad_norm': 3.3336408138275146, 'learning_rate': 0.00025505865102639296, 'epoch': 0.37}
+{'loss': 1.2361, 'grad_norm': 1.3342739343643188, 'learning_rate': 0.0002550342130987292, 'epoch': 0.37}
+{'loss': 0.6936, 'grad_norm': 2.0389742851257324, 'learning_rate': 0.00025500977517106546, 'epoch': 0.37}
+{'loss': 0.6738, 'grad_norm': 1.7741936445236206, 'learning_rate': 0.00025498533724340177, 'epoch': 0.37}
+{'loss': 1.4255, 'grad_norm': 2.689279079437256, 'learning_rate': 0.00025496089931573797, 'epoch': 0.37}
+{'loss': 0.8265, 'grad_norm': 1.4633771181106567, 'learning_rate': 0.00025493646138807427, 'epoch': 0.37}
+{'loss': 0.3965, 'grad_norm': 0.6005833148956299, 'learning_rate': 0.0002549120234604105, 'epoch': 0.37}
+{'loss': 0.4102, 'grad_norm': 0.5253506898880005, 'learning_rate': 0.0002548875855327468, 'epoch': 0.37}
+{'loss': 0.2965, 'grad_norm': 0.582460880279541, 'learning_rate': 0.0002548631476050831, 'epoch': 0.37}
+{'loss': 0.3247, 'grad_norm': 0.8253412246704102, 'learning_rate': 0.00025483870967741933, 'epoch': 0.37}
+{'loss': 0.4133, 'grad_norm': 0.6425923705101013, 'learning_rate': 0.0002548142717497556, 'epoch': 0.37}
+{'loss': 0.4947, 'grad_norm': 1.085408091545105, 'learning_rate': 0.00025478983382209183, 'epoch': 0.37}
+{'loss': 0.6325, 'grad_norm': 1.145398497581482, 'learning_rate': 0.00025476539589442814, 'epoch': 0.37}
+{'loss': 0.4053, 'grad_norm': 0.7742732167243958, 'learning_rate': 0.0002547409579667644, 'epoch': 0.37}
+{'loss': 0.7099, 'grad_norm': 1.4050893783569336, 'learning_rate': 0.00025471652003910064, 'epoch': 0.37}
+{'loss': 0.4195, 'grad_norm': 0.5724520683288574, 'learning_rate': 0.00025469208211143695, 'epoch': 0.37}
+{'loss': 0.6613, 'grad_norm': 1.2148267030715942, 'learning_rate': 0.0002546676441837732, 'epoch': 0.37}
+{'loss': 0.5106, 'grad_norm': 1.6140097379684448, 'learning_rate': 0.00025464320625610945, 'epoch': 0.37}
+{'loss': 0.6509, 'grad_norm': 1.0580832958221436, 'learning_rate': 0.00025461876832844575, 'epoch': 0.37}
+{'loss': 0.4407, 'grad_norm': 1.0968374013900757, 'learning_rate': 0.00025459433040078195, 'epoch': 0.37}
+{'loss': 0.8746, 'grad_norm': 1.8869554996490479, 'learning_rate': 0.00025456989247311826, 'epoch': 0.37}
+{'loss': 0.4115, 'grad_norm': 1.1201070547103882, 'learning_rate': 0.0002545454545454545, 'epoch': 0.37}
+{'loss': 0.8105, 'grad_norm': 1.391060709953308, 'learning_rate': 0.00025452101661779076, 'epoch': 0.37}
+{'loss': 0.5989, 'grad_norm': 1.707555890083313, 'learning_rate': 0.00025449657869012707, 'epoch': 0.37}
+{'loss': 0.7763, 'grad_norm': 1.1574084758758545, 'learning_rate': 0.0002544721407624633, 'epoch': 0.37}
+{'loss': 1.369, 'grad_norm': 4.23545503616333, 'learning_rate': 0.00025444770283479957, 'epoch': 0.37}
+{'loss': 0.5819, 'grad_norm': 1.9482706785202026, 'learning_rate': 0.0002544232649071359, 'epoch': 0.37}
+{'loss': 1.0075, 'grad_norm': 1.2901068925857544, 'learning_rate': 0.0002543988269794721, 'epoch': 0.37}
+{'loss': 0.5711, 'grad_norm': 1.395151138305664, 'learning_rate': 0.0002543743890518084, 'epoch': 0.37}
+{'loss': 0.7202, 'grad_norm': 1.3401544094085693, 'learning_rate': 0.0002543499511241446, 'epoch': 0.37}
+{'loss': 0.7058, 'grad_norm': 0.9693850874900818, 'learning_rate': 0.00025432551319648093, 'epoch': 0.37}
+{'loss': 0.6258, 'grad_norm': 1.058971881866455, 'learning_rate': 0.0002543010752688172, 'epoch': 0.37}
+{'loss': 0.741, 'grad_norm': 2.7098138332366943, 'learning_rate': 0.00025427663734115343, 'epoch': 0.37}
+{'loss': 0.7096, 'grad_norm': 1.293344259262085, 'learning_rate': 0.00025425219941348974, 'epoch': 0.37}
+{'loss': 0.9657, 'grad_norm': 6.409755706787109, 'learning_rate': 0.000254227761485826, 'epoch': 0.37}
+{'loss': 0.8149, 'grad_norm': 3.3196206092834473, 'learning_rate': 0.00025420332355816224, 'epoch': 0.37}
+{'loss': 1.0022, 'grad_norm': 1.7499516010284424, 'learning_rate': 0.0002541788856304985, 'epoch': 0.37}
+{'loss': 0.8736, 'grad_norm': 2.6857728958129883, 'learning_rate': 0.00025415444770283475, 'epoch': 0.37}
+{'loss': 0.6507, 'grad_norm': 2.264000415802002, 'learning_rate': 0.00025413000977517105, 'epoch': 0.37}
+{'loss': 0.8562, 'grad_norm': 2.5130653381347656, 'learning_rate': 0.0002541055718475073, 'epoch': 0.37}
+{'loss': 1.6511, 'grad_norm': 6.680568218231201, 'learning_rate': 0.00025408113391984355, 'epoch': 0.37}
+{'loss': 1.1887, 'grad_norm': 3.2456836700439453, 'learning_rate': 0.00025405669599217986, 'epoch': 0.37}
+{'loss': 1.1949, 'grad_norm': 2.358682632446289, 'learning_rate': 0.0002540322580645161, 'epoch': 0.37}
+ 19%|█▊        | 2388/12776 [24:21<43:18,  4.00it/s] 19%|█▊        | 2389/12776 [24:21<41:22,  4.18it/s]                                                     19%|█▊        | 2389/12776 [24:21<41:22,  4.18it/s] 19%|█▊        | 2390/12776 [24:21<40:01,  4.33it/s]                                                     19%|█▊        | 2390/12776 [24:21<40:01,  4.33it/s] 19%|█▊        | 2391/12776 [24:22<38:51,  4.45it/s]                                                     19%|█▊        | 2391/12776 [24:22<38:51,  4.45it/s] 19%|█▊        | 2392/12776 [24:22<42:30,  4.07it/s]                                                     19%|█▊        | 2392/12776 [24:22<42:30,  4.07it/s] 19%|█▊        | 2393/12776 [24:22<40:26,  4.28it/s]                                                     19%|█▊        | 2393/12776 [24:22<40:26,  4.28it/s] 19%|█▊        | 2394/12776 [24:22<38:44,  4.47it/s]                                                     19%|█▊        | 2394/12776 [24:22<38:44,  4.47it/s] 19%|█▊        | 2395/12776 [24:22<37:27,  4.62it/s]                                                     19%|█▊        | 2395/12776 [24:22<37:27,  4.62it/s] 19%|█▉        | 2396/12776 [24:23<36:21,  4.76it/s]                                                     19%|█▉        | 2396/12776 [24:23<36:21,  4.76it/s] 19%|█▉        | 2397/12776 [24:23<40:52,  4.23it/s]                                                     19%|█▉        | 2397/12776 [24:23<40:52,  4.23it/s] 19%|█▉        | 2398/12776 [24:23<38:18,  4.51it/s]                                                     19%|█▉        | 2398/12776 [24:23<38:18,  4.51it/s] 19%|█▉        | 2399/12776 [24:23<36:33,  4.73it/s]                                                     19%|█▉        | 2399/12776 [24:23<36:33,  4.73it/s] 19%|█▉        | 2400/12776 [24:24<1:06:34,  2.60it/s]                                                       19%|█▉        | 2400/12776 [24:24<1:06:34,  2.60it/s]Saving model checkpoint to ./checkpoint-2400
+Configuration saved in ./checkpoint-2400/config.json
+Model weights saved in ./checkpoint-2400/model.safetensors
+Feature extractor saved in ./checkpoint-2400/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-2400/tokenizer_config.json
+Special tokens file saved in ./checkpoint-2400/special_tokens_map.json
+added tokens file saved in ./checkpoint-2400/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-1200] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 19%|█▉        | 2401/12776 [24:31<6:41:27,  2.32s/it]                                                       19%|█▉        | 2401/12776 [24:31<6:41:27,  2.32s/it] 19%|█▉        | 2402/12776 [24:32<5:34:16,  1.93s/it]                                                       19%|█▉        | 2402/12776 [24:32<5:34:16,  1.93s/it] 19%|█▉        | 2403/12776 [24:33<4:38:42,  1.61s/it]                                                       19%|█▉        | 2403/12776 [24:33<4:38:42,  1.61s/it] 19%|█▉        | 2404/12776 [24:34<3:56:18,  1.37s/it]                                                       19%|█▉        | 2404/12776 [24:34<3:56:18,  1.37s/it] 19%|█▉        | 2405/12776 [24:35<3:29:45,  1.21s/it]                                                       19%|█▉        | 2405/12776 [24:35<3:29:45,  1.21s/it] 19%|█▉        | 2406/12776 [24:35<3:02:26,  1.06s/it]                                                       19%|█▉        | 2406/12776 [24:35<3:02:26,  1.06s/it] 19%|█▉        | 2407/12776 [24:36<2:40:18,  1.08it/s]                                                       19%|█▉        | 2407/12776 [24:36<2:40:18,  1.08it/s] 19%|█▉        | 2408/12776 [24:36<2:26:56,  1.18it/s]                                                       19%|█▉        | 2408/12776 [24:36<2:26:56,  1.18it/s] 19%|█▉        | 2409/12776 [24:37<2:11:54,  1.31it/s]                                                       19%|█▉        | 2409/12776 [24:37<2:11:54,  1.31it/s] 19%|█▉        | 2410/12776 [24:38<2:03:33,  1.40it/s]                                                       19%|█▉        | 2410/12776 [24:38<2:03:33,  1.40it/s] 19%|█▉        | 2411/12776 [24:38<1:53:26,  1.52it/s]                                                       19%|█▉        | 2411/12776 [24:38<1:53:26,  1.52it/s] 19%|█▉        | 2412/12776 [24:39<1:51:36,  1.55it/s]                                                       19%|█▉        | 2412/12776 [24:39<1:51:36,  1.55it/s] 19%|█▉        | 2413/12776 [24:39<1:42:23,  1.69it/s]                                                       19%|█▉        | 2413/12776 [24:39<1:42:23,  1.69it/s] 19%|█▉        | 2414/12776 [24:40<1:34:28,  1.83it/s]                                                       19%|█▉        | 2414/12776 [24:40<1:34:28,  1.83it/s] 19%|█▉        | 2415/12776 [24:40<1:30:07,  1.92it/s]                                                       19%|█▉        | 2415/12776 [24:40<1:30:07,  1.92it/s] 19%|█▉        | 2416/12776 [24:41<1:23:43,  2.06it/s]                                                       19%|█▉        | 2416/12776 [24:41<1:23:43,  2.06it/s] 19%|█▉        | 2417/12776 [24:41<1:24:56,  2.03it/s]                                                       19%|█▉        | 2417/12776 [24:41<1:24:56,  2.03it/s] 19%|█▉        | 2418/12776 [24:41<1:18:44,  2.19it/s]                                                       19%|█▉        | 2418/12776 [24:41<1:18:44,  2.19it/s] 19%|█▉        | 2419/12776 [24:42<1:13:45,  2.34it/s]                                                       19%|█▉        | 2419/12776 [24:42<1:13:45,  2.34it/s] 19%|█▉        | 2420/12776 [24:42<1:13:53,  2.34it/s]                                                       19%|█▉        | 2420/12776 [24:42<1:13:53,  2.34it/s] 19%|█▉        | 2421/12776 [24:43<1:08:44,  2.51it/s]                                                       19%|█▉        | 2421/12776 [24:43<1:08:44,  2.51it/s] 19%|█▉        | 2422/12776 [24:43<1:04:50,  2.66it/s]                                                       19%|█▉        | 2422/12776 [24:43<1:04:50,  2.66it/s] 19%|█▉        | 2423/12776 [24:43<1:01:38,  2.80it/s]                                                       19%|█▉        | 2423/12776 [24:43<1:01:38,  2.80it/s] 19%|█▉        | 2424/12776 [24:44<1:00:45,  2.84it/s]                                                       19%|█▉        | 2424/12776 [24:44<1:00:45,  2.84it/s] 19%|█▉        | 2425/12776 [24:44<58:33,  2.95it/s]                                                       19%|█▉        | 2425/12776 [24:44<58:33,  2.95it/s] 19%|█▉        | 2426/12776 [24:44<55:26,  3.11it/s]                                                     19%|█▉        | 2426/12776 [24:44<55:26,  3.11it/s] 19%|█▉        | 2427/12776 [24:44<56:19,  3.06it/s]                                                     19%|█▉        | 2427/12776 [24:44<56:19,  3.06it/s] 19%|█▉        | 2428/12776 [24:45<53:05,  3.25it/s]                                                     19%|█▉        | 2428/12776 [24:45<53:05,  3.25it/s] 19%|█▉        | 2429/12776 [24:45<50:24,  3.42it/s]                                                     19%|█▉        | 2429/12776 [24:45<50:24,  3.42it/s] 19%|█▉        | 2430/12776 [24:45<48:06,  3.58it/s]                                                     19%|█▉        | 2430/12776 [24:45<48:06,  3.58it/s] 19%|█▉        | 2431/12776 [24:46<52:24,  3.29it/s]                                                     19%|█▉        | 2431/12776 [24:46<52:24,  3.29it/s] 19%|█▉        | 2432/12776 [24:46<48:37,  3.54it/s]                                                     19%|█▉        | 2432/12776 [24:46<48:37,  3.54it/s] 19%|█▉        | 2433/12776 [24:46<46:01,  3.75it/s]                                                     19%|█▉        | 2433/12776 [24:46<46:01,  3.75it/s] 19%|█▉        | 2434/12776 [24:46<43:38,  3.95it/s]                                                     19%|█▉        | 2434/12776 [24:46<43:38,  3.95it/s] 19%|█▉        | 2435/12776 [24:46<41:40,  4.14it/s]                                                     19%|█▉        | 2435/12776 [24:46<41:40,  4.14it/s] 19%|█▉        | 2436/12776 [24:47<46:30,  3.71it/s]                                                     19%|█▉        | 2436/12776 [24:47<46:30,  3.71it/s] 19%|█▉        | 2437/12776 [24:47<42:54,  4.02it/s]                                                     19%|█▉        | 2437/12776 [24:47<42:54,  4.02it/s] 19%|█▉        | 2438/12776 [24:47<40:06,  4.30it/s]                                                     19%|█▉        | 2438/12776 [24:47<40:06,  4.30it/s] 19%|█▉        | 2439/12776 [24:47<37:53,  4.55it/s]                                                     19%|█▉        | 2439/12776 [24:47<37:53,  4.55it/s] 19%|█▉        | 2440/12776 [24:48<36:16,  4.75it/s]                                                     19%|█▉        | 2440/12776 [24:48<36:16,  4.75it/s] 19%|█▉        | 2441/12776 [24:48<38:42,  4.45it/s]                                                     19%|█▉        | 2441/12776 [24:48<38:42,  4.45it/s] 19%|█▉        | 2442/12776 [24:48<36:28,  4.72it/s]                                                     19%|█▉        | 2442/12776 [24:48<36:28,  4.72it/s] 19%|█▉        | 2443/12776 [24:48<34:52,  4.94it/s]                                                     19%|█▉        | 2443/12776 [24:48<34:52,  4.94it/s] 19%|█▉        | 2444/12776 [24:48<33:34,  5.13it/s]                                                     19%|█▉        | 2444/12776 [24:48<33:34,  5.13it/s] 19%|█▉        | 2445/12776 [24:49<32:32,  5.29it/s]                                                     19%|█▉        | 2445/12776 [24:49<32:32,  5.29it/s] 19%|█▉        | 2446/12776 [24:49<31:33,  5.45it/s]                                                     19%|█▉        | 2446/12776 [24:49<31:33,  5.45it/s] 19%|█▉        | 2447/12776 [24:49<35:34,  4.84it/s]                                                     19%|█▉        | 2447/12776 [24:49<35:34,  4.84it/s] 19%|█▉        | 2448/12776 [24:49<33:19,  5.17it/s]                                                     19%|█▉        | 2448/12776 [24:49<33:19,  5.17it/s] 19%|█▉        | 2449/12776 [24:49<31:44,  5.42it/s]                                                     19%|█▉        | 2449/12776 [24:49<31:44,  5.42it/s] 19%|█▉        | 2450/12776 [24:50<1:04:48,  2.66it/s]                                                       19%|█▉        | 2450/12776 [24:50<1:04:48,  2.66it/s] 19%|█▉        | 2451/12776 [24:52<1:59:48,  1.44it/s]                                                       19%|█▉        | 2451/12776 [24:52<1:59:48,  1.44it/s] 19%|█▉        | 2452/12776 [24:53<2:14:07,  1.28it/s]                                                       19%|█▉        | 2452/12776 [24:53<2:14:07,  1.28it/s] 19%|█▉        | 2453/12776 [24:53<2:19:42,  1.23it/s]                                                       19%|█▉        | 2453/12776 [24:53<2:19:42,  1.23it/s] 19%|█▉        | 2454/12776 [24:54<2:19:52,  1.23it/s]                                                       19%|█▉        | 2454/12776 [24:54<2:19:52,  1.23it/s] 19%|█▉        | 2455/12776 [24:55<2:23:16,  1.20it/s]                                                       19%|█▉        | 2455/12776 [24:55<2:23:16,  1.20it/s] 19%|█▉        | 2456/12776 [24:56<2:16:30,  1.26it/s]                                                       19%|█▉        | 2456/12776 [24:56<2:16:30,  1.26it/s] 19%|█▉        | 2457/12776 [24:57<2:09:12,  1.33it/s]                                                       19%|█▉        | 2457/12776 [24:57<2:09:12,  1.33it/s] 19%|█▉        | 2458/12776 [24:57<2:05:11,  1.37it/s]                                                       19%|█▉        | 2458/12776 [24:57<2:05:11,  1.37it/s] 19%|█▉        | 2459/12776 [24:58<1:57:49,  1.46it/s]                                                       19%|█▉        | 2459/12776 [24:58<1:57:49,  1.46it/s] 19%|█▉        | 2460/12776 [24:58<1:52:15,  1.53it/s]                                                       19%|█▉        | 2460/12776 [24:58<1:52:15,  1.53it/s] 19%|█▉        | 2461/12776 [24:59<1:44:56,  1.64it/s]                                                       19%|█▉        | 2461/12776 [24:59<1:44:56,  1.64it/s] 19%|█▉        | 2462/12776 [24:59<1:38:16,  1.75it/s]                                                       19%|█▉        | 2462/12776 [24:59<1:38:16,  1.75it/s] 19%|█▉        | 2463/12776 [25:00<1:32:49,  1.85it/s]                                                       19%|█▉        | 2463/12776 [25:00<1:32:49,  1.85it/s] 19%|█▉        | 2464/12776 [25:00<1:26:53,  1.98it/s]                                                       19%|█▉        | 2464/12776 [25:00<1:26:53,  1.98it/s] 19%|█▉        | 2465/12776 [25:01<1:24:09,  2.04it/s]                                                       19%|█▉        | 2465/12776 [25:01<1:24:09,  2.04it/s] 19%|█▉        | 2466/12776 [25:01<1:20:47,  2.13it/s]                                                      {'loss': 1.248, 'grad_norm': 1.589746117591858, 'learning_rate': 0.00025400782013685236, 'epoch': 0.37}
+{'loss': 1.4367, 'grad_norm': 3.969412326812744, 'learning_rate': 0.0002539833822091886, 'epoch': 0.37}
+{'loss': 0.6383, 'grad_norm': 1.8342620134353638, 'learning_rate': 0.0002539589442815249, 'epoch': 0.37}
+{'loss': 1.3961, 'grad_norm': 5.95599365234375, 'learning_rate': 0.00025393450635386117, 'epoch': 0.37}
+{'loss': 1.5589, 'grad_norm': 2.8930656909942627, 'learning_rate': 0.0002539100684261974, 'epoch': 0.37}
+{'loss': 1.4962, 'grad_norm': 3.1002862453460693, 'learning_rate': 0.0002538856304985337, 'epoch': 0.37}
+{'loss': 1.2726, 'grad_norm': 4.003942489624023, 'learning_rate': 0.00025386119257087, 'epoch': 0.37}
+{'loss': 1.2946, 'grad_norm': 3.102221965789795, 'learning_rate': 0.00025383675464320623, 'epoch': 0.37}
+{'loss': 1.1978, 'grad_norm': 2.444598913192749, 'learning_rate': 0.00025381231671554253, 'epoch': 0.38}
+{'loss': 0.7367, 'grad_norm': 2.2664988040924072, 'learning_rate': 0.00025378787878787873, 'epoch': 0.38}
+{'loss': 0.4199, 'grad_norm': 1.9962775707244873, 'learning_rate': 0.00025376344086021504, 'epoch': 0.38}
+{'loss': 1.2034, 'grad_norm': 2.8011834621429443, 'learning_rate': 0.0002537390029325513, 'epoch': 0.38}
+{'loss': 1.0624, 'grad_norm': 1.9701213836669922, 'learning_rate': 0.00025371456500488754, 'epoch': 0.38}
+{'loss': 0.3932, 'grad_norm': 0.7180734276771545, 'learning_rate': 0.00025369012707722384, 'epoch': 0.38}
+{'loss': 0.3901, 'grad_norm': 0.5675762295722961, 'learning_rate': 0.0002536656891495601, 'epoch': 0.38}
+{'loss': 0.4672, 'grad_norm': 0.6413562893867493, 'learning_rate': 0.00025364125122189635, 'epoch': 0.38}
+{'loss': 0.6919, 'grad_norm': 0.9517688155174255, 'learning_rate': 0.0002536168132942326, 'epoch': 0.38}
+{'loss': 0.451, 'grad_norm': 0.6282806396484375, 'learning_rate': 0.0002535923753665689, 'epoch': 0.38}
+{'loss': 0.4078, 'grad_norm': 0.590168833732605, 'learning_rate': 0.00025356793743890515, 'epoch': 0.38}
+{'loss': 0.4757, 'grad_norm': 0.7684517502784729, 'learning_rate': 0.0002535434995112414, 'epoch': 0.38}
+{'loss': 0.6356, 'grad_norm': 0.9231551289558411, 'learning_rate': 0.0002535190615835777, 'epoch': 0.38}
+{'loss': 0.4089, 'grad_norm': 0.8362495303153992, 'learning_rate': 0.00025349462365591396, 'epoch': 0.38}
+{'loss': 0.4651, 'grad_norm': 0.8577970862388611, 'learning_rate': 0.0002534701857282502, 'epoch': 0.38}
+{'loss': 0.6993, 'grad_norm': 1.4198194742202759, 'learning_rate': 0.0002534457478005865, 'epoch': 0.38}
+{'loss': 0.7051, 'grad_norm': 1.1189966201782227, 'learning_rate': 0.0002534213098729227, 'epoch': 0.38}
+{'loss': 0.4614, 'grad_norm': 0.8378294706344604, 'learning_rate': 0.000253396871945259, 'epoch': 0.38}
+{'loss': 0.4466, 'grad_norm': 0.8094697594642639, 'learning_rate': 0.0002533724340175953, 'epoch': 0.38}
+{'loss': 0.7824, 'grad_norm': 1.419098138809204, 'learning_rate': 0.0002533479960899315, 'epoch': 0.38}
+{'loss': 0.643, 'grad_norm': 2.8481099605560303, 'learning_rate': 0.00025332355816226783, 'epoch': 0.38}
+{'loss': 0.6802, 'grad_norm': 1.6152693033218384, 'learning_rate': 0.0002532991202346041, 'epoch': 0.38}
+{'loss': 0.9134, 'grad_norm': 1.3891299962997437, 'learning_rate': 0.00025327468230694033, 'epoch': 0.38}
+{'loss': 0.5214, 'grad_norm': 0.8565380573272705, 'learning_rate': 0.00025325024437927664, 'epoch': 0.38}
+{'loss': 0.6881, 'grad_norm': 1.4971650838851929, 'learning_rate': 0.0002532258064516129, 'epoch': 0.38}
+{'loss': 0.9213, 'grad_norm': 2.303706169128418, 'learning_rate': 0.00025320136852394914, 'epoch': 0.38}
+{'loss': 0.7242, 'grad_norm': 1.7516024112701416, 'learning_rate': 0.0002531769305962854, 'epoch': 0.38}
+{'loss': 0.829, 'grad_norm': 1.990359902381897, 'learning_rate': 0.0002531524926686217, 'epoch': 0.38}
+{'loss': 1.1958, 'grad_norm': 3.0393333435058594, 'learning_rate': 0.00025312805474095795, 'epoch': 0.38}
+{'loss': 0.7958, 'grad_norm': 1.408226728439331, 'learning_rate': 0.0002531036168132942, 'epoch': 0.38}
+{'loss': 1.0334, 'grad_norm': 4.30161190032959, 'learning_rate': 0.0002530791788856305, 'epoch': 0.38}
+{'loss': 0.8091, 'grad_norm': 1.420586109161377, 'learning_rate': 0.0002530547409579667, 'epoch': 0.38}
+{'loss': 0.5026, 'grad_norm': 1.1459065675735474, 'learning_rate': 0.000253030303030303, 'epoch': 0.38}
+{'loss': 0.8863, 'grad_norm': 1.717860460281372, 'learning_rate': 0.00025300586510263926, 'epoch': 0.38}
+{'loss': 0.926, 'grad_norm': 1.8267467021942139, 'learning_rate': 0.0002529814271749755, 'epoch': 0.38}
+{'loss': 1.062, 'grad_norm': 1.825463056564331, 'learning_rate': 0.0002529569892473118, 'epoch': 0.38}
+{'loss': 1.0772, 'grad_norm': 4.6027140617370605, 'learning_rate': 0.00025293255131964807, 'epoch': 0.38}
+{'loss': 1.2083, 'grad_norm': 1.6209831237792969, 'learning_rate': 0.0002529081133919843, 'epoch': 0.38}
+{'loss': 0.8559, 'grad_norm': 1.9430205821990967, 'learning_rate': 0.0002528836754643206, 'epoch': 0.38}
+{'loss': 0.9144, 'grad_norm': 5.019834041595459, 'learning_rate': 0.0002528592375366569, 'epoch': 0.38}
+{'loss': 0.5865, 'grad_norm': 1.813771367073059, 'learning_rate': 0.0002528347996089931, 'epoch': 0.38}
+{'loss': 1.0331, 'grad_norm': 6.178098201751709, 'learning_rate': 0.0002528103616813294, 'epoch': 0.38}
+{'loss': 1.4776, 'grad_norm': 2.6770286560058594, 'learning_rate': 0.0002527859237536657, 'epoch': 0.38}
+{'loss': 1.2574, 'grad_norm': 2.0814406871795654, 'learning_rate': 0.00025276148582600193, 'epoch': 0.38}
+{'loss': 1.2914, 'grad_norm': 2.1399121284484863, 'learning_rate': 0.0002527370478983382, 'epoch': 0.38}
+{'loss': 1.0346, 'grad_norm': 2.3518238067626953, 'learning_rate': 0.0002527126099706745, 'epoch': 0.38}
+{'loss': 1.0897, 'grad_norm': 2.9069864749908447, 'learning_rate': 0.00025268817204301074, 'epoch': 0.38}
+{'loss': 1.375, 'grad_norm': 10.248844146728516, 'learning_rate': 0.000252663734115347, 'epoch': 0.38}
+{'loss': 1.5689, 'grad_norm': 2.556163787841797, 'learning_rate': 0.00025263929618768324, 'epoch': 0.38}
+{'loss': 1.5573, 'grad_norm': 6.150379657745361, 'learning_rate': 0.0002526148582600195, 'epoch': 0.38}
+{'loss': 0.7524, 'grad_norm': 2.1215577125549316, 'learning_rate': 0.0002525904203323558, 'epoch': 0.38}
+{'loss': 1.0496, 'grad_norm': 1.778437852859497, 'learning_rate': 0.00025256598240469205, 'epoch': 0.38}
+{'loss': 0.676, 'grad_norm': 1.631048560142517, 'learning_rate': 0.0002525415444770283, 'epoch': 0.38}
+{'loss': 1.1262, 'grad_norm': 2.2763094902038574, 'learning_rate': 0.0002525171065493646, 'epoch': 0.38}
+{'loss': 1.5447, 'grad_norm': 1.9400389194488525, 'learning_rate': 0.00025249266862170086, 'epoch': 0.38}
+{'loss': 0.4302, 'grad_norm': 0.6042339205741882, 'learning_rate': 0.0002524682306940371, 'epoch': 0.38}
+{'loss': 0.4748, 'grad_norm': 0.6273007392883301, 'learning_rate': 0.00025244379276637336, 'epoch': 0.38}
+{'loss': 0.3699, 'grad_norm': 0.5941194891929626, 'learning_rate': 0.00025241935483870967, 'epoch': 0.38}
+{'loss': 0.5617, 'grad_norm': 0.6583676338195801, 'learning_rate': 0.0002523949169110459, 'epoch': 0.38}
+{'loss': 0.4727, 'grad_norm': 0.9693568348884583, 'learning_rate': 0.00025237047898338217, 'epoch': 0.38}
+{'loss': 0.4105, 'grad_norm': 0.7565197944641113, 'learning_rate': 0.0002523460410557185, 'epoch': 0.38}
+{'loss': 0.6106, 'grad_norm': 0.9457018375396729, 'learning_rate': 0.00025232160312805473, 'epoch': 0.38}
+{'loss': 0.4825, 'grad_norm': 0.8594042062759399, 'learning_rate': 0.000252297165200391, 'epoch': 0.38}
+{'loss': 0.4283, 'grad_norm': 0.9214059114456177, 'learning_rate': 0.0002522727272727273, 'epoch': 0.38}
+{'loss': 0.4448, 'grad_norm': 1.0324896574020386, 'learning_rate': 0.0002522482893450635, 'epoch': 0.39}
+{'loss': 0.4815, 'grad_norm': 0.8373879194259644, 'learning_rate': 0.0002522238514173998, 'epoch': 0.39}
+{'loss': 0.3629, 'grad_norm': 0.5907357931137085, 'learning_rate': 0.00025219941348973604, 'epoch': 0.39}
+{'loss': 0.4437, 'grad_norm': 0.8201125860214233, 'learning_rate': 0.0002521749755620723, 'epoch': 0.39}
+{'loss': 0.6879, 'grad_norm': 1.6409950256347656, 'learning_rate': 0.0002521505376344086, 'epoch': 0.39}
+{'loss': 0.8157, 'grad_norm': 1.9288908243179321, 'learning_rate': 0.00025212609970674485, 'epoch': 0.39}
+ 19%|█▉        | 2466/12776 [25:01<1:20:47,  2.13it/s] 19%|█▉        | 2467/12776 [25:02<1:17:13,  2.22it/s]                                                       19%|█▉        | 2467/12776 [25:02<1:17:13,  2.22it/s] 19%|█▉        | 2468/12776 [25:02<1:19:42,  2.16it/s]                                                       19%|█▉        | 2468/12776 [25:02<1:19:42,  2.16it/s] 19%|█▉        | 2469/12776 [25:02<1:15:13,  2.28it/s]                                                       19%|█▉        | 2469/12776 [25:02<1:15:13,  2.28it/s] 19%|█▉        | 2470/12776 [25:03<1:11:27,  2.40it/s]                                                       19%|█▉        | 2470/12776 [25:03<1:11:27,  2.40it/s] 19%|█▉        | 2471/12776 [25:03<1:12:31,  2.37it/s]                                                       19%|█▉        | 2471/12776 [25:03<1:12:31,  2.37it/s] 19%|█▉        | 2472/12776 [25:04<1:08:06,  2.52it/s]                                                       19%|█▉        | 2472/12776 [25:04<1:08:06,  2.52it/s] 19%|█▉        | 2473/12776 [25:04<1:04:35,  2.66it/s]                                                       19%|█▉        | 2473/12776 [25:04<1:04:35,  2.66it/s] 19%|█▉        | 2474/12776 [25:04<1:07:18,  2.55it/s]                                                       19%|█▉        | 2474/12776 [25:04<1:07:18,  2.55it/s] 19%|█▉        | 2475/12776 [25:05<1:02:41,  2.74it/s]                                                       19%|█▉        | 2475/12776 [25:05<1:02:41,  2.74it/s] 19%|█▉        | 2476/12776 [25:05<59:04,  2.91it/s]                                                       19%|█▉        | 2476/12776 [25:05<59:04,  2.91it/s] 19%|█▉        | 2477/12776 [25:05<1:00:09,  2.85it/s]                                                       19%|█▉        | 2477/12776 [25:05<1:00:09,  2.85it/s] 19%|█▉        | 2478/12776 [25:06<56:11,  3.05it/s]                                                       19%|█▉        | 2478/12776 [25:06<56:11,  3.05it/s] 19%|█▉        | 2479/12776 [25:06<52:57,  3.24it/s]                                                     19%|█▉        | 2479/12776 [25:06<52:57,  3.24it/s] 19%|█▉        | 2480/12776 [25:06<50:12,  3.42it/s]                                                     19%|█▉        | 2480/12776 [25:06<50:12,  3.42it/s] 19%|█▉        | 2481/12776 [25:06<52:54,  3.24it/s]                                                     19%|█▉        | 2481/12776 [25:06<52:54,  3.24it/s] 19%|█▉        | 2482/12776 [25:07<49:38,  3.46it/s]                                                     19%|█▉        | 2482/12776 [25:07<49:38,  3.46it/s] 19%|█▉        | 2483/12776 [25:07<47:03,  3.65it/s]                                                     19%|█▉        | 2483/12776 [25:07<47:03,  3.65it/s] 19%|█▉        | 2484/12776 [25:07<45:04,  3.80it/s]                                                     19%|█▉        | 2484/12776 [25:07<45:04,  3.80it/s] 19%|█▉        | 2485/12776 [25:07<43:23,  3.95it/s]                                                     19%|█▉        | 2485/12776 [25:07<43:23,  3.95it/s] 19%|█▉        | 2486/12776 [25:08<46:30,  3.69it/s]                                                     19%|█▉        | 2486/12776 [25:08<46:30,  3.69it/s] 19%|█▉        | 2487/12776 [25:08<44:08,  3.89it/s]                                                     19%|█▉        | 2487/12776 [25:08<44:08,  3.89it/s] 19%|█▉        | 2488/12776 [25:08<41:48,  4.10it/s]                                                     19%|█▉        | 2488/12776 [25:08<41:48,  4.10it/s] 19%|█▉        | 2489/12776 [25:08<39:54,  4.30it/s]                                                     19%|█▉        | 2489/12776 [25:08<39:54,  4.30it/s] 19%|█▉        | 2490/12776 [25:09<38:55,  4.40it/s]                                                     19%|█▉        | 2490/12776 [25:09<38:55,  4.40it/s] 19%|█▉        | 2491/12776 [25:09<41:57,  4.08it/s]                                                     19%|█▉        | 2491/12776 [25:09<41:57,  4.08it/s] 20%|█▉        | 2492/12776 [25:09<39:44,  4.31it/s]                                                     20%|█▉        | 2492/12776 [25:09<39:44,  4.31it/s] 20%|█▉        | 2493/12776 [25:09<38:01,  4.51it/s]                                                     20%|█▉        | 2493/12776 [25:09<38:01,  4.51it/s] 20%|█▉        | 2494/12776 [25:09<36:39,  4.68it/s]                                                     20%|█▉        | 2494/12776 [25:09<36:39,  4.68it/s] 20%|█▉        | 2495/12776 [25:10<35:44,  4.79it/s]                                                     20%|█▉        | 2495/12776 [25:10<35:44,  4.79it/s] 20%|█▉        | 2496/12776 [25:10<41:33,  4.12it/s]                                                     20%|█▉        | 2496/12776 [25:10<41:33,  4.12it/s] 20%|█▉        | 2497/12776 [25:10<38:55,  4.40it/s]                                                     20%|█▉        | 2497/12776 [25:10<38:55,  4.40it/s] 20%|█▉        | 2498/12776 [25:10<36:49,  4.65it/s]                                                     20%|█▉        | 2498/12776 [25:10<36:49,  4.65it/s] 20%|█▉        | 2499/12776 [25:10<35:15,  4.86it/s]                                                     20%|█▉        | 2499/12776 [25:10<35:15,  4.86it/s] 20%|█▉        | 2500/12776 [25:11<57:33,  2.98it/s]                                                     20%|█▉        | 2500/12776 [25:11<57:33,  2.98it/s] 20%|█▉        | 2501/12776 [25:12<1:47:30,  1.59it/s]                                                       20%|█▉        | 2501/12776 [25:12<1:47:30,  1.59it/s] 20%|█▉        | 2502/12776 [25:13<2:02:35,  1.40it/s]                                                       20%|█▉        | 2502/12776 [25:13<2:02:35,  1.40it/s] 20%|█▉        | 2503/12776 [25:14<2:10:18,  1.31it/s]                                                       20%|█▉        | 2503/12776 [25:14<2:10:18,  1.31it/s] 20%|█▉        | 2504/12776 [25:15<2:11:17,  1.30it/s]                                                       20%|█▉        | 2504/12776 [25:15<2:11:17,  1.30it/s] 20%|█▉        | 2505/12776 [25:16<2:17:37,  1.24it/s]                                                       20%|█▉        | 2505/12776 [25:16<2:17:37,  1.24it/s] 20%|█▉        | 2506/12776 [25:17<2:11:54,  1.30it/s]                                                       20%|█▉        | 2506/12776 [25:17<2:11:54,  1.30it/s] 20%|█▉        | 2507/12776 [25:17<2:06:03,  1.36it/s]                                                       20%|█▉        | 2507/12776 [25:17<2:06:03,  1.36it/s] 20%|█▉        | 2508/12776 [25:18<2:03:09,  1.39it/s]                                                       20%|█▉        | 2508/12776 [25:18<2:03:09,  1.39it/s] 20%|█▉        | 2509/12776 [25:19<1:57:05,  1.46it/s]                                                       20%|█▉        | 2509/12776 [25:19<1:57:05,  1.46it/s] 20%|█▉        | 2510/12776 [25:19<1:50:05,  1.55it/s]                                                       20%|█▉        | 2510/12776 [25:19<1:50:05,  1.55it/s] 20%|█▉        | 2511/12776 [25:20<1:43:55,  1.65it/s]                                                       20%|█▉        | 2511/12776 [25:20<1:43:55,  1.65it/s] 20%|█▉        | 2512/12776 [25:20<1:44:54,  1.63it/s]                                                       20%|█▉        | 2512/12776 [25:20<1:44:54,  1.63it/s] 20%|█▉        | 2513/12776 [25:21<1:37:17,  1.76it/s]                                                       20%|█▉        | 2513/12776 [25:21<1:37:17,  1.76it/s] 20%|█▉        | 2514/12776 [25:21<1:31:21,  1.87it/s]                                                       20%|█▉        | 2514/12776 [25:21<1:31:21,  1.87it/s] 20%|█▉        | 2515/12776 [25:22<1:29:10,  1.92it/s]                                                       20%|█▉        | 2515/12776 [25:22<1:29:10,  1.92it/s] 20%|█▉        | 2516/12776 [25:22<1:23:40,  2.04it/s]                                                       20%|█▉        | 2516/12776 [25:22<1:23:40,  2.04it/s] 20%|█▉        | 2517/12776 [25:23<1:23:24,  2.05it/s]                                                       20%|█▉        | 2517/12776 [25:23<1:23:24,  2.05it/s] 20%|█▉        | 2518/12776 [25:23<1:18:01,  2.19it/s]                                                       20%|█▉        | 2518/12776 [25:23<1:18:01,  2.19it/s] 20%|█▉        | 2519/12776 [25:23<1:12:57,  2.34it/s]                                                       20%|█▉        | 2519/12776 [25:23<1:12:57,  2.34it/s] 20%|█▉        | 2520/12776 [25:24<1:12:42,  2.35it/s]                                                       20%|█▉        | 2520/12776 [25:24<1:12:42,  2.35it/s] 20%|█▉        | 2521/12776 [25:24<1:08:32,  2.49it/s]                                                       20%|█▉        | 2521/12776 [25:24<1:08:32,  2.49it/s] 20%|█▉        | 2522/12776 [25:24<1:05:16,  2.62it/s]                                                       20%|█▉        | 2522/12776 [25:24<1:05:16,  2.62it/s] 20%|█▉        | 2523/12776 [25:25<1:07:26,  2.53it/s]                                                       20%|█▉        | 2523/12776 [25:25<1:07:26,  2.53it/s] 20%|█▉        | 2524/12776 [25:25<1:02:48,  2.72it/s]                                                       20%|█▉        | 2524/12776 [25:25<1:02:48,  2.72it/s] 20%|█▉        | 2525/12776 [25:25<59:08,  2.89it/s]                                                       20%|█▉        | 2525/12776 [25:25<59:08,  2.89it/s] 20%|█▉        | 2526/12776 [25:26<56:00,  3.05it/s]                                                     20%|█▉        | 2526/12776 [25:26<56:00,  3.05it/s] 20%|█▉        | 2527/12776 [25:26<57:57,  2.95it/s]                                                     20%|█▉        | 2527/12776 [25:26<57:57,  2.95it/s] 20%|█▉        | 2528/12776 [25:26<54:37,  3.13it/s]                                                     20%|█▉        | 2528/12776 [25:26<54:37,  3.13it/s] 20%|█▉        | 2529/12776 [25:27<51:50,  3.29it/s]                                                     20%|█▉        | 2529/12776 [25:27<51:50,  3.29it/s] 20%|█▉        | 2530/12776 [25:27<49:16,  3.47it/s]                                                     20%|█▉        | 2530/12776 [25:27<49:16,  3.47it/s] 20%|█▉        | 2531/12776 [25:27<50:58,  3.35it/s]                                                     20%|█▉        | 2531/12776 [25:27<50:58,  3.35it/s] 20%|█▉        | 2532/12776 [25:27<48:06,  3.55it/s]                                                     20%|█▉        | 2532/12776 [25:27<48:06,  3.55it/s] 20%|█▉        | 2533/12776 [25:28<45:50,  3.72it/s]                                                     20%|█▉        | 2533/12776 [25:28<45:50,  3.72it/s] 20%|█▉        | 2534/12776 [25:28<44:10,  3.86it/s]                                                     20%|█▉        | 2534/12776 [25:28<44:10,  3.86it/s] 20%|█▉        | 2535/12776 [25:28<46:10,  3.70it/s]                                                     20%|█▉        | 2535/12776 [25:28<46:10,  3.70it/s] 20%|█▉        | 2536/12776 [25:28<43:40,  3.91it/s]                                                     20%|█▉        | 2536/12776 [25:28<43:40,  3.91it/s] 20%|█▉        | 2537/12776 [25:29<41:38,  4.10it/s]                                                     20%|█▉        | 2537/12776 [25:29<41:38,  4.10it/s] 20%|█▉        | 2538/12776 [25:29<39:55,  4.27it/s]                                                     20%|█▉        | 2538/12776 [25:29<39:55,  4.27it/s] 20%|█▉        | 2539/12776 [25:29<38:38,  4.42it/s]                                                     20%|█▉        | 2539/12776 [25:29<38:38,  4.42it/s] 20%|█▉        | 2540/12776 [25:29<41:23,  4.12it/s]                                                     20%|█▉        | 2540/12776 [25:29<41:23,  4.12it/s] 20%|█▉        | 2541/12776 [25:30<39:31,  4.32it/s]                                                     20%|█▉        | 2541/12776 [25:30<39:31,  4.32it/s] 20%|█▉        | 2542/12776 [25:30<38:01,  4.49it/s]                                                     20%|█▉        | 2542/12776 [25:30<38:01,  4.49it/s] 20%|█▉        | 2543/12776 [25:30<36:44,  4.64it/s]                                                     20%|█▉        | 2543/12776 [25:30<36:44,  4.64it/s] 20%|█▉        | 2544/12776 [25:30<35:50,  4.76it/s]                                                    {'loss': 0.746, 'grad_norm': 1.1502376794815063, 'learning_rate': 0.0002521016617790811, 'epoch': 0.39}
+{'loss': 0.4716, 'grad_norm': 1.607825756072998, 'learning_rate': 0.0002520772238514174, 'epoch': 0.39}
+{'loss': 0.486, 'grad_norm': 1.3579707145690918, 'learning_rate': 0.00025205278592375365, 'epoch': 0.39}
+{'loss': 0.6688, 'grad_norm': 1.369374394416809, 'learning_rate': 0.0002520283479960899, 'epoch': 0.39}
+{'loss': 0.4558, 'grad_norm': 0.9472814798355103, 'learning_rate': 0.00025200391006842616, 'epoch': 0.39}
+{'loss': 0.7678, 'grad_norm': 1.0959384441375732, 'learning_rate': 0.00025197947214076246, 'epoch': 0.39}
+{'loss': 0.8826, 'grad_norm': 1.783223032951355, 'learning_rate': 0.0002519550342130987, 'epoch': 0.39}
+{'loss': 0.3746, 'grad_norm': 0.7266517281532288, 'learning_rate': 0.00025193059628543496, 'epoch': 0.39}
+{'loss': 0.7786, 'grad_norm': 1.6912201642990112, 'learning_rate': 0.00025190615835777127, 'epoch': 0.39}
+{'loss': 1.0013, 'grad_norm': 2.1097683906555176, 'learning_rate': 0.00025188172043010747, 'epoch': 0.39}
+{'loss': 0.9238, 'grad_norm': 2.697650671005249, 'learning_rate': 0.00025185728250244377, 'epoch': 0.39}
+{'loss': 0.613, 'grad_norm': 1.2909862995147705, 'learning_rate': 0.00025183284457478, 'epoch': 0.39}
+{'loss': 0.698, 'grad_norm': 2.043966293334961, 'learning_rate': 0.0002518084066471163, 'epoch': 0.39}
+{'loss': 0.8166, 'grad_norm': 1.7088145017623901, 'learning_rate': 0.0002517839687194526, 'epoch': 0.39}
+{'loss': 0.5839, 'grad_norm': 1.015830397605896, 'learning_rate': 0.00025175953079178883, 'epoch': 0.39}
+{'loss': 0.7847, 'grad_norm': 1.558098316192627, 'learning_rate': 0.0002517350928641251, 'epoch': 0.39}
+{'loss': 0.6532, 'grad_norm': 1.4762747287750244, 'learning_rate': 0.0002517106549364614, 'epoch': 0.39}
+{'loss': 0.9531, 'grad_norm': 2.2666919231414795, 'learning_rate': 0.00025168621700879764, 'epoch': 0.39}
+{'loss': 1.3798, 'grad_norm': 3.644498586654663, 'learning_rate': 0.0002516617790811339, 'epoch': 0.39}
+{'loss': 0.6577, 'grad_norm': 1.4934632778167725, 'learning_rate': 0.00025163734115347014, 'epoch': 0.39}
+{'loss': 1.0388, 'grad_norm': 2.9513094425201416, 'learning_rate': 0.00025161290322580645, 'epoch': 0.39}
+{'loss': 1.1773, 'grad_norm': 2.517242670059204, 'learning_rate': 0.0002515884652981427, 'epoch': 0.39}
+{'loss': 1.2082, 'grad_norm': 2.8088479042053223, 'learning_rate': 0.00025156402737047895, 'epoch': 0.39}
+{'loss': 1.6824, 'grad_norm': 2.6290640830993652, 'learning_rate': 0.00025153958944281526, 'epoch': 0.39}
+{'loss': 1.1155, 'grad_norm': 1.979951024055481, 'learning_rate': 0.0002515151515151515, 'epoch': 0.39}
+{'loss': 1.1699, 'grad_norm': 3.4212875366210938, 'learning_rate': 0.00025149071358748776, 'epoch': 0.39}
+{'loss': 1.516, 'grad_norm': 2.455173969268799, 'learning_rate': 0.000251466275659824, 'epoch': 0.39}
+{'loss': 1.3543, 'grad_norm': 3.4120118618011475, 'learning_rate': 0.00025144183773216026, 'epoch': 0.39}
+{'loss': 1.1935, 'grad_norm': 1.8049226999282837, 'learning_rate': 0.00025141739980449657, 'epoch': 0.39}
+{'loss': 1.9788, 'grad_norm': 2.091107130050659, 'learning_rate': 0.0002513929618768328, 'epoch': 0.39}
+{'loss': 0.7783, 'grad_norm': 1.4167218208312988, 'learning_rate': 0.00025136852394916907, 'epoch': 0.39}
+{'loss': 0.9097, 'grad_norm': 2.366852045059204, 'learning_rate': 0.0002513440860215054, 'epoch': 0.39}
+{'loss': 0.9851, 'grad_norm': 1.7226179838180542, 'learning_rate': 0.0002513196480938416, 'epoch': 0.39}
+{'loss': 1.281, 'grad_norm': 1.691483974456787, 'learning_rate': 0.0002512952101661779, 'epoch': 0.39}
+{'loss': 1.495, 'grad_norm': 2.1025145053863525, 'learning_rate': 0.00025127077223851413, 'epoch': 0.39}
+{'loss': 0.4291, 'grad_norm': 0.49316316843032837, 'learning_rate': 0.00025124633431085043, 'epoch': 0.39}
+{'loss': 0.3186, 'grad_norm': 0.47687578201293945, 'learning_rate': 0.0002512218963831867, 'epoch': 0.39}
+{'loss': 0.4185, 'grad_norm': 0.6680770516395569, 'learning_rate': 0.00025119745845552294, 'epoch': 0.39}
+{'loss': 0.3147, 'grad_norm': 0.6595609188079834, 'learning_rate': 0.00025117302052785924, 'epoch': 0.39}
+{'loss': 0.3578, 'grad_norm': 0.8479176759719849, 'learning_rate': 0.0002511485826001955, 'epoch': 0.39}
+{'loss': 0.3305, 'grad_norm': 0.6460444331169128, 'learning_rate': 0.00025112414467253174, 'epoch': 0.39}
+{'loss': 0.3328, 'grad_norm': 0.7635279297828674, 'learning_rate': 0.00025109970674486805, 'epoch': 0.39}
+{'loss': 0.3137, 'grad_norm': 0.6837889552116394, 'learning_rate': 0.00025107526881720425, 'epoch': 0.39}
+{'loss': 1.145, 'grad_norm': 3.4760055541992188, 'learning_rate': 0.00025105083088954055, 'epoch': 0.39}
+{'loss': 0.4839, 'grad_norm': 0.936640739440918, 'learning_rate': 0.0002510263929618768, 'epoch': 0.39}
+{'loss': 0.239, 'grad_norm': 0.6478950381278992, 'learning_rate': 0.00025100195503421305, 'epoch': 0.39}
+{'loss': 0.5681, 'grad_norm': 1.125580906867981, 'learning_rate': 0.00025097751710654936, 'epoch': 0.39}
+{'loss': 0.7442, 'grad_norm': 1.4228874444961548, 'learning_rate': 0.0002509530791788856, 'epoch': 0.39}
+{'loss': 0.6783, 'grad_norm': 1.260831356048584, 'learning_rate': 0.00025092864125122186, 'epoch': 0.39}
+{'loss': 0.426, 'grad_norm': 1.2701672315597534, 'learning_rate': 0.0002509042033235581, 'epoch': 0.39}
+{'loss': 0.4279, 'grad_norm': 0.7969043850898743, 'learning_rate': 0.0002508797653958944, 'epoch': 0.39}
+{'loss': 0.8547, 'grad_norm': 2.375305414199829, 'learning_rate': 0.00025085532746823067, 'epoch': 0.39}
+{'loss': 0.5921, 'grad_norm': 1.0514167547225952, 'learning_rate': 0.0002508308895405669, 'epoch': 0.39}
+{'loss': 0.792, 'grad_norm': 1.659162998199463, 'learning_rate': 0.0002508064516129032, 'epoch': 0.39}
+{'loss': 0.8341, 'grad_norm': 1.6312371492385864, 'learning_rate': 0.0002507820136852395, 'epoch': 0.39}
+{'loss': 0.6725, 'grad_norm': 1.0578227043151855, 'learning_rate': 0.00025075757575757573, 'epoch': 0.39}
+{'loss': 0.753, 'grad_norm': 1.3463261127471924, 'learning_rate': 0.00025073313782991203, 'epoch': 0.39}
+{'loss': 0.6932, 'grad_norm': 1.1711524724960327, 'learning_rate': 0.00025070869990224823, 'epoch': 0.39}
+{'loss': 0.8229, 'grad_norm': 1.7872284650802612, 'learning_rate': 0.00025068426197458454, 'epoch': 0.4}
+{'loss': 1.1885, 'grad_norm': 1.4881881475448608, 'learning_rate': 0.0002506598240469208, 'epoch': 0.4}
+{'loss': 0.9122, 'grad_norm': 2.591526508331299, 'learning_rate': 0.00025063538611925704, 'epoch': 0.4}
+{'loss': 1.1493, 'grad_norm': 2.517484188079834, 'learning_rate': 0.00025061094819159334, 'epoch': 0.4}
+{'loss': 0.7883, 'grad_norm': 1.369888424873352, 'learning_rate': 0.0002505865102639296, 'epoch': 0.4}
+{'loss': 0.5566, 'grad_norm': 1.5327993631362915, 'learning_rate': 0.00025056207233626585, 'epoch': 0.4}
+{'loss': 0.8489, 'grad_norm': 2.494425058364868, 'learning_rate': 0.00025053763440860215, 'epoch': 0.4}
+{'loss': 0.8051, 'grad_norm': 1.5433497428894043, 'learning_rate': 0.0002505131964809384, 'epoch': 0.4}
+{'loss': 1.0817, 'grad_norm': 3.3626677989959717, 'learning_rate': 0.00025048875855327465, 'epoch': 0.4}
+{'loss': 0.9022, 'grad_norm': 2.291754722595215, 'learning_rate': 0.0002504643206256109, 'epoch': 0.4}
+{'loss': 1.4655, 'grad_norm': 2.296135425567627, 'learning_rate': 0.0002504398826979472, 'epoch': 0.4}
+{'loss': 0.9305, 'grad_norm': 3.5448482036590576, 'learning_rate': 0.00025041544477028346, 'epoch': 0.4}
+{'loss': 1.3307, 'grad_norm': 2.3300135135650635, 'learning_rate': 0.0002503910068426197, 'epoch': 0.4}
+{'loss': 0.7833, 'grad_norm': 1.741479516029358, 'learning_rate': 0.000250366568914956, 'epoch': 0.4}
+{'loss': 1.0903, 'grad_norm': 4.329046249389648, 'learning_rate': 0.00025034213098729227, 'epoch': 0.4}
+{'loss': 1.4296, 'grad_norm': 5.144536972045898, 'learning_rate': 0.0002503176930596285, 'epoch': 0.4}
+{'loss': 1.2994, 'grad_norm': 2.033552408218384, 'learning_rate': 0.0002502932551319648, 'epoch': 0.4}
+{'loss': 1.3316, 'grad_norm': 2.1732325553894043, 'learning_rate': 0.000250268817204301, 'epoch': 0.4}
+{'loss': 1.187, 'grad_norm': 4.526739597320557, 'learning_rate': 0.00025024437927663733, 'epoch': 0.4}
+{'loss': 0.8175, 'grad_norm': 1.4033749103546143, 'learning_rate': 0.0002502199413489736, 'epoch': 0.4}
+ 20%|█▉        | 2544/12776 [25:30<35:50,  4.76it/s] 20%|█▉        | 2545/12776 [25:30<41:05,  4.15it/s]                                                     20%|█▉        | 2545/12776 [25:30<41:05,  4.15it/s] 20%|█▉        | 2546/12776 [25:31<38:40,  4.41it/s]                                                     20%|█▉        | 2546/12776 [25:31<38:40,  4.41it/s] 20%|█▉        | 2547/12776 [25:31<36:51,  4.63it/s]                                                     20%|█▉        | 2547/12776 [25:31<36:51,  4.63it/s] 20%|█▉        | 2548/12776 [25:31<35:13,  4.84it/s]                                                     20%|█▉        | 2548/12776 [25:31<35:13,  4.84it/s] 20%|█▉        | 2549/12776 [25:31<34:04,  5.00it/s]                                                     20%|█▉        | 2549/12776 [25:31<34:04,  5.00it/s] 20%|█▉        | 2550/12776 [25:32<1:03:28,  2.68it/s]                                                       20%|█▉        | 2550/12776 [25:32<1:03:28,  2.68it/s] 20%|█▉        | 2551/12776 [25:34<2:04:27,  1.37it/s]                                                       20%|█▉        | 2551/12776 [25:34<2:04:27,  1.37it/s] 20%|█▉        | 2552/12776 [25:34<2:17:00,  1.24it/s]                                                       20%|█▉        | 2552/12776 [25:34<2:17:00,  1.24it/s] 20%|█▉        | 2553/12776 [25:35<2:22:02,  1.20it/s]                                                       20%|█▉        | 2553/12776 [25:35<2:22:02,  1.20it/s] 20%|█▉        | 2554/12776 [25:36<2:21:36,  1.20it/s]                                                       20%|█▉        | 2554/12776 [25:36<2:21:36,  1.20it/s] 20%|█▉        | 2555/12776 [25:37<2:17:57,  1.23it/s]                                                       20%|█▉        | 2555/12776 [25:37<2:17:57,  1.23it/s] 20%|██        | 2556/12776 [25:38<2:18:07,  1.23it/s]                                                       20%|██        | 2556/12776 [25:38<2:18:07,  1.23it/s] 20%|██        | 2557/12776 [25:39<2:13:32,  1.28it/s]                                                       20%|██        | 2557/12776 [25:39<2:13:32,  1.28it/s] 20%|██        | 2558/12776 [25:39<2:06:44,  1.34it/s]                                                       20%|██        | 2558/12776 [25:39<2:06:44,  1.34it/s] 20%|██        | 2559/12776 [25:40<2:10:02,  1.31it/s]                                                       20%|██        | 2559/12776 [25:40<2:10:02,  1.31it/s] 20%|██        | 2560/12776 [25:41<2:00:24,  1.41it/s]                                                       20%|██        | 2560/12776 [25:41<2:00:24,  1.41it/s] 20%|██        | 2561/12776 [25:41<1:55:05,  1.48it/s]                                                       20%|██        | 2561/12776 [25:41<1:55:05,  1.48it/s] 20%|██        | 2562/12776 [25:42<1:46:38,  1.60it/s]                                                       20%|██        | 2562/12776 [25:42<1:46:38,  1.60it/s] 20%|██        | 2563/12776 [25:42<1:44:40,  1.63it/s]                                                       20%|██        | 2563/12776 [25:42<1:44:40,  1.63it/s] 20%|██        | 2564/12776 [25:43<1:36:16,  1.77it/s]                                                       20%|██        | 2564/12776 [25:43<1:36:16,  1.77it/s] 20%|██        | 2565/12776 [25:43<1:37:16,  1.75it/s]                                                       20%|██        | 2565/12776 [25:43<1:37:16,  1.75it/s] 20%|██        | 2566/12776 [25:44<1:29:43,  1.90it/s]                                                       20%|██        | 2566/12776 [25:44<1:29:43,  1.90it/s] 20%|██        | 2567/12776 [25:44<1:30:05,  1.89it/s]                                                       20%|██        | 2567/12776 [25:44<1:30:05,  1.89it/s] 20%|██        | 2568/12776 [25:45<1:22:46,  2.06it/s]                                                       20%|██        | 2568/12776 [25:45<1:22:46,  2.06it/s] 20%|██        | 2569/12776 [25:45<1:16:43,  2.22it/s]                                                       20%|██        | 2569/12776 [25:45<1:16:43,  2.22it/s] 20%|██        | 2570/12776 [25:45<1:14:40,  2.28it/s]                                                       20%|██        | 2570/12776 [25:45<1:14:40,  2.28it/s] 20%|██        | 2571/12776 [25:46<1:09:50,  2.44it/s]                                                       20%|██        | 2571/12776 [25:46<1:09:50,  2.44it/s] 20%|██        | 2572/12776 [25:46<1:05:59,  2.58it/s]                                                       20%|██        | 2572/12776 [25:46<1:05:59,  2.58it/s] 20%|██        | 2573/12776 [25:47<1:09:59,  2.43it/s]                                                       20%|██        | 2573/12776 [25:47<1:09:59,  2.43it/s] 20%|██        | 2574/12776 [25:47<1:04:40,  2.63it/s]                                                       20%|██        | 2574/12776 [25:47<1:04:40,  2.63it/s] 20%|██        | 2575/12776 [25:47<1:00:33,  2.81it/s]                                                       20%|██        | 2575/12776 [25:47<1:00:33,  2.81it/s] 20%|██        | 2576/12776 [25:47<57:32,  2.95it/s]                                                       20%|██        | 2576/12776 [25:47<57:32,  2.95it/s] 20%|██        | 2577/12776 [25:48<59:22,  2.86it/s]                                                     20%|██        | 2577/12776 [25:48<59:22,  2.86it/s] 20%|██        | 2578/12776 [25:48<56:08,  3.03it/s]                                                     20%|██        | 2578/12776 [25:48<56:08,  3.03it/s] 20%|██        | 2579/12776 [25:48<53:32,  3.17it/s]                                                     20%|██        | 2579/12776 [25:48<53:32,  3.17it/s] 20%|██        | 2580/12776 [25:49<51:13,  3.32it/s]                                                     20%|██        | 2580/12776 [25:49<51:13,  3.32it/s] 20%|██        | 2581/12776 [25:49<51:09,  3.32it/s]                                                     20%|██        | 2581/12776 [25:49<51:09,  3.32it/s] 20%|██        | 2582/12776 [25:49<48:46,  3.48it/s]                                                     20%|██        | 2582/12776 [25:49<48:46,  3.48it/s] 20%|██        | 2583/12776 [25:49<46:56,  3.62it/s]                                                     20%|██        | 2583/12776 [25:49<46:56,  3.62it/s] 20%|██        | 2584/12776 [25:50<45:28,  3.73it/s]                                                     20%|██        | 2584/12776 [25:50<45:28,  3.73it/s] 20%|██        | 2585/12776 [25:50<49:15,  3.45it/s]                                                     20%|██        | 2585/12776 [25:50<49:15,  3.45it/s] 20%|██        | 2586/12776 [25:50<46:29,  3.65it/s]                                                     20%|██        | 2586/12776 [25:50<46:29,  3.65it/s] 20%|██        | 2587/12776 [25:51<44:18,  3.83it/s]                                                     20%|██        | 2587/12776 [25:51<44:18,  3.83it/s] 20%|██        | 2588/12776 [25:51<42:26,  4.00it/s]                                                     20%|██        | 2588/12776 [25:51<42:26,  4.00it/s] 20%|██        | 2589/12776 [25:51<45:46,  3.71it/s]                                                     20%|██        | 2589/12776 [25:51<45:46,  3.71it/s] 20%|██        | 2590/12776 [25:51<42:42,  3.97it/s]                                                     20%|██        | 2590/12776 [25:51<42:42,  3.97it/s] 20%|██        | 2591/12776 [25:51<40:25,  4.20it/s]                                                     20%|██        | 2591/12776 [25:51<40:25,  4.20it/s] 20%|██        | 2592/12776 [25:52<38:40,  4.39it/s]                                                     20%|██        | 2592/12776 [25:52<38:40,  4.39it/s] 20%|██        | 2593/12776 [25:52<37:21,  4.54it/s]                                                     20%|██        | 2593/12776 [25:52<37:21,  4.54it/s] 20%|██        | 2594/12776 [25:52<41:47,  4.06it/s]                                                     20%|██        | 2594/12776 [25:52<41:47,  4.06it/s] 20%|██        | 2595/12776 [25:52<39:16,  4.32it/s]                                                     20%|██        | 2595/12776 [25:52<39:16,  4.32it/s] 20%|██        | 2596/12776 [25:53<37:20,  4.54it/s]                                                     20%|██        | 2596/12776 [25:53<37:20,  4.54it/s] 20%|██        | 2597/12776 [25:53<35:50,  4.73it/s]                                                     20%|██        | 2597/12776 [25:53<35:50,  4.73it/s] 20%|██        | 2598/12776 [25:53<34:43,  4.88it/s]                                                     20%|██        | 2598/12776 [25:53<34:43,  4.88it/s] 20%|██        | 2599/12776 [25:53<33:42,  5.03it/s]                                                     20%|██        | 2599/12776 [25:53<33:42,  5.03it/s] 20%|██        | 2600/12776 [25:54<1:00:08,  2.82it/s]                                                       20%|██        | 2600/12776 [25:54<1:00:08,  2.82it/s] 20%|██        | 2601/12776 [25:55<1:58:52,  1.43it/s]                                                       20%|██        | 2601/12776 [25:55<1:58:52,  1.43it/s] 20%|██        | 2602/12776 [25:56<2:11:37,  1.29it/s]                                                       20%|██        | 2602/12776 [25:56<2:11:37,  1.29it/s] 20%|██        | 2603/12776 [25:57<2:16:54,  1.24it/s]                                                       20%|██        | 2603/12776 [25:57<2:16:54,  1.24it/s] 20%|██        | 2604/12776 [25:58<2:17:52,  1.23it/s]                                                       20%|██        | 2604/12776 [25:58<2:17:52,  1.23it/s] 20%|██        | 2605/12776 [25:59<2:14:15,  1.26it/s]                                                       20%|██        | 2605/12776 [25:59<2:14:15,  1.26it/s] 20%|██        | 2606/12776 [25:59<2:10:01,  1.30it/s]                                                       20%|██        | 2606/12776 [25:59<2:10:01,  1.30it/s] 20%|██        | 2607/12776 [26:00<2:03:39,  1.37it/s]                                                       20%|██        | 2607/12776 [26:00<2:03:39,  1.37it/s] 20%|██        | 2608/12776 [26:01<1:57:56,  1.44it/s]                                                       20%|██        | 2608/12776 [26:01<1:57:56,  1.44it/s] 20%|██        | 2609/12776 [26:01<1:51:13,  1.52it/s]                                                       20%|██        | 2609/12776 [26:01<1:51:13,  1.52it/s] 20%|██        | 2610/12776 [26:02<1:45:31,  1.61it/s]                                                       20%|██        | 2610/12776 [26:02<1:45:31,  1.61it/s] 20%|██        | 2611/12776 [26:02<1:40:07,  1.69it/s]                                                       20%|██        | 2611/12776 [26:02<1:40:07,  1.69it/s] 20%|██        | 2612/12776 [26:03<1:39:58,  1.69it/s]                                                       20%|██        | 2612/12776 [26:03<1:39:58,  1.69it/s] 20%|██        | 2613/12776 [26:03<1:33:08,  1.82it/s]                                                       20%|██        | 2613/12776 [26:03<1:33:08,  1.82it/s] 20%|██        | 2614/12776 [26:04<1:36:01,  1.76it/s]                                                       20%|██        | 2614/12776 [26:04<1:36:01,  1.76it/s] 20%|██        | 2615/12776 [26:04<1:28:56,  1.90it/s]                                                       20%|██        | 2615/12776 [26:04<1:28:56,  1.90it/s] 20%|██        | 2616/12776 [26:05<1:28:20,  1.92it/s]                                                       20%|██        | 2616/12776 [26:05<1:28:20,  1.92it/s] 20%|██        | 2617/12776 [26:05<1:21:41,  2.07it/s]                                                       20%|██        | 2617/12776 [26:05<1:21:41,  2.07it/s] 20%|██        | 2618/12776 [26:06<1:16:32,  2.21it/s]                                                       20%|██        | 2618/12776 [26:06<1:16:32,  2.21it/s] 20%|██        | 2619/12776 [26:06<1:13:40,  2.30it/s]                                                       20%|██        | 2619/12776 [26:06<1:13:40,  2.30it/s] 21%|██        | 2620/12776 [26:06<1:09:21,  2.44it/s]                                                       21%|██        | 2620/12776 [26:06<1:09:21,  2.44it/s] 21%|██        | 2621/12776 [26:07<1:05:45,  2.57it/s]                                                       21%|██        | 2621/12776 [26:07<1:05:45,  2.57it/s] 21%|██        | 2622/12776 [26:07<1:08:36,  2.47it/s]                                                      {'loss': 1.6048, 'grad_norm': 3.880048990249634, 'learning_rate': 0.00025019550342130983, 'epoch': 0.4}
+{'loss': 1.3564, 'grad_norm': 2.1508264541625977, 'learning_rate': 0.00025017106549364614, 'epoch': 0.4}
+{'loss': 0.6449, 'grad_norm': 1.6752632856369019, 'learning_rate': 0.0002501466275659824, 'epoch': 0.4}
+{'loss': 1.7772, 'grad_norm': 3.97983980178833, 'learning_rate': 0.00025012218963831864, 'epoch': 0.4}
+{'loss': 1.109, 'grad_norm': 2.7829971313476562, 'learning_rate': 0.0002500977517106549, 'epoch': 0.4}
+{'loss': 0.4101, 'grad_norm': 1.2870094776153564, 'learning_rate': 0.0002500733137829912, 'epoch': 0.4}
+{'loss': 1.1036, 'grad_norm': 2.9641854763031006, 'learning_rate': 0.00025004887585532745, 'epoch': 0.4}
+{'loss': 0.3884, 'grad_norm': 0.6355754137039185, 'learning_rate': 0.0002500244379276637, 'epoch': 0.4}
+{'loss': 0.4459, 'grad_norm': 0.7654311656951904, 'learning_rate': 0.00025, 'epoch': 0.4}
+{'loss': 0.3962, 'grad_norm': 0.6441348791122437, 'learning_rate': 0.00024997556207233626, 'epoch': 0.4}
+{'loss': 0.3226, 'grad_norm': 0.8454661965370178, 'learning_rate': 0.0002499511241446725, 'epoch': 0.4}
+{'loss': 0.3463, 'grad_norm': 0.7234505414962769, 'learning_rate': 0.0002499266862170088, 'epoch': 0.4}
+{'loss': 0.2914, 'grad_norm': 0.7027657628059387, 'learning_rate': 0.000249902248289345, 'epoch': 0.4}
+{'loss': 0.3581, 'grad_norm': 0.9003955721855164, 'learning_rate': 0.0002498778103616813, 'epoch': 0.4}
+{'loss': 0.6009, 'grad_norm': 0.8857301473617554, 'learning_rate': 0.00024985337243401757, 'epoch': 0.4}
+{'loss': 0.407, 'grad_norm': 0.8024937510490417, 'learning_rate': 0.0002498289345063538, 'epoch': 0.4}
+{'loss': 0.5759, 'grad_norm': 1.1914548873901367, 'learning_rate': 0.0002498044965786901, 'epoch': 0.4}
+{'loss': 0.4644, 'grad_norm': 1.0876412391662598, 'learning_rate': 0.0002497800586510264, 'epoch': 0.4}
+{'loss': 0.327, 'grad_norm': 0.8005790710449219, 'learning_rate': 0.0002497556207233626, 'epoch': 0.4}
+{'loss': 0.6214, 'grad_norm': 0.8703690767288208, 'learning_rate': 0.0002497311827956989, 'epoch': 0.4}
+{'loss': 0.4908, 'grad_norm': 1.3815338611602783, 'learning_rate': 0.0002497067448680352, 'epoch': 0.4}
+{'loss': 0.4925, 'grad_norm': 3.5727379322052, 'learning_rate': 0.00024968230694037143, 'epoch': 0.4}
+{'loss': 0.661, 'grad_norm': 1.0240099430084229, 'learning_rate': 0.0002496578690127077, 'epoch': 0.4}
+{'loss': 0.5323, 'grad_norm': 1.9735991954803467, 'learning_rate': 0.000249633431085044, 'epoch': 0.4}
+{'loss': 0.5383, 'grad_norm': 1.2893562316894531, 'learning_rate': 0.00024960899315738024, 'epoch': 0.4}
+{'loss': 0.6309, 'grad_norm': 0.8876537680625916, 'learning_rate': 0.0002495845552297165, 'epoch': 0.4}
+{'loss': 0.5627, 'grad_norm': 1.0564181804656982, 'learning_rate': 0.0002495601173020528, 'epoch': 0.4}
+{'loss': 0.6418, 'grad_norm': 1.0052175521850586, 'learning_rate': 0.000249535679374389, 'epoch': 0.4}
+{'loss': 0.4353, 'grad_norm': 0.7646985054016113, 'learning_rate': 0.0002495112414467253, 'epoch': 0.4}
+{'loss': 0.5096, 'grad_norm': 1.8414807319641113, 'learning_rate': 0.00024948680351906155, 'epoch': 0.4}
+{'loss': 0.6357, 'grad_norm': 1.9422529935836792, 'learning_rate': 0.0002494623655913978, 'epoch': 0.4}
+{'loss': 0.5801, 'grad_norm': 2.9409773349761963, 'learning_rate': 0.0002494379276637341, 'epoch': 0.4}
+{'loss': 1.0987, 'grad_norm': 1.64481782913208, 'learning_rate': 0.00024941348973607036, 'epoch': 0.4}
+{'loss': 0.7097, 'grad_norm': 2.1421914100646973, 'learning_rate': 0.0002493890518084066, 'epoch': 0.4}
+{'loss': 0.8432, 'grad_norm': 1.6423346996307373, 'learning_rate': 0.0002493646138807429, 'epoch': 0.4}
+{'loss': 1.2652, 'grad_norm': 4.2599005699157715, 'learning_rate': 0.00024934017595307917, 'epoch': 0.4}
+{'loss': 1.0066, 'grad_norm': 2.666485071182251, 'learning_rate': 0.0002493157380254154, 'epoch': 0.4}
+{'loss': 1.0087, 'grad_norm': 3.4429917335510254, 'learning_rate': 0.00024929130009775167, 'epoch': 0.4}
+{'loss': 1.2341, 'grad_norm': 3.632044553756714, 'learning_rate': 0.000249266862170088, 'epoch': 0.4}
+{'loss': 0.939, 'grad_norm': 1.675054907798767, 'learning_rate': 0.00024924242424242423, 'epoch': 0.4}
+{'loss': 1.3942, 'grad_norm': 2.2389726638793945, 'learning_rate': 0.0002492179863147605, 'epoch': 0.4}
+{'loss': 0.9123, 'grad_norm': 1.801397442817688, 'learning_rate': 0.0002491935483870968, 'epoch': 0.4}
+{'loss': 1.5424, 'grad_norm': 3.3526382446289062, 'learning_rate': 0.000249169110459433, 'epoch': 0.4}
+{'loss': 0.7216, 'grad_norm': 1.4892598390579224, 'learning_rate': 0.0002491446725317693, 'epoch': 0.4}
+{'loss': 1.1292, 'grad_norm': 2.2398314476013184, 'learning_rate': 0.00024912023460410554, 'epoch': 0.41}
+{'loss': 1.1889, 'grad_norm': 7.038582801818848, 'learning_rate': 0.0002490957966764418, 'epoch': 0.41}
+{'loss': 1.0995, 'grad_norm': 2.2800958156585693, 'learning_rate': 0.0002490713587487781, 'epoch': 0.41}
+{'loss': 1.5452, 'grad_norm': 2.6573739051818848, 'learning_rate': 0.00024904692082111435, 'epoch': 0.41}
+{'loss': 1.134, 'grad_norm': 2.7160732746124268, 'learning_rate': 0.0002490224828934506, 'epoch': 0.41}
+{'loss': 1.2917, 'grad_norm': 3.265036106109619, 'learning_rate': 0.0002489980449657869, 'epoch': 0.41}
+{'loss': 1.5418, 'grad_norm': 3.3533453941345215, 'learning_rate': 0.00024897360703812315, 'epoch': 0.41}
+{'loss': 1.3963, 'grad_norm': 1.740634560585022, 'learning_rate': 0.0002489491691104594, 'epoch': 0.41}
+{'loss': 0.9535, 'grad_norm': 5.385989665985107, 'learning_rate': 0.00024892473118279566, 'epoch': 0.41}
+{'loss': 1.1681, 'grad_norm': 2.453171730041504, 'learning_rate': 0.00024890029325513196, 'epoch': 0.41}
+{'loss': 0.795, 'grad_norm': 2.1263811588287354, 'learning_rate': 0.0002488758553274682, 'epoch': 0.41}
+{'loss': 0.9264, 'grad_norm': 2.388746738433838, 'learning_rate': 0.00024885141739980446, 'epoch': 0.41}
+{'loss': 0.8049, 'grad_norm': 1.1103012561798096, 'learning_rate': 0.00024882697947214077, 'epoch': 0.41}
+{'loss': 0.365, 'grad_norm': 0.601675271987915, 'learning_rate': 0.000248802541544477, 'epoch': 0.41}
+{'loss': 0.3728, 'grad_norm': 0.5282478332519531, 'learning_rate': 0.00024877810361681327, 'epoch': 0.41}
+{'loss': 0.3763, 'grad_norm': 0.5197291374206543, 'learning_rate': 0.0002487536656891495, 'epoch': 0.41}
+{'loss': 0.4424, 'grad_norm': 1.0491033792495728, 'learning_rate': 0.0002487292277614858, 'epoch': 0.41}
+{'loss': 0.5056, 'grad_norm': 1.314231038093567, 'learning_rate': 0.0002487047898338221, 'epoch': 0.41}
+{'loss': 0.3068, 'grad_norm': 0.6822581887245178, 'learning_rate': 0.00024868035190615833, 'epoch': 0.41}
+{'loss': 0.4735, 'grad_norm': 1.021838665008545, 'learning_rate': 0.0002486559139784946, 'epoch': 0.41}
+{'loss': 0.4265, 'grad_norm': 0.770111083984375, 'learning_rate': 0.0002486314760508309, 'epoch': 0.41}
+{'loss': 0.4441, 'grad_norm': 0.6351111531257629, 'learning_rate': 0.00024860703812316714, 'epoch': 0.41}
+{'loss': 0.8832, 'grad_norm': 1.667920470237732, 'learning_rate': 0.0002485826001955034, 'epoch': 0.41}
+{'loss': 0.4595, 'grad_norm': 0.9965152740478516, 'learning_rate': 0.00024855816226783964, 'epoch': 0.41}
+{'loss': 0.7611, 'grad_norm': 1.3201706409454346, 'learning_rate': 0.00024853372434017595, 'epoch': 0.41}
+{'loss': 0.4856, 'grad_norm': 0.8612977266311646, 'learning_rate': 0.0002485092864125122, 'epoch': 0.41}
+{'loss': 0.9506, 'grad_norm': 2.6356048583984375, 'learning_rate': 0.00024848484848484845, 'epoch': 0.41}
+{'loss': 0.4625, 'grad_norm': 1.1005220413208008, 'learning_rate': 0.00024846041055718476, 'epoch': 0.41}
+{'loss': 0.747, 'grad_norm': 1.277478814125061, 'learning_rate': 0.000248435972629521, 'epoch': 0.41}
+{'loss': 0.5756, 'grad_norm': 1.216950535774231, 'learning_rate': 0.00024841153470185726, 'epoch': 0.41}
+{'loss': 0.7352, 'grad_norm': 1.0903007984161377, 'learning_rate': 0.00024838709677419356, 'epoch': 0.41}
+{'loss': 0.6177, 'grad_norm': 1.3664308786392212, 'learning_rate': 0.00024836265884652976, 'epoch': 0.41}
+{'loss': 0.7601, 'grad_norm': 1.4325757026672363, 'learning_rate': 0.00024833822091886607, 'epoch': 0.41}
+{'loss': 0.7936, 'grad_norm': 1.858127236366272, 'learning_rate': 0.0002483137829912023, 'epoch': 0.41}
+ 21%|██        | 2622/12776 [26:07<1:08:36,  2.47it/s] 21%|██        | 2623/12776 [26:08<1:04:44,  2.61it/s]                                                       21%|██        | 2623/12776 [26:08<1:04:44,  2.61it/s] 21%|██        | 2624/12776 [26:08<1:00:57,  2.78it/s]                                                       21%|██        | 2624/12776 [26:08<1:00:57,  2.78it/s] 21%|██        | 2625/12776 [26:08<57:52,  2.92it/s]                                                       21%|██        | 2625/12776 [26:08<57:52,  2.92it/s] 21%|██        | 2626/12776 [26:09<57:22,  2.95it/s]                                                     21%|██        | 2626/12776 [26:09<57:22,  2.95it/s] 21%|██        | 2627/12776 [26:09<54:29,  3.10it/s]                                                     21%|██        | 2627/12776 [26:09<54:29,  3.10it/s] 21%|██        | 2628/12776 [26:09<52:26,  3.23it/s]                                                     21%|██        | 2628/12776 [26:09<52:26,  3.23it/s] 21%|██        | 2629/12776 [26:09<50:25,  3.35it/s]                                                     21%|██        | 2629/12776 [26:09<50:25,  3.35it/s] 21%|██        | 2630/12776 [26:10<50:44,  3.33it/s]                                                     21%|██        | 2630/12776 [26:10<50:44,  3.33it/s] 21%|██        | 2631/12776 [26:10<48:31,  3.49it/s]                                                     21%|██        | 2631/12776 [26:10<48:31,  3.49it/s] 21%|██        | 2632/12776 [26:10<46:28,  3.64it/s]                                                     21%|██        | 2632/12776 [26:10<46:28,  3.64it/s] 21%|██        | 2633/12776 [26:10<44:56,  3.76it/s]                                                     21%|██        | 2633/12776 [26:10<44:56,  3.76it/s] 21%|██        | 2634/12776 [26:11<49:06,  3.44it/s]                                                     21%|██        | 2634/12776 [26:11<49:06,  3.44it/s] 21%|██        | 2635/12776 [26:11<46:07,  3.66it/s]                                                     21%|██        | 2635/12776 [26:11<46:07,  3.66it/s] 21%|██        | 2636/12776 [26:11<43:41,  3.87it/s]                                                     21%|██        | 2636/12776 [26:11<43:41,  3.87it/s] 21%|██        | 2637/12776 [26:11<41:39,  4.06it/s]                                                     21%|██        | 2637/12776 [26:11<41:39,  4.06it/s] 21%|██        | 2638/12776 [26:12<45:14,  3.73it/s]                                                     21%|██        | 2638/12776 [26:12<45:14,  3.73it/s] 21%|██        | 2639/12776 [26:12<42:13,  4.00it/s]                                                     21%|██        | 2639/12776 [26:12<42:13,  4.00it/s] 21%|██        | 2640/12776 [26:12<40:10,  4.20it/s]                                                     21%|██        | 2640/12776 [26:12<40:10,  4.20it/s] 21%|██        | 2641/12776 [26:12<38:34,  4.38it/s]                                                     21%|██        | 2641/12776 [26:12<38:34,  4.38it/s] 21%|██        | 2642/12776 [26:13<37:19,  4.52it/s]                                                     21%|██        | 2642/12776 [26:13<37:19,  4.52it/s] 21%|██        | 2643/12776 [26:13<41:49,  4.04it/s]                                                     21%|██        | 2643/12776 [26:13<41:49,  4.04it/s] 21%|██        | 2644/12776 [26:13<39:16,  4.30it/s]                                                     21%|██        | 2644/12776 [26:13<39:16,  4.30it/s] 21%|██        | 2645/12776 [26:13<37:27,  4.51it/s]                                                     21%|██        | 2645/12776 [26:13<37:27,  4.51it/s] 21%|██        | 2646/12776 [26:14<36:00,  4.69it/s]                                                     21%|██        | 2646/12776 [26:14<36:00,  4.69it/s] 21%|██        | 2647/12776 [26:14<34:50,  4.85it/s]                                                     21%|██        | 2647/12776 [26:14<34:50,  4.85it/s] 21%|██        | 2648/12776 [26:14<33:43,  5.00it/s]                                                     21%|██        | 2648/12776 [26:14<33:43,  5.00it/s] 21%|██        | 2649/12776 [26:14<38:19,  4.40it/s]                                                     21%|██        | 2649/12776 [26:14<38:19,  4.40it/s] 21%|██        | 2650/12776 [26:15<1:01:25,  2.75it/s]                                                       21%|██        | 2650/12776 [26:15<1:01:25,  2.75it/s] 21%|██        | 2651/12776 [26:16<1:46:54,  1.58it/s]                                                       21%|██        | 2651/12776 [26:16<1:46:54,  1.58it/s] 21%|██        | 2652/12776 [26:17<2:01:18,  1.39it/s]                                                       21%|██        | 2652/12776 [26:17<2:01:18,  1.39it/s] 21%|██        | 2653/12776 [26:18<2:12:36,  1.27it/s]                                                       21%|██        | 2653/12776 [26:18<2:12:36,  1.27it/s] 21%|██        | 2654/12776 [26:19<2:10:37,  1.29it/s]                                                       21%|██        | 2654/12776 [26:19<2:10:37,  1.29it/s] 21%|██        | 2655/12776 [26:19<2:07:51,  1.32it/s]                                                       21%|██        | 2655/12776 [26:19<2:07:51,  1.32it/s] 21%|██        | 2656/12776 [26:20<2:06:09,  1.34it/s]                                                       21%|██        | 2656/12776 [26:20<2:06:09,  1.34it/s] 21%|██        | 2657/12776 [26:21<2:06:38,  1.33it/s]                                                       21%|██        | 2657/12776 [26:21<2:06:38,  1.33it/s] 21%|██        | 2658/12776 [26:22<1:59:19,  1.41it/s]                                                       21%|██        | 2658/12776 [26:22<1:59:19,  1.41it/s] 21%|██        | 2659/12776 [26:22<1:52:21,  1.50it/s]                                                       21%|██        | 2659/12776 [26:22<1:52:21,  1.50it/s] 21%|██        | 2660/12776 [26:23<1:46:08,  1.59it/s]                                                       21%|██        | 2660/12776 [26:23<1:46:08,  1.59it/s] 21%|██        | 2661/12776 [26:23<1:46:23,  1.58it/s]                                                       21%|██        | 2661/12776 [26:23<1:46:23,  1.58it/s] 21%|██        | 2662/12776 [26:24<1:39:30,  1.69it/s]                                                       21%|██        | 2662/12776 [26:24<1:39:30,  1.69it/s] 21%|██        | 2663/12776 [26:24<1:32:46,  1.82it/s]                                                       21%|██        | 2663/12776 [26:24<1:32:46,  1.82it/s] 21%|██        | 2664/12776 [26:25<1:28:23,  1.91it/s]                                                       21%|██        | 2664/12776 [26:25<1:28:23,  1.91it/s] 21%|██        | 2665/12776 [26:25<1:23:39,  2.01it/s]                                                       21%|██        | 2665/12776 [26:25<1:23:39,  2.01it/s] 21%|██        | 2666/12776 [26:26<1:21:22,  2.07it/s]                                                       21%|██        | 2666/12776 [26:26<1:21:22,  2.07it/s] 21%|██        | 2667/12776 [26:26<1:17:08,  2.18it/s]                                                       21%|██        | 2667/12776 [26:26<1:17:08,  2.18it/s] 21%|██        | 2668/12776 [26:26<1:13:25,  2.29it/s]                                                       21%|██        | 2668/12776 [26:26<1:13:25,  2.29it/s] 21%|██        | 2669/12776 [26:27<1:12:21,  2.33it/s]                                                       21%|██        | 2669/12776 [26:27<1:12:21,  2.33it/s] 21%|██        | 2670/12776 [26:27<1:08:23,  2.46it/s]                                                       21%|██        | 2670/12776 [26:27<1:08:23,  2.46it/s] 21%|██        | 2671/12776 [26:27<1:05:28,  2.57it/s]                                                       21%|██        | 2671/12776 [26:27<1:05:28,  2.57it/s] 21%|██        | 2672/12776 [26:28<1:07:55,  2.48it/s]                                                       21%|██        | 2672/12776 [26:28<1:07:55,  2.48it/s] 21%|██        | 2673/12776 [26:28<1:04:22,  2.62it/s]                                                       21%|██        | 2673/12776 [26:28<1:04:22,  2.62it/s] 21%|██        | 2674/12776 [26:29<1:01:31,  2.74it/s]                                                       21%|██        | 2674/12776 [26:29<1:01:31,  2.74it/s] 21%|██        | 2675/12776 [26:29<59:05,  2.85it/s]                                                       21%|██        | 2675/12776 [26:29<59:05,  2.85it/s] 21%|██        | 2676/12776 [26:29<57:48,  2.91it/s]                                                     21%|██        | 2676/12776 [26:29<57:48,  2.91it/s] 21%|██        | 2677/12776 [26:30<55:14,  3.05it/s]                                                     21%|██        | 2677/12776 [26:30<55:14,  3.05it/s] 21%|██        | 2678/12776 [26:30<52:56,  3.18it/s]                                                     21%|██        | 2678/12776 [26:30<52:56,  3.18it/s] 21%|██        | 2679/12776 [26:30<57:57,  2.90it/s]                                                     21%|██        | 2679/12776 [26:30<57:57,  2.90it/s] 21%|██        | 2680/12776 [26:30<54:07,  3.11it/s]                                                     21%|██        | 2680/12776 [26:30<54:07,  3.11it/s] 21%|██        | 2681/12776 [26:31<51:13,  3.28it/s]                                                     21%|██        | 2681/12776 [26:31<51:13,  3.28it/s] 21%|██        | 2682/12776 [26:31<48:38,  3.46it/s]                                                     21%|██        | 2682/12776 [26:31<48:38,  3.46it/s] 21%|██        | 2683/12776 [26:31<52:09,  3.22it/s]                                                     21%|██        | 2683/12776 [26:31<52:09,  3.22it/s] 21%|██        | 2684/12776 [26:32<49:00,  3.43it/s]                                                     21%|██        | 2684/12776 [26:32<49:00,  3.43it/s] 21%|██        | 2685/12776 [26:32<46:24,  3.62it/s]                                                     21%|██        | 2685/12776 [26:32<46:24,  3.62it/s] 21%|██        | 2686/12776 [26:32<44:14,  3.80it/s]                                                     21%|██        | 2686/12776 [26:32<44:14,  3.80it/s] 21%|██        | 2687/12776 [26:32<42:27,  3.96it/s]                                                     21%|██        | 2687/12776 [26:32<42:27,  3.96it/s] 21%|██        | 2688/12776 [26:33<44:03,  3.82it/s]                                                     21%|██        | 2688/12776 [26:33<44:03,  3.82it/s] 21%|██        | 2689/12776 [26:33<41:33,  4.04it/s]                                                     21%|██        | 2689/12776 [26:33<41:33,  4.04it/s] 21%|██        | 2690/12776 [26:33<39:49,  4.22it/s]                                                     21%|██        | 2690/12776 [26:33<39:49,  4.22it/s] 21%|██        | 2691/12776 [26:33<38:24,  4.38it/s]                                                     21%|██        | 2691/12776 [26:33<38:24,  4.38it/s] 21%|██        | 2692/12776 [26:33<37:15,  4.51it/s]                                                     21%|██        | 2692/12776 [26:33<37:15,  4.51it/s] 21%|██        | 2693/12776 [26:34<39:38,  4.24it/s]                                                     21%|██        | 2693/12776 [26:34<39:38,  4.24it/s] 21%|██        | 2694/12776 [26:34<37:49,  4.44it/s]                                                     21%|██        | 2694/12776 [26:34<37:49,  4.44it/s] 21%|██        | 2695/12776 [26:34<36:27,  4.61it/s]                                                     21%|██        | 2695/12776 [26:34<36:27,  4.61it/s] 21%|██        | 2696/12776 [26:34<35:21,  4.75it/s]                                                     21%|██        | 2696/12776 [26:34<35:21,  4.75it/s] 21%|██        | 2697/12776 [26:34<34:25,  4.88it/s]                                                     21%|██        | 2697/12776 [26:34<34:25,  4.88it/s] 21%|██        | 2698/12776 [26:35<35:08,  4.78it/s]                                                     21%|██        | 2698/12776 [26:35<35:08,  4.78it/s] 21%|██        | 2699/12776 [26:35<33:50,  4.96it/s]                                                     21%|██        | 2699/12776 [26:35<33:50,  4.96it/s] 21%|██        | 2700/12776 [26:36<1:02:58,  2.67it/s]                                                      {'loss': 0.9523, 'grad_norm': 3.4538111686706543, 'learning_rate': 0.00024828934506353857, 'epoch': 0.41}
+{'loss': 0.8096, 'grad_norm': 1.925356388092041, 'learning_rate': 0.0002482649071358749, 'epoch': 0.41}
+{'loss': 0.9204, 'grad_norm': 1.6007717847824097, 'learning_rate': 0.0002482404692082111, 'epoch': 0.41}
+{'loss': 0.9734, 'grad_norm': 2.12320613861084, 'learning_rate': 0.0002482160312805474, 'epoch': 0.41}
+{'loss': 1.0131, 'grad_norm': 1.3324270248413086, 'learning_rate': 0.0002481915933528837, 'epoch': 0.41}
+{'loss': 0.9945, 'grad_norm': 1.9677183628082275, 'learning_rate': 0.00024816715542521993, 'epoch': 0.41}
+{'loss': 0.813, 'grad_norm': 1.487841010093689, 'learning_rate': 0.0002481427174975562, 'epoch': 0.41}
+{'loss': 0.8284, 'grad_norm': 2.215928554534912, 'learning_rate': 0.00024811827956989244, 'epoch': 0.41}
+{'loss': 0.9693, 'grad_norm': 2.1060638427734375, 'learning_rate': 0.00024809384164222874, 'epoch': 0.41}
+{'loss': 0.5589, 'grad_norm': 1.2802146673202515, 'learning_rate': 0.000248069403714565, 'epoch': 0.41}
+{'loss': 0.9417, 'grad_norm': 2.4846386909484863, 'learning_rate': 0.00024804496578690124, 'epoch': 0.41}
+{'loss': 0.523, 'grad_norm': 1.465616226196289, 'learning_rate': 0.00024802052785923755, 'epoch': 0.41}
+{'loss': 0.7951, 'grad_norm': 1.689579963684082, 'learning_rate': 0.00024799608993157375, 'epoch': 0.41}
+{'loss': 1.4816, 'grad_norm': 2.5636866092681885, 'learning_rate': 0.00024797165200391005, 'epoch': 0.41}
+{'loss': 1.4351, 'grad_norm': 2.3543753623962402, 'learning_rate': 0.0002479472140762463, 'epoch': 0.41}
+{'loss': 1.5062, 'grad_norm': 2.550464630126953, 'learning_rate': 0.00024792277614858255, 'epoch': 0.41}
+{'loss': 1.5573, 'grad_norm': 2.0879738330841064, 'learning_rate': 0.00024789833822091886, 'epoch': 0.41}
+{'loss': 1.5449, 'grad_norm': 2.426447629928589, 'learning_rate': 0.0002478739002932551, 'epoch': 0.41}
+{'loss': 1.1526, 'grad_norm': 2.501875877380371, 'learning_rate': 0.00024784946236559136, 'epoch': 0.41}
+{'loss': 1.4765, 'grad_norm': 3.55033540725708, 'learning_rate': 0.00024782502443792767, 'epoch': 0.41}
+{'loss': 1.6887, 'grad_norm': 4.644171237945557, 'learning_rate': 0.0002478005865102639, 'epoch': 0.41}
+{'loss': 1.3512, 'grad_norm': 2.9000136852264404, 'learning_rate': 0.00024777614858260017, 'epoch': 0.41}
+{'loss': 1.1566, 'grad_norm': 2.809741735458374, 'learning_rate': 0.0002477517106549364, 'epoch': 0.41}
+{'loss': 1.4035, 'grad_norm': 2.349257230758667, 'learning_rate': 0.0002477272727272727, 'epoch': 0.41}
+{'loss': 1.3006, 'grad_norm': 3.1145989894866943, 'learning_rate': 0.000247702834799609, 'epoch': 0.41}
+{'loss': 1.3462, 'grad_norm': 1.8506964445114136, 'learning_rate': 0.00024767839687194523, 'epoch': 0.41}
+{'loss': 0.7869, 'grad_norm': 1.8519948720932007, 'learning_rate': 0.00024765395894428153, 'epoch': 0.41}
+{'loss': 1.4319, 'grad_norm': 1.9720544815063477, 'learning_rate': 0.0002476295210166178, 'epoch': 0.41}
+{'loss': 2.3146, 'grad_norm': 3.180431365966797, 'learning_rate': 0.00024760508308895404, 'epoch': 0.41}
+{'loss': 0.4092, 'grad_norm': 0.8477596640586853, 'learning_rate': 0.0002475806451612903, 'epoch': 0.41}
+{'loss': 0.3916, 'grad_norm': 0.5798821449279785, 'learning_rate': 0.00024755620723362654, 'epoch': 0.42}
+{'loss': 0.3177, 'grad_norm': 0.638412356376648, 'learning_rate': 0.00024753176930596284, 'epoch': 0.42}
+{'loss': 0.3968, 'grad_norm': 0.5856349468231201, 'learning_rate': 0.0002475073313782991, 'epoch': 0.42}
+{'loss': 0.3886, 'grad_norm': 0.852562665939331, 'learning_rate': 0.00024748289345063535, 'epoch': 0.42}
+{'loss': 0.4342, 'grad_norm': 0.7616521716117859, 'learning_rate': 0.00024745845552297165, 'epoch': 0.42}
+{'loss': 0.4359, 'grad_norm': 0.7631804943084717, 'learning_rate': 0.0002474340175953079, 'epoch': 0.42}
+{'loss': 0.3401, 'grad_norm': 1.0951449871063232, 'learning_rate': 0.00024740957966764416, 'epoch': 0.42}
+{'loss': 0.3564, 'grad_norm': 0.5410890579223633, 'learning_rate': 0.0002473851417399804, 'epoch': 0.42}
+{'loss': 0.5835, 'grad_norm': 0.9513525366783142, 'learning_rate': 0.0002473607038123167, 'epoch': 0.42}
+{'loss': 0.5584, 'grad_norm': 1.1458948850631714, 'learning_rate': 0.00024733626588465296, 'epoch': 0.42}
+{'loss': 0.4138, 'grad_norm': 1.0255465507507324, 'learning_rate': 0.0002473118279569892, 'epoch': 0.42}
+{'loss': 0.7455, 'grad_norm': 2.739941120147705, 'learning_rate': 0.0002472873900293255, 'epoch': 0.42}
+{'loss': 0.6217, 'grad_norm': 1.1839171648025513, 'learning_rate': 0.00024726295210166177, 'epoch': 0.42}
+{'loss': 0.5179, 'grad_norm': 1.1585807800292969, 'learning_rate': 0.000247238514173998, 'epoch': 0.42}
+{'loss': 0.5774, 'grad_norm': 1.2621674537658691, 'learning_rate': 0.00024721407624633433, 'epoch': 0.42}
+{'loss': 0.6508, 'grad_norm': 1.1701115369796753, 'learning_rate': 0.0002471896383186705, 'epoch': 0.42}
+{'loss': 0.809, 'grad_norm': 1.1155378818511963, 'learning_rate': 0.00024716520039100683, 'epoch': 0.42}
+{'loss': 0.5816, 'grad_norm': 1.2293123006820679, 'learning_rate': 0.0002471407624633431, 'epoch': 0.42}
+{'loss': 0.6369, 'grad_norm': 1.4187663793563843, 'learning_rate': 0.00024711632453567933, 'epoch': 0.42}
+{'loss': 0.715, 'grad_norm': 1.2454630136489868, 'learning_rate': 0.00024709188660801564, 'epoch': 0.42}
+{'loss': 0.5052, 'grad_norm': 1.0575640201568604, 'learning_rate': 0.0002470674486803519, 'epoch': 0.42}
+{'loss': 0.8882, 'grad_norm': 1.1785027980804443, 'learning_rate': 0.00024704301075268814, 'epoch': 0.42}
+{'loss': 1.3041, 'grad_norm': 2.9896340370178223, 'learning_rate': 0.0002470185728250244, 'epoch': 0.42}
+{'loss': 0.7306, 'grad_norm': 2.0894863605499268, 'learning_rate': 0.0002469941348973607, 'epoch': 0.42}
+{'loss': 0.846, 'grad_norm': 1.6486024856567383, 'learning_rate': 0.00024696969696969695, 'epoch': 0.42}
+{'loss': 0.8831, 'grad_norm': 2.0100576877593994, 'learning_rate': 0.0002469452590420332, 'epoch': 0.42}
+{'loss': 0.8862, 'grad_norm': 2.7158892154693604, 'learning_rate': 0.0002469208211143695, 'epoch': 0.42}
+{'loss': 1.0893, 'grad_norm': 2.027427911758423, 'learning_rate': 0.00024689638318670576, 'epoch': 0.42}
+{'loss': 0.9038, 'grad_norm': 1.634826421737671, 'learning_rate': 0.000246871945259042, 'epoch': 0.42}
+{'loss': 0.6251, 'grad_norm': 1.4327385425567627, 'learning_rate': 0.0002468475073313783, 'epoch': 0.42}
+{'loss': 0.9759, 'grad_norm': 2.376344919204712, 'learning_rate': 0.0002468230694037145, 'epoch': 0.42}
+{'loss': 1.1159, 'grad_norm': 2.1796486377716064, 'learning_rate': 0.0002467986314760508, 'epoch': 0.42}
+{'loss': 1.0303, 'grad_norm': 2.264559507369995, 'learning_rate': 0.00024677419354838707, 'epoch': 0.42}
+{'loss': 1.2559, 'grad_norm': 3.5693933963775635, 'learning_rate': 0.0002467497556207233, 'epoch': 0.42}
+{'loss': 1.3884, 'grad_norm': 1.9500762224197388, 'learning_rate': 0.0002467253176930596, 'epoch': 0.42}
+{'loss': 1.1449, 'grad_norm': 1.6748422384262085, 'learning_rate': 0.0002467008797653959, 'epoch': 0.42}
+{'loss': 0.8418, 'grad_norm': 1.6067205667495728, 'learning_rate': 0.0002466764418377321, 'epoch': 0.42}
+{'loss': 0.8212, 'grad_norm': 1.628675937652588, 'learning_rate': 0.00024665200391006843, 'epoch': 0.42}
+{'loss': 1.5843, 'grad_norm': 2.2137458324432373, 'learning_rate': 0.0002466275659824047, 'epoch': 0.42}
+{'loss': 1.4545, 'grad_norm': 2.58758282661438, 'learning_rate': 0.00024660312805474093, 'epoch': 0.42}
+{'loss': 1.8048, 'grad_norm': 2.075272560119629, 'learning_rate': 0.0002465786901270772, 'epoch': 0.42}
+{'loss': 1.3897, 'grad_norm': 2.119429588317871, 'learning_rate': 0.0002465542521994135, 'epoch': 0.42}
+{'loss': 1.8498, 'grad_norm': 1.7578437328338623, 'learning_rate': 0.00024652981427174974, 'epoch': 0.42}
+{'loss': 1.3389, 'grad_norm': 1.8442625999450684, 'learning_rate': 0.000246505376344086, 'epoch': 0.42}
+{'loss': 0.8474, 'grad_norm': 1.9508384466171265, 'learning_rate': 0.0002464809384164223, 'epoch': 0.42}
+{'loss': 0.8622, 'grad_norm': 1.3713496923446655, 'learning_rate': 0.00024645650048875855, 'epoch': 0.42}
+{'loss': 0.6771, 'grad_norm': 2.2516250610351562, 'learning_rate': 0.0002464320625610948, 'epoch': 0.42}
+{'loss': 1.3575, 'grad_norm': 2.1349215507507324, 'learning_rate': 0.00024640762463343105, 'epoch': 0.42}
+ 21%|██        | 2700/12776 [26:36<1:02:58,  2.67it/s] 21%|██        | 2701/12776 [26:37<2:02:57,  1.37it/s]                                                       21%|██        | 2701/12776 [26:37<2:02:57,  1.37it/s] 21%|██        | 2702/12776 [26:38<2:19:45,  1.20it/s]                                                       21%|██        | 2702/12776 [26:38<2:19:45,  1.20it/s] 21%|██        | 2703/12776 [26:39<2:25:31,  1.15it/s]                                                       21%|██        | 2703/12776 [26:39<2:25:31,  1.15it/s] 21%|██        | 2704/12776 [26:40<2:21:18,  1.19it/s]                                                       21%|██        | 2704/12776 [26:40<2:21:18,  1.19it/s] 21%|██        | 2705/12776 [26:41<2:18:39,  1.21it/s]                                                       21%|██        | 2705/12776 [26:41<2:18:39,  1.21it/s] 21%|██        | 2706/12776 [26:42<2:13:29,  1.26it/s]                                                       21%|██        | 2706/12776 [26:42<2:13:29,  1.26it/s] 21%|██        | 2707/12776 [26:42<2:05:49,  1.33it/s]                                                       21%|██        | 2707/12776 [26:42<2:05:49,  1.33it/s] 21%|██        | 2708/12776 [26:43<2:06:14,  1.33it/s]                                                       21%|██        | 2708/12776 [26:43<2:06:14,  1.33it/s] 21%|██        | 2709/12776 [26:44<1:57:22,  1.43it/s]                                                       21%|██        | 2709/12776 [26:44<1:57:22,  1.43it/s] 21%|██        | 2710/12776 [26:44<1:53:16,  1.48it/s]                                                       21%|██        | 2710/12776 [26:44<1:53:16,  1.48it/s] 21%|██        | 2711/12776 [26:45<1:46:18,  1.58it/s]                                                       21%|██        | 2711/12776 [26:45<1:46:18,  1.58it/s] 21%|██        | 2712/12776 [26:45<1:44:59,  1.60it/s]                                                       21%|██        | 2712/12776 [26:45<1:44:59,  1.60it/s] 21%|██        | 2713/12776 [26:46<1:38:06,  1.71it/s]                                                       21%|██        | 2713/12776 [26:46<1:38:06,  1.71it/s] 21%|██        | 2714/12776 [26:46<1:36:30,  1.74it/s]                                                       21%|██        | 2714/12776 [26:46<1:36:30,  1.74it/s] 21%|██▏       | 2715/12776 [26:47<1:29:44,  1.87it/s]                                                       21%|██▏       | 2715/12776 [26:47<1:29:44,  1.87it/s] 21%|██▏       | 2716/12776 [26:47<1:28:25,  1.90it/s]                                                       21%|██▏       | 2716/12776 [26:47<1:28:25,  1.90it/s] 21%|██▏       | 2717/12776 [26:48<1:22:33,  2.03it/s]                                                       21%|██▏       | 2717/12776 [26:48<1:22:33,  2.03it/s] 21%|██▏       | 2718/12776 [26:48<1:17:42,  2.16it/s]                                                       21%|██▏       | 2718/12776 [26:48<1:17:42,  2.16it/s] 21%|██▏       | 2719/12776 [26:49<1:19:34,  2.11it/s]                                                       21%|██▏       | 2719/12776 [26:49<1:19:34,  2.11it/s] 21%|██▏       | 2720/12776 [26:49<1:14:09,  2.26it/s]                                                       21%|██▏       | 2720/12776 [26:49<1:14:09,  2.26it/s] 21%|██▏       | 2721/12776 [26:49<1:09:28,  2.41it/s]                                                       21%|██▏       | 2721/12776 [26:49<1:09:28,  2.41it/s] 21%|██▏       | 2722/12776 [26:50<1:09:27,  2.41it/s]                                                       21%|██▏       | 2722/12776 [26:50<1:09:27,  2.41it/s] 21%|██▏       | 2723/12776 [26:50<1:05:28,  2.56it/s]                                                       21%|██▏       | 2723/12776 [26:50<1:05:28,  2.56it/s] 21%|██▏       | 2724/12776 [26:50<1:02:12,  2.69it/s]                                                       21%|██▏       | 2724/12776 [26:50<1:02:12,  2.69it/s] 21%|██▏       | 2725/12776 [26:51<1:00:56,  2.75it/s]                                                       21%|██▏       | 2725/12776 [26:51<1:00:56,  2.75it/s] 21%|██▏       | 2726/12776 [26:51<57:36,  2.91it/s]                                                       21%|██▏       | 2726/12776 [26:51<57:36,  2.91it/s] 21%|██▏       | 2727/12776 [26:51<54:55,  3.05it/s]                                                     21%|██▏       | 2727/12776 [26:51<54:55,  3.05it/s] 21%|██▏       | 2728/12776 [26:52<53:09,  3.15it/s]                                                     21%|██▏       | 2728/12776 [26:52<53:09,  3.15it/s] 21%|██▏       | 2729/12776 [26:52<58:05,  2.88it/s]                                                     21%|██▏       | 2729/12776 [26:52<58:05,  2.88it/s] 21%|██▏       | 2730/12776 [26:52<54:07,  3.09it/s]                                                     21%|██▏       | 2730/12776 [26:52<54:07,  3.09it/s] 21%|██▏       | 2731/12776 [26:53<50:25,  3.32it/s]                                                     21%|██▏       | 2731/12776 [26:53<50:25,  3.32it/s] 21%|██▏       | 2732/12776 [26:53<47:52,  3.50it/s]                                                     21%|██▏       | 2732/12776 [26:53<47:52,  3.50it/s] 21%|██▏       | 2733/12776 [26:53<50:20,  3.32it/s]                                                     21%|██▏       | 2733/12776 [26:53<50:20,  3.32it/s] 21%|██▏       | 2734/12776 [26:53<47:20,  3.54it/s]                                                     21%|██▏       | 2734/12776 [26:53<47:20,  3.54it/s] 21%|██▏       | 2735/12776 [26:54<44:46,  3.74it/s]                                                     21%|██▏       | 2735/12776 [26:54<44:46,  3.74it/s] 21%|██▏       | 2736/12776 [26:54<42:47,  3.91it/s]                                                     21%|██▏       | 2736/12776 [26:54<42:47,  3.91it/s] 21%|██▏       | 2737/12776 [26:54<46:04,  3.63it/s]                                                     21%|██▏       | 2737/12776 [26:54<46:04,  3.63it/s] 21%|██▏       | 2738/12776 [26:54<43:01,  3.89it/s]                                                     21%|██▏       | 2738/12776 [26:54<43:01,  3.89it/s] 21%|██▏       | 2739/12776 [26:55<40:44,  4.11it/s]                                                     21%|██▏       | 2739/12776 [26:55<40:44,  4.11it/s] 21%|██▏       | 2740/12776 [26:55<39:11,  4.27it/s]                                                     21%|██▏       | 2740/12776 [26:55<39:11,  4.27it/s] 21%|██▏       | 2741/12776 [26:55<37:55,  4.41it/s]                                                     21%|██▏       | 2741/12776 [26:55<37:55,  4.41it/s] 21%|██▏       | 2742/12776 [26:55<40:04,  4.17it/s]                                                     21%|██▏       | 2742/12776 [26:55<40:04,  4.17it/s] 21%|██▏       | 2743/12776 [26:55<38:05,  4.39it/s]                                                     21%|██▏       | 2743/12776 [26:55<38:05,  4.39it/s] 21%|██▏       | 2744/12776 [26:56<36:38,  4.56it/s]                                                     21%|██▏       | 2744/12776 [26:56<36:38,  4.56it/s] 21%|██▏       | 2745/12776 [26:56<35:30,  4.71it/s]                                                     21%|██▏       | 2745/12776 [26:56<35:30,  4.71it/s] 21%|██▏       | 2746/12776 [26:56<34:41,  4.82it/s]                                                     21%|██▏       | 2746/12776 [26:56<34:41,  4.82it/s] 22%|██▏       | 2747/12776 [26:56<33:51,  4.94it/s]                                                     22%|██▏       | 2747/12776 [26:56<33:51,  4.94it/s] 22%|██▏       | 2748/12776 [26:56<36:27,  4.58it/s]                                                     22%|██▏       | 2748/12776 [26:56<36:27,  4.58it/s] 22%|██▏       | 2749/12776 [26:57<34:46,  4.81it/s]                                                     22%|██▏       | 2749/12776 [26:57<34:46,  4.81it/s] 22%|██▏       | 2750/12776 [26:58<1:06:52,  2.50it/s]                                                       22%|██▏       | 2750/12776 [26:58<1:06:52,  2.50it/s] 22%|██▏       | 2751/12776 [26:59<2:05:04,  1.34it/s]                                                       22%|██▏       | 2751/12776 [26:59<2:05:04,  1.34it/s] 22%|██▏       | 2752/12776 [27:00<2:15:44,  1.23it/s]                                                       22%|██▏       | 2752/12776 [27:00<2:15:44,  1.23it/s] 22%|██▏       | 2753/12776 [27:01<2:26:26,  1.14it/s]                                                       22%|██▏       | 2753/12776 [27:01<2:26:26,  1.14it/s] 22%|██▏       | 2754/12776 [27:02<2:21:27,  1.18it/s]                                                       22%|██▏       | 2754/12776 [27:02<2:21:27,  1.18it/s] 22%|██▏       | 2755/12776 [27:03<2:17:26,  1.22it/s]                                                       22%|██▏       | 2755/12776 [27:03<2:17:26,  1.22it/s] 22%|██▏       | 2756/12776 [27:03<2:12:11,  1.26it/s]                                                       22%|██▏       | 2756/12776 [27:03<2:12:11,  1.26it/s] 22%|██▏       | 2757/12776 [27:04<2:05:41,  1.33it/s]                                                       22%|██▏       | 2757/12776 [27:04<2:05:41,  1.33it/s] 22%|██▏       | 2758/12776 [27:05<2:06:05,  1.32it/s]                                                       22%|██▏       | 2758/12776 [27:05<2:06:05,  1.32it/s] 22%|██▏       | 2759/12776 [27:05<1:58:50,  1.40it/s]                                                       22%|██▏       | 2759/12776 [27:05<1:58:50,  1.40it/s] 22%|██▏       | 2760/12776 [27:06<1:53:16,  1.47it/s]                                                       22%|██▏       | 2760/12776 [27:06<1:53:16,  1.47it/s] 22%|██▏       | 2761/12776 [27:07<1:46:50,  1.56it/s]                                                       22%|██▏       | 2761/12776 [27:07<1:46:50,  1.56it/s] 22%|██▏       | 2762/12776 [27:07<1:43:07,  1.62it/s]                                                       22%|██▏       | 2762/12776 [27:07<1:43:07,  1.62it/s] 22%|██▏       | 2763/12776 [27:08<1:37:20,  1.71it/s]                                                       22%|██▏       | 2763/12776 [27:08<1:37:20,  1.71it/s] 22%|██▏       | 2764/12776 [27:08<1:35:59,  1.74it/s]                                                       22%|██▏       | 2764/12776 [27:08<1:35:59,  1.74it/s] 22%|██▏       | 2765/12776 [27:09<1:29:30,  1.86it/s]                                                       22%|██▏       | 2765/12776 [27:09<1:29:30,  1.86it/s] 22%|██▏       | 2766/12776 [27:09<1:28:28,  1.89it/s]                                                       22%|██▏       | 2766/12776 [27:09<1:28:28,  1.89it/s] 22%|██▏       | 2767/12776 [27:10<1:22:37,  2.02it/s]                                                       22%|██▏       | 2767/12776 [27:10<1:22:37,  2.02it/s] 22%|██▏       | 2768/12776 [27:10<1:17:55,  2.14it/s]                                                       22%|██▏       | 2768/12776 [27:10<1:17:55,  2.14it/s] 22%|██▏       | 2769/12776 [27:10<1:20:19,  2.08it/s]                                                       22%|██▏       | 2769/12776 [27:10<1:20:19,  2.08it/s] 22%|██▏       | 2770/12776 [27:11<1:15:19,  2.21it/s]                                                       22%|██▏       | 2770/12776 [27:11<1:15:19,  2.21it/s] 22%|██▏       | 2771/12776 [27:11<1:10:38,  2.36it/s]                                                       22%|██▏       | 2771/12776 [27:11<1:10:38,  2.36it/s] 22%|██▏       | 2772/12776 [27:12<1:10:32,  2.36it/s]                                                       22%|██▏       | 2772/12776 [27:12<1:10:32,  2.36it/s] 22%|██▏       | 2773/12776 [27:12<1:06:13,  2.52it/s]                                                       22%|██▏       | 2773/12776 [27:12<1:06:13,  2.52it/s] 22%|██▏       | 2774/12776 [27:12<1:02:51,  2.65it/s]                                                       22%|██▏       | 2774/12776 [27:12<1:02:51,  2.65it/s] 22%|██▏       | 2775/12776 [27:13<1:05:18,  2.55it/s]                                                       22%|██▏       | 2775/12776 [27:13<1:05:18,  2.55it/s] 22%|██▏       | 2776/12776 [27:13<1:00:59,  2.73it/s]                                                       22%|██▏       | 2776/12776 [27:13<1:00:59,  2.73it/s] 22%|██▏       | 2777/12776 [27:13<57:51,  2.88it/s]                                                       22%|██▏       | 2777/12776 [27:13<57:51,  2.88it/s] 22%|██▏       | 2778/12776 [27:14<58:44,  2.84it/s]                                                    {'loss': 1.4243, 'grad_norm': 3.8896472454071045, 'learning_rate': 0.0002463831867057673, 'epoch': 0.42}
+{'loss': 0.3883, 'grad_norm': 0.8927620053291321, 'learning_rate': 0.0002463587487781036, 'epoch': 0.42}
+{'loss': 0.5873, 'grad_norm': 0.9689701199531555, 'learning_rate': 0.00024633431085043986, 'epoch': 0.42}
+{'loss': 0.276, 'grad_norm': 0.4493149220943451, 'learning_rate': 0.0002463098729227761, 'epoch': 0.42}
+{'loss': 0.3299, 'grad_norm': 0.5155251026153564, 'learning_rate': 0.0002462854349951124, 'epoch': 0.42}
+{'loss': 0.3684, 'grad_norm': 0.7345922589302063, 'learning_rate': 0.00024626099706744867, 'epoch': 0.42}
+{'loss': 0.4542, 'grad_norm': 0.633811354637146, 'learning_rate': 0.0002462365591397849, 'epoch': 0.42}
+{'loss': 0.4833, 'grad_norm': 0.9088723063468933, 'learning_rate': 0.00024621212121212117, 'epoch': 0.42}
+{'loss': 0.461, 'grad_norm': 1.057075023651123, 'learning_rate': 0.0002461876832844575, 'epoch': 0.42}
+{'loss': 0.3916, 'grad_norm': 0.996385395526886, 'learning_rate': 0.00024616324535679373, 'epoch': 0.42}
+{'loss': 0.4329, 'grad_norm': 1.1317174434661865, 'learning_rate': 0.00024613880742913, 'epoch': 0.42}
+{'loss': 0.3573, 'grad_norm': 0.8854171633720398, 'learning_rate': 0.0002461143695014663, 'epoch': 0.42}
+{'loss': 0.6152, 'grad_norm': 1.3034350872039795, 'learning_rate': 0.00024608993157380254, 'epoch': 0.42}
+{'loss': 0.5314, 'grad_norm': 0.8485596776008606, 'learning_rate': 0.0002460654936461388, 'epoch': 0.42}
+{'loss': 0.4594, 'grad_norm': 1.0428731441497803, 'learning_rate': 0.0002460410557184751, 'epoch': 0.42}
+{'loss': 0.4709, 'grad_norm': 1.12111496925354, 'learning_rate': 0.0002460166177908113, 'epoch': 0.43}
+{'loss': 0.5229, 'grad_norm': 1.1140966415405273, 'learning_rate': 0.0002459921798631476, 'epoch': 0.43}
+{'loss': 0.5124, 'grad_norm': 0.8085500597953796, 'learning_rate': 0.00024596774193548385, 'epoch': 0.43}
+{'loss': 0.6329, 'grad_norm': 1.5303175449371338, 'learning_rate': 0.0002459433040078201, 'epoch': 0.43}
+{'loss': 0.6225, 'grad_norm': 1.0877023935317993, 'learning_rate': 0.0002459188660801564, 'epoch': 0.43}
+{'loss': 0.5107, 'grad_norm': 1.3418067693710327, 'learning_rate': 0.00024589442815249265, 'epoch': 0.43}
+{'loss': 0.6342, 'grad_norm': 1.9135111570358276, 'learning_rate': 0.0002458699902248289, 'epoch': 0.43}
+{'loss': 0.5549, 'grad_norm': 1.0036910772323608, 'learning_rate': 0.00024584555229716516, 'epoch': 0.43}
+{'loss': 0.6197, 'grad_norm': 1.8668835163116455, 'learning_rate': 0.00024582111436950146, 'epoch': 0.43}
+{'loss': 0.6798, 'grad_norm': 2.176187515258789, 'learning_rate': 0.0002457966764418377, 'epoch': 0.43}
+{'loss': 1.0341, 'grad_norm': 1.4894177913665771, 'learning_rate': 0.00024577223851417396, 'epoch': 0.43}
+{'loss': 0.8838, 'grad_norm': 2.1725120544433594, 'learning_rate': 0.00024574780058651027, 'epoch': 0.43}
+{'loss': 0.5326, 'grad_norm': 1.8074668645858765, 'learning_rate': 0.0002457233626588465, 'epoch': 0.43}
+{'loss': 0.6351, 'grad_norm': 2.6459243297576904, 'learning_rate': 0.00024569892473118277, 'epoch': 0.43}
+{'loss': 0.9502, 'grad_norm': 1.6028372049331665, 'learning_rate': 0.0002456744868035191, 'epoch': 0.43}
+{'loss': 1.3844, 'grad_norm': 2.833115816116333, 'learning_rate': 0.0002456500488758553, 'epoch': 0.43}
+{'loss': 1.0888, 'grad_norm': 1.2911146879196167, 'learning_rate': 0.0002456256109481916, 'epoch': 0.43}
+{'loss': 1.0575, 'grad_norm': 1.904571771621704, 'learning_rate': 0.00024560117302052783, 'epoch': 0.43}
+{'loss': 1.5075, 'grad_norm': 2.163661241531372, 'learning_rate': 0.0002455767350928641, 'epoch': 0.43}
+{'loss': 1.4266, 'grad_norm': 5.065518379211426, 'learning_rate': 0.0002455522971652004, 'epoch': 0.43}
+{'loss': 1.0812, 'grad_norm': 1.9618192911148071, 'learning_rate': 0.00024552785923753664, 'epoch': 0.43}
+{'loss': 1.0698, 'grad_norm': 2.5747668743133545, 'learning_rate': 0.0002455034213098729, 'epoch': 0.43}
+{'loss': 0.883, 'grad_norm': 2.1782901287078857, 'learning_rate': 0.0002454789833822092, 'epoch': 0.43}
+{'loss': 1.4407, 'grad_norm': 5.35093879699707, 'learning_rate': 0.00024545454545454545, 'epoch': 0.43}
+{'loss': 1.0695, 'grad_norm': 2.183814287185669, 'learning_rate': 0.0002454301075268817, 'epoch': 0.43}
+{'loss': 1.0842, 'grad_norm': 2.1607635021209717, 'learning_rate': 0.00024540566959921795, 'epoch': 0.43}
+{'loss': 1.4593, 'grad_norm': 2.969998598098755, 'learning_rate': 0.00024538123167155426, 'epoch': 0.43}
+{'loss': 0.8874, 'grad_norm': 1.807499647140503, 'learning_rate': 0.0002453567937438905, 'epoch': 0.43}
+{'loss': 1.5912, 'grad_norm': 1.7791450023651123, 'learning_rate': 0.00024533235581622676, 'epoch': 0.43}
+{'loss': 1.0056, 'grad_norm': 1.3194993734359741, 'learning_rate': 0.00024530791788856306, 'epoch': 0.43}
+{'loss': 1.4255, 'grad_norm': 2.4051010608673096, 'learning_rate': 0.00024528347996089926, 'epoch': 0.43}
+{'loss': 1.2331, 'grad_norm': 3.256500482559204, 'learning_rate': 0.00024525904203323557, 'epoch': 0.43}
+{'loss': 0.845, 'grad_norm': 2.3121085166931152, 'learning_rate': 0.0002452346041055718, 'epoch': 0.43}
+{'loss': 0.851, 'grad_norm': 2.94036865234375, 'learning_rate': 0.00024521016617790807, 'epoch': 0.43}
+{'loss': 0.5205, 'grad_norm': 2.072420597076416, 'learning_rate': 0.0002451857282502444, 'epoch': 0.43}
+{'loss': 0.9723, 'grad_norm': 2.3945791721343994, 'learning_rate': 0.0002451612903225806, 'epoch': 0.43}
+{'loss': 0.3922, 'grad_norm': 0.6630196571350098, 'learning_rate': 0.0002451368523949169, 'epoch': 0.43}
+{'loss': 0.4675, 'grad_norm': 0.6978053450584412, 'learning_rate': 0.0002451124144672532, 'epoch': 0.43}
+{'loss': 0.3968, 'grad_norm': 0.570740818977356, 'learning_rate': 0.00024508797653958943, 'epoch': 0.43}
+{'loss': 0.3249, 'grad_norm': 0.5810028314590454, 'learning_rate': 0.0002450635386119257, 'epoch': 0.43}
+{'loss': 0.4897, 'grad_norm': 1.180005431175232, 'learning_rate': 0.00024503910068426194, 'epoch': 0.43}
+{'loss': 0.3243, 'grad_norm': 0.7174784541130066, 'learning_rate': 0.00024501466275659824, 'epoch': 0.43}
+{'loss': 0.5946, 'grad_norm': 0.7873983979225159, 'learning_rate': 0.0002449902248289345, 'epoch': 0.43}
+{'loss': 0.4663, 'grad_norm': 0.7748199105262756, 'learning_rate': 0.00024496578690127074, 'epoch': 0.43}
+{'loss': 0.3024, 'grad_norm': 0.571399450302124, 'learning_rate': 0.00024494134897360705, 'epoch': 0.43}
+{'loss': 0.5797, 'grad_norm': 0.838038980960846, 'learning_rate': 0.0002449169110459433, 'epoch': 0.43}
+{'loss': 0.4022, 'grad_norm': 0.9804670214653015, 'learning_rate': 0.00024489247311827955, 'epoch': 0.43}
+{'loss': 0.3703, 'grad_norm': 0.7213824391365051, 'learning_rate': 0.0002448680351906158, 'epoch': 0.43}
+{'loss': 0.3412, 'grad_norm': 1.013265609741211, 'learning_rate': 0.00024484359726295205, 'epoch': 0.43}
+{'loss': 0.8917, 'grad_norm': 1.2366046905517578, 'learning_rate': 0.00024481915933528836, 'epoch': 0.43}
+{'loss': 0.3694, 'grad_norm': 0.8313443660736084, 'learning_rate': 0.0002447947214076246, 'epoch': 0.43}
+{'loss': 0.7908, 'grad_norm': 1.9965428113937378, 'learning_rate': 0.00024477028347996086, 'epoch': 0.43}
+{'loss': 0.5598, 'grad_norm': 1.3897510766983032, 'learning_rate': 0.00024474584555229717, 'epoch': 0.43}
+{'loss': 0.5628, 'grad_norm': 1.1880437135696411, 'learning_rate': 0.0002447214076246334, 'epoch': 0.43}
+{'loss': 0.9813, 'grad_norm': 3.2662549018859863, 'learning_rate': 0.00024469696969696967, 'epoch': 0.43}
+{'loss': 0.5714, 'grad_norm': 1.4377611875534058, 'learning_rate': 0.0002446725317693059, 'epoch': 0.43}
+{'loss': 1.2337, 'grad_norm': 2.769331693649292, 'learning_rate': 0.0002446480938416422, 'epoch': 0.43}
+{'loss': 0.5278, 'grad_norm': 1.2153619527816772, 'learning_rate': 0.0002446236559139785, 'epoch': 0.43}
+{'loss': 0.7495, 'grad_norm': 2.0345957279205322, 'learning_rate': 0.00024459921798631473, 'epoch': 0.43}
+{'loss': 0.6209, 'grad_norm': 1.450657844543457, 'learning_rate': 0.00024457478005865103, 'epoch': 0.43}
+{'loss': 0.6324, 'grad_norm': 1.648074746131897, 'learning_rate': 0.0002445503421309873, 'epoch': 0.43}
+{'loss': 0.754, 'grad_norm': 1.3603814840316772, 'learning_rate': 0.00024452590420332354, 'epoch': 0.43}
+{'loss': 0.73, 'grad_norm': 3.350707769393921, 'learning_rate': 0.00024450146627565984, 'epoch': 0.43}
+ 22%|██▏       | 2778/12776 [27:14<58:44,  2.84it/s] 22%|██▏       | 2779/12776 [27:14<55:14,  3.02it/s]                                                     22%|██▏       | 2779/12776 [27:14<55:14,  3.02it/s] 22%|██▏       | 2780/12776 [27:14<52:15,  3.19it/s]                                                     22%|██▏       | 2780/12776 [27:14<52:15,  3.19it/s] 22%|██▏       | 2781/12776 [27:14<49:49,  3.34it/s]                                                     22%|██▏       | 2781/12776 [27:14<49:49,  3.34it/s] 22%|██▏       | 2782/12776 [27:15<50:22,  3.31it/s]                                                     22%|██▏       | 2782/12776 [27:15<50:22,  3.31it/s] 22%|██▏       | 2783/12776 [27:15<47:49,  3.48it/s]                                                     22%|██▏       | 2783/12776 [27:15<47:49,  3.48it/s] 22%|██▏       | 2784/12776 [27:15<45:38,  3.65it/s]                                                     22%|██▏       | 2784/12776 [27:15<45:38,  3.65it/s] 22%|██▏       | 2785/12776 [27:16<43:42,  3.81it/s]                                                     22%|██▏       | 2785/12776 [27:16<43:42,  3.81it/s] 22%|██▏       | 2786/12776 [27:16<41:59,  3.96it/s]                                                     22%|██▏       | 2786/12776 [27:16<41:59,  3.96it/s] 22%|██▏       | 2787/12776 [27:16<43:19,  3.84it/s]                                                     22%|██▏       | 2787/12776 [27:16<43:19,  3.84it/s] 22%|██▏       | 2788/12776 [27:16<41:03,  4.05it/s]                                                     22%|██▏       | 2788/12776 [27:16<41:03,  4.05it/s] 22%|██▏       | 2789/12776 [27:16<39:14,  4.24it/s]                                                     22%|██▏       | 2789/12776 [27:16<39:14,  4.24it/s] 22%|██▏       | 2790/12776 [27:17<37:57,  4.38it/s]                                                     22%|██▏       | 2790/12776 [27:17<37:57,  4.38it/s] 22%|██▏       | 2791/12776 [27:17<36:50,  4.52it/s]                                                     22%|██▏       | 2791/12776 [27:17<36:50,  4.52it/s] 22%|██▏       | 2792/12776 [27:17<40:23,  4.12it/s]                                                     22%|██▏       | 2792/12776 [27:17<40:23,  4.12it/s] 22%|██▏       | 2793/12776 [27:17<38:14,  4.35it/s]                                                     22%|██▏       | 2793/12776 [27:17<38:14,  4.35it/s] 22%|██▏       | 2794/12776 [27:18<36:38,  4.54it/s]                                                     22%|██▏       | 2794/12776 [27:18<36:38,  4.54it/s] 22%|██▏       | 2795/12776 [27:18<35:28,  4.69it/s]                                                     22%|██▏       | 2795/12776 [27:18<35:28,  4.69it/s] 22%|██▏       | 2796/12776 [27:18<34:28,  4.83it/s]                                                     22%|██▏       | 2796/12776 [27:18<34:28,  4.83it/s] 22%|██▏       | 2797/12776 [27:18<40:18,  4.13it/s]                                                     22%|██▏       | 2797/12776 [27:18<40:18,  4.13it/s] 22%|██▏       | 2798/12776 [27:18<37:27,  4.44it/s]                                                     22%|██▏       | 2798/12776 [27:18<37:27,  4.44it/s] 22%|██▏       | 2799/12776 [27:19<35:25,  4.69it/s]                                                     22%|██▏       | 2799/12776 [27:19<35:25,  4.69it/s] 22%|██▏       | 2800/12776 [27:19<1:03:56,  2.60it/s]                                                       22%|██▏       | 2800/12776 [27:19<1:03:56,  2.60it/s]Saving model checkpoint to ./checkpoint-2800
+Configuration saved in ./checkpoint-2800/config.json
+Model weights saved in ./checkpoint-2800/model.safetensors
+Feature extractor saved in ./checkpoint-2800/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-2800/tokenizer_config.json
+Special tokens file saved in ./checkpoint-2800/special_tokens_map.json
+added tokens file saved in ./checkpoint-2800/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-1600] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 22%|██▏       | 2801/12776 [27:26<6:15:24,  2.26s/it]                                                       22%|██▏       | 2801/12776 [27:26<6:15:24,  2.26s/it] 22%|██▏       | 2802/12776 [27:27<5:10:00,  1.86s/it]                                                       22%|██▏       | 2802/12776 [27:27<5:10:00,  1.86s/it] 22%|██▏       | 2803/12776 [27:28<4:17:43,  1.55s/it]                                                       22%|██▏       | 2803/12776 [27:28<4:17:43,  1.55s/it] 22%|██▏       | 2804/12776 [27:29<3:37:55,  1.31s/it]                                                       22%|██▏       | 2804/12776 [27:29<3:37:55,  1.31s/it] 22%|██▏       | 2805/12776 [27:29<3:08:05,  1.13s/it]                                                       22%|██▏       | 2805/12776 [27:29<3:08:05,  1.13s/it] 22%|██▏       | 2806/12776 [27:30<2:49:03,  1.02s/it]                                                       22%|██▏       | 2806/12776 [27:30<2:49:03,  1.02s/it] 22%|██▏       | 2807/12776 [27:31<2:35:48,  1.07it/s]                                                       22%|██▏       | 2807/12776 [27:31<2:35:48,  1.07it/s] 22%|██▏       | 2808/12776 [27:31<2:19:51,  1.19it/s]                                                       22%|██▏       | 2808/12776 [27:31<2:19:51,  1.19it/s] 22%|██▏       | 2809/12776 [27:32<2:07:22,  1.30it/s]                                                       22%|██▏       | 2809/12776 [27:32<2:07:22,  1.30it/s] 22%|██▏       | 2810/12776 [27:33<1:55:49,  1.43it/s]                                                       22%|██▏       | 2810/12776 [27:33<1:55:49,  1.43it/s] 22%|██▏       | 2811/12776 [27:33<1:49:34,  1.52it/s]                                                       22%|██▏       | 2811/12776 [27:33<1:49:34,  1.52it/s] 22%|██▏       | 2812/12776 [27:34<1:41:02,  1.64it/s]                                                       22%|██▏       | 2812/12776 [27:34<1:41:02,  1.64it/s] 22%|██▏       | 2813/12776 [27:34<1:33:48,  1.77it/s]                                                       22%|██▏       | 2813/12776 [27:34<1:33:48,  1.77it/s] 22%|██▏       | 2814/12776 [27:35<1:30:18,  1.84it/s]                                                       22%|██▏       | 2814/12776 [27:35<1:30:18,  1.84it/s] 22%|██▏       | 2815/12776 [27:35<1:23:51,  1.98it/s]                                                       22%|██▏       | 2815/12776 [27:35<1:23:51,  1.98it/s] 22%|██▏       | 2816/12776 [27:35<1:21:41,  2.03it/s]                                                       22%|██▏       | 2816/12776 [27:35<1:21:41,  2.03it/s] 22%|██▏       | 2817/12776 [27:36<1:16:22,  2.17it/s]                                                       22%|██▏       | 2817/12776 [27:36<1:16:22,  2.17it/s] 22%|██▏       | 2818/12776 [27:36<1:12:03,  2.30it/s]                                                       22%|██▏       | 2818/12776 [27:36<1:12:03,  2.30it/s] 22%|██▏       | 2819/12776 [27:37<1:09:17,  2.39it/s]                                                       22%|██▏       | 2819/12776 [27:37<1:09:17,  2.39it/s] 22%|██▏       | 2820/12776 [27:37<1:04:55,  2.56it/s]                                                       22%|██▏       | 2820/12776 [27:37<1:04:55,  2.56it/s] 22%|██▏       | 2821/12776 [27:37<1:01:28,  2.70it/s]                                                       22%|██▏       | 2821/12776 [27:37<1:01:28,  2.70it/s] 22%|██▏       | 2822/12776 [27:38<58:44,  2.82it/s]                                                       22%|██▏       | 2822/12776 [27:38<58:44,  2.82it/s] 22%|██▏       | 2823/12776 [27:38<59:20,  2.80it/s]                                                     22%|██▏       | 2823/12776 [27:38<59:20,  2.80it/s] 22%|██▏       | 2824/12776 [27:38<56:07,  2.96it/s]                                                     22%|██▏       | 2824/12776 [27:38<56:07,  2.96it/s] 22%|██▏       | 2825/12776 [27:38<53:11,  3.12it/s]                                                     22%|██▏       | 2825/12776 [27:38<53:11,  3.12it/s] 22%|██▏       | 2826/12776 [27:39<54:52,  3.02it/s]                                                     22%|██▏       | 2826/12776 [27:39<54:52,  3.02it/s] 22%|██▏       | 2827/12776 [27:39<51:34,  3.22it/s]                                                     22%|██▏       | 2827/12776 [27:39<51:34,  3.22it/s] 22%|██▏       | 2828/12776 [27:39<48:47,  3.40it/s]                                                     22%|██▏       | 2828/12776 [27:39<48:47,  3.40it/s] 22%|██▏       | 2829/12776 [27:40<46:26,  3.57it/s]                                                     22%|██▏       | 2829/12776 [27:40<46:26,  3.57it/s] 22%|██▏       | 2830/12776 [27:40<50:59,  3.25it/s]                                                     22%|██▏       | 2830/12776 [27:40<50:59,  3.25it/s] 22%|██▏       | 2831/12776 [27:40<47:14,  3.51it/s]                                                     22%|██▏       | 2831/12776 [27:40<47:14,  3.51it/s] 22%|██▏       | 2832/12776 [27:40<44:07,  3.76it/s]                                                     22%|██▏       | 2832/12776 [27:40<44:07,  3.76it/s] 22%|██▏       | 2833/12776 [27:41<41:46,  3.97it/s]                                                     22%|██▏       | 2833/12776 [27:41<41:46,  3.97it/s] 22%|██▏       | 2834/12776 [27:41<39:55,  4.15it/s]                                                     22%|██▏       | 2834/12776 [27:41<39:55,  4.15it/s] 22%|██▏       | 2835/12776 [27:41<41:24,  4.00it/s]                                                     22%|██▏       | 2835/12776 [27:41<41:24,  4.00it/s] 22%|██▏       | 2836/12776 [27:41<39:04,  4.24it/s]                                                     22%|██▏       | 2836/12776 [27:41<39:04,  4.24it/s] 22%|██▏       | 2837/12776 [27:42<37:13,  4.45it/s]                                                     22%|██▏       | 2837/12776 [27:42<37:13,  4.45it/s] 22%|██▏       | 2838/12776 [27:42<35:50,  4.62it/s]                                                     22%|██▏       | 2838/12776 [27:42<35:50,  4.62it/s] 22%|██▏       | 2839/12776 [27:42<34:26,  4.81it/s]                                                     22%|██▏       | 2839/12776 [27:42<34:26,  4.81it/s] 22%|██▏       | 2840/12776 [27:42<33:30,  4.94it/s]                                                     22%|██▏       | 2840/12776 [27:42<33:30,  4.94it/s] 22%|██▏       | 2841/12776 [27:42<37:38,  4.40it/s]                                                     22%|██▏       | 2841/12776 [27:42<37:38,  4.40it/s] 22%|██▏       | 2842/12776 [27:43<35:23,  4.68it/s]                                                     22%|██▏       | 2842/12776 [27:43<35:23,  4.68it/s] 22%|██▏       | 2843/12776 [27:43<33:35,  4.93it/s]                                                     22%|██▏       | 2843/12776 [27:43<33:35,  4.93it/s] 22%|██▏       | 2844/12776 [27:43<32:15,  5.13it/s]                                                     22%|██▏       | 2844/12776 [27:43<32:15,  5.13it/s] 22%|██▏       | 2845/12776 [27:43<31:27,  5.26it/s]                                                     22%|██▏       | 2845/12776 [27:43<31:27,  5.26it/s] 22%|██▏       | 2846/12776 [27:43<30:36,  5.41it/s]                                                     22%|██▏       | 2846/12776 [27:43<30:36,  5.41it/s] 22%|██▏       | 2847/12776 [27:44<32:57,  5.02it/s]                                                     22%|██▏       | 2847/12776 [27:44<32:57,  5.02it/s] 22%|██▏       | 2848/12776 [27:44<31:12,  5.30it/s]                                                     22%|██▏       | 2848/12776 [27:44<31:12,  5.30it/s] 22%|██▏       | 2849/12776 [27:44<30:00,  5.51it/s]                                                     22%|██▏       | 2849/12776 [27:44<30:00,  5.51it/s] 22%|██▏       | 2850/12776 [27:45<59:18,  2.79it/s]                                                     22%|██▏       | 2850/12776 [27:45<59:18,  2.79it/s] 22%|██▏       | 2851/12776 [27:46<1:55:11,  1.44it/s]                                                       22%|██▏       | 2851/12776 [27:46<1:55:11,  1.44it/s] 22%|██▏       | 2852/12776 [27:47<2:16:11,  1.21it/s]                                                       22%|██▏       | 2852/12776 [27:47<2:16:11,  1.21it/s] 22%|██▏       | 2853/12776 [27:48<2:18:38,  1.19it/s]                                                       22%|██▏       | 2853/12776 [27:48<2:18:38,  1.19it/s] 22%|██▏       | 2854/12776 [27:49<2:16:43,  1.21it/s]                                                       22%|██▏       | 2854/12776 [27:49<2:16:43,  1.21it/s] 22%|██▏       | 2855/12776 [27:50<2:12:02,  1.25it/s]                                                      {'loss': 0.9358, 'grad_norm': 2.1769869327545166, 'learning_rate': 0.00024447702834799604, 'epoch': 0.43}
+{'loss': 0.6444, 'grad_norm': 1.703329086303711, 'learning_rate': 0.00024445259042033235, 'epoch': 0.44}
+{'loss': 0.7654, 'grad_norm': 2.654348611831665, 'learning_rate': 0.0002444281524926686, 'epoch': 0.44}
+{'loss': 0.9084, 'grad_norm': 1.5147393941879272, 'learning_rate': 0.00024440371456500485, 'epoch': 0.44}
+{'loss': 1.033, 'grad_norm': 2.269134759902954, 'learning_rate': 0.00024437927663734115, 'epoch': 0.44}
+{'loss': 1.0404, 'grad_norm': 1.4680894613265991, 'learning_rate': 0.0002443548387096774, 'epoch': 0.44}
+{'loss': 1.4057, 'grad_norm': 1.8765166997909546, 'learning_rate': 0.00024433040078201366, 'epoch': 0.44}
+{'loss': 0.7339, 'grad_norm': 1.2388074398040771, 'learning_rate': 0.00024430596285434996, 'epoch': 0.44}
+{'loss': 0.8482, 'grad_norm': 1.540697693824768, 'learning_rate': 0.0002442815249266862, 'epoch': 0.44}
+{'loss': 0.9921, 'grad_norm': 2.0315816402435303, 'learning_rate': 0.00024425708699902246, 'epoch': 0.44}
+{'loss': 1.6373, 'grad_norm': 2.7498373985290527, 'learning_rate': 0.0002442326490713587, 'epoch': 0.44}
+{'loss': 1.6076, 'grad_norm': 2.1657485961914062, 'learning_rate': 0.000244208211143695, 'epoch': 0.44}
+{'loss': 1.1333, 'grad_norm': 3.260566234588623, 'learning_rate': 0.00024418377321603127, 'epoch': 0.44}
+{'loss': 0.9482, 'grad_norm': 2.2284598350524902, 'learning_rate': 0.0002441593352883675, 'epoch': 0.44}
+{'loss': 1.0199, 'grad_norm': 2.1188924312591553, 'learning_rate': 0.0002441348973607038, 'epoch': 0.44}
+{'loss': 1.8085, 'grad_norm': 5.20643949508667, 'learning_rate': 0.00024411045943304005, 'epoch': 0.44}
+{'loss': 1.2478, 'grad_norm': 4.37985372543335, 'learning_rate': 0.00024408602150537633, 'epoch': 0.44}
+{'loss': 1.2622, 'grad_norm': 1.3027615547180176, 'learning_rate': 0.0002440615835777126, 'epoch': 0.44}
+{'loss': 0.8131, 'grad_norm': 2.000991106033325, 'learning_rate': 0.00024403714565004886, 'epoch': 0.44}
+{'loss': 0.7618, 'grad_norm': 2.1807901859283447, 'learning_rate': 0.00024401270772238514, 'epoch': 0.44}
+{'loss': 0.6396, 'grad_norm': 2.6867918968200684, 'learning_rate': 0.0002439882697947214, 'epoch': 0.44}
+{'loss': 1.2688, 'grad_norm': 1.687910795211792, 'learning_rate': 0.00024396383186705764, 'epoch': 0.44}
+{'loss': 1.1152, 'grad_norm': 2.035445213317871, 'learning_rate': 0.00024393939393939392, 'epoch': 0.44}
+{'loss': 0.6478, 'grad_norm': 1.4942790269851685, 'learning_rate': 0.0002439149560117302, 'epoch': 0.44}
+{'loss': 0.257, 'grad_norm': 0.5058338642120361, 'learning_rate': 0.00024389051808406645, 'epoch': 0.44}
+{'loss': 0.3152, 'grad_norm': 0.5301083922386169, 'learning_rate': 0.00024386608015640273, 'epoch': 0.44}
+{'loss': 1.2555, 'grad_norm': 3.6920485496520996, 'learning_rate': 0.000243841642228739, 'epoch': 0.44}
+{'loss': 0.4101, 'grad_norm': 0.8687499761581421, 'learning_rate': 0.00024381720430107523, 'epoch': 0.44}
+{'loss': 0.4841, 'grad_norm': 0.5601373314857483, 'learning_rate': 0.0002437927663734115, 'epoch': 0.44}
+{'loss': 0.3959, 'grad_norm': 0.5990231037139893, 'learning_rate': 0.0002437683284457478, 'epoch': 0.44}
+{'loss': 0.5858, 'grad_norm': 0.8136112093925476, 'learning_rate': 0.00024374389051808404, 'epoch': 0.44}
+{'loss': 0.4747, 'grad_norm': 0.8082829713821411, 'learning_rate': 0.00024371945259042032, 'epoch': 0.44}
+{'loss': 0.4398, 'grad_norm': 0.751895010471344, 'learning_rate': 0.00024369501466275657, 'epoch': 0.44}
+{'loss': 0.3338, 'grad_norm': 0.7629719972610474, 'learning_rate': 0.00024367057673509285, 'epoch': 0.44}
+{'loss': 0.5572, 'grad_norm': 0.9700342416763306, 'learning_rate': 0.00024364613880742912, 'epoch': 0.44}
+{'loss': 0.5201, 'grad_norm': 1.136712670326233, 'learning_rate': 0.00024362170087976535, 'epoch': 0.44}
+{'loss': 0.5493, 'grad_norm': 0.9968575239181519, 'learning_rate': 0.00024359726295210163, 'epoch': 0.44}
+{'loss': 0.5117, 'grad_norm': 1.318066120147705, 'learning_rate': 0.0002435728250244379, 'epoch': 0.44}
+{'loss': 0.6577, 'grad_norm': 1.940822720527649, 'learning_rate': 0.00024354838709677416, 'epoch': 0.44}
+{'loss': 0.7105, 'grad_norm': 1.8296147584915161, 'learning_rate': 0.00024352394916911043, 'epoch': 0.44}
+{'loss': 0.8139, 'grad_norm': 2.1246185302734375, 'learning_rate': 0.0002434995112414467, 'epoch': 0.44}
+{'loss': 0.6666, 'grad_norm': 1.4536235332489014, 'learning_rate': 0.00024347507331378296, 'epoch': 0.44}
+{'loss': 0.7701, 'grad_norm': 1.9893560409545898, 'learning_rate': 0.00024345063538611924, 'epoch': 0.44}
+{'loss': 0.4967, 'grad_norm': 2.051508903503418, 'learning_rate': 0.0002434261974584555, 'epoch': 0.44}
+{'loss': 0.7787, 'grad_norm': 3.648225784301758, 'learning_rate': 0.00024340175953079175, 'epoch': 0.44}
+{'loss': 0.5775, 'grad_norm': 1.885310411453247, 'learning_rate': 0.00024337732160312802, 'epoch': 0.44}
+{'loss': 0.9215, 'grad_norm': 2.8299691677093506, 'learning_rate': 0.0002433528836754643, 'epoch': 0.44}
+{'loss': 0.8729, 'grad_norm': 1.727311134338379, 'learning_rate': 0.00024332844574780055, 'epoch': 0.44}
+{'loss': 0.7647, 'grad_norm': 3.334357738494873, 'learning_rate': 0.00024330400782013683, 'epoch': 0.44}
+{'loss': 0.808, 'grad_norm': 2.5358595848083496, 'learning_rate': 0.0002432795698924731, 'epoch': 0.44}
+{'loss': 0.8237, 'grad_norm': 1.919858694076538, 'learning_rate': 0.00024325513196480933, 'epoch': 0.44}
+{'loss': 0.7848, 'grad_norm': 2.2071852684020996, 'learning_rate': 0.0002432306940371456, 'epoch': 0.44}
+{'loss': 0.8957, 'grad_norm': 2.1200320720672607, 'learning_rate': 0.0002432062561094819, 'epoch': 0.44}
+{'loss': 1.4184, 'grad_norm': 2.553398609161377, 'learning_rate': 0.00024318181818181814, 'epoch': 0.44}
+{'loss': 1.1927, 'grad_norm': 1.989119529724121, 'learning_rate': 0.00024315738025415442, 'epoch': 0.44}
+{'loss': 0.7261, 'grad_norm': 1.7680295705795288, 'learning_rate': 0.0002431329423264907, 'epoch': 0.44}
+{'loss': 1.2965, 'grad_norm': 3.1489715576171875, 'learning_rate': 0.00024310850439882695, 'epoch': 0.44}
+{'loss': 0.9612, 'grad_norm': 2.7406511306762695, 'learning_rate': 0.00024308406647116323, 'epoch': 0.44}
+{'loss': 1.1864, 'grad_norm': 2.736985921859741, 'learning_rate': 0.0002430596285434995, 'epoch': 0.44}
+{'loss': 1.6187, 'grad_norm': 2.205655097961426, 'learning_rate': 0.00024303519061583573, 'epoch': 0.44}
+{'loss': 0.9562, 'grad_norm': 1.7284249067306519, 'learning_rate': 0.000243010752688172, 'epoch': 0.44}
+{'loss': 1.3006, 'grad_norm': 2.0134189128875732, 'learning_rate': 0.0002429863147605083, 'epoch': 0.44}
+{'loss': 1.5716, 'grad_norm': 2.37020206451416, 'learning_rate': 0.00024296187683284454, 'epoch': 0.44}
+{'loss': 1.1951, 'grad_norm': 2.1553592681884766, 'learning_rate': 0.00024293743890518082, 'epoch': 0.44}
+{'loss': 0.8878, 'grad_norm': 2.4819650650024414, 'learning_rate': 0.0002429130009775171, 'epoch': 0.44}
+{'loss': 1.4834, 'grad_norm': 2.196474552154541, 'learning_rate': 0.00024288856304985335, 'epoch': 0.45}
+{'loss': 1.1742, 'grad_norm': 3.71829891204834, 'learning_rate': 0.00024286412512218963, 'epoch': 0.45}
+{'loss': 1.3603, 'grad_norm': 2.282411575317383, 'learning_rate': 0.00024283968719452588, 'epoch': 0.45}
+{'loss': 0.7373, 'grad_norm': 1.549609899520874, 'learning_rate': 0.00024281524926686213, 'epoch': 0.45}
+{'loss': 0.435, 'grad_norm': 0.8783185482025146, 'learning_rate': 0.0002427908113391984, 'epoch': 0.45}
+{'loss': 1.096, 'grad_norm': 4.473077774047852, 'learning_rate': 0.00024276637341153468, 'epoch': 0.45}
+{'loss': 0.9034, 'grad_norm': 1.7083899974822998, 'learning_rate': 0.00024274193548387094, 'epoch': 0.45}
+{'loss': 0.8184, 'grad_norm': 1.6015340089797974, 'learning_rate': 0.00024271749755620721, 'epoch': 0.45}
+{'loss': 0.3711, 'grad_norm': 0.5668478608131409, 'learning_rate': 0.0002426930596285435, 'epoch': 0.45}
+{'loss': 0.284, 'grad_norm': 0.603583812713623, 'learning_rate': 0.00024266862170087972, 'epoch': 0.45}
+{'loss': 0.2688, 'grad_norm': 0.8786858916282654, 'learning_rate': 0.000242644183773216, 'epoch': 0.45}
+{'loss': 0.3223, 'grad_norm': 0.5734667778015137, 'learning_rate': 0.00024261974584555227, 'epoch': 0.45}
+{'loss': 0.44, 'grad_norm': 0.926365077495575, 'learning_rate': 0.00024259530791788852, 'epoch': 0.45}
+ 22%|██▏       | 2855/12776 [27:50<2:12:02,  1.25it/s] 22%|██▏       | 2856/12776 [27:50<2:08:04,  1.29it/s]                                                       22%|██▏       | 2856/12776 [27:50<2:08:04,  1.29it/s] 22%|██▏       | 2857/12776 [27:51<2:02:53,  1.35it/s]                                                       22%|██���       | 2857/12776 [27:51<2:02:53,  1.35it/s] 22%|██▏       | 2858/12776 [27:52<2:02:10,  1.35it/s]                                                       22%|██▏       | 2858/12776 [27:52<2:02:10,  1.35it/s] 22%|██▏       | 2859/12776 [27:52<1:56:05,  1.42it/s]                                                       22%|██▏       | 2859/12776 [27:52<1:56:05,  1.42it/s] 22%|██▏       | 2860/12776 [27:53<1:51:16,  1.49it/s]                                                       22%|██▏       | 2860/12776 [27:53<1:51:16,  1.49it/s] 22%|██▏       | 2861/12776 [27:54<1:45:03,  1.57it/s]                                                       22%|██▏       | 2861/12776 [27:54<1:45:03,  1.57it/s] 22%|██▏       | 2862/12776 [27:54<1:38:55,  1.67it/s]                                                       22%|██▏       | 2862/12776 [27:54<1:38:55,  1.67it/s] 22%|██▏       | 2863/12776 [27:55<1:32:46,  1.78it/s]                                                       22%|██▏       | 2863/12776 [27:55<1:32:46,  1.78it/s] 22%|██▏       | 2864/12776 [27:55<1:32:05,  1.79it/s]                                                       22%|██▏       | 2864/12776 [27:55<1:32:05,  1.79it/s] 22%|██▏       | 2865/12776 [27:55<1:25:20,  1.94it/s]                                                       22%|██▏       | 2865/12776 [27:55<1:25:20,  1.94it/s] 22%|██▏       | 2866/12776 [27:56<1:27:32,  1.89it/s]                                                       22%|██▏       | 2866/12776 [27:56<1:27:32,  1.89it/s] 22%|██▏       | 2867/12776 [27:56<1:20:50,  2.04it/s]                                                       22%|██▏       | 2867/12776 [27:56<1:20:50,  2.04it/s] 22%|██▏       | 2868/12776 [27:57<1:15:46,  2.18it/s]                                                       22%|██▏       | 2868/12776 [27:57<1:15:46,  2.18it/s] 22%|██▏       | 2869/12776 [27:57<1:19:17,  2.08it/s]                                                       22%|██▏       | 2869/12776 [27:57<1:19:17,  2.08it/s] 22%|██▏       | 2870/12776 [27:58<1:13:10,  2.26it/s]                                                       22%|██▏       | 2870/12776 [27:58<1:13:10,  2.26it/s] 22%|██▏       | 2871/12776 [27:58<1:08:28,  2.41it/s]                                                       22%|██▏       | 2871/12776 [27:58<1:08:28,  2.41it/s] 22%|██▏       | 2872/12776 [27:58<1:09:06,  2.39it/s]                                                       22%|██▏       | 2872/12776 [27:58<1:09:06,  2.39it/s] 22%|██▏       | 2873/12776 [27:59<1:04:10,  2.57it/s]                                                       22%|██▏       | 2873/12776 [27:59<1:04:10,  2.57it/s] 22%|██▏       | 2874/12776 [27:59<1:00:15,  2.74it/s]                                                       22%|██▏       | 2874/12776 [27:59<1:00:15,  2.74it/s] 23%|██▎       | 2875/12776 [27:59<1:00:20,  2.74it/s]                                                       23%|██▎       | 2875/12776 [27:59<1:00:20,  2.74it/s] 23%|██▎       | 2876/12776 [28:00<56:22,  2.93it/s]                                                       23%|██▎       | 2876/12776 [28:00<56:22,  2.93it/s] 23%|██▎       | 2877/12776 [28:00<52:50,  3.12it/s]                                                     23%|██▎       | 2877/12776 [28:00<52:50,  3.12it/s] 23%|██▎       | 2878/12776 [28:00<50:00,  3.30it/s]                                                     23%|██▎       | 2878/12776 [28:00<50:00,  3.30it/s] 23%|██▎       | 2879/12776 [28:01<50:57,  3.24it/s]                                                     23%|██▎       | 2879/12776 [28:01<50:57,  3.24it/s] 23%|██▎       | 2880/12776 [28:01<49:15,  3.35it/s]                                                     23%|██▎       | 2880/12776 [28:01<49:15,  3.35it/s] 23%|██▎       | 2881/12776 [28:01<47:53,  3.44it/s]                                                     23%|██▎       | 2881/12776 [28:01<47:53,  3.44it/s] 23%|██▎       | 2882/12776 [28:01<46:39,  3.53it/s]                                                     23%|██▎       | 2882/12776 [28:01<46:39,  3.53it/s] 23%|██▎       | 2883/12776 [28:02<48:29,  3.40it/s]                                                     23%|██▎       | 2883/12776 [28:02<48:29,  3.40it/s] 23%|██▎       | 2884/12776 [28:02<46:10,  3.57it/s]                                                     23%|██▎       | 2884/12776 [28:02<46:10,  3.57it/s] 23%|██▎       | 2885/12776 [28:02<44:30,  3.70it/s]                                                     23%|██▎       | 2885/12776 [28:02<44:30,  3.70it/s] 23%|██▎       | 2886/12776 [28:02<42:59,  3.83it/s]                                                     23%|██▎       | 2886/12776 [28:02<42:59,  3.83it/s] 23%|██▎       | 2887/12776 [28:03<41:36,  3.96it/s]                                                     23%|██▎       | 2887/12776 [28:03<41:36,  3.96it/s] 23%|██▎       | 2888/12776 [28:03<43:55,  3.75it/s]                                                     23%|██▎       | 2888/12776 [28:03<43:55,  3.75it/s] 23%|██▎       | 2889/12776 [28:03<41:22,  3.98it/s]                                                     23%|██▎       | 2889/12776 [28:03<41:22,  3.98it/s] 23%|██▎       | 2890/12776 [28:03<39:27,  4.18it/s]                                                     23%|██▎       | 2890/12776 [28:03<39:27,  4.18it/s] 23%|██▎       | 2891/12776 [28:04<37:49,  4.36it/s]                                                     23%|██▎       | 2891/12776 [28:04<37:49,  4.36it/s] 23%|██▎       | 2892/12776 [28:04<36:40,  4.49it/s]                                                     23%|██▎       | 2892/12776 [28:04<36:40,  4.49it/s] 23%|██▎       | 2893/12776 [28:04<39:05,  4.21it/s]                                                     23%|██▎       | 2893/12776 [28:04<39:05,  4.21it/s] 23%|██▎       | 2894/12776 [28:04<37:16,  4.42it/s]                                                     23%|██▎       | 2894/12776 [28:04<37:16,  4.42it/s] 23%|██▎       | 2895/12776 [28:05<35:57,  4.58it/s]                                                     23%|██▎       | 2895/12776 [28:05<35:57,  4.58it/s] 23%|██▎       | 2896/12776 [28:05<34:54,  4.72it/s]                                                     23%|██▎       | 2896/12776 [28:05<34:54,  4.72it/s] 23%|██▎       | 2897/12776 [28:05<34:05,  4.83it/s]                                                     23%|██▎       | 2897/12776 [28:05<34:05,  4.83it/s] 23%|██▎       | 2898/12776 [28:05<38:46,  4.25it/s]                                                     23%|██▎       | 2898/12776 [28:05<38:46,  4.25it/s] 23%|██▎       | 2899/12776 [28:05<36:21,  4.53it/s]                                                     23%|██▎       | 2899/12776 [28:05<36:21,  4.53it/s] 23%|██▎       | 2900/12776 [28:06<59:21,  2.77it/s]                                                     23%|██▎       | 2900/12776 [28:06<59:21,  2.77it/s] 23%|██▎       | 2901/12776 [28:07<1:50:11,  1.49it/s]                                                       23%|██▎       | 2901/12776 [28:07<1:50:11,  1.49it/s] 23%|██▎       | 2902/12776 [28:09<2:10:06,  1.26it/s]                                                       23%|██▎       | 2902/12776 [28:09<2:10:06,  1.26it/s] 23%|██▎       | 2903/12776 [28:09<2:13:32,  1.23it/s]                                                       23%|██▎       | 2903/12776 [28:09<2:13:32,  1.23it/s] 23%|██▎       | 2904/12776 [28:10<2:13:01,  1.24it/s]                                                       23%|██▎       | 2904/12776 [28:10<2:13:01,  1.24it/s] 23%|██▎       | 2905/12776 [28:11<2:09:38,  1.27it/s]                                                       23%|██▎       | 2905/12776 [28:11<2:09:38,  1.27it/s] 23%|██▎       | 2906/12776 [28:12<2:06:43,  1.30it/s]                                                       23%|██▎       | 2906/12776 [28:12<2:06:43,  1.30it/s] 23%|██▎       | 2907/12776 [28:12<2:03:37,  1.33it/s]                                                       23%|██▎       | 2907/12776 [28:12<2:03:37,  1.33it/s] 23%|██▎       | 2908/12776 [28:13<2:03:44,  1.33it/s]                                                       23%|██▎       | 2908/12776 [28:13<2:03:44,  1.33it/s] 23%|██▎       | 2909/12776 [28:14<1:58:05,  1.39it/s]                                                       23%|██▎       | 2909/12776 [28:14<1:58:05,  1.39it/s] 23%|██▎       | 2910/12776 [28:14<1:52:29,  1.46it/s]                                                       23%|██▎       | 2910/12776 [28:14<1:52:29,  1.46it/s] 23%|██▎       | 2911/12776 [28:15<1:46:43,  1.54it/s]                                                       23%|██▎       | 2911/12776 [28:15<1:46:43,  1.54it/s] 23%|██▎       | 2912/12776 [28:16<1:44:31,  1.57it/s]                                                       23%|██▎       | 2912/12776 [28:16<1:44:31,  1.57it/s] 23%|██▎       | 2913/12776 [28:16<1:39:22,  1.65it/s]                                                       23%|██▎       | 2913/12776 [28:16<1:39:22,  1.65it/s] 23%|██▎       | 2914/12776 [28:17<1:39:30,  1.65it/s]                                                       23%|██▎       | 2914/12776 [28:17<1:39:30,  1.65it/s] 23%|██▎       | 2915/12776 [28:17<1:34:15,  1.74it/s]                                                       23%|██▎       | 2915/12776 [28:17<1:34:15,  1.74it/s] 23%|██▎       | 2916/12776 [28:18<1:29:22,  1.84it/s]                                                       23%|██▎       | 2916/12776 [28:18<1:29:22,  1.84it/s] 23%|██▎       | 2917/12776 [28:18<1:25:08,  1.93it/s]                                                       23%|██▎       | 2917/12776 [28:18<1:25:08,  1.93it/s] 23%|██▎       | 2918/12776 [28:19<1:20:49,  2.03it/s]                                                       23%|██▎       | 2918/12776 [28:19<1:20:49,  2.03it/s] 23%|██▎       | 2919/12776 [28:19<1:19:07,  2.08it/s]                                                       23%|██▎       | 2919/12776 [28:19<1:19:07,  2.08it/s] 23%|██▎       | 2920/12776 [28:19<1:15:15,  2.18it/s]                                                       23%|██▎       | 2920/12776 [28:19<1:15:15,  2.18it/s] 23%|██▎       | 2921/12776 [28:20<1:11:53,  2.28it/s]                                                       23%|██▎       | 2921/12776 [28:20<1:11:53,  2.28it/s] 23%|██▎       | 2922/12776 [28:20<1:10:13,  2.34it/s]                                                       23%|██▎       | 2922/12776 [28:20<1:10:13,  2.34it/s] 23%|██▎       | 2923/12776 [28:21<1:06:21,  2.47it/s]                                                       23%|██▎       | 2923/12776 [28:21<1:06:21,  2.47it/s] 23%|██▎       | 2924/12776 [28:21<1:03:14,  2.60it/s]                                                       23%|██▎       | 2924/12776 [28:21<1:03:14,  2.60it/s] 23%|██▎       | 2925/12776 [28:21<1:05:35,  2.50it/s]                                                       23%|██▎       | 2925/12776 [28:21<1:05:35,  2.50it/s] 23%|██▎       | 2926/12776 [28:22<1:01:43,  2.66it/s]                                                       23%|██▎       | 2926/12776 [28:22<1:01:43,  2.66it/s] 23%|██▎       | 2927/12776 [28:22<58:20,  2.81it/s]                                                       23%|██▎       | 2927/12776 [28:22<58:20,  2.81it/s] 23%|██▎       | 2928/12776 [28:22<55:43,  2.95it/s]                                                     23%|██▎       | 2928/12776 [28:22<55:43,  2.95it/s] 23%|██▎       | 2929/12776 [28:23<56:36,  2.90it/s]                                                     23%|██▎       | 2929/12776 [28:23<56:36,  2.90it/s] 23%|██▎       | 2930/12776 [28:23<53:48,  3.05it/s]                                                     23%|██▎       | 2930/12776 [28:23<53:48,  3.05it/s] 23%|██▎       | 2931/12776 [28:23<51:12,  3.20it/s]                                                     23%|██▎       | 2931/12776 [28:23<51:12,  3.20it/s] 23%|██▎       | 2932/12776 [28:23<49:04,  3.34it/s]                                                     23%|██▎       | 2932/12776 [28:23<49:04,  3.34it/s] 23%|██▎       | 2933/12776 [28:24<49:48,  3.29it/s]                                                    {'loss': 0.3443, 'grad_norm': 0.661888062953949, 'learning_rate': 0.0002425708699902248, 'epoch': 0.45}
+{'loss': 0.4118, 'grad_norm': 0.6842008233070374, 'learning_rate': 0.00024254643206256108, 'epoch': 0.45}
+{'loss': 0.3634, 'grad_norm': 0.7839653491973877, 'learning_rate': 0.00024252199413489733, 'epoch': 0.45}
+{'loss': 0.6146, 'grad_norm': 1.1856542825698853, 'learning_rate': 0.0002424975562072336, 'epoch': 0.45}
+{'loss': 0.4388, 'grad_norm': 1.2870744466781616, 'learning_rate': 0.0002424731182795699, 'epoch': 0.45}
+{'loss': 0.5828, 'grad_norm': 0.8525822162628174, 'learning_rate': 0.0002424486803519061, 'epoch': 0.45}
+{'loss': 0.4072, 'grad_norm': 0.8361726403236389, 'learning_rate': 0.0002424242424242424, 'epoch': 0.45}
+{'loss': 0.7043, 'grad_norm': 3.588017225265503, 'learning_rate': 0.00024239980449657867, 'epoch': 0.45}
+{'loss': 0.6288, 'grad_norm': 0.9456626176834106, 'learning_rate': 0.00024237536656891492, 'epoch': 0.45}
+{'loss': 0.3946, 'grad_norm': 0.8043878078460693, 'learning_rate': 0.0002423509286412512, 'epoch': 0.45}
+{'loss': 0.5556, 'grad_norm': 1.2611615657806396, 'learning_rate': 0.00024232649071358748, 'epoch': 0.45}
+{'loss': 0.6662, 'grad_norm': 1.1336512565612793, 'learning_rate': 0.00024230205278592373, 'epoch': 0.45}
+{'loss': 0.4993, 'grad_norm': 1.0719926357269287, 'learning_rate': 0.00024227761485826, 'epoch': 0.45}
+{'loss': 0.5825, 'grad_norm': 1.3766525983810425, 'learning_rate': 0.00024225317693059626, 'epoch': 0.45}
+{'loss': 0.5897, 'grad_norm': 1.647732138633728, 'learning_rate': 0.0002422287390029325, 'epoch': 0.45}
+{'loss': 0.8135, 'grad_norm': 2.6894371509552, 'learning_rate': 0.0002422043010752688, 'epoch': 0.45}
+{'loss': 0.6992, 'grad_norm': 2.326774835586548, 'learning_rate': 0.00024217986314760507, 'epoch': 0.45}
+{'loss': 0.4537, 'grad_norm': 0.996810257434845, 'learning_rate': 0.00024215542521994132, 'epoch': 0.45}
+{'loss': 0.6694, 'grad_norm': 1.5761237144470215, 'learning_rate': 0.0002421309872922776, 'epoch': 0.45}
+{'loss': 0.9042, 'grad_norm': 3.633239507675171, 'learning_rate': 0.00024210654936461387, 'epoch': 0.45}
+{'loss': 1.0301, 'grad_norm': 2.0842907428741455, 'learning_rate': 0.0002420821114369501, 'epoch': 0.45}
+{'loss': 0.6357, 'grad_norm': 1.3994956016540527, 'learning_rate': 0.00024205767350928638, 'epoch': 0.45}
+{'loss': 0.6302, 'grad_norm': 1.4533970355987549, 'learning_rate': 0.00024203323558162266, 'epoch': 0.45}
+{'loss': 0.5899, 'grad_norm': 1.189319133758545, 'learning_rate': 0.0002420087976539589, 'epoch': 0.45}
+{'loss': 0.931, 'grad_norm': 3.1489529609680176, 'learning_rate': 0.00024198435972629519, 'epoch': 0.45}
+{'loss': 0.4861, 'grad_norm': 1.8433654308319092, 'learning_rate': 0.00024195992179863146, 'epoch': 0.45}
+{'loss': 0.6557, 'grad_norm': 3.465445041656494, 'learning_rate': 0.00024193548387096771, 'epoch': 0.45}
+{'loss': 1.3114, 'grad_norm': 2.037400484085083, 'learning_rate': 0.000241911045943304, 'epoch': 0.45}
+{'loss': 0.81, 'grad_norm': 1.4593843221664429, 'learning_rate': 0.00024188660801564027, 'epoch': 0.45}
+{'loss': 0.6537, 'grad_norm': 2.352095365524292, 'learning_rate': 0.0002418621700879765, 'epoch': 0.45}
+{'loss': 1.0747, 'grad_norm': 2.0871784687042236, 'learning_rate': 0.00024183773216031277, 'epoch': 0.45}
+{'loss': 0.8365, 'grad_norm': 3.217430591583252, 'learning_rate': 0.00024181329423264905, 'epoch': 0.45}
+{'loss': 0.7699, 'grad_norm': 3.391012668609619, 'learning_rate': 0.0002417888563049853, 'epoch': 0.45}
+{'loss': 1.3047, 'grad_norm': 4.206727981567383, 'learning_rate': 0.00024176441837732158, 'epoch': 0.45}
+{'loss': 0.8944, 'grad_norm': 2.0663089752197266, 'learning_rate': 0.00024173998044965786, 'epoch': 0.45}
+{'loss': 2.0434, 'grad_norm': 3.9211676120758057, 'learning_rate': 0.0002417155425219941, 'epoch': 0.45}
+{'loss': 1.4715, 'grad_norm': 1.6745312213897705, 'learning_rate': 0.00024169110459433036, 'epoch': 0.45}
+{'loss': 0.9836, 'grad_norm': 2.4523682594299316, 'learning_rate': 0.00024166666666666664, 'epoch': 0.45}
+{'loss': 1.4415, 'grad_norm': 1.776292085647583, 'learning_rate': 0.0002416422287390029, 'epoch': 0.45}
+{'loss': 1.3364, 'grad_norm': 2.345961570739746, 'learning_rate': 0.00024161779081133917, 'epoch': 0.45}
+{'loss': 1.2469, 'grad_norm': 2.3358938694000244, 'learning_rate': 0.00024159335288367545, 'epoch': 0.45}
+{'loss': 0.6862, 'grad_norm': 2.8481311798095703, 'learning_rate': 0.0002415689149560117, 'epoch': 0.45}
+{'loss': 0.8734, 'grad_norm': 3.2002439498901367, 'learning_rate': 0.00024154447702834798, 'epoch': 0.45}
+{'loss': 1.1847, 'grad_norm': 3.30496883392334, 'learning_rate': 0.00024152003910068426, 'epoch': 0.45}
+{'loss': 1.1897, 'grad_norm': 2.5524821281433105, 'learning_rate': 0.00024149560117302048, 'epoch': 0.45}
+{'loss': 0.5016, 'grad_norm': 0.7851200103759766, 'learning_rate': 0.00024147116324535676, 'epoch': 0.45}
+{'loss': 0.4045, 'grad_norm': 0.8391879796981812, 'learning_rate': 0.00024144672531769304, 'epoch': 0.45}
+{'loss': 0.4671, 'grad_norm': 1.0512319803237915, 'learning_rate': 0.0002414222873900293, 'epoch': 0.45}
+{'loss': 0.4057, 'grad_norm': 0.7638587951660156, 'learning_rate': 0.00024139784946236557, 'epoch': 0.45}
+{'loss': 0.3474, 'grad_norm': 0.5966224670410156, 'learning_rate': 0.00024137341153470185, 'epoch': 0.45}
+{'loss': 0.259, 'grad_norm': 0.6843605041503906, 'learning_rate': 0.0002413489736070381, 'epoch': 0.45}
+{'loss': 0.5368, 'grad_norm': 0.8733233213424683, 'learning_rate': 0.00024132453567937438, 'epoch': 0.46}
+{'loss': 0.4997, 'grad_norm': 0.8052546977996826, 'learning_rate': 0.00024130009775171065, 'epoch': 0.46}
+{'loss': 0.4456, 'grad_norm': 0.7011566162109375, 'learning_rate': 0.00024127565982404688, 'epoch': 0.46}
+{'loss': 0.4917, 'grad_norm': 2.446593999862671, 'learning_rate': 0.00024125122189638316, 'epoch': 0.46}
+{'loss': 0.4428, 'grad_norm': 0.5837518572807312, 'learning_rate': 0.00024122678396871943, 'epoch': 0.46}
+{'loss': 0.4092, 'grad_norm': 0.7179192900657654, 'learning_rate': 0.00024120234604105569, 'epoch': 0.46}
+{'loss': 0.4673, 'grad_norm': 0.9555642008781433, 'learning_rate': 0.00024117790811339196, 'epoch': 0.46}
+{'loss': 0.3904, 'grad_norm': 0.891258955001831, 'learning_rate': 0.00024115347018572824, 'epoch': 0.46}
+{'loss': 0.4525, 'grad_norm': 0.9778416752815247, 'learning_rate': 0.0002411290322580645, 'epoch': 0.46}
+{'loss': 0.3054, 'grad_norm': 1.230553150177002, 'learning_rate': 0.00024110459433040074, 'epoch': 0.46}
+{'loss': 0.3882, 'grad_norm': 1.3313859701156616, 'learning_rate': 0.00024108015640273702, 'epoch': 0.46}
+{'loss': 0.6825, 'grad_norm': 1.3717501163482666, 'learning_rate': 0.00024105571847507327, 'epoch': 0.46}
+{'loss': 0.5989, 'grad_norm': 2.073382616043091, 'learning_rate': 0.00024103128054740955, 'epoch': 0.46}
+{'loss': 0.5252, 'grad_norm': 1.1343777179718018, 'learning_rate': 0.00024100684261974583, 'epoch': 0.46}
+{'loss': 0.4679, 'grad_norm': 1.8335095643997192, 'learning_rate': 0.00024098240469208208, 'epoch': 0.46}
+{'loss': 0.6568, 'grad_norm': 2.010408639907837, 'learning_rate': 0.00024095796676441836, 'epoch': 0.46}
+{'loss': 0.545, 'grad_norm': 1.3354651927947998, 'learning_rate': 0.00024093352883675464, 'epoch': 0.46}
+{'loss': 0.9399, 'grad_norm': 2.0071985721588135, 'learning_rate': 0.00024090909090909086, 'epoch': 0.46}
+{'loss': 1.0217, 'grad_norm': 3.2481436729431152, 'learning_rate': 0.00024088465298142714, 'epoch': 0.46}
+{'loss': 0.5155, 'grad_norm': 2.576655387878418, 'learning_rate': 0.00024086021505376342, 'epoch': 0.46}
+{'loss': 0.6947, 'grad_norm': 1.2466546297073364, 'learning_rate': 0.00024083577712609967, 'epoch': 0.46}
+{'loss': 0.6762, 'grad_norm': 1.3399920463562012, 'learning_rate': 0.00024081133919843595, 'epoch': 0.46}
+{'loss': 0.8415, 'grad_norm': 1.9821996688842773, 'learning_rate': 0.00024078690127077223, 'epoch': 0.46}
+{'loss': 0.7196, 'grad_norm': 2.0268774032592773, 'learning_rate': 0.00024076246334310848, 'epoch': 0.46}
+{'loss': 0.7176, 'grad_norm': 1.9509254693984985, 'learning_rate': 0.00024073802541544476, 'epoch': 0.46}
+{'loss': 0.9125, 'grad_norm': 1.5287137031555176, 'learning_rate': 0.00024071358748778104, 'epoch': 0.46}
+ 23%|██▎       | 2933/12776 [28:24<49:48,  3.29it/s] 23%|██▎       | 2934/12776 [28:24<47:58,  3.42it/s]                                                     23%|██▎       | 2934/12776 [28:24<47:58,  3.42it/s] 23%|██▎       | 2935/12776 [28:24<45:48,  3.58it/s]                                                     23%|██▎       | 2935/12776 [28:24<45:48,  3.58it/s] 23%|██▎       | 2936/12776 [28:25<44:07,  3.72it/s]                                                     23%|██▎       | 2936/12776 [28:25<44:07,  3.72it/s] 23%|██▎       | 2937/12776 [28:25<47:43,  3.44it/s]                                                     23%|██▎       | 2937/12776 [28:25<47:43,  3.44it/s] 23%|██▎       | 2938/12776 [28:25<44:46,  3.66it/s]                                                     23%|██▎       | 2938/12776 [28:25<44:46,  3.66it/s] 23%|██▎       | 2939/12776 [28:25<42:21,  3.87it/s]                                                     23%|██▎       | 2939/12776 [28:25<42:21,  3.87it/s] 23%|██▎       | 2940/12776 [28:26<40:11,  4.08it/s]                                                     23%|██▎       | 2940/12776 [28:26<40:11,  4.08it/s] 23%|██▎       | 2941/12776 [28:26<44:05,  3.72it/s]                                                     23%|██▎       | 2941/12776 [28:26<44:05,  3.72it/s] 23%|██▎       | 2942/12776 [28:26<41:21,  3.96it/s]                                                     23%|██▎       | 2942/12776 [28:26<41:21,  3.96it/s] 23%|██▎       | 2943/12776 [28:26<39:13,  4.18it/s]                                                     23%|██▎       | 2943/12776 [28:26<39:13,  4.18it/s] 23%|██▎       | 2944/12776 [28:27<37:30,  4.37it/s]                                                     23%|██▎       | 2944/12776 [28:27<37:30,  4.37it/s] 23%|██▎       | 2945/12776 [28:27<36:05,  4.54it/s]                                                     23%|██▎       | 2945/12776 [28:27<36:05,  4.54it/s] 23%|██▎       | 2946/12776 [28:27<39:58,  4.10it/s]                                                     23%|██▎       | 2946/12776 [28:27<39:58,  4.10it/s] 23%|██▎       | 2947/12776 [28:27<37:28,  4.37it/s]                                                     23%|██▎       | 2947/12776 [28:27<37:28,  4.37it/s] 23%|██▎       | 2948/12776 [28:27<35:26,  4.62it/s]                                                     23%|██▎       | 2948/12776 [28:27<35:26,  4.62it/s] 23%|██▎       | 2949/12776 [28:28<33:52,  4.83it/s]                                                     23%|██▎       | 2949/12776 [28:28<33:52,  4.83it/s] 23%|██▎       | 2950/12776 [28:28<58:42,  2.79it/s]                                                     23%|██▎       | 2950/12776 [28:28<58:42,  2.79it/s] 23%|██▎       | 2951/12776 [28:30<1:52:38,  1.45it/s]                                                       23%|██▎       | 2951/12776 [28:30<1:52:38,  1.45it/s] 23%|██▎       | 2952/12776 [28:31<2:12:52,  1.23it/s]                                                       23%|██▎       | 2952/12776 [28:31<2:12:52,  1.23it/s] 23%|██▎       | 2953/12776 [28:32<2:15:32,  1.21it/s]                                                       23%|██▎       | 2953/12776 [28:32<2:15:32,  1.21it/s] 23%|██▎       | 2954/12776 [28:33<2:14:08,  1.22it/s]                                                       23%|██▎       | 2954/12776 [28:33<2:14:08,  1.22it/s] 23%|██▎       | 2955/12776 [28:33<2:11:00,  1.25it/s]                                                       23%|██▎       | 2955/12776 [28:33<2:11:00,  1.25it/s] 23%|██▎       | 2956/12776 [28:34<2:06:29,  1.29it/s]                                                       23%|██▎       | 2956/12776 [28:34<2:06:29,  1.29it/s] 23%|██▎       | 2957/12776 [28:35<2:01:18,  1.35it/s]                                                       23%|██▎       | 2957/12776 [28:35<2:01:18,  1.35it/s] 23%|██▎       | 2958/12776 [28:35<2:01:45,  1.34it/s]                                                       23%|██▎       | 2958/12776 [28:35<2:01:45,  1.34it/s] 23%|██▎       | 2959/12776 [28:36<1:54:30,  1.43it/s]                                                       23%|██▎       | 2959/12776 [28:36<1:54:30,  1.43it/s] 23%|██▎       | 2960/12776 [28:37<1:49:41,  1.49it/s]                                                       23%|██▎       | 2960/12776 [28:37<1:49:41,  1.49it/s] 23%|██▎       | 2961/12776 [28:37<1:43:05,  1.59it/s]                                                       23%|██▎       | 2961/12776 [28:37<1:43:05,  1.59it/s] 23%|██▎       | 2962/12776 [28:38<1:39:27,  1.64it/s]                                                       23%|██▎       | 2962/12776 [28:38<1:39:27,  1.64it/s] 23%|██▎       | 2963/12776 [28:38<1:31:53,  1.78it/s]                                                       23%|██▎       | 2963/12776 [28:38<1:31:53,  1.78it/s] 23%|██▎       | 2964/12776 [28:39<1:32:54,  1.76it/s]                                                       23%|██▎       | 2964/12776 [28:39<1:32:54,  1.76it/s] 23%|██▎       | 2965/12776 [28:39<1:25:42,  1.91it/s]                                                       23%|██▎       | 2965/12776 [28:39<1:25:42,  1.91it/s] 23%|██▎       | 2966/12776 [28:40<1:27:47,  1.86it/s]                                                       23%|██▎       | 2966/12776 [28:40<1:27:47,  1.86it/s] 23%|██▎       | 2967/12776 [28:40<1:20:37,  2.03it/s]                                                       23%|██▎       | 2967/12776 [28:40<1:20:37,  2.03it/s] 23%|██▎       | 2968/12776 [28:40<1:15:10,  2.17it/s]                                                       23%|██▎       | 2968/12776 [28:40<1:15:10,  2.17it/s] 23%|██▎       | 2969/12776 [28:41<1:12:43,  2.25it/s]                                                       23%|██▎       | 2969/12776 [28:41<1:12:43,  2.25it/s] 23%|██▎       | 2970/12776 [28:41<1:08:08,  2.40it/s]                                                       23%|██▎       | 2970/12776 [28:41<1:08:08,  2.40it/s] 23%|██▎       | 2971/12776 [28:42<1:04:18,  2.54it/s]                                                       23%|██▎       | 2971/12776 [28:42<1:04:18,  2.54it/s] 23%|██▎       | 2972/12776 [28:42<1:06:45,  2.45it/s]                                                       23%|██▎       | 2972/12776 [28:42<1:06:45,  2.45it/s] 23%|██▎       | 2973/12776 [28:42<1:02:14,  2.62it/s]                                                       23%|██▎       | 2973/12776 [28:42<1:02:14,  2.62it/s] 23%|██▎       | 2974/12776 [28:43<58:24,  2.80it/s]                                                       23%|██▎       | 2974/12776 [28:43<58:24,  2.80it/s] 23%|██▎       | 2975/12776 [28:43<55:29,  2.94it/s]                                                     23%|██▎       | 2975/12776 [28:43<55:29,  2.94it/s] 23%|██▎       | 2976/12776 [28:43<56:52,  2.87it/s]                                                     23%|██▎       | 2976/12776 [28:43<56:52,  2.87it/s] 23%|██▎       | 2977/12776 [28:44<53:39,  3.04it/s]                                                     23%|██▎       | 2977/12776 [28:44<53:39,  3.04it/s] 23%|██▎       | 2978/12776 [28:44<51:03,  3.20it/s]                                                     23%|██▎       | 2978/12776 [28:44<51:03,  3.20it/s] 23%|██▎       | 2979/12776 [28:44<48:51,  3.34it/s]                                                     23%|██▎       | 2979/12776 [28:44<48:51,  3.34it/s] 23%|██▎       | 2980/12776 [28:44<49:58,  3.27it/s]                                                     23%|██▎       | 2980/12776 [28:44<49:58,  3.27it/s] 23%|██▎       | 2981/12776 [28:45<47:19,  3.45it/s]                                                     23%|██▎       | 2981/12776 [28:45<47:19,  3.45it/s] 23%|██▎       | 2982/12776 [28:45<45:20,  3.60it/s]                                                     23%|██▎       | 2982/12776 [28:45<45:20,  3.60it/s] 23%|██▎       | 2983/12776 [28:45<43:35,  3.74it/s]                                                     23%|██▎       | 2983/12776 [28:45<43:35,  3.74it/s] 23%|██▎       | 2984/12776 [28:45<44:09,  3.70it/s]                                                     23%|██▎       | 2984/12776 [28:45<44:09,  3.70it/s] 23%|██▎       | 2985/12776 [28:46<42:10,  3.87it/s]                                                     23%|██▎       | 2985/12776 [28:46<42:10,  3.87it/s] 23%|██▎       | 2986/12776 [28:46<40:21,  4.04it/s]                                                     23%|██▎       | 2986/12776 [28:46<40:21,  4.04it/s] 23%|██▎       | 2987/12776 [28:46<38:50,  4.20it/s]                                                     23%|██▎       | 2987/12776 [28:46<38:50,  4.20it/s] 23%|██▎       | 2988/12776 [28:46<37:36,  4.34it/s]                                                     23%|██▎       | 2988/12776 [28:46<37:36,  4.34it/s] 23%|██▎       | 2989/12776 [28:47<38:59,  4.18it/s]                                                     23%|██▎       | 2989/12776 [28:47<38:59,  4.18it/s] 23%|██▎       | 2990/12776 [28:47<37:44,  4.32it/s]                                                     23%|██▎       | 2990/12776 [28:47<37:44,  4.32it/s] 23%|██▎       | 2991/12776 [28:47<36:40,  4.45it/s]                                                     23%|██▎       | 2991/12776 [28:47<36:40,  4.45it/s] 23%|██▎       | 2992/12776 [28:47<35:40,  4.57it/s]                                                     23%|██▎       | 2992/12776 [28:47<35:40,  4.57it/s] 23%|██▎       | 2993/12776 [28:47<34:53,  4.67it/s]                                                     23%|██▎       | 2993/12776 [28:47<34:53,  4.67it/s] 23%|██▎       | 2994/12776 [28:48<39:38,  4.11it/s]                                                     23%|██▎       | 2994/12776 [28:48<39:38,  4.11it/s] 23%|██▎       | 2995/12776 [28:48<37:24,  4.36it/s]                                                     23%|██▎       | 2995/12776 [28:48<37:24,  4.36it/s] 23%|██▎       | 2996/12776 [28:48<35:44,  4.56it/s]                                                     23%|██▎       | 2996/12776 [28:48<35:44,  4.56it/s] 23%|██▎       | 2997/12776 [28:48<34:25,  4.73it/s]                                                     23%|██▎       | 2997/12776 [28:48<34:25,  4.73it/s] 23%|██▎       | 2998/12776 [28:49<33:10,  4.91it/s]                                                     23%|██▎       | 2998/12776 [28:49<33:10,  4.91it/s] 23%|██▎       | 2999/12776 [28:49<32:14,  5.05it/s]                                                     23%|██▎       | 2999/12776 [28:49<32:14,  5.05it/s] 23%|██▎       | 3000/12776 [28:49<55:10,  2.95it/s]                                                     23%|██▎       | 3000/12776 [28:49<55:10,  2.95it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 0.7953, 'grad_norm': 2.1340198516845703, 'learning_rate': 0.00024068914956011726, 'epoch': 0.46}
+{'loss': 0.9377, 'grad_norm': 1.94748055934906, 'learning_rate': 0.00024066471163245354, 'epoch': 0.46}
+{'loss': 0.9098, 'grad_norm': 1.8639601469039917, 'learning_rate': 0.00024064027370478982, 'epoch': 0.46}
+{'loss': 1.1671, 'grad_norm': 1.7481262683868408, 'learning_rate': 0.00024061583577712607, 'epoch': 0.46}
+{'loss': 1.5963, 'grad_norm': 2.4911282062530518, 'learning_rate': 0.00024059139784946235, 'epoch': 0.46}
+{'loss': 0.8903, 'grad_norm': 3.1671178340911865, 'learning_rate': 0.00024056695992179862, 'epoch': 0.46}
+{'loss': 1.2117, 'grad_norm': 2.2914581298828125, 'learning_rate': 0.00024054252199413488, 'epoch': 0.46}
+{'loss': 1.4033, 'grad_norm': 2.8251593112945557, 'learning_rate': 0.00024051808406647113, 'epoch': 0.46}
+{'loss': 0.9579, 'grad_norm': 2.4043309688568115, 'learning_rate': 0.0002404936461388074, 'epoch': 0.46}
+{'loss': 0.833, 'grad_norm': 2.894033908843994, 'learning_rate': 0.00024046920821114366, 'epoch': 0.46}
+{'loss': 1.7688, 'grad_norm': 2.439354658126831, 'learning_rate': 0.00024044477028347994, 'epoch': 0.46}
+{'loss': 1.5622, 'grad_norm': 3.430408000946045, 'learning_rate': 0.00024042033235581621, 'epoch': 0.46}
+{'loss': 1.3692, 'grad_norm': 1.8975231647491455, 'learning_rate': 0.00024039589442815246, 'epoch': 0.46}
+{'loss': 1.1524, 'grad_norm': 1.8430036306381226, 'learning_rate': 0.00024037145650048874, 'epoch': 0.46}
+{'loss': 0.8915, 'grad_norm': 3.267629861831665, 'learning_rate': 0.00024034701857282502, 'epoch': 0.46}
+{'loss': 1.2047, 'grad_norm': 2.6302919387817383, 'learning_rate': 0.00024032258064516125, 'epoch': 0.46}
+{'loss': 1.3952, 'grad_norm': 2.9128334522247314, 'learning_rate': 0.00024029814271749752, 'epoch': 0.46}
+{'loss': 1.4002, 'grad_norm': 1.4925674200057983, 'learning_rate': 0.0002402737047898338, 'epoch': 0.46}
+{'loss': 0.3536, 'grad_norm': 0.5663087964057922, 'learning_rate': 0.00024024926686217005, 'epoch': 0.46}
+{'loss': 0.4303, 'grad_norm': 0.5711171627044678, 'learning_rate': 0.00024022482893450633, 'epoch': 0.46}
+{'loss': 0.4472, 'grad_norm': 0.7608370184898376, 'learning_rate': 0.0002402003910068426, 'epoch': 0.46}
+{'loss': 0.4728, 'grad_norm': 0.5618414282798767, 'learning_rate': 0.00024017595307917886, 'epoch': 0.46}
+{'loss': 0.4194, 'grad_norm': 0.5996711254119873, 'learning_rate': 0.00024015151515151514, 'epoch': 0.46}
+{'loss': 0.4361, 'grad_norm': 0.6966267228126526, 'learning_rate': 0.00024012707722385142, 'epoch': 0.46}
+{'loss': 0.3586, 'grad_norm': 0.6041519045829773, 'learning_rate': 0.00024010263929618764, 'epoch': 0.46}
+{'loss': 0.3958, 'grad_norm': 0.6143047213554382, 'learning_rate': 0.00024007820136852392, 'epoch': 0.46}
+{'loss': 0.4222, 'grad_norm': 0.727200448513031, 'learning_rate': 0.0002400537634408602, 'epoch': 0.46}
+{'loss': 0.3269, 'grad_norm': 0.7016879320144653, 'learning_rate': 0.00024002932551319645, 'epoch': 0.46}
+{'loss': 0.4065, 'grad_norm': 1.3324402570724487, 'learning_rate': 0.00024000488758553273, 'epoch': 0.46}
+{'loss': 0.4625, 'grad_norm': 0.8815485835075378, 'learning_rate': 0.000239980449657869, 'epoch': 0.46}
+{'loss': 0.5971, 'grad_norm': 1.0616453886032104, 'learning_rate': 0.00023995601173020523, 'epoch': 0.46}
+{'loss': 0.4368, 'grad_norm': 0.8224049806594849, 'learning_rate': 0.0002399315738025415, 'epoch': 0.46}
+{'loss': 0.5855, 'grad_norm': 1.420790195465088, 'learning_rate': 0.0002399071358748778, 'epoch': 0.46}
+{'loss': 0.6298, 'grad_norm': 1.2988914251327515, 'learning_rate': 0.00023988269794721404, 'epoch': 0.46}
+{'loss': 0.6326, 'grad_norm': 2.2684569358825684, 'learning_rate': 0.00023985826001955032, 'epoch': 0.46}
+{'loss': 0.7841, 'grad_norm': 2.2783150672912598, 'learning_rate': 0.0002398338220918866, 'epoch': 0.46}
+{'loss': 0.7821, 'grad_norm': 1.206459879875183, 'learning_rate': 0.00023980938416422285, 'epoch': 0.46}
+{'loss': 1.0646, 'grad_norm': 3.7730114459991455, 'learning_rate': 0.00023978494623655913, 'epoch': 0.46}
+{'loss': 0.452, 'grad_norm': 1.3835830688476562, 'learning_rate': 0.0002397605083088954, 'epoch': 0.47}
+{'loss': 0.9363, 'grad_norm': 2.1477081775665283, 'learning_rate': 0.00023973607038123163, 'epoch': 0.47}
+{'loss': 0.5786, 'grad_norm': 1.1559685468673706, 'learning_rate': 0.0002397116324535679, 'epoch': 0.47}
+{'loss': 0.5976, 'grad_norm': 2.111488103866577, 'learning_rate': 0.00023968719452590418, 'epoch': 0.47}
+{'loss': 0.5426, 'grad_norm': 1.2530162334442139, 'learning_rate': 0.00023966275659824044, 'epoch': 0.47}
+{'loss': 1.0584, 'grad_norm': 4.318511962890625, 'learning_rate': 0.00023963831867057671, 'epoch': 0.47}
+{'loss': 0.5268, 'grad_norm': 1.5593619346618652, 'learning_rate': 0.000239613880742913, 'epoch': 0.47}
+{'loss': 0.9127, 'grad_norm': 1.6333059072494507, 'learning_rate': 0.00023958944281524924, 'epoch': 0.47}
+{'loss': 1.2134, 'grad_norm': 1.942104697227478, 'learning_rate': 0.00023956500488758552, 'epoch': 0.47}
+{'loss': 1.0458, 'grad_norm': 3.2156219482421875, 'learning_rate': 0.00023954056695992177, 'epoch': 0.47}
+{'loss': 0.7571, 'grad_norm': 1.9920734167099, 'learning_rate': 0.00023951612903225802, 'epoch': 0.47}
+{'loss': 0.7142, 'grad_norm': 4.246744632720947, 'learning_rate': 0.0002394916911045943, 'epoch': 0.47}
+{'loss': 0.8802, 'grad_norm': 1.2510920763015747, 'learning_rate': 0.00023946725317693058, 'epoch': 0.47}
+{'loss': 0.9898, 'grad_norm': 1.446738839149475, 'learning_rate': 0.00023944281524926683, 'epoch': 0.47}
+{'loss': 1.2076, 'grad_norm': 2.1608774662017822, 'learning_rate': 0.0002394183773216031, 'epoch': 0.47}
+{'loss': 0.8113, 'grad_norm': 2.4812753200531006, 'learning_rate': 0.0002393939393939394, 'epoch': 0.47}
+{'loss': 1.0752, 'grad_norm': 2.0435714721679688, 'learning_rate': 0.0002393695014662756, 'epoch': 0.47}
+{'loss': 1.6354, 'grad_norm': 3.043316602706909, 'learning_rate': 0.0002393450635386119, 'epoch': 0.47}
+{'loss': 1.0435, 'grad_norm': 3.5616087913513184, 'learning_rate': 0.00023932062561094817, 'epoch': 0.47}
+{'loss': 1.6489, 'grad_norm': 3.942680835723877, 'learning_rate': 0.00023929618768328442, 'epoch': 0.47}
+{'loss': 1.0482, 'grad_norm': 1.8026114702224731, 'learning_rate': 0.0002392717497556207, 'epoch': 0.47}
+{'loss': 1.5673, 'grad_norm': 3.0630009174346924, 'learning_rate': 0.00023924731182795698, 'epoch': 0.47}
+{'loss': 1.3448, 'grad_norm': 2.128530740737915, 'learning_rate': 0.00023922287390029323, 'epoch': 0.47}
+{'loss': 1.4261, 'grad_norm': 2.782782554626465, 'learning_rate': 0.0002391984359726295, 'epoch': 0.47}
+{'loss': 1.6046, 'grad_norm': 2.1580779552459717, 'learning_rate': 0.00023917399804496579, 'epoch': 0.47}
+{'loss': 0.9642, 'grad_norm': 1.6903536319732666, 'learning_rate': 0.000239149560117302, 'epoch': 0.47}
+{'loss': 0.7686, 'grad_norm': 3.0075113773345947, 'learning_rate': 0.0002391251221896383, 'epoch': 0.47}
+{'loss': 1.2032, 'grad_norm': 1.6930934190750122, 'learning_rate': 0.00023910068426197457, 'epoch': 0.47}
+{'loss': 1.0076, 'grad_norm': 2.7833595275878906, 'learning_rate': 0.00023907624633431082, 'epoch': 0.47}
+{'loss': 0.9728, 'grad_norm': 1.994335651397705, 'learning_rate': 0.0002390518084066471, 'epoch': 0.47}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:07,  6.07it/s][A
+  0%|          | 3/774 [00:00<02:49,  4.54it/s][A
+  1%|          | 4/774 [00:00<03:16,  3.93it/s][A
+  1%|          | 5/774 [00:01<03:15,  3.94it/s][A
+  1%|          | 6/774 [00:01<03:26,  3.71it/s][A
+  1%|          | 7/774 [00:01<03:25,  3.74it/s][A
+  1%|          | 8/774 [00:02<03:27,  3.69it/s][A
+  1%|          | 9/774 [00:02<03:16,  3.90it/s][A
+  1%|▏         | 10/774 [00:02<03:16,  3.90it/s][A
+  1%|▏         | 11/774 [00:02<03:31,  3.60it/s][A
+  2%|▏         | 12/774 [00:03<03:19,  3.82it/s][A
+  2%|▏         | 13/774 [00:03<03:11,  3.97it/s][A
+  2%|▏         | 14/774 [00:03<03:23,  3.73it/s][A
+  2%|▏         | 15/774 [00:03<03:40,  3.44it/s][A
+  2%|▏         | 16/774 [00:04<03:38,  3.47it/s][A
+  2%|▏         | 17/774 [00:04<03:15,  3.87it/s][A
+  2%|▏         | 18/774 [00:04<03:07,  4.03it/s][A
+  2%|▏         | 19/774 [00:04<03:17,  3.83it/s][A
+  3%|▎         | 20/774 [00:05<03:12,  3.92it/s][A
+  3%|▎         | 21/774 [00:05<03:16,  3.82it/s][A
+  3%|▎         | 22/774 [00:05<03:22,  3.72it/s][A
+  3%|▎         | 23/774 [00:06<03:33,  3.52it/s][A
+  3%|▎         | 24/774 [00:06<03:30,  3.56it/s][A
+  3%|▎         | 25/774 [00:06<03:38,  3.42it/s][A
+  3%|▎         | 26/774 [00:06<03:37,  3.44it/s][A
+  3%|▎         | 27/774 [00:07<03:36,  3.46it/s][A
+  4%|▎         | 28/774 [00:07<03:41,  3.37it/s][A
+  4%|▎         | 29/774 [00:07<03:45,  3.30it/s][A
+  4%|▍         | 30/774 [00:08<03:32,  3.49it/s][A
+  4%|▍         | 31/774 [00:08<03:32,  3.49it/s][A
+  4%|▍         | 32/774 [00:08<04:21,  2.84it/s][A
+  4%|▍         | 33/774 [00:09<04:04,  3.03it/s][A
+  4%|▍         | 34/774 [00:09<03:48,  3.24it/s][A
+  5%|▍         | 35/774 [00:09<03:51,  3.20it/s][A
+  5%|▍         | 36/774 [00:10<03:50,  3.21it/s][A
+  5%|▍         | 37/774 [00:10<03:49,  3.21it/s][A
+  5%|▍         | 38/774 [00:10<03:39,  3.36it/s][A
+  5%|▌         | 39/774 [00:10<03:22,  3.63it/s][A
+  5%|▌         | 40/774 [00:11<03:27,  3.55it/s][A
+  5%|▌         | 41/774 [00:11<03:24,  3.58it/s][A
+  5%|▌         | 42/774 [00:11<03:13,  3.79it/s][A
+  6%|▌         | 43/774 [00:11<03:25,  3.55it/s][A
+  6%|▌         | 44/774 [00:12<03:28,  3.50it/s][A
+  6%|▌         | 45/774 [00:12<03:17,  3.69it/s][A
+  6%|▌         | 46/774 [00:12<03:02,  4.00it/s][A
+  6%|▌         | 47/774 [00:12<02:50,  4.27it/s][A
+  6%|▌         | 48/774 [00:13<02:51,  4.23it/s][A
+  6%|▋         | 49/774 [00:13<02:53,  4.17it/s][A
+  6%|▋         | 50/774 [00:13<02:57,  4.09it/s][A
+  7%|▋         | 51/774 [00:13<02:59,  4.04it/s][A
+  7%|▋         | 52/774 [00:14<02:57,  4.08it/s][A
+  7%|▋         | 53/774 [00:14<03:03,  3.92it/s][A
+  7%|▋         | 54/774 [00:14<03:08,  3.82it/s][A
+  7%|▋         | 55/774 [00:15<03:18,  3.63it/s][A
+  7%|▋         | 56/774 [00:15<03:17,  3.63it/s][A
+  7%|▋         | 57/774 [00:15<03:22,  3.54it/s][A
+  7%|▋         | 58/774 [00:15<03:22,  3.53it/s][A
+  8%|▊         | 59/774 [00:16<03:07,  3.82it/s][A
+  8%|▊         | 60/774 [00:16<02:53,  4.12it/s][A
+  8%|▊         | 61/774 [00:16<02:31,  4.70it/s][A
+  8%|▊         | 62/774 [00:16<02:30,  4.74it/s][A
+  8%|▊         | 63/774 [00:16<02:59,  3.97it/s][A
+  8%|▊         | 64/774 [00:17<02:49,  4.20it/s][A
+  8%|▊         | 65/774 [00:17<02:50,  4.17it/s][A
+  9%|▊         | 66/774 [00:17<02:47,  4.23it/s][A
+  9%|▊         | 67/774 [00:17<02:40,  4.42it/s][A
+  9%|▉         | 68/774 [00:18<02:36,  4.51it/s][A
+  9%|▉         | 69/774 [00:18<02:28,  4.76it/s][A
+  9%|▉         | 70/774 [00:18<02:36,  4.50it/s][A
+  9%|▉         | 71/774 [00:18<02:31,  4.64it/s][A
+  9%|▉         | 72/774 [00:18<02:42,  4.31it/s][A
+  9%|▉         | 73/774 [00:19<02:52,  4.07it/s][A
+ 10%|▉         | 74/774 [00:19<02:58,  3.92it/s][A
+ 10%|▉         | 75/774 [00:19<03:04,  3.79it/s][A
+ 10%|▉         | 76/774 [00:20<03:00,  3.86it/s][A
+ 10%|▉         | 77/774 [00:20<03:12,  3.61it/s][A
+ 10%|█         | 78/774 [00:20<02:53,  4.01it/s][A
+ 10%|█         | 79/774 [00:20<02:41,  4.29it/s][A
+ 10%|█         | 80/774 [00:20<02:38,  4.38it/s][A
+ 10%|█         | 81/774 [00:21<02:16,  5.06it/s][A
+ 11%|█         | 82/774 [00:21<02:17,  5.03it/s][A
+ 11%|█         | 83/774 [00:21<02:20,  4.93it/s][A
+ 11%|█         | 84/774 [00:21<02:26,  4.70it/s][A
+ 11%|█         | 85/774 [00:22<02:35,  4.42it/s][A
+ 11%|█         | 86/774 [00:22<02:42,  4.23it/s][A
+ 11%|█         | 87/774 [00:22<02:43,  4.19it/s][A
+ 11%|█▏        | 88/774 [00:22<02:31,  4.53it/s][A
+ 11%|█▏        | 89/774 [00:22<02:25,  4.70it/s][A
+ 12%|█▏        | 90/774 [00:23<02:34,  4.44it/s][A
+ 12%|█▏        | 91/774 [00:23<02:48,  4.04it/s][A
+ 12%|█▏        | 92/774 [00:23<03:01,  3.75it/s][A
+ 12%|█▏        | 93/774 [00:24<02:57,  3.83it/s][A
+ 12%|█▏        | 94/774 [00:24<03:01,  3.74it/s][A
+ 12%|█▏        | 95/774 [00:24<03:00,  3.76it/s][A
+ 12%|█▏        | 96/774 [00:24<02:56,  3.84it/s][A
+ 13%|█▎        | 97/774 [00:24<02:39,  4.25it/s][A
+ 13%|█▎        | 98/774 [00:25<02:33,  4.41it/s][A
+ 13%|█▎        | 99/774 [00:25<02:45,  4.07it/s][A
+ 13%|█▎        | 100/774 [00:25<02:58,  3.78it/s][A
+ 13%|█▎        | 101/774 [00:26<03:03,  3.67it/s][A
+ 13%|█▎        | 102/774 [00:26<03:15,  3.43it/s][A
+ 13%|█▎        | 103/774 [00:26<03:17,  3.39it/s][A
+ 13%|█▎        | 104/774 [00:26<03:16,  3.41it/s][A
+ 14%|█▎        | 105/774 [00:27<03:14,  3.44it/s][A
+ 14%|█▎        | 106/774 [00:27<03:34,  3.12it/s][A
+ 14%|█▍        | 107/774 [00:28<03:46,  2.95it/s][A
+ 14%|█▍        | 108/774 [00:28<03:37,  3.06it/s][A
+ 14%|█▍        | 109/774 [00:28<03:34,  3.09it/s][A
+ 14%|█▍        | 110/774 [00:28<03:24,  3.25it/s][A
+ 14%|█▍        | 111/774 [00:29<03:23,  3.26it/s][A
+ 14%|█▍        | 112/774 [00:29<03:12,  3.44it/s][A
+ 15%|█▍        | 113/774 [00:29<03:18,  3.34it/s][A
+ 15%|█▍        | 114/774 [00:30<03:22,  3.27it/s][A
+ 15%|█▍        | 115/774 [00:30<03:15,  3.37it/s][A
+ 15%|█▍        | 116/774 [00:30<03:00,  3.64it/s][A
+ 15%|█▌        | 117/774 [00:30<03:06,  3.51it/s][A
+ 15%|█▌        | 118/774 [00:31<03:05,  3.53it/s][A
+ 15%|█▌        | 119/774 [00:31<02:58,  3.68it/s][A
+ 16%|█▌        | 120/774 [00:31<03:08,  3.48it/s][A
+ 16%|█▌        | 121/774 [00:32<03:02,  3.57it/s][A
+ 16%|█▌        | 122/774 [00:32<03:08,  3.47it/s][A
+ 16%|█▌        | 123/774 [00:32<02:59,  3.63it/s][A
+ 16%|█▌        | 124/774 [00:32<03:00,  3.60it/s][A
+ 16%|█▌        | 125/774 [00:33<03:00,  3.59it/s][A
+ 16%|█▋        | 126/774 [00:33<03:08,  3.43it/s][A
+ 16%|█▋        | 127/774 [00:33<03:19,  3.25it/s][A
+ 17%|█▋        | 128/774 [00:34<03:09,  3.41it/s][A
+ 17%|█▋        | 129/774 [00:34<03:09,  3.40it/s][A
+ 17%|█▋        | 130/774 [00:34<03:16,  3.27it/s][A
+ 17%|█▋        | 131/774 [00:34<03:07,  3.43it/s][A
+ 17%|█▋        | 132/774 [00:35<03:08,  3.41it/s][A
+ 17%|█▋        | 133/774 [00:35<03:03,  3.50it/s][A
+ 17%|█▋        | 134/774 [00:35<03:03,  3.49it/s][A
+ 17%|█▋        | 135/774 [00:36<03:19,  3.20it/s][A
+ 18%|█▊        | 136/774 [00:36<03:27,  3.08it/s][A
+ 18%|█▊        | 137/774 [00:36<03:27,  3.07it/s][A
+ 18%|█▊        | 138/774 [00:37<03:23,  3.13it/s][A
+ 18%|█▊        | 139/774 [00:37<03:23,  3.11it/s][A
+ 18%|█▊        | 140/774 [00:37<03:19,  3.18it/s][A
+ 18%|█▊        | 141/774 [00:38<03:12,  3.30it/s][A
+ 18%|█▊        | 142/774 [00:38<03:22,  3.13it/s][A
+ 18%|█▊        | 143/774 [00:38<03:18,  3.17it/s][A
+ 19%|█▊        | 144/774 [00:39<03:06,  3.37it/s][A
+ 19%|█▊        | 145/774 [00:39<03:00,  3.49it/s][A
+ 19%|█▉        | 146/774 [00:39<02:49,  3.71it/s][A
+ 19%|█▉        | 147/774 [00:39<02:40,  3.91it/s][A
+ 19%|█▉        | 148/774 [00:40<02:50,  3.68it/s][A
+ 19%|█▉        | 149/774 [00:40<03:04,  3.40it/s][A
+ 19%|█▉        | 150/774 [00:40<03:06,  3.34it/s][A
+ 20%|█▉        | 151/774 [00:40<02:56,  3.53it/s][A
+ 20%|█▉        | 152/774 [00:41<02:47,  3.70it/s][A
+ 20%|█▉        | 153/774 [00:41<02:55,  3.55it/s][A
+ 20%|█▉        | 154/774 [00:41<02:50,  3.64it/s][A
+ 20%|██        | 155/774 [00:42<02:47,  3.70it/s][A
+ 20%|██        | 156/774 [00:42<02:41,  3.83it/s][A
+ 20%|██        | 157/774 [00:42<02:33,  4.01it/s][A
+ 20%|██        | 158/774 [00:42<02:38,  3.89it/s][A
+ 21%|██        | 159/774 [00:43<02:39,  3.84it/s][A
+ 21%|██        | 160/774 [00:43<02:32,  4.03it/s][A
+ 21%|██        | 161/774 [00:43<02:42,  3.78it/s][A
+ 21%|██        | 162/774 [00:43<02:47,  3.64it/s][A
+ 21%|██        | 163/774 [00:44<02:46,  3.67it/s][A
+ 21%|██        | 164/774 [00:44<02:40,  3.80it/s][A
+ 21%|██▏       | 165/774 [00:44<02:38,  3.83it/s][A
+ 21%|██▏       | 166/774 [00:44<02:41,  3.76it/s][A
+ 22%|██▏       | 167/774 [00:45<02:44,  3.69it/s][A
+ 22%|██▏       | 168/774 [00:45<02:36,  3.88it/s][A
+ 22%|██▏       | 169/774 [00:45<02:28,  4.06it/s][A
+ 22%|██▏       | 170/774 [00:45<02:37,  3.83it/s][A
+ 22%|██▏       | 171/774 [00:46<02:48,  3.58it/s][A
+ 22%|██▏       | 172/774 [00:46<02:55,  3.43it/s][A
+ 22%|██▏       | 173/774 [00:46<02:51,  3.50it/s][A
+ 22%|██▏       | 174/774 [00:47<02:44,  3.64it/s][A
+ 23%|██▎       | 175/774 [00:47<02:45,  3.62it/s][A
+ 23%|██▎       | 176/774 [00:47<02:38,  3.77it/s][A
+ 23%|██▎       | 177/774 [00:47<02:51,  3.47it/s][A
+ 23%|██▎       | 178/774 [00:48<02:36,  3.82it/s][A
+ 23%|██▎       | 179/774 [00:48<02:21,  4.21it/s][A
+ 23%|██▎       | 180/774 [00:48<02:16,  4.36it/s][A
+ 23%|██▎       | 181/774 [00:48<02:19,  4.25it/s][A
+ 24%|██▎       | 182/774 [00:49<02:23,  4.13it/s][A
+ 24%|██▎       | 183/774 [00:49<02:24,  4.09it/s][A
+ 24%|██▍       | 184/774 [00:49<02:34,  3.81it/s][A
+ 24%|██▍       | 185/774 [00:49<02:43,  3.60it/s][A
+ 24%|██▍       | 186/774 [00:50<02:41,  3.64it/s][A
+ 24%|██▍       | 187/774 [00:50<02:35,  3.78it/s][A
+ 24%|██▍       | 188/774 [00:50<02:32,  3.83it/s][A
+ 24%|██▍       | 189/774 [00:50<02:31,  3.87it/s][A
+ 25%|██▍       | 190/774 [00:51<02:26,  3.99it/s][A
+ 25%|██▍       | 191/774 [00:51<02:32,  3.83it/s][A
+ 25%|██▍       | 192/774 [00:51<02:36,  3.72it/s][A
+ 25%|██▍       | 193/774 [00:52<02:39,  3.63it/s][A
+ 25%|██▌       | 194/774 [00:52<02:48,  3.44it/s][A
+ 25%|██▌       | 195/774 [00:52<02:56,  3.27it/s][A
+ 25%|██▌       | 196/774 [00:52<02:56,  3.27it/s][A
+ 25%|██▌       | 197/774 [00:53<02:54,  3.31it/s][A
+ 26%|██▌       | 198/774 [00:53<02:44,  3.50it/s][A
+ 26%|██▌       | 199/774 [00:53<02:45,  3.47it/s][A
+ 26%|██▌       | 200/774 [00:54<02:40,  3.58it/s][A
+ 26%|██▌       | 201/774 [00:54<02:37,  3.63it/s][A
+ 26%|██▌       | 202/774 [00:54<02:34,  3.70it/s][A
+ 26%|██▌       | 203/774 [00:54<02:26,  3.89it/s][A
+ 26%|██▋       | 204/774 [00:55<02:30,  3.78it/s][A
+ 26%|██▋       | 205/774 [00:55<02:39,  3.56it/s][A
+ 27%|██▋       | 206/774 [00:55<02:35,  3.66it/s][A
+ 27%|██▋       | 207/774 [00:55<02:34,  3.68it/s][A
+ 27%|██▋       | 208/774 [00:56<02:34,  3.65it/s][A
+ 27%|██▋       | 209/774 [00:56<02:32,  3.71it/s][A
+ 27%|██▋       | 210/774 [00:56<02:31,  3.72it/s][A
+ 27%|██▋       | 211/774 [00:57<02:28,  3.80it/s][A
+ 27%|██▋       | 212/774 [00:57<02:17,  4.09it/s][A
+ 28%|██▊       | 213/774 [00:57<02:02,  4.58it/s][A
+ 28%|██▊       | 214/774 [00:57<02:03,  4.53it/s][A
+ 28%|██▊       | 215/774 [00:57<02:03,  4.53it/s][A
+ 28%|██▊       | 216/774 [00:58<02:02,  4.56it/s][A
+ 28%|██▊       | 217/774 [00:58<02:06,  4.42it/s][A
+ 28%|██▊       | 218/774 [00:58<02:12,  4.21it/s][A
+ 28%|██▊       | 219/774 [00:58<02:20,  3.96it/s][A
+ 28%|██▊       | 220/774 [00:59<02:19,  3.98it/s][A
+ 29%|██▊       | 221/774 [00:59<02:24,  3.82it/s][A
+ 29%|██▊       | 222/774 [00:59<02:33,  3.59it/s][A
+ 29%|██▉       | 223/774 [01:00<02:50,  3.23it/s][A
+ 29%|██▉       | 224/774 [01:00<03:00,  3.05it/s][A
+ 29%|██▉       | 225/774 [01:00<03:11,  2.86it/s][A
+ 29%|██▉       | 226/774 [01:01<03:14,  2.82it/s][A
+ 29%|██▉       | 227/774 [01:01<03:11,  2.85it/s][A
+ 29%|██▉       | 228/774 [01:01<03:04,  2.96it/s][A
+ 30%|██▉       | 229/774 [01:02<03:18,  2.75it/s][A
+ 30%|██▉       | 230/774 [01:02<03:04,  2.95it/s][A
+ 30%|██▉       | 231/774 [01:02<03:01,  2.99it/s][A
+ 30%|██▉       | 232/774 [01:03<02:53,  3.13it/s][A
+ 30%|███       | 233/774 [01:03<03:07,  2.88it/s][A
+ 30%|███       | 234/774 [01:03<03:11,  2.82it/s][A
+ 30%|███       | 235/774 [01:04<03:10,  2.83it/s][A
+ 30%|███       | 236/774 [01:04<03:13,  2.78it/s][A
+ 31%|███       | 237/774 [01:05<03:10,  2.82it/s][A
+ 31%|███       | 238/774 [01:05<03:01,  2.95it/s][A
+ 31%|███       | 239/774 [01:05<02:59,  2.98it/s][A
+ 31%|███       | 240/774 [01:05<02:58,  2.98it/s][A
+ 31%|███       | 241/774 [01:06<03:02,  2.92it/s][A
+ 31%|███▏      | 242/774 [01:06<03:13,  2.75it/s][A
+ 31%|███▏      | 243/774 [01:07<03:23,  2.61it/s][A
+ 32%|███▏      | 244/774 [01:07<03:17,  2.68it/s][A
+ 32%|███▏      | 245/774 [01:07<03:09,  2.79it/s][A
+ 32%|███▏      | 246/774 [01:08<03:08,  2.80it/s][A
+ 32%|███▏      | 247/774 [01:08<03:46,  2.32it/s][A
+ 32%|███▏      | 248/774 [01:09<03:51,  2.27it/s][A
+ 32%|███▏      | 249/774 [01:09<03:27,  2.52it/s][A
+ 32%|███▏      | 250/774 [01:09<03:20,  2.61it/s][A
+ 32%|███▏      | 251/774 [01:10<03:18,  2.63it/s][A
+ 33%|███▎      | 252/774 [01:10<03:13,  2.70it/s][A
+ 33%|███▎      | 253/774 [01:10<03:11,  2.71it/s][A
+ 33%|███▎      | 254/774 [01:11<03:07,  2.78it/s][A
+ 33%|███▎      | 255/774 [01:11<03:02,  2.84it/s][A
+ 33%|███▎      | 256/774 [01:11<02:58,  2.91it/s][A
+ 33%|███▎      | 257/774 [01:12<02:56,  2.94it/s][A
+ 33%|███▎      | 258/774 [01:12<02:41,  3.20it/s][A
+ 33%|███▎      | 259/774 [01:12<02:24,  3.58it/s][A
+ 34%|███▎      | 260/774 [01:13<02:23,  3.58it/s][A
+ 34%|███▎      | 261/774 [01:13<02:28,  3.44it/s][A
+ 34%|███▍      | 262/774 [01:13<02:14,  3.82it/s][A
+ 34%|███▍      | 263/774 [01:13<02:06,  4.04it/s][A
+ 34%|███▍      | 264/774 [01:14<02:14,  3.79it/s][A
+ 34%|███▍      | 265/774 [01:14<02:09,  3.93it/s][A
+ 34%|███▍      | 266/774 [01:14<02:03,  4.11it/s][A
+ 34%|███▍      | 267/774 [01:14<02:02,  4.15it/s][A
+ 35%|███▍      | 268/774 [01:15<02:09,  3.92it/s][A
+ 35%|███▍      | 269/774 [01:15<02:13,  3.79it/s][A
+ 35%|███▍      | 270/774 [01:15<02:19,  3.62it/s][A
+ 35%|███▌      | 271/774 [01:15<02:15,  3.70it/s][A
+ 35%|███▌      | 272/774 [01:16<02:05,  4.00it/s][A
+ 35%|███▌      | 273/774 [01:16<02:01,  4.11it/s][A
+ 35%|███▌      | 274/774 [01:16<02:06,  3.96it/s][A
+ 36%|███▌      | 275/774 [01:16<01:59,  4.16it/s][A
+ 36%|███▌      | 276/774 [01:17<01:53,  4.38it/s][A
+ 36%|███▌      | 277/774 [01:17<01:57,  4.23it/s][A
+ 36%|███▌      | 278/774 [01:17<01:59,  4.15it/s][A
+ 36%|███▌      | 279/774 [01:17<01:52,  4.39it/s][A
+ 36%|███▌      | 280/774 [01:17<01:54,  4.30it/s][A
+ 36%|███▋      | 281/774 [01:18<02:05,  3.92it/s][A
+ 36%|███▋      | 282/774 [01:18<02:17,  3.59it/s][A
+ 37%|███▋      | 283/774 [01:18<02:11,  3.74it/s][A
+ 37%|███▋      | 284/774 [01:19<02:12,  3.71it/s][A
+ 37%|███▋      | 285/774 [01:19<02:05,  3.91it/s][A
+ 37%|███▋      | 286/774 [01:19<02:00,  4.05it/s][A
+ 37%|███▋      | 287/774 [01:19<02:11,  3.70it/s][A
+ 37%|███▋      | 288/774 [01:20<02:15,  3.58it/s][A
+ 37%|███▋      | 289/774 [01:20<02:13,  3.64it/s][A
+ 37%|███▋      | 290/774 [01:20<02:09,  3.73it/s][A
+ 38%|███▊      | 291/774 [01:20<02:08,  3.75it/s][A
+ 38%|███▊      | 292/774 [01:21<02:04,  3.86it/s][A
+ 38%|███▊      | 293/774 [01:21<01:54,  4.19it/s][A
+ 38%|███▊      | 294/774 [01:21<01:51,  4.31it/s][A
+ 38%|███▊      | 295/774 [01:21<01:49,  4.37it/s][A
+ 38%|███▊      | 296/774 [01:22<01:44,  4.58it/s][A
+ 38%|███▊      | 297/774 [01:22<01:38,  4.83it/s][A
+ 39%|███▊      | 298/774 [01:22<01:42,  4.64it/s][A
+ 39%|███▊      | 299/774 [01:22<01:46,  4.45it/s][A
+ 39%|███▉      | 300/774 [01:22<01:53,  4.17it/s][A
+ 39%|███▉      | 301/774 [01:23<01:46,  4.45it/s][A
+ 39%|███▉      | 302/774 [01:23<01:40,  4.70it/s][A
+ 39%|███▉      | 303/774 [01:23<01:36,  4.87it/s][A
+ 39%|███▉      | 304/774 [01:23<01:26,  5.46it/s][A
+ 39%|███▉      | 305/774 [01:23<01:26,  5.44it/s][A
+ 40%|███▉      | 306/774 [01:24<01:38,  4.75it/s][A
+ 40%|███▉      | 307/774 [01:24<01:43,  4.51it/s][A
+ 40%|███▉      | 308/774 [01:24<01:38,  4.71it/s][A
+ 40%|███▉      | 309/774 [01:24<01:38,  4.73it/s][A
+ 40%|████      | 310/774 [01:25<01:43,  4.47it/s][A
+ 40%|████      | 311/774 [01:25<01:42,  4.53it/s][A
+ 40%|████      | 312/774 [01:25<01:39,  4.65it/s][A
+ 40%|████      | 313/774 [01:25<01:39,  4.63it/s][A
+ 41%|████      | 314/774 [01:25<01:40,  4.57it/s][A
+ 41%|████      | 315/774 [01:26<01:49,  4.19it/s][A
+ 41%|████      | 316/774 [01:26<01:40,  4.55it/s][A
+ 41%|████      | 317/774 [01:26<01:33,  4.89it/s][A
+ 41%|████      | 318/774 [01:26<01:37,  4.70it/s][A
+ 41%|████      | 319/774 [01:26<01:39,  4.59it/s][A
+ 41%|████▏     | 320/774 [01:27<01:40,  4.54it/s][A
+ 41%|████▏     | 321/774 [01:27<01:32,  4.88it/s][A
+ 42%|████▏     | 322/774 [01:27<01:27,  5.19it/s][A
+ 42%|████▏     | 323/774 [01:27<01:18,  5.76it/s][A
+ 42%|████▏     | 324/774 [01:27<01:25,  5.28it/s][A
+ 42%|████▏     | 325/774 [01:28<01:29,  5.02it/s][A
+ 42%|████▏     | 326/774 [01:28<01:25,  5.21it/s][A
+ 42%|████▏     | 327/774 [01:28<01:29,  5.00it/s][A
+ 42%|████▏     | 328/774 [01:28<01:27,  5.12it/s][A
+ 43%|████▎     | 329/774 [01:28<01:35,  4.67it/s][A
+ 43%|████▎     | 330/774 [01:29<01:31,  4.88it/s][A
+ 43%|████▎     | 331/774 [01:29<01:22,  5.39it/s][A
+ 43%|████▎     | 332/774 [01:29<01:20,  5.52it/s][A
+ 43%|████▎     | 333/774 [01:29<01:23,  5.31it/s][A
+ 43%|████▎     | 334/774 [01:29<01:26,  5.06it/s][A
+ 43%|████▎     | 335/774 [01:30<01:27,  5.03it/s][A
+ 43%|████▎     | 336/774 [01:30<01:26,  5.06it/s][A
+ 44%|████▎     | 337/774 [01:30<01:20,  5.46it/s][A
+ 44%|████▎     | 338/774 [01:30<01:14,  5.82it/s][A
+ 44%|████▍     | 339/774 [01:30<01:10,  6.19it/s][A
+ 44%|████▍     | 340/774 [01:30<01:10,  6.17it/s][A
+ 44%|████▍     | 341/774 [01:31<01:27,  4.93it/s][A
+ 44%|████▍     | 342/774 [01:31<01:37,  4.45it/s][A
+ 44%|████▍     | 343/774 [01:31<01:36,  4.44it/s][A
+ 44%|████▍     | 344/774 [01:31<01:41,  4.25it/s][A
+ 45%|████▍     | 345/774 [01:32<01:44,  4.10it/s][A
+ 45%|████▍     | 346/774 [01:32<01:46,  4.01it/s][A
+ 45%|████▍     | 347/774 [01:32<01:43,  4.12it/s][A
+ 45%|████▍     | 348/774 [01:32<01:38,  4.33it/s][A
+ 45%|████▌     | 349/774 [01:33<01:34,  4.49it/s][A
+ 45%|████▌     | 350/774 [01:33<01:37,  4.34it/s][A
+ 45%|████▌     | 351/774 [01:33<01:37,  4.32it/s][A
+ 45%|████▌     | 352/774 [01:33<01:33,  4.50it/s][A
+ 46%|████▌     | 353/774 [01:33<01:33,  4.51it/s][A
+ 46%|████▌     | 354/774 [01:34<01:33,  4.51it/s][A
+ 46%|████▌     | 355/774 [01:34<01:38,  4.26it/s][A
+ 46%|████▌     | 356/774 [01:34<01:48,  3.87it/s][A
+ 46%|████▌     | 357/774 [01:35<02:04,  3.36it/s][A
+ 46%|████▋     | 358/774 [01:35<02:08,  3.23it/s][A
+ 46%|████▋     | 359/774 [01:35<02:07,  3.24it/s][A
+ 47%|████▋     | 360/774 [01:36<02:07,  3.24it/s][A
+ 47%|████▋     | 361/774 [01:36<02:01,  3.41it/s][A
+ 47%|████▋     | 362/774 [01:36<02:07,  3.23it/s][A
+ 47%|████▋     | 363/774 [01:37<02:06,  3.24it/s][A
+ 47%|████▋     | 364/774 [01:37<02:08,  3.20it/s][A
+ 47%|████▋     | 365/774 [01:37<02:04,  3.30it/s][A
+ 47%|████▋     | 366/774 [01:37<01:55,  3.54it/s][A
+ 47%|████▋     | 367/774 [01:38<01:50,  3.69it/s][A
+ 48%|████▊     | 368/774 [01:38<01:47,  3.77it/s][A
+ 48%|████▊     | 369/774 [01:38<01:54,  3.53it/s][A
+ 48%|████▊     | 370/774 [01:39<02:09,  3.13it/s][A
+ 48%|████▊     | 371/774 [01:39<02:00,  3.34it/s][A
+ 48%|████▊     | 372/774 [01:39<02:00,  3.33it/s][A
+ 48%|████▊     | 373/774 [01:39<01:58,  3.39it/s][A
+ 48%|████▊     | 374/774 [01:40<01:55,  3.46it/s][A
+ 48%|████▊     | 375/774 [01:40<01:55,  3.45it/s][A
+ 49%|████▊     | 376/774 [01:40<02:00,  3.31it/s][A
+ 49%|████▊     | 377/774 [01:41<02:11,  3.01it/s][A
+ 49%|████▉     | 378/774 [01:41<02:12,  2.98it/s][A
+ 49%|████▉     | 379/774 [01:41<02:03,  3.20it/s][A
+ 49%|████▉     | 380/774 [01:42<01:53,  3.48it/s][A
+ 49%|████▉     | 381/774 [01:42<01:44,  3.77it/s][A
+ 49%|████▉     | 382/774 [01:42<01:41,  3.87it/s][A
+ 49%|████▉     | 383/774 [01:42<01:39,  3.94it/s][A
+ 50%|████▉     | 384/774 [01:43<01:46,  3.66it/s][A
+ 50%|████▉     | 385/774 [01:43<01:54,  3.39it/s][A
+ 50%|████▉     | 386/774 [01:43<01:47,  3.60it/s][A
+ 50%|█████     | 387/774 [01:43<01:41,  3.81it/s][A
+ 50%|█████     | 388/774 [01:44<01:47,  3.60it/s][A
+ 50%|█████     | 389/774 [01:44<01:43,  3.71it/s][A
+ 50%|█████     | 390/774 [01:44<01:56,  3.29it/s][A
+ 51%|█████     | 391/774 [01:45<01:58,  3.24it/s][A
+ 51%|█████     | 392/774 [01:45<01:48,  3.51it/s][A
+ 51%|█████     | 393/774 [01:45<01:40,  3.79it/s][A
+ 51%|█████     | 394/774 [01:45<01:39,  3.82it/s][A
+ 51%|█████     | 395/774 [01:46<01:47,  3.53it/s][A
+ 51%|█████     | 396/774 [01:46<01:44,  3.60it/s][A
+ 51%|█████▏    | 397/774 [01:46<01:48,  3.49it/s][A
+ 51%|█████▏    | 398/774 [01:47<01:42,  3.67it/s][A
+ 52%|█████▏    | 399/774 [01:47<01:43,  3.64it/s][A
+ 52%|█████▏    | 400/774 [01:47<01:35,  3.90it/s][A
+ 52%|█████▏    | 401/774 [01:47<01:32,  4.03it/s][A
+ 52%|█████▏    | 402/774 [01:47<01:31,  4.05it/s][A
+ 52%|█████▏    | 403/774 [01:48<01:35,  3.88it/s][A
+ 52%|█████▏    | 404/774 [01:48<01:41,  3.64it/s][A
+ 52%|█████▏    | 405/774 [01:48<01:38,  3.76it/s][A
+ 52%|█████▏    | 406/774 [01:49<01:40,  3.66it/s][A
+ 53%|█████▎    | 407/774 [01:49<01:45,  3.48it/s][A
+ 53%|█████▎    | 408/774 [01:49<01:41,  3.60it/s][A
+ 53%|█████▎    | 409/774 [01:49<01:38,  3.71it/s][A
+ 53%|█████▎    | 410/774 [01:50<01:39,  3.66it/s][A
+ 53%|█████▎    | 411/774 [01:50<01:39,  3.66it/s][A
+ 53%|█████▎    | 412/774 [01:50<01:40,  3.60it/s][A
+ 53%|█████▎    | 413/774 [01:51<01:38,  3.66it/s][A
+ 53%|█████▎    | 414/774 [01:51<01:36,  3.75it/s][A
+ 54%|█████▎    | 415/774 [01:51<01:24,  4.23it/s][A
+ 54%|█████▎    | 416/774 [01:51<01:24,  4.24it/s][A
+ 54%|█████▍    | 417/774 [01:51<01:23,  4.28it/s][A
+ 54%|█████▍    | 418/774 [01:52<01:17,  4.62it/s][A
+ 54%|█████▍    | 419/774 [01:52<01:31,  3.86it/s][A
+ 54%|█████▍    | 420/774 [01:52<01:36,  3.68it/s][A
+ 54%|█████▍    | 421/774 [01:53<01:36,  3.67it/s][A
+ 55%|█████▍    | 422/774 [01:53<01:36,  3.67it/s][A
+ 55%|█████▍    | 423/774 [01:53<01:37,  3.61it/s][A
+ 55%|█████▍    | 424/774 [01:53<01:34,  3.69it/s][A
+ 55%|█████▍    | 425/774 [01:54<01:23,  4.19it/s][A
+ 55%|█████▌    | 426/774 [01:54<01:17,  4.49it/s][A
+ 55%|█████▌    | 427/774 [01:54<01:14,  4.69it/s][A
+ 55%|█████▌    | 428/774 [01:54<01:16,  4.54it/s][A
+ 55%|█████▌    | 429/774 [01:54<01:18,  4.39it/s][A
+ 56%|█████▌    | 430/774 [01:55<01:22,  4.17it/s][A
+ 56%|█████▌    | 431/774 [01:55<01:35,  3.59it/s][A
+ 56%|█████▌    | 432/774 [01:55<01:34,  3.62it/s][A
+ 56%|█████▌    | 433/774 [01:55<01:27,  3.90it/s][A
+ 56%|█████▌    | 434/774 [01:56<01:22,  4.12it/s][A
+ 56%|█████▌    | 435/774 [01:56<01:21,  4.16it/s][A
+ 56%|█████▋    | 436/774 [01:56<01:23,  4.06it/s][A
+ 56%|█████▋    | 437/774 [01:56<01:19,  4.21it/s][A
+ 57%|█████▋    | 438/774 [01:57<01:16,  4.42it/s][A
+ 57%|█████▋    | 439/774 [01:57<01:19,  4.23it/s][A
+ 57%|█████▋    | 440/774 [01:57<01:23,  4.02it/s][A
+ 57%|█████▋    | 441/774 [01:57<01:27,  3.81it/s][A
+ 57%|█████▋    | 442/774 [01:58<01:28,  3.74it/s][A
+ 57%|█████▋    | 443/774 [01:58<01:26,  3.82it/s][A
+ 57%|█████▋    | 444/774 [01:58<01:24,  3.90it/s][A
+ 57%|█████▋    | 445/774 [01:58<01:24,  3.88it/s][A
+ 58%|█████▊    | 446/774 [01:59<01:22,  3.97it/s][A
+ 58%|█████▊    | 447/774 [01:59<01:21,  4.02it/s][A
+ 58%|█████▊    | 448/774 [01:59<01:14,  4.38it/s][A
+ 58%|█████▊    | 449/774 [01:59<01:15,  4.33it/s][A
+ 58%|█████▊    | 450/774 [02:00<01:18,  4.15it/s][A
+ 58%|█████▊    | 451/774 [02:00<01:15,  4.26it/s][A
+ 58%|█████▊    | 452/774 [02:00<01:12,  4.46it/s][A
+ 59%|█████▊    | 453/774 [02:00<01:10,  4.53it/s][A
+ 59%|█████▊    | 454/774 [02:01<01:17,  4.15it/s][A
+ 59%|█████▉    | 455/774 [02:01<01:21,  3.91it/s][A
+ 59%|█████▉    | 456/774 [02:01<01:25,  3.71it/s][A
+ 59%|█████▉    | 457/774 [02:01<01:19,  3.98it/s][A
+ 59%|█████▉    | 458/774 [02:02<01:19,  3.98it/s][A
+ 59%|█████▉    | 459/774 [02:02<01:17,  4.06it/s][A
+ 59%|█████▉    | 460/774 [02:02<01:23,  3.78it/s][A
+ 60%|█████▉    | 461/774 [02:02<01:29,  3.48it/s][A
+ 60%|█████▉    | 462/774 [02:03<01:27,  3.58it/s][A
+ 60%|█████▉    | 463/774 [02:03<01:23,  3.72it/s][A
+ 60%|█████▉    | 464/774 [02:03<01:23,  3.70it/s][A
+ 60%|██████    | 465/774 [02:03<01:15,  4.09it/s][A
+ 60%|██████    | 466/774 [02:04<01:12,  4.23it/s][A
+ 60%|██████    | 467/774 [02:04<01:08,  4.46it/s][A
+ 60%|██████    | 468/774 [02:04<01:09,  4.41it/s][A
+ 61%|██████    | 469/774 [02:04<01:02,  4.85it/s][A
+ 61%|██████    | 470/774 [02:04<01:00,  5.03it/s][A
+ 61%|██████    | 471/774 [02:05<01:02,  4.84it/s][A
+ 61%|██████    | 472/774 [02:05<01:07,  4.48it/s][A
+ 61%|██████    | 473/774 [02:05<01:10,  4.29it/s][A
+ 61%|██████    | 474/774 [02:05<01:07,  4.43it/s][A
+ 61%|██████▏   | 475/774 [02:06<01:09,  4.30it/s][A
+ 61%|██████▏   | 476/774 [02:06<01:17,  3.83it/s][A
+ 62%|██████▏   | 477/774 [02:06<01:32,  3.22it/s][A
+ 62%|██████▏   | 478/774 [02:07<01:32,  3.18it/s][A
+ 62%|██████▏   | 479/774 [02:07<01:30,  3.25it/s][A
+ 62%|██████▏   | 480/774 [02:07<01:27,  3.36it/s][A
+ 62%|██████▏   | 481/774 [02:08<01:28,  3.31it/s][A
+ 62%|██████▏   | 482/774 [02:08<01:26,  3.39it/s][A
+ 62%|██████▏   | 483/774 [02:08<01:23,  3.47it/s][A
+ 63%|██████▎   | 484/774 [02:08<01:25,  3.40it/s][A
+ 63%|██████▎   | 485/774 [02:09<01:27,  3.31it/s][A
+ 63%|██████▎   | 486/774 [02:09<01:23,  3.46it/s][A
+ 63%|██████▎   | 487/774 [02:09<01:24,  3.38it/s][A
+ 63%|██████▎   | 488/774 [02:10<01:22,  3.46it/s][A
+ 63%|██████▎   | 489/774 [02:10<01:17,  3.66it/s][A
+ 63%|██████▎   | 490/774 [02:10<01:17,  3.65it/s][A
+ 63%|██████▎   | 491/774 [02:10<01:16,  3.70it/s][A
+ 64%|██████▎   | 492/774 [02:11<01:18,  3.60it/s][A
+ 64%|██████▎   | 493/774 [02:11<01:19,  3.56it/s][A
+ 64%|██████▍   | 494/774 [02:11<01:17,  3.60it/s][A
+ 64%|██████▍   | 495/774 [02:12<01:17,  3.58it/s][A
+ 64%|██████▍   | 496/774 [02:12<01:23,  3.35it/s][A
+ 64%|██████▍   | 497/774 [02:12<01:23,  3.31it/s][A
+ 64%|██████▍   | 498/774 [02:12<01:22,  3.36it/s][A
+ 64%|██████▍   | 499/774 [02:13<01:20,  3.43it/s][A
+ 65%|██████▍   | 500/774 [02:13<01:17,  3.52it/s][A
+ 65%|██████▍   | 501/774 [02:13<01:14,  3.65it/s][A
+ 65%|██████▍   | 502/774 [02:14<01:14,  3.67it/s][A
+ 65%|██████▍   | 503/774 [02:14<01:19,  3.40it/s][A
+ 65%|██████▌   | 504/774 [02:14<01:21,  3.29it/s][A
+ 65%|██████▌   | 505/774 [02:14<01:19,  3.40it/s][A
+ 65%|██████▌   | 506/774 [02:15<01:18,  3.40it/s][A
+ 66%|██████▌   | 507/774 [02:15<01:23,  3.19it/s][A
+ 66%|██████▌   | 508/774 [02:15<01:21,  3.27it/s][A
+ 66%|██████▌   | 509/774 [02:16<01:19,  3.32it/s][A
+ 66%|██████▌   | 510/774 [02:16<01:17,  3.42it/s][A
+ 66%|██████▌   | 511/774 [02:16<01:12,  3.61it/s][A
+ 66%|██████▌   | 512/774 [02:16<01:11,  3.69it/s][A
+ 66%|██████▋   | 513/774 [02:17<01:14,  3.51it/s][A
+ 66%|██████▋   | 514/774 [02:17<01:16,  3.41it/s][A
+ 67%|██████▋   | 515/774 [02:17<01:22,  3.15it/s][A
+ 67%|██████▋   | 516/774 [02:18<01:16,  3.37it/s][A
+ 67%|██████▋   | 517/774 [02:18<01:10,  3.66it/s][A
+ 67%|██████▋   | 518/774 [02:18<01:07,  3.78it/s][A
+ 67%|██████▋   | 519/774 [02:18<01:09,  3.65it/s][A
+ 67%|██████▋   | 520/774 [02:19<01:09,  3.66it/s][A
+ 67%|██████▋   | 521/774 [02:19<01:07,  3.77it/s][A
+ 67%|██████▋   | 522/774 [02:19<01:03,  3.98it/s][A
+ 68%|██████▊   | 523/774 [02:19<01:01,  4.06it/s][A
+ 68%|██████▊   | 524/774 [02:20<01:05,  3.80it/s][A
+ 68%|██████▊   | 525/774 [02:20<01:07,  3.71it/s][A
+ 68%|██████▊   | 526/774 [02:20<01:09,  3.56it/s][A
+ 68%|██████▊   | 527/774 [02:21<01:11,  3.46it/s][A
+ 68%|██████▊   | 528/774 [02:21<01:10,  3.50it/s][A
+ 68%|██████▊   | 529/774 [02:21<01:07,  3.63it/s][A
+ 68%|██████▊   | 530/774 [02:21<01:06,  3.69it/s][A
+ 69%|██████▊   | 531/774 [02:22<01:05,  3.73it/s][A
+ 69%|██████▊   | 532/774 [02:22<01:02,  3.84it/s][A
+ 69%|██████▉   | 533/774 [02:22<00:59,  4.04it/s][A
+ 69%|██████▉   | 534/774 [02:22<00:56,  4.24it/s][A
+ 69%|██████▉   | 535/774 [02:23<00:59,  4.05it/s][A
+ 69%|██████▉   | 536/774 [02:23<01:01,  3.88it/s][A
+ 69%|██████▉   | 537/774 [02:23<01:01,  3.86it/s][A
+ 70%|██████▉   | 538/774 [02:24<01:05,  3.60it/s][A
+ 70%|██████▉   | 539/774 [02:24<01:04,  3.62it/s][A
+ 70%|██████▉   | 540/774 [02:24<01:04,  3.63it/s][A
+ 70%|██████▉   | 541/774 [02:24<01:02,  3.75it/s][A
+ 70%|███████   | 542/774 [02:25<01:02,  3.73it/s][A
+ 70%|███████   | 543/774 [02:25<01:03,  3.64it/s][A
+ 70%|███████   | 544/774 [02:25<01:03,  3.62it/s][A
+ 70%|███████   | 545/774 [02:25<01:01,  3.75it/s][A
+ 71%|███████   | 546/774 [02:26<00:57,  3.96it/s][A
+ 71%|███████   | 547/774 [02:26<00:55,  4.10it/s][A
+ 71%|███████   | 548/774 [02:26<00:54,  4.16it/s][A
+ 71%|███████   | 549/774 [02:26<00:55,  4.08it/s][A
+ 71%|███████   | 550/774 [02:27<00:58,  3.84it/s][A
+ 71%|███████   | 551/774 [02:27<01:00,  3.66it/s][A
+ 71%|███████▏  | 552/774 [02:27<01:04,  3.46it/s][A
+ 71%|███████▏  | 553/774 [02:28<01:08,  3.24it/s][A
+ 72%|███████▏  | 554/774 [02:28<01:06,  3.30it/s][A
+ 72%|███████▏  | 555/774 [02:28<01:06,  3.30it/s][A
+ 72%|███████▏  | 556/774 [02:28<01:02,  3.47it/s][A
+ 72%|███████▏  | 557/774 [02:29<01:06,  3.26it/s][A
+ 72%|███████▏  | 558/774 [02:29<01:00,  3.57it/s][A
+ 72%|███████▏  | 559/774 [02:29<00:55,  3.84it/s][A
+ 72%|███████▏  | 560/774 [02:30<01:00,  3.55it/s][A
+ 72%|███████▏  | 561/774 [02:30<00:57,  3.73it/s][A
+ 73%|███████▎  | 562/774 [02:30<00:52,  4.02it/s][A
+ 73%|███████▎  | 563/774 [02:30<00:49,  4.22it/s][A
+ 73%|███████▎  | 564/774 [02:30<00:51,  4.04it/s][A
+ 73%|███████▎  | 565/774 [02:31<00:54,  3.86it/s][A
+ 73%|███████▎  | 566/774 [02:31<00:49,  4.17it/s][A
+ 73%|███████▎  | 567/774 [02:31<00:45,  4.50it/s][A
+ 73%|███████▎  | 568/774 [02:31<00:47,  4.36it/s][A
+ 74%|███████▎  | 569/774 [02:32<00:48,  4.26it/s][A
+ 74%|███████▎  | 570/774 [02:32<00:48,  4.24it/s][A
+ 74%|███████▍  | 571/774 [02:32<00:52,  3.89it/s][A
+ 74%|███████▍  | 572/774 [02:32<00:54,  3.71it/s][A
+ 74%|███████▍  | 573/774 [02:33<00:54,  3.72it/s][A
+ 74%|███████▍  | 574/774 [02:33<00:52,  3.82it/s][A
+ 74%|███████▍  | 575/774 [02:33<00:51,  3.85it/s][A
+ 74%|███████▍  | 576/774 [02:34<00:56,  3.47it/s][A
+ 75%|███████▍  | 577/774 [02:34<00:54,  3.58it/s][A
+ 75%|███████▍  | 578/774 [02:34<00:53,  3.63it/s][A
+ 75%|███████▍  | 579/774 [02:34<00:56,  3.47it/s][A
+ 75%|███████▍  | 580/774 [02:35<00:55,  3.49it/s][A
+ 75%|███████▌  | 581/774 [02:35<00:54,  3.51it/s][A
+ 75%|███████▌  | 582/774 [02:35<00:53,  3.61it/s][A
+ 75%|███████▌  | 583/774 [02:36<00:51,  3.74it/s][A
+ 75%|███████▌  | 584/774 [02:36<00:50,  3.77it/s][A
+ 76%|███████▌  | 585/774 [02:36<00:52,  3.60it/s][A
+ 76%|███████▌  | 586/774 [02:36<00:52,  3.56it/s][A
+ 76%|███████▌  | 587/774 [02:37<00:51,  3.63it/s][A
+ 76%|███████▌  | 588/774 [02:37<00:50,  3.71it/s][A
+ 76%|███████▌  | 589/774 [02:37<00:48,  3.81it/s][A
+ 76%|███████▌  | 590/774 [02:37<00:45,  4.07it/s][A
+ 76%|███████▋  | 591/774 [02:38<00:46,  3.93it/s][A
+ 76%|███████▋  | 592/774 [02:38<00:49,  3.67it/s][A
+ 77%|███████▋  | 593/774 [02:38<00:50,  3.61it/s][A
+ 77%|███████▋  | 594/774 [02:39<00:50,  3.59it/s][A
+ 77%|███████▋  | 595/774 [02:39<00:54,  3.31it/s][A
+ 77%|███████▋  | 596/774 [02:39<00:56,  3.15it/s][A
+ 77%|███████▋  | 597/774 [02:40<00:56,  3.14it/s][A
+ 77%|███████▋  | 598/774 [02:40<00:57,  3.06it/s][A
+ 77%|███████▋  | 599/774 [02:40<00:58,  3.01it/s][A
+ 78%|███████▊  | 600/774 [02:41<00:57,  3.01it/s][A
+ 78%|███████▊  | 601/774 [02:41<00:58,  2.98it/s][A
+ 78%|███████▊  | 602/774 [02:41<00:58,  2.96it/s][A
+ 78%|███████▊  | 603/774 [02:42<00:56,  3.00it/s][A
+ 78%|███████▊  | 604/774 [02:42<00:57,  2.96it/s][A
+ 78%|███████▊  | 605/774 [02:42<00:56,  3.01it/s][A
+ 78%|███████▊  | 606/774 [02:43<00:57,  2.91it/s][A
+ 78%|███████▊  | 607/774 [02:43<00:56,  2.94it/s][A
+ 79%|███████▊  | 608/774 [02:43<00:56,  2.93it/s][A
+ 79%|███████▊  | 609/774 [02:44<00:54,  3.05it/s][A
+ 79%|███████▉  | 610/774 [02:44<00:55,  2.97it/s][A
+ 79%|███████▉  | 611/774 [02:44<00:59,  2.73it/s][A
+ 79%|███████▉  | 612/774 [02:45<01:01,  2.62it/s][A
+ 79%|███████▉  | 613/774 [02:45<00:57,  2.81it/s][A
+ 79%|███████▉  | 614/774 [02:45<00:55,  2.89it/s][A
+ 79%|███████▉  | 615/774 [02:46<00:52,  3.04it/s][A
+ 80%|███████▉  | 616/774 [02:46<00:51,  3.09it/s][A
+ 80%|███████▉  | 617/774 [02:46<00:50,  3.11it/s][A
+ 80%|███████▉  | 618/774 [02:47<00:47,  3.27it/s][A
+ 80%|███████▉  | 619/774 [02:47<00:45,  3.43it/s][A
+ 80%|████████  | 620/774 [02:47<00:44,  3.46it/s][A
+ 80%|████████  | 621/774 [02:47<00:41,  3.73it/s][A
+ 80%|████████  | 622/774 [02:48<00:38,  3.99it/s][A
+ 80%|████████  | 623/774 [02:48<00:38,  3.94it/s][A
+ 81%|████████  | 624/774 [02:48<00:43,  3.48it/s][A
+ 81%|████████  | 625/774 [02:48<00:43,  3.44it/s][A
+ 81%|████████  | 626/774 [02:49<00:45,  3.25it/s][A
+ 81%|████████  | 627/774 [02:49<00:47,  3.10it/s][A
+ 81%|████████  | 628/774 [02:50<00:46,  3.11it/s][A
+ 81%|████████▏ | 629/774 [02:50<00:46,  3.11it/s][A
+ 81%|████████▏ | 630/774 [02:50<00:43,  3.31it/s][A
+ 82%|████████▏ | 631/774 [02:50<00:41,  3.45it/s][A
+ 82%|████████▏ | 632/774 [02:51<00:40,  3.48it/s][A
+ 82%|████████▏ | 633/774 [02:51<00:41,  3.39it/s][A
+ 82%|████████▏ | 634/774 [02:51<00:41,  3.37it/s][A
+ 82%|████████▏ | 635/774 [02:52<00:40,  3.45it/s][A
+ 82%|████████▏ | 636/774 [02:52<00:40,  3.41it/s][A
+ 82%|████████▏ | 637/774 [02:52<00:39,  3.48it/s][A
+ 82%|████████▏ | 638/774 [02:52<00:39,  3.47it/s][A
+ 83%|████████▎ | 639/774 [02:53<00:43,  3.07it/s][A
+ 83%|████████▎ | 640/774 [02:53<00:50,  2.66it/s][A
+ 83%|████████▎ | 641/774 [02:54<00:49,  2.71it/s][A
+ 83%|████████▎ | 642/774 [02:54<00:46,  2.87it/s][A
+ 83%|████████▎ | 643/774 [02:54<00:45,  2.88it/s][A
+ 83%|████████▎ | 644/774 [02:55<00:41,  3.11it/s][A
+ 83%|████████▎ | 645/774 [02:55<00:37,  3.43it/s][A
+ 83%|████████▎ | 646/774 [02:55<00:34,  3.66it/s][A
+ 84%|████████▎ | 647/774 [02:55<00:32,  3.92it/s][A
+ 84%|████████▎ | 648/774 [02:55<00:30,  4.07it/s][A
+ 84%|████████▍ | 649/774 [02:56<00:30,  4.10it/s][A
+ 84%|████████▍ | 650/774 [02:56<00:28,  4.35it/s][A
+ 84%|████████▍ | 651/774 [02:56<00:28,  4.28it/s][A
+ 84%|████████▍ | 652/774 [02:56<00:29,  4.15it/s][A
+ 84%|████████▍ | 653/774 [02:57<00:31,  3.87it/s][A
+ 84%|████████▍ | 654/774 [02:57<00:29,  4.10it/s][A
+ 85%|████████▍ | 655/774 [02:57<00:26,  4.47it/s][A
+ 85%|████████▍ | 656/774 [02:57<00:27,  4.29it/s][A
+ 85%|████████▍ | 657/774 [02:58<00:26,  4.48it/s][A
+ 85%|████████▌ | 658/774 [02:58<00:27,  4.26it/s][A
+ 85%|████████▌ | 659/774 [02:58<00:29,  3.90it/s][A
+ 85%|████████▌ | 660/774 [02:58<00:30,  3.79it/s][A
+ 85%|████████▌ | 661/774 [02:59<00:30,  3.72it/s][A
+ 86%|████████▌ | 662/774 [02:59<00:28,  3.89it/s][A
+ 86%|████████▌ | 663/774 [02:59<00:30,  3.66it/s][A
+ 86%|████████▌ | 664/774 [02:59<00:30,  3.65it/s][A
+ 86%|████████▌ | 665/774 [03:00<00:27,  3.93it/s][A
+ 86%|████████▌ | 666/774 [03:00<00:24,  4.34it/s][A
+ 86%|████████▌ | 667/774 [03:00<00:23,  4.60it/s][A
+ 86%|████████▋ | 668/774 [03:00<00:23,  4.44it/s][A
+ 86%|████████▋ | 669/774 [03:01<00:25,  4.17it/s][A
+ 87%|████████▋ | 670/774 [03:01<00:24,  4.30it/s][A
+ 87%|████████▋ | 671/774 [03:01<00:26,  3.92it/s][A
+ 87%|████████▋ | 672/774 [03:01<00:25,  3.99it/s][A
+ 87%|████████▋ | 673/774 [03:02<00:24,  4.08it/s][A
+ 87%|████████▋ | 674/774 [03:02<00:24,  4.05it/s][A
+ 87%|████████▋ | 675/774 [03:02<00:23,  4.26it/s][A
+ 87%|████████▋ | 676/774 [03:02<00:22,  4.45it/s][A
+ 87%|████████▋ | 677/774 [03:02<00:21,  4.42it/s][A
+ 88%|████████▊ | 678/774 [03:03<00:21,  4.46it/s][A
+ 88%|████████▊ | 679/774 [03:03<00:22,  4.20it/s][A
+ 88%|████████▊ | 680/774 [03:03<00:22,  4.16it/s][A
+ 88%|████████▊ | 681/774 [03:03<00:21,  4.43it/s][A
+ 88%|████████▊ | 682/774 [03:04<00:21,  4.28it/s][A
+ 88%|████████▊ | 683/774 [03:04<00:23,  3.86it/s][A
+ 88%|████████▊ | 684/774 [03:04<00:24,  3.71it/s][A
+ 89%|████████▊ | 685/774 [03:05<00:24,  3.58it/s][A
+ 89%|████████▊ | 686/774 [03:05<00:23,  3.72it/s][A
+ 89%|████████▉ | 687/774 [03:05<00:22,  3.95it/s][A
+ 89%|████████▉ | 688/774 [03:05<00:21,  3.94it/s][A
+ 89%|████████▉ | 689/774 [03:05<00:20,  4.11it/s][A
+ 89%|████████▉ | 690/774 [03:06<00:19,  4.24it/s][A
+ 89%|████████▉ | 691/774 [03:06<00:19,  4.35it/s][A
+ 89%|████████▉ | 692/774 [03:06<00:18,  4.42it/s][A
+ 90%|████████▉ | 693/774 [03:06<00:18,  4.47it/s][A
+ 90%|████████▉ | 694/774 [03:07<00:19,  4.19it/s][A
+ 90%|████████▉ | 695/774 [03:07<00:20,  3.85it/s][A
+ 90%|████████▉ | 696/774 [03:07<00:19,  3.95it/s][A
+ 90%|█████████ | 697/774 [03:07<00:19,  3.95it/s][A
+ 90%|█████████ | 698/774 [03:08<00:17,  4.40it/s][A
+ 90%|█████████ | 699/774 [03:08<00:15,  4.74it/s][A
+ 90%|█████████ | 700/774 [03:08<00:17,  4.33it/s][A
+ 91%|█████████ | 701/774 [03:08<00:16,  4.39it/s][A
+ 91%|█████████ | 702/774 [03:08<00:16,  4.39it/s][A
+ 91%|█████████ | 703/774 [03:09<00:16,  4.29it/s][A
+ 91%|█████████ | 704/774 [03:09<00:17,  4.04it/s][A
+ 91%|█████████ | 705/774 [03:09<00:15,  4.33it/s][A
+ 91%|█████████ | 706/774 [03:09<00:14,  4.55it/s][A
+ 91%|█████████▏| 707/774 [03:10<00:14,  4.51it/s][A
+ 91%|█████████▏| 708/774 [03:10<00:13,  4.79it/s][A
+ 92%|█████████▏| 709/774 [03:10<00:13,  4.72it/s][A
+ 92%|█████████▏| 710/774 [03:10<00:13,  4.64it/s][A
+ 92%|█████████▏| 711/774 [03:10<00:13,  4.82it/s][A
+ 92%|█████████▏| 712/774 [03:11<00:12,  5.04it/s][A
+ 92%|█████████▏| 713/774 [03:11<00:12,  4.84it/s][A
+ 92%|█████████▏| 714/774 [03:11<00:13,  4.59it/s][A
+ 92%|█████████▏| 715/774 [03:11<00:12,  4.70it/s][A
+ 93%|█████████▎| 716/774 [03:11<00:11,  5.17it/s][A
+ 93%|█████████▎| 717/774 [03:12<00:11,  5.11it/s][A
+ 93%|█████████▎| 718/774 [03:12<00:12,  4.61it/s][A
+ 93%|█████████▎| 719/774 [03:12<00:12,  4.53it/s][A
+ 93%|█████████▎| 720/774 [03:12<00:10,  4.94it/s][A
+ 93%|█████████▎| 721/774 [03:12<00:10,  5.20it/s][A
+ 93%|█████████▎| 722/774 [03:13<00:09,  5.61it/s][A
+ 93%|█████████▎| 723/774 [03:13<00:09,  5.40it/s][A
+ 94%|█████████▎| 724/774 [03:13<00:09,  5.35it/s][A
+ 94%|█████████▎| 725/774 [03:13<00:08,  5.49it/s][A
+ 94%|█████████▍| 726/774 [03:13<00:08,  5.56it/s][A
+ 94%|█████████▍| 727/774 [03:14<00:08,  5.33it/s][A
+ 94%|█████████▍| 728/774 [03:14<00:09,  4.77it/s][A
+ 94%|█████████▍| 729/774 [03:14<00:08,  5.06it/s][A
+ 94%|█████████▍| 730/774 [03:14<00:08,  5.35it/s][A
+ 94%|█████████▍| 731/774 [03:14<00:08,  5.36it/s][A
+ 95%|█████████▍| 732/774 [03:14<00:07,  5.52it/s][A
+ 95%|█████████▍| 733/774 [03:15<00:07,  5.57it/s][A
+ 95%|█████████▍| 734/774 [03:15<00:07,  5.54it/s][A
+ 95%|█████████▍| 735/774 [03:15<00:06,  5.68it/s][A
+ 95%|█████████▌| 736/774 [03:15<00:06,  5.77it/s][A
+ 95%|█████████▌| 737/774 [03:15<00:06,  5.72it/s][A
+ 95%|█████████▌| 738/774 [03:16<00:06,  5.55it/s][A
+ 95%|█████████▌| 739/774 [03:16<00:06,  5.58it/s][A
+ 96%|█████████▌| 740/774 [03:16<00:06,  5.45it/s][A
+ 96%|█████████▌| 741/774 [03:16<00:06,  5.11it/s][A
+ 96%|█████████▌| 742/774 [03:16<00:06,  5.29it/s][A
+ 96%|█████████▌| 743/774 [03:16<00:05,  5.62it/s][A
+ 96%|█████████▌| 744/774 [03:17<00:05,  5.41it/s][A
+ 96%|█████████▋| 745/774 [03:17<00:06,  4.51it/s][A
+ 96%|█████████▋| 746/774 [03:17<00:07,  3.89it/s][A
+ 97%|█████████▋| 747/774 [03:18<00:06,  4.09it/s][A
+ 97%|█████████▋| 748/774 [03:18<00:06,  4.30it/s][A
+ 97%|█████████▋| 749/774 [03:18<00:05,  4.60it/s][A
+ 97%|█████████▋| 750/774 [03:18<00:05,  4.32it/s][A
+ 97%|█████████▋| 751/774 [03:18<00:05,  4.52it/s][A
+ 97%|█████████▋| 752/774 [03:19<00:04,  4.46it/s][A
+ 97%|█████████▋| 753/774 [03:19<00:04,  4.74it/s][A
+ 97%|█████████▋| 754/774 [03:19<00:03,  5.37it/s][A
+ 98%|█████████▊| 755/774 [03:19<00:03,  5.67it/s][A
+ 98%|█████████▊| 756/774 [03:19<00:03,  5.54it/s][A
+ 98%|█████████▊| 757/774 [03:19<00:03,  5.35it/s][A
+ 98%|█████████▊| 758/774 [03:20<00:03,  5.23it/s][A
+ 98%|█████████▊| 759/774 [03:20<00:02,  5.46it/s][A
+ 98%|█████████▊| 760/774 [03:20<00:02,  5.44it/s][A
+ 98%|█████████▊| 761/774 [03:20<00:02,  5.88it/s][A
+ 98%|█████████▊| 762/774 [03:20<00:02,  5.93it/s][A
+ 99%|█████████▊| 763/774 [03:20<00:01,  6.26it/s][A
+ 99%|█████████▊| 764/774 [03:21<00:01,  6.36it/s][A
+ 99%|█████████▉| 765/774 [03:21<00:01,  6.23it/s][A
+ 99%|█████████▉| 766/774 [03:21<00:01,  5.33it/s][A
+ 99%|█████████▉| 767/774 [03:21<00:01,  5.48it/s][A
+ 99%|█████████▉| 768/774 [03:21<00:01,  5.46it/s][A
+ 99%|█████████▉| 769/774 [03:22<00:00,  5.21it/s][A
+ 99%|█████████▉| 770/774 [03:22<00:00,  5.08it/s][A
+100%|█████████▉| 771/774 [03:22<00:00,  5.37it/s][A
+100%|█████████▉| 772/774 [03:22<00:00,  5.07it/s][A
+100%|█████████▉| 773/774 [03:22<00:00,  4.88it/s][A                                                    
+                                                 [A 23%|██▎       | 3000/12776 [32:15<55:10,  2.95it/s]
+100%|██████████| 774/774 [03:25<00:00,  4.88it/s][A
+                                                 [A 23%|██▎       | 3001/12776 [32:17<169:24:11, 62.39s/it]                                                         23%|██▎       | 3001/12776 [32:17<169:24:11, 62.39s/it] 23%|██▎       | 3002/12776 [32:17<119:19:48, 43.95s/it]                                                         23%|██▎       | 3002/12776 [32:17<119:19:48, 43.95s/it] 24%|██▎       | 3003/12776 [32:18<84:12:29, 31.02s/it]                                                         24%|██▎       | 3003/12776 [32:18<84:12:29, 31.02s/it] 24%|██▎       | 3004/12776 [32:19<59:36:36, 21.96s/it]                                                        24%|██▎       | 3004/12776 [32:19<59:36:36, 21.96s/it] 24%|██▎       | 3005/12776 [32:20<42:24:12, 15.62s/it]                                                        24%|██▎       | 3005/12776 [32:20<42:24:12, 15.62s/it] 24%|██▎       | 3006/12776 [32:21<30:13:35, 11.14s/it]                                                        24%|██▎       | 3006/12776 [32:21<30:13:35, 11.14s/it] 24%|██▎       | 3007/12776 [32:21<21:46:13,  8.02s/it]                                                        24%|██▎       | 3007/12776 [32:21<21:46:13,  8.02s/it] 24%|██▎       | 3008/12776 [32:22<15:44:10,  5.80s/it]                                                        24%|██▎       | 3008/12776 [32:22<15:44:10,  5.80s/it] 24%|██▎       | 3009/12776 [32:23<11:30:11,  4.24s/it]                                                        24%|██▎       | 3009/12776 [32:23<11:30:11,  4.24s/it] 24%|██▎       | 3010/12776 [32:23<8:30:22,  3.14s/it]                                                        24%|██▎       | 3010/12776 [32:23<8:30:22,  3.14s/it] 24%|██▎       | 3011/12776 [32:24<6:25:45,  2.37s/it]                                                       24%|██▎       | 3011/12776 [32:24<6:25:45,  2.37s/it] 24%|██▎       | 3012/12776 [32:24<4:54:15,  1.81s/it]                                                       24%|██▎       | 3012/12776 [32:24<4:54:15,  1.81s/it] 24%|██▎       | 3013/12776 [32:25<3:52:56,  1.43s/it]                                                       24%|██▎       | 3013/12776 [32:25<3:52:56,  1.43s/it] 24%|██▎       | 3014/12776 [32:25<3:04:43,  1.14s/it]                                                       24%|██▎       | 3014/12776 [32:25<3:04:43,  1.14s/it] 24%|██▎       | 3015/12776 [32:26<2:34:12,  1.05it/s]                                                       24%|██▎       | 3015/12776 [32:26<2:34:12,  1.05it/s] 24%|██▎       | 3016/12776 [32:26<2:07:48,  1.27it/s]                                                       24%|██▎       | 3016/12776 [32:26<2:07:48,  1.27it/s] 24%|██▎       | 3017/12776 [32:27<1:48:47,  1.50it/s]                                                       24%|██▎       | 3017/12776 [32:27<1:48:47,  1.50it/s] 24%|██▎       | 3018/12776 [32:27<1:41:59,  1.59it/s]                                                       24%|██▎       | 3018/12776 [32:27<1:41:59,  1.59it/s] 24%|██▎       | 3019/12776 [32:27<1:29:27,  1.82it/s]                                                       24%|██▎       | 3019/12776 [32:27<1:29:27,  1.82it/s] 24%|██▎       | 3020/12776 [32:28<1:19:48,  2.04it/s]                                                       24%|██▎       | 3020/12776 [32:28<1:19:48,  2.04it/s] 24%|██▎       | 3021/12776 [32:28<1:15:17,  2.16it/s]                                                       24%|██▎       | 3021/12776 [32:28<1:15:17,  2.16it/s] 24%|██▎       | 3022/12776 [32:29<1:08:58,  2.36it/s]                                                       24%|██▎       | 3022/12776 [32:29<1:08:58,  2.36it/s] 24%|██▎       | 3023/12776 [32:29<1:04:29,  2.52it/s]                                                       24%|██▎       | 3023/12776 [32:29<1:04:29,  2.52it/s] 24%|██▎       | 3024/12776 [32:29<1:07:37,  2.40it/s]                                                       24%|██▎       | 3024/12776 [32:29<1:07:37,  2.40it/s] 24%|██▎       | 3025/12776 [32:30<1:02:33,  2.60it/s]                                                       24%|██▎       | 3025/12776 [32:30<1:02:33,  2.60it/s] 24%|██▎       | 3026/12776 [32:30<58:27,  2.78it/s]                                                       24%|██▎       | 3026/12776 [32:30<58:27,  2.78it/s] 24%|██▎       | 3027/12776 [32:30<1:00:26,  2.69it/s]                                                       24%|██▎       | 3027/12776 [32:30<1:00:26,  2.69it/s] 24%|██▎       | 3028/12776 [32:31<55:55,  2.90it/s]                                                       24%|██▎       | 3028/12776 [32:31<55:55,  2.90it/s] 24%|██▎       | 3029/12776 [32:31<52:17,  3.11it/s]                                                     24%|██▎       | 3029/12776 [32:31<52:17,  3.11it/s] 24%|██▎       | 3030/12776 [32:31<49:30,  3.28it/s]                                                     24%|██▎       | 3030/12776 [32:31<49:30,  3.28it/s] 24%|██▎       | 3031/12776 [32:32<50:20,  3.23it/s]                                                     24%|██▎       | 3031/12776 [32:32<50:20,  3.23it/s] 24%|██▎       | 3032/12776 [32:32<47:32,  3.42it/s]                                                     24%|██▎       | 3032/12776 [32:32<47:32,  3.42it/s] 24%|██▎       | 3033/12776 [32:32<45:10,  3.59it/s]                                                     24%|██▎       | 3033/12776 [32:32<45:10,  3.59it/s] 24%|██▎       | 3034/12776 [32:32<45:54,  3.54it/s]                                                     24%|██▎       | 3034/12776 [32:32<45:54,  3.54it/s] 24%|██▍       | 3035/12776 [32:33<48:12,  3.37it/s]                                                     24%|██▍       | 3035/12776 [32:33<48:12,  3.37it/s] 24%|██▍       | 3036/12776 [32:33<45:04,  3.60it/s]                                                     24%|██▍       | 3036/12776 [32:33<45:04,  3.60it/s] 24%|██▍       | 3037/12776 [32:33<42:25,  3.83it/s]                                                     24%|██▍       | 3037/12776 [32:33<42:25,  3.83it/s] 24%|██▍       | 3038/12776 [32:33<40:15,  4.03it/s]                                                     24%|██▍       | 3038/12776 [32:33<40:15,  4.03it/s] 24%|██▍       | 3039/12776 [32:34<38:29,  4.22it/s]                                                     24%|██▍       | 3039/12776 [32:34<38:29,  4.22it/s] 24%|██▍       | 3040/12776 [32:34<38:44,  4.19it/s]                                                     24%|██▍       | 3040/12776 [32:34<38:44,  4.19it/s] 24%|██▍       | 3041/12776 [32:34<37:09,  4.37it/s]                                                     24%|██▍       | 3041/12776 [32:34<37:09,  4.37it/s] 24%|██▍       | 3042/12776 [32:34<35:50,  4.53it/s]                                                     24%|██▍       | 3042/12776 [32:34<35:50,  4.53it/s] 24%|██▍       | 3043/12776 [32:34<34:45,  4.67it/s]                                                     24%|██▍       | 3043/12776 [32:34<34:45,  4.67it/s] 24%|██▍       | 3044/12776 [32:35<33:52,  4.79it/s]                                                     24%|██▍       | 3044/12776 [32:35<33:52,  4.79it/s] 24%|██▍       | 3045/12776 [32:35<34:17,  4.73it/s]                                                     24%|██▍       | 3045/12776 [32:35<34:17,  4.73it/s] 24%|██▍       | 3046/12776 [32:35<35:11,  4.61it/s]                                                     24%|██▍       | 3046/12776 [32:35<35:11,  4.61it/s] 24%|██▍       | 3047/12776 [32:35<34:00,  4.77it/s]                                                     24%|██▍       | 3047/12776 [32:35<34:00,  4.77it/s] 24%|██▍       | 3048/12776 [32:35<32:45,  4.95it/s]                                                     24%|██▍       | 3048/12776 [32:35<32:45,  4.95it/s] 24%|██▍       | 3049/12776 [32:36<31:47,  5.10it/s]                                                     24%|██▍       | 3049/12776 [32:36<31:47,  5.10it/s] 24%|██▍       | 3050/12776 [32:36<54:33,  2.97it/s]                                                     24%|██▍       | 3050/12776 [32:36<54:33,  2.97it/s] 24%|██▍       | 3051/12776 [32:38<1:42:13,  1.59it/s]                                                       24%|██▍       | 3051/12776 [32:38<1:42:13,  1.59it/s] 24%|██▍       | 3052/12776 [32:38<1:57:37,  1.38it/s]                                                       24%|██▍       | 3052/12776 [32:38<1:57:37,  1.38it/s] 24%|██▍       | 3053/12776 [32:39<2:10:43,  1.24it/s]                                                       24%|██▍       | 3053/12776 [32:39<2:10:43,  1.24it/s] 24%|██▍       | 3054/12776 [32:40<2:09:51,  1.25it/s]                                                       24%|██▍       | 3054/12776 [32:40<2:09:51,  1.25it/s] 24%|██▍       | 3055/12776 [32:41<2:08:17,  1.26it/s]                                                       24%|██▍       | 3055/12776 [32:41<2:08:17,  1.26it/s] 24%|██▍       | 3056/12776 [32:42<2:05:25,  1.29it/s]                                                       24%|██▍       | 3056/12776 [32:42<2:05:25,  1.29it/s] 24%|██▍       | 3057/12776 [32:42<1:59:49,  1.35it/s]                                                       24%|██▍       | 3057/12776 [32:42<1:59:49,  1.35it/s] 24%|██▍       | 3058/12776 [32:43<1:59:02,  1.36it/s]                                                       24%|██▍       | 3058/12776 [32:43<1:59:02,  1.36it/s] 24%|██▍       | 3059/12776 [32:44<1:51:38,  1.45it/s]                                                       24%|██▍       | 3059/12776 [32:44<1:51:38,  1.45it/s] 24%|██▍       | 3060/12776 [32:44<1:48:43,  1.49it/s]                                                       24%|██▍       | 3060/12776 [32:44<1:48:43,  1.49it/s] 24%|██▍       | 3061/12776 [32:45<1:41:44,  1.59it/s]                                                       24%|██▍       | 3061/12776 [32:45<1:41:44,  1.59it/s] 24%|██▍       | 3062/12776 [32:46<1:39:56,  1.62it/s]                                                       24%|██▍       | 3062/12776 [32:46<1:39:56,  1.62it/s] 24%|██▍       | 3063/12776 [32:46<1:33:06,  1.74it/s]                                                       24%|██▍       | 3063/12776 [32:46<1:33:06,  1.74it/s] 24%|██▍       | 3064/12776 [32:47<1:30:54,  1.78it/s]                                                       24%|██▍       | 3064/12776 [32:47<1:30:54,  1.78it/s] 24%|██▍       | 3065/12776 [32:47<1:25:02,  1.90it/s]                                                       24%|██▍       | 3065/12776 [32:47<1:25:02,  1.90it/s] 24%|██▍       | 3066/12776 [32:47<1:24:38,  1.91it/s]                                                       24%|██▍       | 3066/12776 [32:47<1:24:38,  1.91it/s] 24%|██▍       | 3067/12776 [32:48<1:18:59,  2.05it/s]                                                       24%|██▍       | 3067/12776 [32:48<1:18:59,  2.05it/s] 24%|██▍       | 3068/12776 [32:48<1:14:39,  2.17it/s]                                                       24%|██▍       | 3068/12776 [32:48<1:14:39,  2.17it/s] 24%|██▍       | 3069/12776 [32:49<1:16:28,  2.12it/s]                                                       24%|██▍       | 3069/12776 [32:49<1:16:28,  2.12it/s] 24%|██▍       | 3070/12776 [32:49<1:11:08,  2.27it/s]                                                       24%|██▍       | 3070/12776 [32:49<1:11:08,  2.27it/s] 24%|██▍       | 3071/12776 [32:49<1:06:24,  2.44it/s]                                                       24%|██▍       | 3071/12776 [32:49<1:06:24,  2.44it/s] 24%|██▍       | 3072/12776 [32:50<1:05:57,  2.45it/s]                                                       24%|██▍       | 3072/12776 [32:50<1:05:57,  2.45it/s] 24%|██▍       | 3073/12776 [32:50<1:02:10,  2.60it/s]                                                       24%|██▍       | 3073/12776 [32:50<1:02:10,  2.60it/s] 24%|██▍       | 3074/12776 [32:51<59:07,  2.73it/s]                                                       24%|██▍       | 3074/12776 [32:51<59:07,  2.73it/s] 24%|██▍       | 3075/12776 [32:51<58:10,  2.78it/s]                                                     24%|██▍       | 3075/12776 [32:51<58:10,  2.78it/s] 24%|██▍       | 3076/12776 [32:51<55:12,  2.93it/s]                                                     24%|██▍       | 3076/12776 [32:51<55:12,  2.93it/s] 24%|██▍       | 3077/12776 [32:51<52:50,  3.06it/s]                                                    {'eval_loss': 0.6461995244026184, 'eval_wer': 0.4029793654504112, 'eval_runtime': 205.814, 'eval_samples_per_second': 60.166, 'eval_steps_per_second': 3.761, 'epoch': 0.47}
+{'loss': 0.2968, 'grad_norm': 0.5218053460121155, 'learning_rate': 0.00023902737047898337, 'epoch': 0.47}
+{'loss': 0.363, 'grad_norm': 0.8626418113708496, 'learning_rate': 0.00023900293255131963, 'epoch': 0.47}
+{'loss': 0.3505, 'grad_norm': 0.6451046466827393, 'learning_rate': 0.0002389784946236559, 'epoch': 0.47}
+{'loss': 0.3542, 'grad_norm': 1.0618706941604614, 'learning_rate': 0.00023895405669599216, 'epoch': 0.47}
+{'loss': 0.4272, 'grad_norm': 0.6961786150932312, 'learning_rate': 0.0002389296187683284, 'epoch': 0.47}
+{'loss': 0.3825, 'grad_norm': 0.6767577528953552, 'learning_rate': 0.00023890518084066469, 'epoch': 0.47}
+{'loss': 0.3995, 'grad_norm': 0.8151198029518127, 'learning_rate': 0.00023888074291300096, 'epoch': 0.47}
+{'loss': 0.5605, 'grad_norm': 1.8509833812713623, 'learning_rate': 0.00023885630498533722, 'epoch': 0.47}
+{'loss': 0.4549, 'grad_norm': 0.6328085660934448, 'learning_rate': 0.0002388318670576735, 'epoch': 0.47}
+{'loss': 0.5468, 'grad_norm': 1.1745184659957886, 'learning_rate': 0.00023880742913000977, 'epoch': 0.47}
+{'loss': 0.6493, 'grad_norm': 1.2673914432525635, 'learning_rate': 0.000238782991202346, 'epoch': 0.47}
+{'loss': 0.3245, 'grad_norm': 0.6951119899749756, 'learning_rate': 0.00023875855327468227, 'epoch': 0.47}
+{'loss': 0.4878, 'grad_norm': 1.6163688898086548, 'learning_rate': 0.00023873411534701855, 'epoch': 0.47}
+{'loss': 0.4346, 'grad_norm': 1.3467936515808105, 'learning_rate': 0.0002387096774193548, 'epoch': 0.47}
+{'loss': 0.6272, 'grad_norm': 0.9435334205627441, 'learning_rate': 0.00023868523949169108, 'epoch': 0.47}
+{'loss': 0.6137, 'grad_norm': 1.2836298942565918, 'learning_rate': 0.00023866080156402736, 'epoch': 0.47}
+{'loss': 0.5952, 'grad_norm': 1.1199934482574463, 'learning_rate': 0.0002386363636363636, 'epoch': 0.47}
+{'loss': 0.6759, 'grad_norm': 1.2082144021987915, 'learning_rate': 0.0002386119257086999, 'epoch': 0.47}
+{'loss': 0.4364, 'grad_norm': 0.8209431171417236, 'learning_rate': 0.00023858748778103617, 'epoch': 0.47}
+{'loss': 0.7077, 'grad_norm': 1.712845802307129, 'learning_rate': 0.0002385630498533724, 'epoch': 0.47}
+{'loss': 0.7401, 'grad_norm': 2.234483242034912, 'learning_rate': 0.00023853861192570867, 'epoch': 0.47}
+{'loss': 1.2215, 'grad_norm': 2.4135541915893555, 'learning_rate': 0.00023851417399804495, 'epoch': 0.47}
+{'loss': 0.8349, 'grad_norm': 1.4404560327529907, 'learning_rate': 0.0002384897360703812, 'epoch': 0.47}
+{'loss': 1.2226, 'grad_norm': 2.8168258666992188, 'learning_rate': 0.00023846529814271748, 'epoch': 0.47}
+{'loss': 0.7827, 'grad_norm': 2.413266181945801, 'learning_rate': 0.00023844086021505376, 'epoch': 0.47}
+{'loss': 0.9485, 'grad_norm': 1.8607373237609863, 'learning_rate': 0.00023841642228739, 'epoch': 0.47}
+{'loss': 0.6483, 'grad_norm': 2.2613914012908936, 'learning_rate': 0.0002383919843597263, 'epoch': 0.47}
+{'loss': 0.9358, 'grad_norm': 1.8476412296295166, 'learning_rate': 0.00023836754643206254, 'epoch': 0.47}
+{'loss': 1.1602, 'grad_norm': 5.6817851066589355, 'learning_rate': 0.0002383431085043988, 'epoch': 0.47}
+{'loss': 0.5406, 'grad_norm': 1.4337266683578491, 'learning_rate': 0.00023831867057673507, 'epoch': 0.47}
+{'loss': 0.6749, 'grad_norm': 1.3688822984695435, 'learning_rate': 0.00023829423264907135, 'epoch': 0.47}
+{'loss': 0.6256, 'grad_norm': 1.8527987003326416, 'learning_rate': 0.0002382697947214076, 'epoch': 0.47}
+{'loss': 0.8839, 'grad_norm': 2.1295547485351562, 'learning_rate': 0.00023824535679374388, 'epoch': 0.47}
+{'loss': 1.5188, 'grad_norm': 7.561639308929443, 'learning_rate': 0.00023822091886608015, 'epoch': 0.47}
+{'loss': 1.0713, 'grad_norm': 2.218928575515747, 'learning_rate': 0.00023819648093841638, 'epoch': 0.48}
+{'loss': 0.9386, 'grad_norm': 1.8199081420898438, 'learning_rate': 0.00023817204301075266, 'epoch': 0.48}
+{'loss': 1.3609, 'grad_norm': 2.755200147628784, 'learning_rate': 0.00023814760508308893, 'epoch': 0.48}
+{'loss': 0.9989, 'grad_norm': 2.402803897857666, 'learning_rate': 0.00023812316715542519, 'epoch': 0.48}
+{'loss': 1.2102, 'grad_norm': 2.486337184906006, 'learning_rate': 0.00023809872922776146, 'epoch': 0.48}
+{'loss': 1.1949, 'grad_norm': 2.5728185176849365, 'learning_rate': 0.00023807429130009774, 'epoch': 0.48}
+{'loss': 1.2828, 'grad_norm': 2.1376590728759766, 'learning_rate': 0.000238049853372434, 'epoch': 0.48}
+{'loss': 2.2113, 'grad_norm': 1.944007158279419, 'learning_rate': 0.00023802541544477027, 'epoch': 0.48}
+{'loss': 0.9221, 'grad_norm': 3.550218343734741, 'learning_rate': 0.00023800097751710655, 'epoch': 0.48}
+{'loss': 1.6563, 'grad_norm': 3.1462228298187256, 'learning_rate': 0.00023797653958944277, 'epoch': 0.48}
+{'loss': 1.7042, 'grad_norm': 4.181352615356445, 'learning_rate': 0.00023795210166177905, 'epoch': 0.48}
+{'loss': 1.0479, 'grad_norm': 2.818664312362671, 'learning_rate': 0.00023792766373411533, 'epoch': 0.48}
+{'loss': 0.4576, 'grad_norm': 1.8017176389694214, 'learning_rate': 0.00023790322580645158, 'epoch': 0.48}
+{'loss': 1.3568, 'grad_norm': 3.6153724193573, 'learning_rate': 0.00023787878787878786, 'epoch': 0.48}
+{'loss': 1.0744, 'grad_norm': 2.630364418029785, 'learning_rate': 0.00023785434995112414, 'epoch': 0.48}
+{'loss': 1.015, 'grad_norm': 2.6814486980438232, 'learning_rate': 0.0002378299120234604, 'epoch': 0.48}
+{'loss': 0.4242, 'grad_norm': 0.47881409525871277, 'learning_rate': 0.00023780547409579664, 'epoch': 0.48}
+{'loss': 0.3254, 'grad_norm': 0.47194862365722656, 'learning_rate': 0.00023778103616813292, 'epoch': 0.48}
+{'loss': 0.5085, 'grad_norm': 1.0471508502960205, 'learning_rate': 0.00023775659824046917, 'epoch': 0.48}
+{'loss': 0.5577, 'grad_norm': 0.921794593334198, 'learning_rate': 0.00023773216031280545, 'epoch': 0.48}
+{'loss': 0.3387, 'grad_norm': 0.5158277750015259, 'learning_rate': 0.00023770772238514173, 'epoch': 0.48}
+{'loss': 0.3237, 'grad_norm': 0.7019078135490417, 'learning_rate': 0.00023768328445747798, 'epoch': 0.48}
+{'loss': 0.2525, 'grad_norm': 0.46547719836235046, 'learning_rate': 0.00023765884652981426, 'epoch': 0.48}
+{'loss': 0.5739, 'grad_norm': 1.2258836030960083, 'learning_rate': 0.00023763440860215054, 'epoch': 0.48}
+{'loss': 0.3721, 'grad_norm': 0.8825581073760986, 'learning_rate': 0.00023760997067448676, 'epoch': 0.48}
+{'loss': 0.3145, 'grad_norm': 0.7724276185035706, 'learning_rate': 0.00023758553274682304, 'epoch': 0.48}
+{'loss': 0.716, 'grad_norm': 6.942146301269531, 'learning_rate': 0.00023756109481915932, 'epoch': 0.48}
+{'loss': 0.4538, 'grad_norm': 0.8972859978675842, 'learning_rate': 0.00023753665689149557, 'epoch': 0.48}
+{'loss': 0.6098, 'grad_norm': 1.213957667350769, 'learning_rate': 0.00023751221896383185, 'epoch': 0.48}
+{'loss': 0.5618, 'grad_norm': 1.26592218875885, 'learning_rate': 0.00023748778103616813, 'epoch': 0.48}
+{'loss': 0.5241, 'grad_norm': 1.0946046113967896, 'learning_rate': 0.00023746334310850438, 'epoch': 0.48}
+{'loss': 0.5417, 'grad_norm': 1.3569597005844116, 'learning_rate': 0.00023743890518084065, 'epoch': 0.48}
+{'loss': 0.7591, 'grad_norm': 1.2722363471984863, 'learning_rate': 0.00023741446725317693, 'epoch': 0.48}
+{'loss': 0.7028, 'grad_norm': 2.4486443996429443, 'learning_rate': 0.00023739002932551316, 'epoch': 0.48}
+{'loss': 0.7086, 'grad_norm': 1.7454280853271484, 'learning_rate': 0.00023736559139784944, 'epoch': 0.48}
+{'loss': 0.7306, 'grad_norm': 1.4875236749649048, 'learning_rate': 0.00023734115347018571, 'epoch': 0.48}
+{'loss': 0.6128, 'grad_norm': 1.795547366142273, 'learning_rate': 0.00023731671554252197, 'epoch': 0.48}
+{'loss': 0.5872, 'grad_norm': 1.589440107345581, 'learning_rate': 0.00023729227761485824, 'epoch': 0.48}
+{'loss': 0.7743, 'grad_norm': 2.098665714263916, 'learning_rate': 0.00023726783968719452, 'epoch': 0.48}
+{'loss': 0.8935, 'grad_norm': 1.5738600492477417, 'learning_rate': 0.00023724340175953077, 'epoch': 0.48}
+{'loss': 0.9405, 'grad_norm': 1.6582045555114746, 'learning_rate': 0.00023721896383186702, 'epoch': 0.48}
+{'loss': 0.7447, 'grad_norm': 1.581145167350769, 'learning_rate': 0.0002371945259042033, 'epoch': 0.48}
+ 24%|██▍       | 3077/12776 [32:51<52:50,  3.06it/s] 24%|██▍       | 3078/12776 [32:52<51:06,  3.16it/s]                                                     24%|██▍       | 3078/12776 [32:52<51:06,  3.16it/s] 24%|██▍       | 3079/12776 [32:52<53:52,  3.00it/s]                                                     24%|██▍       | 3079/12776 [32:52<53:52,  3.00it/s] 24%|██▍       | 3080/12776 [32:52<50:34,  3.20it/s]                                                     24%|██▍       | 3080/12776 [32:52<50:34,  3.20it/s] 24%|██▍       | 3081/12776 [32:53<48:05,  3.36it/s]                                                     24%|██▍       | 3081/12776 [32:53<48:05,  3.36it/s] 24%|██▍       | 3082/12776 [32:53<45:50,  3.52it/s]                                                     24%|██▍       | 3082/12776 [32:53<45:50,  3.52it/s] 24%|██▍       | 3083/12776 [32:53<49:00,  3.30it/s]                                                     24%|██▍       | 3083/12776 [32:53<49:00,  3.30it/s] 24%|██▍       | 3084/12776 [32:53<45:59,  3.51it/s]                                                     24%|██▍       | 3084/12776 [32:54<45:59,  3.51it/s] 24%|██▍       | 3085/12776 [32:54<43:25,  3.72it/s]                                                     24%|██▍       | 3085/12776 [32:54<43:25,  3.72it/s] 24%|██▍       | 3086/12776 [32:54<41:17,  3.91it/s]                                                     24%|██▍       | 3086/12776 [32:54<41:17,  3.91it/s] 24%|██▍       | 3087/12776 [32:54<43:45,  3.69it/s]                                                     24%|██▍       | 3087/12776 [32:54<43:45,  3.69it/s] 24%|██▍       | 3088/12776 [32:54<40:55,  3.95it/s]                                                     24%|██▍       | 3088/12776 [32:54<40:55,  3.95it/s] 24%|██▍       | 3089/12776 [32:55<38:44,  4.17it/s]                                                     24%|██▍       | 3089/12776 [32:55<38:44,  4.17it/s] 24%|██▍       | 3090/12776 [32:55<37:18,  4.33it/s]                                                     24%|██▍       | 3090/12776 [32:55<37:18,  4.33it/s] 24%|██▍       | 3091/12776 [32:55<36:05,  4.47it/s]                                                     24%|██▍       | 3091/12776 [32:55<36:05,  4.47it/s] 24%|██▍       | 3092/12776 [32:55<39:47,  4.06it/s]                                                     24%|██▍       | 3092/12776 [32:55<39:47,  4.06it/s] 24%|██▍       | 3093/12776 [32:56<37:33,  4.30it/s]                                                     24%|██▍       | 3093/12776 [32:56<37:33,  4.30it/s] 24%|██▍       | 3094/12776 [32:56<33:50,  4.77it/s]                                                     24%|██▍       | 3094/12776 [32:56<33:50,  4.77it/s] 24%|██▍       | 3095/12776 [32:56<32:07,  5.02it/s]                                                     24%|██▍       | 3095/12776 [32:56<32:07,  5.02it/s] 24%|██▍       | 3096/12776 [32:56<31:48,  5.07it/s]                                                     24%|██▍       | 3096/12776 [32:56<31:48,  5.07it/s] 24%|██▍       | 3097/12776 [32:56<31:30,  5.12it/s]                                                     24%|██▍       | 3097/12776 [32:56<31:30,  5.12it/s] 24%|██▍       | 3098/12776 [32:57<36:53,  4.37it/s]                                                     24%|██▍       | 3098/12776 [32:57<36:53,  4.37it/s] 24%|██▍       | 3099/12776 [32:57<34:45,  4.64it/s]                                                     24%|██▍       | 3099/12776 [32:57<34:45,  4.64it/s] 24%|██▍       | 3100/12776 [32:58<58:16,  2.77it/s]                                                     24%|██▍       | 3100/12776 [32:58<58:16,  2.77it/s] 24%|██▍       | 3101/12776 [32:59<1:56:55,  1.38it/s]                                                       24%|██▍       | 3101/12776 [32:59<1:56:55,  1.38it/s] 24%|██▍       | 3102/12776 [33:00<2:06:09,  1.28it/s]                                                       24%|██▍       | 3102/12776 [33:00<2:06:09,  1.28it/s] 24%|██▍       | 3103/12776 [33:01<2:07:53,  1.26it/s]                                                       24%|██▍       | 3103/12776 [33:01<2:07:53,  1.26it/s] 24%|██▍       | 3104/12776 [33:02<2:05:16,  1.29it/s]                                                       24%|██▍       | 3104/12776 [33:02<2:05:16,  1.29it/s] 24%|██▍       | 3105/12776 [33:02<2:01:18,  1.33it/s]                                                       24%|██▍       | 3105/12776 [33:02<2:01:18,  1.33it/s] 24%|██▍       | 3106/12776 [33:03<1:57:02,  1.38it/s]                                                       24%|██▍       | 3106/12776 [33:03<1:57:02,  1.38it/s] 24%|██▍       | 3107/12776 [33:04<1:59:21,  1.35it/s]                                                       24%|██▍       | 3107/12776 [33:04<1:59:21,  1.35it/s] 24%|██▍       | 3108/12776 [33:04<1:53:08,  1.42it/s]                                                       24%|██▍       | 3108/12776 [33:04<1:53:08,  1.42it/s] 24%|██▍       | 3109/12776 [33:05<1:48:18,  1.49it/s]                                                       24%|██▍       | 3109/12776 [33:05<1:48:18,  1.49it/s] 24%|██▍       | 3110/12776 [33:05<1:42:15,  1.58it/s]                                                       24%|██▍       | 3110/12776 [33:05<1:42:15,  1.58it/s] 24%|██▍       | 3111/12776 [33:06<1:39:13,  1.62it/s]                                                       24%|██▍       | 3111/12776 [33:06<1:39:13,  1.62it/s] 24%|██▍       | 3112/12776 [33:07<1:33:28,  1.72it/s]                                                       24%|██▍       | 3112/12776 [33:07<1:33:28,  1.72it/s] 24%|██▍       | 3113/12776 [33:07<1:31:29,  1.76it/s]                                                       24%|██▍       | 3113/12776 [33:07<1:31:29,  1.76it/s] 24%|██▍       | 3114/12776 [33:08<1:25:39,  1.88it/s]                                                       24%|██▍       | 3114/12776 [33:08<1:25:39,  1.88it/s] 24%|██▍       | 3115/12776 [33:08<1:26:05,  1.87it/s]                                                       24%|██▍       | 3115/12776 [33:08<1:26:05,  1.87it/s] 24%|██▍       | 3116/12776 [33:08<1:20:22,  2.00it/s]                                                       24%|██▍       | 3116/12776 [33:08<1:20:22,  2.00it/s] 24%|██▍       | 3117/12776 [33:09<1:15:47,  2.12it/s]                                                       24%|██▍       | 3117/12776 [33:09<1:15:47,  2.12it/s] 24%|██▍       | 3118/12776 [33:09<1:17:09,  2.09it/s]                                                       24%|██▍       | 3118/12776 [33:09<1:17:09,  2.09it/s] 24%|██▍       | 3119/12776 [33:10<1:12:38,  2.22it/s]                                                       24%|██▍       | 3119/12776 [33:10<1:12:38,  2.22it/s] 24%|██▍       | 3120/12776 [33:10<1:08:30,  2.35it/s]                                                       24%|██▍       | 3120/12776 [33:10<1:08:30,  2.35it/s] 24%|██▍       | 3121/12776 [33:11<1:08:31,  2.35it/s]                                                       24%|██▍       | 3121/12776 [33:11<1:08:31,  2.35it/s] 24%|██▍       | 3122/12776 [33:11<1:04:08,  2.51it/s]                                                       24%|██▍       | 3122/12776 [33:11<1:04:08,  2.51it/s] 24%|██▍       | 3123/12776 [33:11<1:01:05,  2.63it/s]                                                       24%|██▍       | 3123/12776 [33:11<1:01:05,  2.63it/s] 24%|██▍       | 3124/12776 [33:12<1:01:31,  2.61it/s]                                                       24%|██▍       | 3124/12776 [33:12<1:01:31,  2.61it/s] 24%|██▍       | 3125/12776 [33:12<58:37,  2.74it/s]                                                       24%|██▍       | 3125/12776 [33:12<58:37,  2.74it/s] 24%|██▍       | 3126/12776 [33:12<55:42,  2.89it/s]                                                     24%|██▍       | 3126/12776 [33:12<55:42,  2.89it/s] 24%|██▍       | 3127/12776 [33:13<56:24,  2.85it/s]                                                     24%|██▍       | 3127/12776 [33:13<56:24,  2.85it/s] 24%|██▍       | 3128/12776 [33:13<53:22,  3.01it/s]                                                     24%|██▍       | 3128/12776 [33:13<53:22,  3.01it/s] 24%|██▍       | 3129/12776 [33:13<50:44,  3.17it/s]                                                     24%|██▍       | 3129/12776 [33:13<50:44,  3.17it/s] 24%|██▍       | 3130/12776 [33:13<50:23,  3.19it/s]                                                     24%|██▍       | 3130/12776 [33:13<50:23,  3.19it/s] 25%|██▍       | 3131/12776 [33:14<56:17,  2.86it/s]                                                     25%|██▍       | 3131/12776 [33:14<56:17,  2.86it/s] 25%|██▍       | 3132/12776 [33:14<51:50,  3.10it/s]                                                     25%|██▍       | 3132/12776 [33:14<51:50,  3.10it/s] 25%|██▍       | 3133/12776 [33:14<48:58,  3.28it/s]                                                     25%|██▍       | 3133/12776 [33:14<48:58,  3.28it/s] 25%|██▍       | 3134/12776 [33:15<46:17,  3.47it/s]                                                     25%|██▍       | 3134/12776 [33:15<46:17,  3.47it/s] 25%|██▍       | 3135/12776 [33:15<49:04,  3.27it/s]                                                     25%|██▍       | 3135/12776 [33:15<49:04,  3.27it/s] 25%|██▍       | 3136/12776 [33:15<45:43,  3.51it/s]                                                     25%|██▍       | 3136/12776 [33:15<45:43,  3.51it/s] 25%|██▍       | 3137/12776 [33:15<43:16,  3.71it/s]                                                     25%|██▍       | 3137/12776 [33:15<43:16,  3.71it/s] 25%|██▍       | 3138/12776 [33:16<41:11,  3.90it/s]                                                     25%|██▍       | 3138/12776 [33:16<41:11,  3.90it/s] 25%|██▍       | 3139/12776 [33:16<41:58,  3.83it/s]                                                     25%|██▍       | 3139/12776 [33:16<41:58,  3.83it/s] 25%|██▍       | 3140/12776 [33:16<39:38,  4.05it/s]                                                     25%|██▍       | 3140/12776 [33:16<39:38,  4.05it/s] 25%|██▍       | 3141/12776 [33:16<38:08,  4.21it/s]                                                     25%|██▍       | 3141/12776 [33:16<38:08,  4.21it/s] 25%|██▍       | 3142/12776 [33:17<37:30,  4.28it/s]                                                     25%|██▍       | 3142/12776 [33:17<37:30,  4.28it/s] 25%|██▍       | 3143/12776 [33:17<36:54,  4.35it/s]                                                     25%|██▍       | 3143/12776 [33:17<36:54,  4.35it/s] 25%|██▍       | 3144/12776 [33:17<39:04,  4.11it/s]                                                     25%|██▍       | 3144/12776 [33:17<39:04,  4.11it/s] 25%|██▍       | 3145/12776 [33:17<37:06,  4.33it/s]                                                     25%|██▍       | 3145/12776 [33:17<37:06,  4.33it/s] 25%|██▍       | 3146/12776 [33:18<35:28,  4.52it/s]                                                     25%|██▍       | 3146/12776 [33:18<35:28,  4.52it/s] 25%|██▍       | 3147/12776 [33:18<35:02,  4.58it/s]                                                     25%|██▍       | 3147/12776 [33:18<35:02,  4.58it/s] 25%|██▍       | 3148/12776 [33:18<34:14,  4.69it/s]                                                     25%|██▍       | 3148/12776 [33:18<34:14,  4.69it/s] 25%|██▍       | 3149/12776 [33:18<38:02,  4.22it/s]                                                     25%|██▍       | 3149/12776 [33:18<38:02,  4.22it/s] 25%|██▍       | 3150/12776 [33:19<1:05:22,  2.45it/s]                                                       25%|██▍       | 3150/12776 [33:19<1:05:22,  2.45it/s] 25%|██▍       | 3151/12776 [33:21<2:00:28,  1.33it/s]                                                       25%|██▍       | 3151/12776 [33:21<2:00:28,  1.33it/s] 25%|██▍       | 3152/12776 [33:22<2:17:25,  1.17it/s]                                                       25%|██▍       | 3152/12776 [33:22<2:17:25,  1.17it/s] 25%|██▍       | 3153/12776 [33:23<2:19:14,  1.15it/s]                                                       25%|██▍       | 3153/12776 [33:23<2:19:14,  1.15it/s] 25%|██▍       | 3154/12776 [33:23<2:17:04,  1.17it/s]                                                       25%|██▍       | 3154/12776 [33:23<2:17:04,  1.17it/s] 25%|██▍       | 3155/12776 [33:24<2:17:27,  1.17it/s]                                                      {'loss': 0.5348, 'grad_norm': 1.4934779405593872, 'learning_rate': 0.00023717008797653955, 'epoch': 0.48}
+{'loss': 0.9434, 'grad_norm': 1.6471208333969116, 'learning_rate': 0.00023714565004887583, 'epoch': 0.48}
+{'loss': 0.8578, 'grad_norm': 2.7180702686309814, 'learning_rate': 0.0002371212121212121, 'epoch': 0.48}
+{'loss': 0.7375, 'grad_norm': 1.2886486053466797, 'learning_rate': 0.00023709677419354836, 'epoch': 0.48}
+{'loss': 1.0982, 'grad_norm': 2.2205405235290527, 'learning_rate': 0.00023707233626588464, 'epoch': 0.48}
+{'loss': 0.9257, 'grad_norm': 3.079237461090088, 'learning_rate': 0.00023704789833822092, 'epoch': 0.48}
+{'loss': 1.1405, 'grad_norm': 2.3623974323272705, 'learning_rate': 0.00023702346041055714, 'epoch': 0.48}
+{'loss': 0.6099, 'grad_norm': 1.2514792680740356, 'learning_rate': 0.00023699902248289342, 'epoch': 0.48}
+{'loss': 0.82, 'grad_norm': 1.4864928722381592, 'learning_rate': 0.0002369745845552297, 'epoch': 0.48}
+{'loss': 1.0889, 'grad_norm': 1.9229499101638794, 'learning_rate': 0.00023695014662756595, 'epoch': 0.48}
+{'loss': 0.9904, 'grad_norm': 2.564469575881958, 'learning_rate': 0.00023692570869990223, 'epoch': 0.48}
+{'loss': 1.594, 'grad_norm': 3.783491611480713, 'learning_rate': 0.0002369012707722385, 'epoch': 0.48}
+{'loss': 0.8637, 'grad_norm': 2.4869840145111084, 'learning_rate': 0.00023687683284457476, 'epoch': 0.48}
+{'loss': 1.1697, 'grad_norm': 2.5603604316711426, 'learning_rate': 0.00023685239491691104, 'epoch': 0.48}
+{'loss': 2.2811, 'grad_norm': 3.291196823120117, 'learning_rate': 0.00023682795698924732, 'epoch': 0.48}
+{'loss': 1.0696, 'grad_norm': 1.8274985551834106, 'learning_rate': 0.00023680351906158354, 'epoch': 0.48}
+{'loss': 0.9953, 'grad_norm': 1.265817403793335, 'learning_rate': 0.00023677908113391982, 'epoch': 0.48}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.00023677908113391982, 'epoch': 0.48}
+{'loss': 1.2792, 'grad_norm': 3.3963782787323, 'learning_rate': 0.0002367546432062561, 'epoch': 0.48}
+{'loss': 0.7411, 'grad_norm': 1.3875449895858765, 'learning_rate': 0.00023673020527859235, 'epoch': 0.48}
+{'loss': 0.9312, 'grad_norm': 3.187683582305908, 'learning_rate': 0.00023670576735092863, 'epoch': 0.48}
+{'loss': 1.432, 'grad_norm': 3.2056405544281006, 'learning_rate': 0.0002366813294232649, 'epoch': 0.48}
+{'loss': 0.9695, 'grad_norm': 4.896152019500732, 'learning_rate': 0.00023665689149560116, 'epoch': 0.49}
+{'loss': 1.3928, 'grad_norm': 2.358058452606201, 'learning_rate': 0.0002366324535679374, 'epoch': 0.49}
+{'loss': 0.3465, 'grad_norm': 0.6357040405273438, 'learning_rate': 0.00023660801564027369, 'epoch': 0.49}
+{'loss': 0.4029, 'grad_norm': 0.48957034945487976, 'learning_rate': 0.00023658357771260994, 'epoch': 0.49}
+{'loss': 0.3123, 'grad_norm': 0.7128604054450989, 'learning_rate': 0.00023655913978494621, 'epoch': 0.49}
+{'loss': 0.3385, 'grad_norm': 0.5227282643318176, 'learning_rate': 0.0002365347018572825, 'epoch': 0.49}
+{'loss': 0.3359, 'grad_norm': 0.5651166439056396, 'learning_rate': 0.00023651026392961874, 'epoch': 0.49}
+{'loss': 0.4309, 'grad_norm': 0.8126301169395447, 'learning_rate': 0.00023648582600195502, 'epoch': 0.49}
+{'loss': 0.4009, 'grad_norm': 1.4748018980026245, 'learning_rate': 0.0002364613880742913, 'epoch': 0.49}
+{'loss': 0.4647, 'grad_norm': 1.2960294485092163, 'learning_rate': 0.00023643695014662753, 'epoch': 0.49}
+{'loss': 0.4079, 'grad_norm': 1.0774697065353394, 'learning_rate': 0.0002364125122189638, 'epoch': 0.49}
+{'loss': 0.3332, 'grad_norm': 1.326656460762024, 'learning_rate': 0.00023638807429130008, 'epoch': 0.49}
+{'loss': 0.4756, 'grad_norm': 0.907184362411499, 'learning_rate': 0.00023636363636363633, 'epoch': 0.49}
+{'loss': 0.3881, 'grad_norm': 0.7261903882026672, 'learning_rate': 0.0002363391984359726, 'epoch': 0.49}
+{'loss': 0.4808, 'grad_norm': 1.3236558437347412, 'learning_rate': 0.0002363147605083089, 'epoch': 0.49}
+{'loss': 0.4554, 'grad_norm': 1.082720160484314, 'learning_rate': 0.00023629032258064514, 'epoch': 0.49}
+{'loss': 0.7315, 'grad_norm': 1.2888387441635132, 'learning_rate': 0.00023626588465298142, 'epoch': 0.49}
+{'loss': 0.8247, 'grad_norm': 1.2054266929626465, 'learning_rate': 0.0002362414467253177, 'epoch': 0.49}
+{'loss': 0.4858, 'grad_norm': 0.8966947197914124, 'learning_rate': 0.00023621700879765392, 'epoch': 0.49}
+{'loss': 0.7636, 'grad_norm': 2.668046712875366, 'learning_rate': 0.0002361925708699902, 'epoch': 0.49}
+{'loss': 0.7442, 'grad_norm': 2.526010513305664, 'learning_rate': 0.00023616813294232648, 'epoch': 0.49}
+{'loss': 0.6907, 'grad_norm': 1.6158323287963867, 'learning_rate': 0.00023614369501466273, 'epoch': 0.49}
+{'loss': 0.773, 'grad_norm': 1.133370041847229, 'learning_rate': 0.000236119257086999, 'epoch': 0.49}
+{'loss': 0.9368, 'grad_norm': 1.4854271411895752, 'learning_rate': 0.0002360948191593353, 'epoch': 0.49}
+{'loss': 0.9644, 'grad_norm': 2.469548463821411, 'learning_rate': 0.0002360703812316715, 'epoch': 0.49}
+{'loss': 0.8894, 'grad_norm': 1.3509788513183594, 'learning_rate': 0.0002360459433040078, 'epoch': 0.49}
+{'loss': 0.6317, 'grad_norm': 1.1007205247879028, 'learning_rate': 0.00023602150537634407, 'epoch': 0.49}
+{'loss': 0.8419, 'grad_norm': 1.8395272493362427, 'learning_rate': 0.00023599706744868032, 'epoch': 0.49}
+{'loss': 1.069, 'grad_norm': 2.4748945236206055, 'learning_rate': 0.0002359726295210166, 'epoch': 0.49}
+{'loss': 0.9937, 'grad_norm': 2.535205602645874, 'learning_rate': 0.00023594819159335288, 'epoch': 0.49}
+{'loss': 0.6853, 'grad_norm': 1.4972389936447144, 'learning_rate': 0.00023592375366568913, 'epoch': 0.49}
+{'loss': 0.6998, 'grad_norm': 1.4133657217025757, 'learning_rate': 0.0002358993157380254, 'epoch': 0.49}
+{'loss': 1.6381, 'grad_norm': 2.480767250061035, 'learning_rate': 0.00023587487781036168, 'epoch': 0.49}
+{'loss': 0.7757, 'grad_norm': 2.512920618057251, 'learning_rate': 0.0002358504398826979, 'epoch': 0.49}
+{'loss': 1.1028, 'grad_norm': 2.0442888736724854, 'learning_rate': 0.00023582600195503419, 'epoch': 0.49}
+{'loss': 1.0315, 'grad_norm': 2.166085720062256, 'learning_rate': 0.00023580156402737046, 'epoch': 0.49}
+{'loss': 0.9559, 'grad_norm': 2.705939292907715, 'learning_rate': 0.00023577712609970672, 'epoch': 0.49}
+{'loss': 0.719, 'grad_norm': 1.7874341011047363, 'learning_rate': 0.000235752688172043, 'epoch': 0.49}
+{'loss': 0.8894, 'grad_norm': 2.1287143230438232, 'learning_rate': 0.00023572825024437927, 'epoch': 0.49}
+{'loss': 1.2279, 'grad_norm': 2.1680641174316406, 'learning_rate': 0.00023570381231671552, 'epoch': 0.49}
+{'loss': 1.1262, 'grad_norm': 3.1743721961975098, 'learning_rate': 0.0002356793743890518, 'epoch': 0.49}
+{'loss': 1.4381, 'grad_norm': 2.6501054763793945, 'learning_rate': 0.00023565493646138805, 'epoch': 0.49}
+{'loss': 0.9156, 'grad_norm': 2.002523422241211, 'learning_rate': 0.0002356304985337243, 'epoch': 0.49}
+{'loss': 1.7951, 'grad_norm': 3.3505101203918457, 'learning_rate': 0.00023560606060606058, 'epoch': 0.49}
+{'loss': 0.957, 'grad_norm': 2.2706339359283447, 'learning_rate': 0.00023558162267839686, 'epoch': 0.49}
+{'loss': 1.1053, 'grad_norm': 1.9583147764205933, 'learning_rate': 0.0002355571847507331, 'epoch': 0.49}
+{'loss': 1.3319, 'grad_norm': 1.6880264282226562, 'learning_rate': 0.0002355327468230694, 'epoch': 0.49}
+{'loss': 1.1314, 'grad_norm': 2.1848926544189453, 'learning_rate': 0.00023550830889540567, 'epoch': 0.49}
+{'loss': 1.2468, 'grad_norm': 2.028449773788452, 'learning_rate': 0.0002354838709677419, 'epoch': 0.49}
+{'loss': 0.7077, 'grad_norm': 1.6452484130859375, 'learning_rate': 0.00023545943304007817, 'epoch': 0.49}
+{'loss': 0.7043, 'grad_norm': 1.4108058214187622, 'learning_rate': 0.00023543499511241445, 'epoch': 0.49}
+{'loss': 0.8504, 'grad_norm': 3.064290761947632, 'learning_rate': 0.0002354105571847507, 'epoch': 0.49}
+{'loss': 0.3642, 'grad_norm': 0.6606544256210327, 'learning_rate': 0.00023538611925708698, 'epoch': 0.49}
+{'loss': 0.3771, 'grad_norm': 0.7985259890556335, 'learning_rate': 0.00023536168132942326, 'epoch': 0.49}
+{'loss': 0.2973, 'grad_norm': 0.8636269569396973, 'learning_rate': 0.0002353372434017595, 'epoch': 0.49}
+{'loss': 0.3708, 'grad_norm': 0.8866095542907715, 'learning_rate': 0.0002353128054740958, 'epoch': 0.49}
+ 25%|██▍       | 3155/12776 [33:24<2:17:27,  1.17it/s] 25%|██▍       | 3156/12776 [33:25<2:15:09,  1.19it/s]                                                       25%|██▍       | 3156/12776 [33:25<2:15:09,  1.19it/s] 25%|██▍       | 3157/12776 [33:26<2:06:28,  1.27it/s]                                                       25%|██▍       | 3157/12776 [33:26<2:06:28,  1.27it/s] 25%|██▍       | 3158/12776 [33:27<2:04:31,  1.29it/s]                                                       25%|██▍       | 3158/12776 [33:27<2:04:31,  1.29it/s] 25%|██▍       | 3159/12776 [33:27<1:55:27,  1.39it/s]                                                       25%|██▍       | 3159/12776 [33:27<1:55:27,  1.39it/s] 25%|██▍       | 3160/12776 [33:28<1:50:17,  1.45it/s]                                                       25%|██▍       | 3160/12776 [33:28<1:50:17,  1.45it/s] 25%|██▍       | 3161/12776 [33:28<1:43:14,  1.55it/s]                                                       25%|██▍       | 3161/12776 [33:28<1:43:14,  1.55it/s] 25%|██▍       | 3162/12776 [33:29<1:40:11,  1.60it/s]                                                       25%|██▍       | 3162/12776 [33:29<1:40:11,  1.60it/s] 25%|██▍       | 3163/12776 [33:29<1:33:40,  1.71it/s]                                                       25%|██▍       | 3163/12776 [33:29<1:33:40,  1.71it/s] 25%|██▍       | 3164/12776 [33:30<1:32:36,  1.73it/s]                                                       25%|██▍       | 3164/12776 [33:30<1:32:36,  1.73it/s] 25%|██▍       | 3165/12776 [33:30<1:25:43,  1.87it/s]                                                       25%|██▍       | 3165/12776 [33:30<1:25:43,  1.87it/s] 25%|██▍       | 3166/12776 [33:31<1:24:26,  1.90it/s]                                                       25%|██▍       | 3166/12776 [33:31<1:24:26,  1.90it/s] 25%|██▍       | 3167/12776 [33:31<1:18:19,  2.04it/s]                                                       25%|██▍       | 3167/12776 [33:31<1:18:19,  2.04it/s] 25%|██▍       | 3168/12776 [33:32<1:13:27,  2.18it/s]                                                       25%|██▍       | 3168/12776 [33:32<1:13:27,  2.18it/s] 25%|██▍       | 3169/12776 [33:32<1:09:45,  2.30it/s]                                                       25%|██▍       | 3169/12776 [33:32<1:09:45,  2.30it/s] 25%|██▍       | 3170/12776 [33:32<1:05:57,  2.43it/s]                                                       25%|██▍       | 3170/12776 [33:32<1:05:57,  2.43it/s] 25%|██▍       | 3171/12776 [33:33<1:02:55,  2.54it/s]                                                       25%|██▍       | 3171/12776 [33:33<1:02:55,  2.54it/s] 25%|██▍       | 3172/12776 [33:33<1:05:09,  2.46it/s]                                                       25%|██▍       | 3172/12776 [33:33<1:05:09,  2.46it/s] 25%|██▍       | 3173/12776 [33:33<1:01:29,  2.60it/s]                                                       25%|██▍       | 3173/12776 [33:33<1:01:29,  2.60it/s] 25%|██▍       | 3174/12776 [33:34<58:28,  2.74it/s]                                                       25%|██▍       | 3174/12776 [33:34<58:28,  2.74it/s] 25%|██▍       | 3175/12776 [33:34<55:40,  2.87it/s]                                                     25%|██▍       | 3175/12776 [33:34<55:40,  2.87it/s] 25%|██▍       | 3176/12776 [33:34<54:42,  2.92it/s]                                                     25%|██▍       | 3176/12776 [33:34<54:42,  2.92it/s] 25%|██▍       | 3177/12776 [33:35<52:02,  3.07it/s]                                                     25%|██▍       | 3177/12776 [33:35<52:02,  3.07it/s] 25%|██▍       | 3178/12776 [33:35<49:54,  3.21it/s]                                                     25%|██▍       | 3178/12776 [33:35<49:54,  3.21it/s] 25%|██▍       | 3179/12776 [33:35<48:04,  3.33it/s]                                                     25%|██▍       | 3179/12776 [33:35<48:04,  3.33it/s] 25%|██▍       | 3180/12776 [33:36<47:18,  3.38it/s]                                                     25%|██▍       | 3180/12776 [33:36<47:18,  3.38it/s] 25%|██▍       | 3181/12776 [33:36<45:14,  3.53it/s]                                                     25%|██▍       | 3181/12776 [33:36<45:14,  3.53it/s] 25%|██▍       | 3182/12776 [33:36<43:57,  3.64it/s]                                                     25%|██▍       | 3182/12776 [33:36<43:57,  3.64it/s] 25%|██▍       | 3183/12776 [33:36<42:34,  3.76it/s]                                                     25%|██▍       | 3183/12776 [33:36<42:34,  3.76it/s] 25%|██▍       | 3184/12776 [33:37<47:55,  3.34it/s]                                                     25%|██▍       | 3184/12776 [33:37<47:55,  3.34it/s] 25%|██▍       | 3185/12776 [33:37<44:54,  3.56it/s]                                                     25%|██▍       | 3185/12776 [33:37<44:54,  3.56it/s] 25%|██▍       | 3186/12776 [33:37<43:31,  3.67it/s]                                                     25%|██▍       | 3186/12776 [33:37<43:31,  3.67it/s] 25%|██▍       | 3187/12776 [33:37<41:25,  3.86it/s]                                                     25%|██▍       | 3187/12776 [33:37<41:25,  3.86it/s] 25%|██▍       | 3188/12776 [33:38<42:46,  3.74it/s]                                                     25%|██▍       | 3188/12776 [33:38<42:46,  3.74it/s] 25%|██▍       | 3189/12776 [33:38<40:10,  3.98it/s]                                                     25%|██▍       | 3189/12776 [33:38<40:10,  3.98it/s] 25%|██▍       | 3190/12776 [33:38<38:14,  4.18it/s]                                                     25%|██▍       | 3190/12776 [33:38<38:14,  4.18it/s] 25%|██▍       | 3191/12776 [33:38<36:41,  4.35it/s]                                                     25%|██▍       | 3191/12776 [33:38<36:41,  4.35it/s] 25%|██▍       | 3192/12776 [33:39<35:29,  4.50it/s]                                                     25%|██▍       | 3192/12776 [33:39<35:29,  4.50it/s] 25%|██▍       | 3193/12776 [33:39<37:51,  4.22it/s]                                                     25%|██▍       | 3193/12776 [33:39<37:51,  4.22it/s] 25%|██▌       | 3194/12776 [33:39<36:11,  4.41it/s]                                                     25%|██▌       | 3194/12776 [33:39<36:11,  4.41it/s] 25%|██▌       | 3195/12776 [33:39<34:48,  4.59it/s]                                                     25%|██▌       | 3195/12776 [33:39<34:48,  4.59it/s] 25%|██▌       | 3196/12776 [33:39<33:47,  4.72it/s]                                                     25%|██▌       | 3196/12776 [33:39<33:47,  4.72it/s] 25%|██▌       | 3197/12776 [33:40<32:57,  4.84it/s]                                                     25%|██▌       | 3197/12776 [33:40<32:57,  4.84it/s] 25%|██▌       | 3198/12776 [33:40<32:06,  4.97it/s]                                                     25%|██▌       | 3198/12776 [33:40<32:06,  4.97it/s] 25%|██▌       | 3199/12776 [33:40<34:38,  4.61it/s]                                                     25%|██▌       | 3199/12776 [33:40<34:38,  4.61it/s] 25%|██▌       | 3200/12776 [33:41<58:30,  2.73it/s]                                                     25%|██▌       | 3200/12776 [33:41<58:30,  2.73it/s]Saving model checkpoint to ./checkpoint-3200
+Configuration saved in ./checkpoint-3200/config.json
+Model weights saved in ./checkpoint-3200/model.safetensors
+Feature extractor saved in ./checkpoint-3200/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-3200/tokenizer_config.json
+Special tokens file saved in ./checkpoint-3200/special_tokens_map.json
+added tokens file saved in ./checkpoint-3200/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-2000] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 25%|██▌       | 3201/12776 [33:47<6:01:27,  2.27s/it]                                                       25%|██▌       | 3201/12776 [33:47<6:01:27,  2.27s/it] 25%|██▌       | 3202/12776 [33:48<5:00:57,  1.89s/it]                                                       25%|██▌       | 3202/12776 [33:48<5:00:57,  1.89s/it] 25%|██▌       | 3203/12776 [33:49<4:14:38,  1.60s/it]                                                       25%|██▌       | 3203/12776 [33:49<4:14:38,  1.60s/it] 25%|██▌       | 3204/12776 [33:50<3:38:41,  1.37s/it]                                                       25%|██▌       | 3204/12776 [33:50<3:38:41,  1.37s/it] 25%|██▌       | 3205/12776 [33:51<3:05:42,  1.16s/it]                                                       25%|██▌       | 3205/12776 [33:51<3:05:42,  1.16s/it] 25%|██▌       | 3206/12776 [33:52<2:45:26,  1.04s/it]                                                       25%|██▌       | 3206/12776 [33:52<2:45:26,  1.04s/it] 25%|██▌       | 3207/12776 [33:52<2:24:03,  1.11it/s]                                                       25%|██▌       | 3207/12776 [33:52<2:24:03,  1.11it/s] 25%|██▌       | 3208/12776 [33:53<2:09:43,  1.23it/s]                                                       25%|██▌       | 3208/12776 [33:53<2:09:43,  1.23it/s] 25%|██▌       | 3209/12776 [33:53<1:55:22,  1.38it/s]                                                       25%|██▌       | 3209/12776 [33:53<1:55:22,  1.38it/s] 25%|██▌       | 3210/12776 [33:54<1:47:10,  1.49it/s]                                                       25%|██▌       | 3210/12776 [33:54<1:47:10,  1.49it/s] 25%|██▌       | 3211/12776 [33:54<1:37:34,  1.63it/s]                                                       25%|██▌       | 3211/12776 [33:54<1:37:34,  1.63it/s] 25%|██▌       | 3212/12776 [33:55<1:31:15,  1.75it/s]                                                       25%|██▌       | 3212/12776 [33:55<1:31:15,  1.75it/s] 25%|██▌       | 3213/12776 [33:55<1:23:56,  1.90it/s]                                                       25%|██▌       | 3213/12776 [33:55<1:23:56,  1.90it/s] 25%|██▌       | 3214/12776 [33:56<1:18:06,  2.04it/s]                                                       25%|██▌       | 3214/12776 [33:56<1:18:06,  2.04it/s] 25%|██▌       | 3215/12776 [33:56<1:18:49,  2.02it/s]                                                       25%|██▌       | 3215/12776 [33:56<1:18:49,  2.02it/s] 25%|██▌       | 3216/12776 [33:57<1:12:52,  2.19it/s]                                                       25%|██▌       | 3216/12776 [33:57<1:12:52,  2.19it/s] 25%|██▌       | 3217/12776 [33:57<1:08:16,  2.33it/s]                                                       25%|██▌       | 3217/12776 [33:57<1:08:16,  2.33it/s] 25%|██▌       | 3218/12776 [33:57<1:08:06,  2.34it/s]                                                       25%|██▌       | 3218/12776 [33:57<1:08:06,  2.34it/s] 25%|██▌       | 3219/12776 [33:58<1:03:47,  2.50it/s]                                                       25%|██▌       | 3219/12776 [33:58<1:03:47,  2.50it/s] 25%|██▌       | 3220/12776 [33:58<1:00:00,  2.65it/s]                                                       25%|██▌       | 3220/12776 [33:58<1:00:00,  2.65it/s] 25%|██▌       | 3221/12776 [33:58<59:08,  2.69it/s]                                                       25%|██▌       | 3221/12776 [33:58<59:08,  2.69it/s] 25%|██▌       | 3222/12776 [33:59<56:02,  2.84it/s]                                                     25%|██▌       | 3222/12776 [33:59<56:02,  2.84it/s] 25%|██▌       | 3223/12776 [33:59<53:35,  2.97it/s]                                                     25%|██▌       | 3223/12776 [33:59<53:35,  2.97it/s] 25%|██▌       | 3224/12776 [33:59<50:56,  3.12it/s]                                                     25%|██▌       | 3224/12776 [33:59<50:56,  3.12it/s] 25%|██▌       | 3225/12776 [34:00<53:12,  2.99it/s]                                                     25%|██▌       | 3225/12776 [34:00<53:12,  2.99it/s] 25%|██▌       | 3226/12776 [34:00<50:07,  3.18it/s]                                                     25%|██▌       | 3226/12776 [34:00<50:07,  3.18it/s] 25%|██▌       | 3227/12776 [34:00<47:24,  3.36it/s]                                                     25%|██▌       | 3227/12776 [34:00<47:24,  3.36it/s] 25%|██▌       | 3228/12776 [34:00<45:16,  3.52it/s]                                                     25%|██▌       | 3228/12776 [34:00<45:16,  3.52it/s] 25%|██▌       | 3229/12776 [34:01<48:52,  3.26it/s]                                                     25%|██▌       | 3229/12776 [34:01<48:52,  3.26it/s] 25%|██▌       | 3230/12776 [34:01<45:42,  3.48it/s]                                                     25%|██▌       | 3230/12776 [34:01<45:42,  3.48it/s] 25%|██▌       | 3231/12776 [34:01<42:59,  3.70it/s]                                                     25%|██▌       | 3231/12776 [34:01<42:59,  3.70it/s] 25%|██▌       | 3232/12776 [34:01<40:50,  3.90it/s]                                                     25%|██▌       | 3232/12776 [34:01<40:50,  3.90it/s] 25%|██▌       | 3233/12776 [34:02<43:53,  3.62it/s]                                                    {'loss': 0.3621, 'grad_norm': 0.6658779978752136, 'learning_rate': 0.00023528836754643207, 'epoch': 0.49}
+{'loss': 0.2645, 'grad_norm': 0.5546128749847412, 'learning_rate': 0.0002352639296187683, 'epoch': 0.49}
+{'loss': 0.4566, 'grad_norm': 0.9286956191062927, 'learning_rate': 0.00023523949169110457, 'epoch': 0.49}
+{'loss': 0.3843, 'grad_norm': 1.118327021598816, 'learning_rate': 0.00023521505376344085, 'epoch': 0.49}
+{'loss': 0.306, 'grad_norm': 1.691359281539917, 'learning_rate': 0.0002351906158357771, 'epoch': 0.49}
+{'loss': 0.5754, 'grad_norm': 1.1028999090194702, 'learning_rate': 0.00023516617790811338, 'epoch': 0.49}
+{'loss': 0.4509, 'grad_norm': 0.7714839577674866, 'learning_rate': 0.00023514173998044965, 'epoch': 0.49}
+{'loss': 0.3924, 'grad_norm': 0.7560392022132874, 'learning_rate': 0.0002351173020527859, 'epoch': 0.49}
+{'loss': 0.6857, 'grad_norm': 0.8828386664390564, 'learning_rate': 0.00023509286412512218, 'epoch': 0.5}
+{'loss': 0.671, 'grad_norm': 2.5070831775665283, 'learning_rate': 0.00023506842619745844, 'epoch': 0.5}
+{'loss': 0.5964, 'grad_norm': 1.2520339488983154, 'learning_rate': 0.0002350439882697947, 'epoch': 0.5}
+{'loss': 0.5922, 'grad_norm': 0.9470450282096863, 'learning_rate': 0.00023501955034213096, 'epoch': 0.5}
+{'loss': 0.6786, 'grad_norm': 0.9970950484275818, 'learning_rate': 0.00023499511241446724, 'epoch': 0.5}
+{'loss': 0.6266, 'grad_norm': 1.0567865371704102, 'learning_rate': 0.0002349706744868035, 'epoch': 0.5}
+{'loss': 0.3637, 'grad_norm': 0.6511049866676331, 'learning_rate': 0.00023494623655913977, 'epoch': 0.5}
+{'loss': 0.538, 'grad_norm': 1.0755113363265991, 'learning_rate': 0.00023492179863147605, 'epoch': 0.5}
+{'loss': 0.8472, 'grad_norm': 1.8359781503677368, 'learning_rate': 0.00023489736070381228, 'epoch': 0.5}
+{'loss': 0.5028, 'grad_norm': 0.943196713924408, 'learning_rate': 0.00023487292277614855, 'epoch': 0.5}
+{'loss': 0.6239, 'grad_norm': 2.1813058853149414, 'learning_rate': 0.00023484848484848483, 'epoch': 0.5}
+{'loss': 0.544, 'grad_norm': 1.8853085041046143, 'learning_rate': 0.00023482404692082108, 'epoch': 0.5}
+{'loss': 0.7816, 'grad_norm': 1.5820690393447876, 'learning_rate': 0.00023479960899315736, 'epoch': 0.5}
+{'loss': 0.5894, 'grad_norm': 1.5415321588516235, 'learning_rate': 0.00023477517106549364, 'epoch': 0.5}
+{'loss': 0.8592, 'grad_norm': 1.8562960624694824, 'learning_rate': 0.0002347507331378299, 'epoch': 0.5}
+{'loss': 1.0993, 'grad_norm': 2.251617908477783, 'learning_rate': 0.00023472629521016617, 'epoch': 0.5}
+{'loss': 0.9617, 'grad_norm': 3.1134870052337646, 'learning_rate': 0.00023470185728250245, 'epoch': 0.5}
+{'loss': 0.7751, 'grad_norm': 2.8295364379882812, 'learning_rate': 0.00023467741935483867, 'epoch': 0.5}
+{'loss': 0.638, 'grad_norm': 2.4342901706695557, 'learning_rate': 0.00023465298142717495, 'epoch': 0.5}
+{'loss': 1.0993, 'grad_norm': 2.19533371925354, 'learning_rate': 0.00023462854349951123, 'epoch': 0.5}
+{'loss': 1.1031, 'grad_norm': 2.2838051319122314, 'learning_rate': 0.00023460410557184748, 'epoch': 0.5}
+{'loss': 1.0324, 'grad_norm': 2.4727444648742676, 'learning_rate': 0.00023457966764418376, 'epoch': 0.5}
+{'loss': 1.0483, 'grad_norm': 2.0869252681732178, 'learning_rate': 0.00023455522971652004, 'epoch': 0.5}
+{'loss': 0.8777, 'grad_norm': 2.266732931137085, 'learning_rate': 0.0002345307917888563, 'epoch': 0.5}
+{'loss': 1.1718, 'grad_norm': 2.169137954711914, 'learning_rate': 0.00023450635386119257, 'epoch': 0.5}
+{'loss': 1.1595, 'grad_norm': 2.2153878211975098, 'learning_rate': 0.00023448191593352882, 'epoch': 0.5}
+{'loss': 1.2587, 'grad_norm': 2.4565927982330322, 'learning_rate': 0.00023445747800586507, 'epoch': 0.5}
+{'loss': 0.9619, 'grad_norm': 1.9531141519546509, 'learning_rate': 0.00023443304007820135, 'epoch': 0.5}
+{'loss': 1.2016, 'grad_norm': 3.107656478881836, 'learning_rate': 0.00023440860215053763, 'epoch': 0.5}
+{'loss': 1.7411, 'grad_norm': 2.456040143966675, 'learning_rate': 0.00023438416422287388, 'epoch': 0.5}
+{'loss': 1.5113, 'grad_norm': 2.5410706996917725, 'learning_rate': 0.00023435972629521016, 'epoch': 0.5}
+{'loss': 1.2314, 'grad_norm': 1.8548656702041626, 'learning_rate': 0.00023433528836754643, 'epoch': 0.5}
+{'loss': 0.9746, 'grad_norm': 2.335625648498535, 'learning_rate': 0.00023431085043988266, 'epoch': 0.5}
+{'loss': 0.6818, 'grad_norm': 1.950925350189209, 'learning_rate': 0.00023428641251221894, 'epoch': 0.5}
+{'loss': 1.3223, 'grad_norm': 2.8675320148468018, 'learning_rate': 0.00023426197458455521, 'epoch': 0.5}
+{'loss': 0.8275, 'grad_norm': 7.516252040863037, 'learning_rate': 0.00023423753665689147, 'epoch': 0.5}
+{'loss': 0.8734, 'grad_norm': 3.5480682849884033, 'learning_rate': 0.00023421309872922774, 'epoch': 0.5}
+{'loss': 0.9164, 'grad_norm': 1.8720403909683228, 'learning_rate': 0.00023418866080156402, 'epoch': 0.5}
+{'loss': 0.3279, 'grad_norm': 0.5714183449745178, 'learning_rate': 0.00023416422287390027, 'epoch': 0.5}
+{'loss': 0.4663, 'grad_norm': 0.5906309485435486, 'learning_rate': 0.00023413978494623655, 'epoch': 0.5}
+{'loss': 0.3761, 'grad_norm': 0.49747294187545776, 'learning_rate': 0.00023411534701857283, 'epoch': 0.5}
+{'loss': 0.3538, 'grad_norm': 0.6094551086425781, 'learning_rate': 0.00023409090909090905, 'epoch': 0.5}
+{'loss': 0.259, 'grad_norm': 0.549359142780304, 'learning_rate': 0.00023406647116324533, 'epoch': 0.5}
+{'loss': 0.3294, 'grad_norm': 0.6701558232307434, 'learning_rate': 0.0002340420332355816, 'epoch': 0.5}
+{'loss': 0.3143, 'grad_norm': 0.6899073719978333, 'learning_rate': 0.00023401759530791786, 'epoch': 0.5}
+{'loss': 0.4073, 'grad_norm': 0.5624547004699707, 'learning_rate': 0.00023399315738025414, 'epoch': 0.5}
+{'loss': 0.5554, 'grad_norm': 1.0639121532440186, 'learning_rate': 0.00023396871945259042, 'epoch': 0.5}
+{'loss': 1.0342, 'grad_norm': 1.0245072841644287, 'learning_rate': 0.00023394428152492667, 'epoch': 0.5}
+{'loss': 0.3688, 'grad_norm': 0.8923421502113342, 'learning_rate': 0.00023391984359726292, 'epoch': 0.5}
+{'loss': 0.6074, 'grad_norm': 1.4009363651275635, 'learning_rate': 0.0002338954056695992, 'epoch': 0.5}
+{'loss': 0.5968, 'grad_norm': 1.919011116027832, 'learning_rate': 0.00023387096774193545, 'epoch': 0.5}
+{'loss': 0.5708, 'grad_norm': 1.0158270597457886, 'learning_rate': 0.00023384652981427173, 'epoch': 0.5}
+{'loss': 0.7346, 'grad_norm': 1.5985990762710571, 'learning_rate': 0.000233822091886608, 'epoch': 0.5}
+{'loss': 0.5624, 'grad_norm': 1.1101726293563843, 'learning_rate': 0.00023379765395894426, 'epoch': 0.5}
+{'loss': 0.4229, 'grad_norm': 1.8938688039779663, 'learning_rate': 0.00023377321603128054, 'epoch': 0.5}
+{'loss': 0.6207, 'grad_norm': 0.9973450303077698, 'learning_rate': 0.00023374877810361682, 'epoch': 0.5}
+{'loss': 0.4491, 'grad_norm': 1.006881833076477, 'learning_rate': 0.00023372434017595304, 'epoch': 0.5}
+{'loss': 0.6227, 'grad_norm': 1.4391586780548096, 'learning_rate': 0.00023369990224828932, 'epoch': 0.5}
+{'loss': 0.5966, 'grad_norm': 1.5591343641281128, 'learning_rate': 0.0002336754643206256, 'epoch': 0.5}
+{'loss': 0.5168, 'grad_norm': 1.0018322467803955, 'learning_rate': 0.00023365102639296185, 'epoch': 0.5}
+{'loss': 0.6349, 'grad_norm': 1.1433907747268677, 'learning_rate': 0.00023362658846529813, 'epoch': 0.5}
+{'loss': 0.9885, 'grad_norm': 1.683487057685852, 'learning_rate': 0.0002336021505376344, 'epoch': 0.5}
+{'loss': 0.9242, 'grad_norm': 1.9962869882583618, 'learning_rate': 0.00023357771260997066, 'epoch': 0.5}
+{'loss': 1.2199, 'grad_norm': 2.4794111251831055, 'learning_rate': 0.00023355327468230693, 'epoch': 0.51}
+{'loss': 0.4802, 'grad_norm': 1.2889044284820557, 'learning_rate': 0.0002335288367546432, 'epoch': 0.51}
+{'loss': 0.9971, 'grad_norm': 1.9017808437347412, 'learning_rate': 0.00023350439882697944, 'epoch': 0.51}
+{'loss': 0.6098, 'grad_norm': 2.0930917263031006, 'learning_rate': 0.00023347996089931572, 'epoch': 0.51}
+{'loss': 1.0619, 'grad_norm': 2.39898943901062, 'learning_rate': 0.000233455522971652, 'epoch': 0.51}
+{'loss': 0.9109, 'grad_norm': 2.192613124847412, 'learning_rate': 0.00023343108504398824, 'epoch': 0.51}
+{'loss': 0.8734, 'grad_norm': 2.8761684894561768, 'learning_rate': 0.00023340664711632452, 'epoch': 0.51}
+ 25%|██▌       | 3233/12776 [34:02<43:53,  3.62it/s] 25%|██▌       | 3234/12776 [34:02<40:59,  3.88it/s]                                                     25%|██▌       | 3234/12776 [34:02<40:59,  3.88it/s] 25%|██▌       | 3235/12776 [34:02<38:49,  4.10it/s]                                                     25%|██▌       | 3235/12776 [34:02<38:49,  4.10it/s] 25%|██▌       | 3236/12776 [34:02<36:59,  4.30it/s]                                                     25%|██▌       | 3236/12776 [34:02<36:59,  4.30it/s] 25%|██▌       | 3237/12776 [34:03<35:41,  4.45it/s]                                                     25%|██▌       | 3237/12776 [34:03<35:41,  4.45it/s] 25%|██▌       | 3238/12776 [34:03<39:10,  4.06it/s]                                                     25%|██▌       | 3238/12776 [34:03<39:10,  4.06it/s] 25%|██▌       | 3239/12776 [34:03<36:23,  4.37it/s]                                                     25%|██▌       | 3239/12776 [34:03<36:23,  4.37it/s] 25%|██▌       | 3240/12776 [34:03<34:29,  4.61it/s]                                                     25%|██▌       | 3240/12776 [34:03<34:29,  4.61it/s] 25%|██▌       | 3241/12776 [34:03<33:03,  4.81it/s]                                                     25%|██▌       | 3241/12776 [34:03<33:03,  4.81it/s] 25%|██▌       | 3242/12776 [34:04<31:49,  4.99it/s]                                                     25%|██▌       | 3242/12776 [34:04<31:49,  4.99it/s] 25%|██▌       | 3243/12776 [34:04<30:51,  5.15it/s]                                                     25%|██▌       | 3243/12776 [34:04<30:51,  5.15it/s] 25%|██▌       | 3244/12776 [34:04<36:38,  4.34it/s]                                                     25%|██▌       | 3244/12776 [34:04<36:38,  4.34it/s] 25%|██▌       | 3245/12776 [34:04<34:00,  4.67it/s]                                                     25%|██▌       | 3245/12776 [34:04<34:00,  4.67it/s] 25%|██▌       | 3246/12776 [34:04<31:55,  4.97it/s]                                                     25%|██▌       | 3246/12776 [34:04<31:55,  4.97it/s] 25%|██▌       | 3247/12776 [34:05<30:34,  5.20it/s]                                                     25%|██▌       | 3247/12776 [34:05<30:34,  5.20it/s] 25%|██▌       | 3248/12776 [34:05<29:10,  5.44it/s]                                                     25%|██▌       | 3248/12776 [34:05<29:10,  5.44it/s] 25%|██▌       | 3249/12776 [34:05<28:06,  5.65it/s]                                                     25%|██▌       | 3249/12776 [34:05<28:06,  5.65it/s] 25%|██▌       | 3250/12776 [34:06<54:14,  2.93it/s]                                                     25%|██▌       | 3250/12776 [34:06<54:14,  2.93it/s] 25%|██▌       | 3251/12776 [34:07<1:49:05,  1.46it/s]                                                       25%|██▌       | 3251/12776 [34:07<1:49:05,  1.46it/s] 25%|██▌       | 3252/12776 [34:08<1:59:33,  1.33it/s]                                                       25%|██▌       | 3252/12776 [34:08<1:59:33,  1.33it/s] 25%|██▌       | 3253/12776 [34:09<2:07:04,  1.25it/s]                                                       25%|██▌       | 3253/12776 [34:09<2:07:04,  1.25it/s] 25%|██▌       | 3254/12776 [34:10<2:07:30,  1.24it/s]                                                       25%|██▌       | 3254/12776 [34:10<2:07:30,  1.24it/s] 25%|██▌       | 3255/12776 [34:11<2:02:20,  1.30it/s]                                                       25%|██▌       | 3255/12776 [34:11<2:02:20,  1.30it/s] 25%|██▌       | 3256/12776 [34:11<2:01:32,  1.31it/s]                                                       25%|██▌       | 3256/12776 [34:11<2:01:32,  1.31it/s] 25%|██▌       | 3257/12776 [34:12<1:54:26,  1.39it/s]                                                       25%|██▌       | 3257/12776 [34:12<1:54:26,  1.39it/s] 26%|██▌       | 3258/12776 [34:13<1:48:20,  1.46it/s]                                                       26%|██▌       | 3258/12776 [34:13<1:48:20,  1.46it/s] 26%|██▌       | 3259/12776 [34:13<1:41:04,  1.57it/s]                                                       26%|██▌       | 3259/12776 [34:13<1:41:04,  1.57it/s] 26%|██▌       | 3260/12776 [34:14<1:38:02,  1.62it/s]                                                       26%|██▌       | 3260/12776 [34:14<1:38:02,  1.62it/s] 26%|██▌       | 3261/12776 [34:14<1:32:02,  1.72it/s]                                                       26%|██▌       | 3261/12776 [34:14<1:32:02,  1.72it/s] 26%|██▌       | 3262/12776 [34:15<1:31:57,  1.72it/s]                                                       26%|██▌       | 3262/12776 [34:15<1:31:57,  1.72it/s] 26%|██▌       | 3263/12776 [34:15<1:25:11,  1.86it/s]                                                       26%|██▌       | 3263/12776 [34:15<1:25:11,  1.86it/s] 26%|██▌       | 3264/12776 [34:16<1:25:08,  1.86it/s]                                                       26%|██▌       | 3264/12776 [34:16<1:25:08,  1.86it/s] 26%|██▌       | 3265/12776 [34:16<1:18:34,  2.02it/s]                                                       26%|██▌       | 3265/12776 [34:16<1:18:34,  2.02it/s] 26%|██▌       | 3266/12776 [34:16<1:12:54,  2.17it/s]                                                       26%|██▌       | 3266/12776 [34:16<1:12:54,  2.17it/s] 26%|██▌       | 3267/12776 [34:17<1:09:48,  2.27it/s]                                                       26%|██▌       | 3267/12776 [34:17<1:09:48,  2.27it/s] 26%|██▌       | 3268/12776 [34:17<1:05:38,  2.41it/s]                                                       26%|██▌       | 3268/12776 [34:17<1:05:38,  2.41it/s] 26%|██▌       | 3269/12776 [34:18<1:02:17,  2.54it/s]                                                       26%|██▌       | 3269/12776 [34:18<1:02:17,  2.54it/s] 26%|██▌       | 3270/12776 [34:18<1:04:27,  2.46it/s]                                                       26%|██▌       | 3270/12776 [34:18<1:04:27,  2.46it/s] 26%|██▌       | 3271/12776 [34:18<1:00:07,  2.63it/s]                                                       26%|██▌       | 3271/12776 [34:18<1:00:07,  2.63it/s] 26%|██▌       | 3272/12776 [34:19<56:39,  2.80it/s]                                                       26%|██▌       | 3272/12776 [34:19<56:39,  2.80it/s] 26%|██▌       | 3273/12776 [34:19<54:01,  2.93it/s]                                                     26%|██▌       | 3273/12776 [34:19<54:01,  2.93it/s] 26%|██▌       | 3274/12776 [34:19<55:21,  2.86it/s]                                                     26%|██▌       | 3274/12776 [34:19<55:21,  2.86it/s] 26%|██▌       | 3275/12776 [34:20<51:53,  3.05it/s]                                                     26%|██▌       | 3275/12776 [34:20<51:53,  3.05it/s] 26%|██▌       | 3276/12776 [34:20<50:27,  3.14it/s]                                                     26%|██▌       | 3276/12776 [34:20<50:27,  3.14it/s] 26%|██▌       | 3277/12776 [34:20<54:46,  2.89it/s]                                                     26%|██▌       | 3277/12776 [34:20<54:46,  2.89it/s] 26%|██▌       | 3278/12776 [34:21<51:49,  3.05it/s]                                                     26%|██▌       | 3278/12776 [34:21<51:49,  3.05it/s] 26%|██▌       | 3279/12776 [34:21<49:24,  3.20it/s]                                                     26%|██▌       | 3279/12776 [34:21<49:24,  3.20it/s] 26%|██▌       | 3280/12776 [34:21<47:15,  3.35it/s]                                                     26%|██▌       | 3280/12776 [34:21<47:15,  3.35it/s] 26%|██▌       | 3281/12776 [34:21<48:19,  3.27it/s]                                                     26%|██▌       | 3281/12776 [34:21<48:19,  3.27it/s] 26%|██▌       | 3282/12776 [34:22<45:41,  3.46it/s]                                                     26%|██▌       | 3282/12776 [34:22<45:41,  3.46it/s] 26%|██▌       | 3283/12776 [34:22<43:34,  3.63it/s]                                                     26%|██▌       | 3283/12776 [34:22<43:34,  3.63it/s] 26%|██▌       | 3284/12776 [34:22<41:53,  3.78it/s]                                                     26%|██▌       | 3284/12776 [34:22<41:53,  3.78it/s] 26%|██▌       | 3285/12776 [34:22<38:50,  4.07it/s]                                                     26%|██▌       | 3285/12776 [34:22<38:50,  4.07it/s] 26%|██▌       | 3286/12776 [34:23<40:55,  3.87it/s]                                                     26%|██▌       | 3286/12776 [34:23<40:55,  3.87it/s] 26%|██▌       | 3287/12776 [34:23<39:07,  4.04it/s]                                                     26%|██▌       | 3287/12776 [34:23<39:07,  4.04it/s] 26%|██▌       | 3288/12776 [34:23<37:36,  4.21it/s]                                                     26%|██▌       | 3288/12776 [34:23<37:36,  4.21it/s] 26%|██▌       | 3289/12776 [34:23<36:17,  4.36it/s]                                                     26%|██▌       | 3289/12776 [34:23<36:17,  4.36it/s] 26%|██▌       | 3290/12776 [34:23<35:36,  4.44it/s]                                                     26%|██▌       | 3290/12776 [34:23<35:36,  4.44it/s] 26%|██▌       | 3291/12776 [34:24<40:23,  3.91it/s]                                                     26%|██▌       | 3291/12776 [34:24<40:23,  3.91it/s] 26%|██▌       | 3292/12776 [34:24<37:58,  4.16it/s]                                                     26%|██▌       | 3292/12776 [34:24<37:58,  4.16it/s] 26%|██▌       | 3293/12776 [34:24<36:06,  4.38it/s]                                                     26%|██▌       | 3293/12776 [34:24<36:06,  4.38it/s] 26%|██▌       | 3294/12776 [34:24<34:49,  4.54it/s]                                                     26%|██▌       | 3294/12776 [34:24<34:49,  4.54it/s] 26%|██▌       | 3295/12776 [34:25<33:45,  4.68it/s]                                                     26%|██▌       | 3295/12776 [34:25<33:45,  4.68it/s] 26%|██▌       | 3296/12776 [34:25<35:21,  4.47it/s]                                                     26%|██▌       | 3296/12776 [34:25<35:21,  4.47it/s] 26%|██▌       | 3297/12776 [34:25<33:53,  4.66it/s]                                                     26%|██▌       | 3297/12776 [34:25<33:53,  4.66it/s] 26%|██▌       | 3298/12776 [34:25<32:39,  4.84it/s]                                                     26%|██▌       | 3298/12776 [34:25<32:39,  4.84it/s] 26%|██▌       | 3299/12776 [34:25<31:44,  4.98it/s]                                                     26%|██▌       | 3299/12776 [34:25<31:44,  4.98it/s] 26%|██▌       | 3300/12776 [34:26<57:07,  2.76it/s]                                                     26%|██▌       | 3300/12776 [34:26<57:07,  2.76it/s] 26%|██▌       | 3301/12776 [34:28<1:47:29,  1.47it/s]                                                       26%|██▌       | 3301/12776 [34:28<1:47:29,  1.47it/s] 26%|██▌       | 3302/12776 [34:29<2:00:25,  1.31it/s]                                                       26%|██▌       | 3302/12776 [34:29<2:00:25,  1.31it/s] 26%|██▌       | 3303/12776 [34:29<2:04:05,  1.27it/s]                                                       26%|██▌       | 3303/12776 [34:29<2:04:05,  1.27it/s] 26%|██▌       | 3304/12776 [34:30<2:03:19,  1.28it/s]                                                       26%|██▌       | 3304/12776 [34:30<2:03:19,  1.28it/s] 26%|██▌       | 3305/12776 [34:31<2:01:04,  1.30it/s]                                                       26%|██▌       | 3305/12776 [34:31<2:01:04,  1.30it/s] 26%|██▌       | 3306/12776 [34:32<1:59:57,  1.32it/s]                                                       26%|██▌       | 3306/12776 [34:32<1:59:57,  1.32it/s] 26%|██▌       | 3307/12776 [34:32<1:54:14,  1.38it/s]                                                       26%|██▌       | 3307/12776 [34:32<1:54:14,  1.38it/s] 26%|██▌       | 3308/12776 [34:33<1:56:21,  1.36it/s]                                                       26%|██▌       | 3308/12776 [34:33<1:56:21,  1.36it/s] 26%|██▌       | 3309/12776 [34:34<1:48:35,  1.45it/s]                                                       26%|██▌       | 3309/12776 [34:34<1:48:35,  1.45it/s] 26%|██▌       | 3310/12776 [34:34<1:46:48,  1.48it/s]                                                       26%|██▌       | 3310/12776 [34:34<1:46:48,  1.48it/s] 26%|██▌       | 3311/12776 [34:35<1:39:40,  1.58it/s]                                                      {'loss': 1.1158, 'grad_norm': 3.7933554649353027, 'learning_rate': 0.0002333822091886608, 'epoch': 0.51}
+{'loss': 0.9443, 'grad_norm': 3.8589084148406982, 'learning_rate': 0.00023335777126099705, 'epoch': 0.51}
+{'loss': 1.1503, 'grad_norm': 2.620635986328125, 'learning_rate': 0.0002333333333333333, 'epoch': 0.51}
+{'loss': 1.1161, 'grad_norm': 2.9023118019104004, 'learning_rate': 0.00023330889540566958, 'epoch': 0.51}
+{'loss': 1.3424, 'grad_norm': 2.676131248474121, 'learning_rate': 0.00023328445747800583, 'epoch': 0.51}
+{'loss': 1.2241, 'grad_norm': 5.856595516204834, 'learning_rate': 0.0002332600195503421, 'epoch': 0.51}
+{'loss': 1.5444, 'grad_norm': 2.3947947025299072, 'learning_rate': 0.0002332355816226784, 'epoch': 0.51}
+{'loss': 0.983, 'grad_norm': 2.4484972953796387, 'learning_rate': 0.00023321114369501464, 'epoch': 0.51}
+{'loss': 1.0836, 'grad_norm': 1.774258017539978, 'learning_rate': 0.00023318670576735092, 'epoch': 0.51}
+{'loss': 1.9356, 'grad_norm': 3.7993736267089844, 'learning_rate': 0.0002331622678396872, 'epoch': 0.51}
+{'loss': 1.4901, 'grad_norm': 2.240050792694092, 'learning_rate': 0.00023313782991202342, 'epoch': 0.51}
+{'loss': 1.0782, 'grad_norm': 2.437676429748535, 'learning_rate': 0.0002331133919843597, 'epoch': 0.51}
+{'loss': 1.5938, 'grad_norm': 2.2974166870117188, 'learning_rate': 0.00023308895405669598, 'epoch': 0.51}
+{'loss': 0.4601, 'grad_norm': 1.5529897212982178, 'learning_rate': 0.00023306451612903223, 'epoch': 0.51}
+{'loss': 0.6735, 'grad_norm': 1.3996665477752686, 'learning_rate': 0.0002330400782013685, 'epoch': 0.51}
+{'loss': 0.9503, 'grad_norm': 1.607757329940796, 'learning_rate': 0.0002330156402737048, 'epoch': 0.51}
+{'loss': 0.844, 'grad_norm': 1.984406590461731, 'learning_rate': 0.00023299120234604104, 'epoch': 0.51}
+{'loss': 1.3768, 'grad_norm': 2.105788230895996, 'learning_rate': 0.00023296676441837732, 'epoch': 0.51}
+{'loss': 0.3916, 'grad_norm': 0.5248891115188599, 'learning_rate': 0.0002329423264907136, 'epoch': 0.51}
+{'loss': 0.3574, 'grad_norm': 0.6104065179824829, 'learning_rate': 0.00023291788856304982, 'epoch': 0.51}
+{'loss': 0.2812, 'grad_norm': 0.3868466913700104, 'learning_rate': 0.0002328934506353861, 'epoch': 0.51}
+{'loss': 0.3395, 'grad_norm': 0.5593306422233582, 'learning_rate': 0.00023286901270772238, 'epoch': 0.51}
+{'loss': 0.2767, 'grad_norm': 0.5168365836143494, 'learning_rate': 0.00023284457478005863, 'epoch': 0.51}
+{'loss': 0.4546, 'grad_norm': 0.5223838686943054, 'learning_rate': 0.0002328201368523949, 'epoch': 0.51}
+{'loss': 0.4196, 'grad_norm': 0.9906821846961975, 'learning_rate': 0.00023279569892473118, 'epoch': 0.51}
+{'loss': 0.6491, 'grad_norm': 1.1748069524765015, 'learning_rate': 0.00023277126099706743, 'epoch': 0.51}
+{'loss': 0.3803, 'grad_norm': 0.8128048777580261, 'learning_rate': 0.00023274682306940369, 'epoch': 0.51}
+{'loss': 0.4582, 'grad_norm': 1.0079333782196045, 'learning_rate': 0.00023272238514173996, 'epoch': 0.51}
+{'loss': 0.5789, 'grad_norm': 1.1329032182693481, 'learning_rate': 0.00023269794721407622, 'epoch': 0.51}
+{'loss': 0.3144, 'grad_norm': 0.8500834703445435, 'learning_rate': 0.0002326735092864125, 'epoch': 0.51}
+{'loss': 0.4592, 'grad_norm': 0.8295961022377014, 'learning_rate': 0.00023264907135874877, 'epoch': 0.51}
+{'loss': 0.4923, 'grad_norm': 1.165689468383789, 'learning_rate': 0.00023262463343108502, 'epoch': 0.51}
+{'loss': 0.4771, 'grad_norm': 1.2372647523880005, 'learning_rate': 0.0002326001955034213, 'epoch': 0.51}
+{'loss': 0.4397, 'grad_norm': 1.0081822872161865, 'learning_rate': 0.00023257575757575758, 'epoch': 0.51}
+{'loss': 0.5526, 'grad_norm': 1.4410061836242676, 'learning_rate': 0.0002325513196480938, 'epoch': 0.51}
+{'loss': 0.4355, 'grad_norm': 1.1677322387695312, 'learning_rate': 0.00023252688172043008, 'epoch': 0.51}
+{'loss': 0.5189, 'grad_norm': 1.1747709512710571, 'learning_rate': 0.00023250244379276636, 'epoch': 0.51}
+{'loss': 0.7244, 'grad_norm': 1.338614821434021, 'learning_rate': 0.0002324780058651026, 'epoch': 0.51}
+{'loss': 0.727, 'grad_norm': 1.5333219766616821, 'learning_rate': 0.0002324535679374389, 'epoch': 0.51}
+{'loss': 0.737, 'grad_norm': 1.5182764530181885, 'learning_rate': 0.00023242913000977517, 'epoch': 0.51}
+{'loss': 0.672, 'grad_norm': 1.514541745185852, 'learning_rate': 0.00023240469208211142, 'epoch': 0.51}
+{'loss': 0.8974, 'grad_norm': 1.7871018648147583, 'learning_rate': 0.0002323802541544477, 'epoch': 0.51}
+{'loss': 0.7339, 'grad_norm': 3.3677568435668945, 'learning_rate': 0.00023235581622678398, 'epoch': 0.51}
+{'loss': 0.6502, 'grad_norm': 1.6806715726852417, 'learning_rate': 0.0002323313782991202, 'epoch': 0.51}
+{'loss': 0.7692, 'grad_norm': 3.0515427589416504, 'learning_rate': 0.00023230694037145648, 'epoch': 0.51}
+{'loss': 1.0262, 'grad_norm': 1.5666495561599731, 'learning_rate': 0.00023228250244379276, 'epoch': 0.51}
+{'loss': 0.5212, 'grad_norm': 2.228578805923462, 'learning_rate': 0.000232258064516129, 'epoch': 0.51}
+{'loss': 0.5467, 'grad_norm': 1.372183084487915, 'learning_rate': 0.0002322336265884653, 'epoch': 0.51}
+{'loss': 1.0146, 'grad_norm': 3.1377692222595215, 'learning_rate': 0.00023220918866080157, 'epoch': 0.51}
+{'loss': 0.6966, 'grad_norm': 2.914731979370117, 'learning_rate': 0.0002321847507331378, 'epoch': 0.51}
+{'loss': 0.9771, 'grad_norm': 1.5598711967468262, 'learning_rate': 0.00023216031280547407, 'epoch': 0.51}
+{'loss': 1.0935, 'grad_norm': 2.859375238418579, 'learning_rate': 0.00023213587487781035, 'epoch': 0.51}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.00023213587487781035, 'epoch': 0.51}
+{'loss': 1.3124, 'grad_norm': 3.383396625518799, 'learning_rate': 0.0002321114369501466, 'epoch': 0.51}
+{'loss': 0.9329, 'grad_norm': 4.065854549407959, 'learning_rate': 0.00023208699902248288, 'epoch': 0.51}
+{'loss': 1.4921, 'grad_norm': 2.2295284271240234, 'learning_rate': 0.00023206256109481915, 'epoch': 0.51}
+{'loss': 1.0573, 'grad_norm': 1.8836601972579956, 'learning_rate': 0.0002320381231671554, 'epoch': 0.51}
+{'loss': 1.5235, 'grad_norm': 3.414989709854126, 'learning_rate': 0.00023201368523949168, 'epoch': 0.52}
+{'loss': 1.2622, 'grad_norm': 2.405909776687622, 'learning_rate': 0.00023198924731182796, 'epoch': 0.52}
+{'loss': 1.4015, 'grad_norm': 2.359362840652466, 'learning_rate': 0.0002319648093841642, 'epoch': 0.52}
+{'loss': 1.2348, 'grad_norm': 1.3080377578735352, 'learning_rate': 0.00023194037145650047, 'epoch': 0.52}
+{'loss': 1.8937, 'grad_norm': 3.4239389896392822, 'learning_rate': 0.00023191593352883674, 'epoch': 0.52}
+{'loss': 1.7055, 'grad_norm': 2.4649817943573, 'learning_rate': 0.000231891495601173, 'epoch': 0.52}
+{'loss': 0.9789, 'grad_norm': 2.2212533950805664, 'learning_rate': 0.00023186705767350927, 'epoch': 0.52}
+{'loss': 1.141, 'grad_norm': 1.765533685684204, 'learning_rate': 0.00023184261974584555, 'epoch': 0.52}
+{'loss': 0.4344, 'grad_norm': 2.404294013977051, 'learning_rate': 0.0002318181818181818, 'epoch': 0.52}
+{'loss': 0.9269, 'grad_norm': 2.090888500213623, 'learning_rate': 0.00023179374389051808, 'epoch': 0.52}
+{'loss': 1.101, 'grad_norm': 1.3735568523406982, 'learning_rate': 0.00023176930596285433, 'epoch': 0.52}
+{'loss': 0.3792, 'grad_norm': 0.6953723430633545, 'learning_rate': 0.00023174486803519058, 'epoch': 0.52}
+{'loss': 0.4269, 'grad_norm': 1.5250662565231323, 'learning_rate': 0.00023172043010752686, 'epoch': 0.52}
+{'loss': 0.3612, 'grad_norm': 0.602741539478302, 'learning_rate': 0.00023169599217986314, 'epoch': 0.52}
+{'loss': 0.5513, 'grad_norm': 2.0099756717681885, 'learning_rate': 0.0002316715542521994, 'epoch': 0.52}
+{'loss': 0.3664, 'grad_norm': 0.6258875131607056, 'learning_rate': 0.00023164711632453567, 'epoch': 0.52}
+{'loss': 0.5725, 'grad_norm': 0.8901511430740356, 'learning_rate': 0.00023162267839687195, 'epoch': 0.52}
+{'loss': 0.6214, 'grad_norm': 0.6851283311843872, 'learning_rate': 0.00023159824046920817, 'epoch': 0.52}
+{'loss': 0.4796, 'grad_norm': 0.9573870897293091, 'learning_rate': 0.00023157380254154445, 'epoch': 0.52}
+{'loss': 0.3986, 'grad_norm': 1.0000059604644775, 'learning_rate': 0.00023154936461388073, 'epoch': 0.52}
+{'loss': 0.8138, 'grad_norm': 2.725654125213623, 'learning_rate': 0.00023152492668621698, 'epoch': 0.52}
+ 26%|██▌       | 3311/12776 [34:35<1:39:40,  1.58it/s] 26%|██▌       | 3312/12776 [34:35<1:38:11,  1.61it/s]                                                       26%|██▌       | 3312/12776 [34:35<1:38:11,  1.61it/s] 26%|██▌       | 3313/12776 [34:36<1:31:57,  1.72it/s]                                                       26%|██▌       | 3313/12776 [34:36<1:31:57,  1.72it/s] 26%|██▌       | 3314/12776 [34:36<1:29:19,  1.77it/s]                                                       26%|██▌       | 3314/12776 [34:36<1:29:19,  1.77it/s] 26%|██▌       | 3315/12776 [34:37<1:23:34,  1.89it/s]                                                       26%|██▌       | 3315/12776 [34:37<1:23:34,  1.89it/s] 26%|██▌       | 3316/12776 [34:37<1:22:48,  1.90it/s]                                                       26%|██▌       | 3316/12776 [34:37<1:22:48,  1.90it/s] 26%|██▌       | 3317/12776 [34:38<1:17:32,  2.03it/s]                                                       26%|██▌       | 3317/12776 [34:38<1:17:32,  2.03it/s] 26%|██▌       | 3318/12776 [34:38<1:13:19,  2.15it/s]                                                       26%|██▌       | 3318/12776 [34:38<1:13:19,  2.15it/s] 26%|██▌       | 3319/12776 [34:39<1:14:20,  2.12it/s]                                                       26%|██▌       | 3319/12776 [34:39<1:14:20,  2.12it/s] 26%|██▌       | 3320/12776 [34:39<1:09:47,  2.26it/s]                                                       26%|██▌       | 3320/12776 [34:39<1:09:47,  2.26it/s] 26%|██▌       | 3321/12776 [34:39<1:05:28,  2.41it/s]                                                       26%|██▌       | 3321/12776 [34:39<1:05:28,  2.41it/s] 26%|██▌       | 3322/12776 [34:40<1:04:37,  2.44it/s]                                                       26%|██▌       | 3322/12776 [34:40<1:04:37,  2.44it/s] 26%|██▌       | 3323/12776 [34:40<1:01:01,  2.58it/s]                                                       26%|██▌       | 3323/12776 [34:40<1:01:01,  2.58it/s] 26%|██▌       | 3324/12776 [34:40<58:06,  2.71it/s]                                                       26%|██▌       | 3324/12776 [34:40<58:06,  2.71it/s] 26%|██▌       | 3325/12776 [34:41<56:05,  2.81it/s]                                                     26%|██▌       | 3325/12776 [34:41<56:05,  2.81it/s] 26%|██▌       | 3326/12776 [34:41<53:28,  2.95it/s]                                                     26%|██▌       | 3326/12776 [34:41<53:28,  2.95it/s] 26%|██▌       | 3327/12776 [34:41<51:23,  3.06it/s]                                                     26%|██▌       | 3327/12776 [34:41<51:23,  3.06it/s] 26%|██▌       | 3328/12776 [34:42<49:36,  3.17it/s]                                                     26%|██▌       | 3328/12776 [34:42<49:36,  3.17it/s] 26%|██▌       | 3329/12776 [34:42<53:30,  2.94it/s]                                                     26%|██▌       | 3329/12776 [34:42<53:30,  2.94it/s] 26%|██▌       | 3330/12776 [34:42<50:12,  3.14it/s]                                                     26%|██▌       | 3330/12776 [34:42<50:12,  3.14it/s] 26%|██▌       | 3331/12776 [34:43<47:51,  3.29it/s]                                                     26%|██▌       | 3331/12776 [34:43<47:51,  3.29it/s] 26%|██▌       | 3332/12776 [34:43<45:26,  3.46it/s]                                                     26%|██▌       | 3332/12776 [34:43<45:26,  3.46it/s] 26%|██▌       | 3333/12776 [34:43<46:31,  3.38it/s]                                                     26%|██▌       | 3333/12776 [34:43<46:31,  3.38it/s] 26%|██▌       | 3334/12776 [34:43<44:13,  3.56it/s]                                                     26%|██▌       | 3334/12776 [34:43<44:13,  3.56it/s] 26%|██▌       | 3335/12776 [34:44<42:59,  3.66it/s]                                                     26%|██▌       | 3335/12776 [34:44<42:59,  3.66it/s] 26%|██▌       | 3336/12776 [34:44<41:05,  3.83it/s]                                                     26%|██▌       | 3336/12776 [34:44<41:05,  3.83it/s] 26%|██▌       | 3337/12776 [34:44<41:11,  3.82it/s]                                                     26%|██▌       | 3337/12776 [34:44<41:11,  3.82it/s] 26%|██▌       | 3338/12776 [34:44<39:09,  4.02it/s]                                                     26%|██▌       | 3338/12776 [34:44<39:09,  4.02it/s] 26%|██▌       | 3339/12776 [34:45<37:23,  4.21it/s]                                                     26%|██▌       | 3339/12776 [34:45<37:23,  4.21it/s] 26%|██▌       | 3340/12776 [34:45<36:11,  4.34it/s]                                                     26%|██▌       | 3340/12776 [34:45<36:11,  4.34it/s] 26%|██▌       | 3341/12776 [34:45<35:08,  4.47it/s]                                                     26%|██▌       | 3341/12776 [34:45<35:08,  4.47it/s] 26%|██▌       | 3342/12776 [34:45<37:28,  4.20it/s]                                                     26%|██▌       | 3342/12776 [34:45<37:28,  4.20it/s] 26%|██▌       | 3343/12776 [34:45<35:43,  4.40it/s]                                                     26%|██▌       | 3343/12776 [34:45<35:43,  4.40it/s] 26%|██▌       | 3344/12776 [34:46<34:27,  4.56it/s]                                                     26%|██▌       | 3344/12776 [34:46<34:27,  4.56it/s] 26%|██▌       | 3345/12776 [34:46<33:28,  4.70it/s]                                                     26%|██▌       | 3345/12776 [34:46<33:28,  4.70it/s] 26%|██▌       | 3346/12776 [34:46<32:36,  4.82it/s]                                                     26%|██▌       | 3346/12776 [34:46<32:36,  4.82it/s] 26%|██▌       | 3347/12776 [34:46<31:51,  4.93it/s]                                                     26%|██▌       | 3347/12776 [34:46<31:51,  4.93it/s] 26%|██▌       | 3348/12776 [34:47<35:15,  4.46it/s]                                                     26%|██▌       | 3348/12776 [34:47<35:15,  4.46it/s] 26%|██▌       | 3349/12776 [34:47<33:26,  4.70it/s]                                                     26%|██▌       | 3349/12776 [34:47<33:26,  4.70it/s] 26%|██▌       | 3350/12776 [34:48<1:02:37,  2.51it/s]                                                       26%|██▌       | 3350/12776 [34:48<1:02:37,  2.51it/s] 26%|██▌       | 3351/12776 [34:49<1:56:34,  1.35it/s]                                                       26%|██▌       | 3351/12776 [34:49<1:56:34,  1.35it/s] 26%|██▌       | 3352/12776 [34:50<2:17:28,  1.14it/s]                                                       26%|██▌       | 3352/12776 [34:50<2:17:28,  1.14it/s] 26%|██▌       | 3353/12776 [34:51<2:19:50,  1.12it/s]                                                       26%|██▌       | 3353/12776 [34:51<2:19:50,  1.12it/s] 26%|██▋       | 3354/12776 [34:52<2:23:36,  1.09it/s]                                                       26%|██▋       | 3354/12776 [34:52<2:23:36,  1.09it/s] 26%|██▋       | 3355/12776 [34:53<2:15:34,  1.16it/s]                                                       26%|██▋       | 3355/12776 [34:53<2:15:34,  1.16it/s] 26%|██▋       | 3356/12776 [34:54<2:12:10,  1.19it/s]                                                       26%|██▋       | 3356/12776 [34:54<2:12:10,  1.19it/s] 26%|██▋       | 3357/12776 [34:54<2:08:01,  1.23it/s]                                                       26%|██▋       | 3357/12776 [34:54<2:08:01,  1.23it/s] 26%|██▋       | 3358/12776 [34:55<1:59:38,  1.31it/s]                                                       26%|██▋       | 3358/12776 [34:55<1:59:38,  1.31it/s] 26%|██▋       | 3359/12776 [34:56<1:59:48,  1.31it/s]                                                       26%|██▋       | 3359/12776 [34:56<1:59:48,  1.31it/s] 26%|██▋       | 3360/12776 [34:56<1:51:17,  1.41it/s]                                                       26%|██▋       | 3360/12776 [34:56<1:51:17,  1.41it/s] 26%|██▋       | 3361/12776 [34:57<1:47:20,  1.46it/s]                                                       26%|██▋       | 3361/12776 [34:57<1:47:20,  1.46it/s] 26%|██▋       | 3362/12776 [34:58<1:38:57,  1.59it/s]                                                       26%|██▋       | 3362/12776 [34:58<1:38:57,  1.59it/s] 26%|██▋       | 3363/12776 [34:58<1:34:35,  1.66it/s]                                                       26%|██▋       | 3363/12776 [34:58<1:34:35,  1.66it/s] 26%|██▋       | 3364/12776 [34:59<1:27:32,  1.79it/s]                                                       26%|██▋       | 3364/12776 [34:59<1:27:32,  1.79it/s] 26%|██▋       | 3365/12776 [34:59<1:24:17,  1.86it/s]                                                       26%|██▋       | 3365/12776 [34:59<1:24:17,  1.86it/s] 26%|██▋       | 3366/12776 [35:00<1:18:53,  1.99it/s]                                                       26%|██▋       | 3366/12776 [35:00<1:18:53,  1.99it/s] 26%|██▋       | 3367/12776 [35:00<1:14:20,  2.11it/s]                                                       26%|██▋       | 3367/12776 [35:00<1:14:20,  2.11it/s] 26%|██▋       | 3368/12776 [35:00<1:16:15,  2.06it/s]                                                       26%|██▋       | 3368/12776 [35:00<1:16:15,  2.06it/s] 26%|██▋       | 3369/12776 [35:01<1:11:31,  2.19it/s]                                                       26%|██▋       | 3369/12776 [35:01<1:11:31,  2.19it/s] 26%|██▋       | 3370/12776 [35:01<1:07:33,  2.32it/s]                                                       26%|██▋       | 3370/12776 [35:01<1:07:33,  2.32it/s] 26%|██▋       | 3371/12776 [35:02<1:06:52,  2.34it/s]                                                       26%|██▋       | 3371/12776 [35:02<1:06:52,  2.34it/s] 26%|██▋       | 3372/12776 [35:02<1:03:07,  2.48it/s]                                                       26%|██▋       | 3372/12776 [35:02<1:03:07,  2.48it/s] 26%|██▋       | 3373/12776 [35:02<1:00:05,  2.61it/s]                                                       26%|██▋       | 3373/12776 [35:02<1:00:05,  2.61it/s] 26%|██▋       | 3374/12776 [35:03<1:00:59,  2.57it/s]                                                       26%|██▋       | 3374/12776 [35:03<1:00:59,  2.57it/s] 26%|██▋       | 3375/12776 [35:03<57:50,  2.71it/s]                                                       26%|██▋       | 3375/12776 [35:03<57:50,  2.71it/s] 26%|██▋       | 3376/12776 [35:03<54:45,  2.86it/s]                                                     26%|██▋       | 3376/12776 [35:03<54:45,  2.86it/s] 26%|██▋       | 3377/12776 [35:04<54:32,  2.87it/s]                                                     26%|██▋       | 3377/12776 [35:04<54:32,  2.87it/s] 26%|██▋       | 3378/12776 [35:04<51:48,  3.02it/s]                                                     26%|██▋       | 3378/12776 [35:04<51:48,  3.02it/s] 26%|██▋       | 3379/12776 [35:04<49:24,  3.17it/s]                                                     26%|██▋       | 3379/12776 [35:04<49:24,  3.17it/s] 26%|██▋       | 3380/12776 [35:04<47:20,  3.31it/s]                                                     26%|██▋       | 3380/12776 [35:04<47:20,  3.31it/s] 26%|██▋       | 3381/12776 [35:05<48:06,  3.26it/s]                                                     26%|██▋       | 3381/12776 [35:05<48:06,  3.26it/s] 26%|██▋       | 3382/12776 [35:05<45:42,  3.43it/s]                                                     26%|██▋       | 3382/12776 [35:05<45:42,  3.43it/s] 26%|██▋       | 3383/12776 [35:05<43:46,  3.58it/s]                                                     26%|██▋       | 3383/12776 [35:05<43:46,  3.58it/s] 26%|██▋       | 3384/12776 [35:06<42:02,  3.72it/s]                                                     26%|██▋       | 3384/12776 [35:06<42:02,  3.72it/s] 26%|██▋       | 3385/12776 [35:06<40:38,  3.85it/s]                                                     26%|██▋       | 3385/12776 [35:06<40:38,  3.85it/s] 27%|██▋       | 3386/12776 [35:06<40:49,  3.83it/s]                                                     27%|██▋       | 3386/12776 [35:06<40:49,  3.83it/s] 27%|██▋       | 3387/12776 [35:06<39:11,  3.99it/s]                                                     27%|██▋       | 3387/12776 [35:06<39:11,  3.99it/s] 27%|██▋       | 3388/12776 [35:07<37:23,  4.18it/s]                                                    {'loss': 0.4892, 'grad_norm': 0.7232942581176758, 'learning_rate': 0.00023150048875855326, 'epoch': 0.52}
+{'loss': 0.359, 'grad_norm': 0.7372367978096008, 'learning_rate': 0.00023147605083088954, 'epoch': 0.52}
+{'loss': 0.4568, 'grad_norm': 0.6439573168754578, 'learning_rate': 0.0002314516129032258, 'epoch': 0.52}
+{'loss': 0.4037, 'grad_norm': 0.9462945461273193, 'learning_rate': 0.00023142717497556207, 'epoch': 0.52}
+{'loss': 0.5252, 'grad_norm': 1.0442487001419067, 'learning_rate': 0.00023140273704789835, 'epoch': 0.52}
+{'loss': 0.6814, 'grad_norm': 1.7141872644424438, 'learning_rate': 0.00023137829912023457, 'epoch': 0.52}
+{'loss': 0.7735, 'grad_norm': 1.502058982849121, 'learning_rate': 0.00023135386119257085, 'epoch': 0.52}
+{'loss': 0.4362, 'grad_norm': 1.0354453325271606, 'learning_rate': 0.00023132942326490713, 'epoch': 0.52}
+{'loss': 0.5186, 'grad_norm': 0.9755420088768005, 'learning_rate': 0.00023130498533724338, 'epoch': 0.52}
+{'loss': 0.5573, 'grad_norm': 2.601315975189209, 'learning_rate': 0.00023128054740957966, 'epoch': 0.52}
+{'loss': 0.5652, 'grad_norm': 0.9432253241539001, 'learning_rate': 0.00023125610948191593, 'epoch': 0.52}
+{'loss': 0.63, 'grad_norm': 1.0160396099090576, 'learning_rate': 0.00023123167155425219, 'epoch': 0.52}
+{'loss': 0.6753, 'grad_norm': 1.788796067237854, 'learning_rate': 0.00023120723362658846, 'epoch': 0.52}
+{'loss': 0.5324, 'grad_norm': 2.158668041229248, 'learning_rate': 0.0002311827956989247, 'epoch': 0.52}
+{'loss': 0.8801, 'grad_norm': 2.5072875022888184, 'learning_rate': 0.00023115835777126097, 'epoch': 0.52}
+{'loss': 0.6469, 'grad_norm': 2.151085138320923, 'learning_rate': 0.00023113391984359724, 'epoch': 0.52}
+{'loss': 0.5898, 'grad_norm': 1.2307536602020264, 'learning_rate': 0.0002311094819159335, 'epoch': 0.52}
+{'loss': 0.498, 'grad_norm': 1.785718321800232, 'learning_rate': 0.00023108504398826977, 'epoch': 0.52}
+{'loss': 0.9652, 'grad_norm': 2.295013666152954, 'learning_rate': 0.00023106060606060605, 'epoch': 0.52}
+{'loss': 0.7332, 'grad_norm': 2.417727470397949, 'learning_rate': 0.00023103616813294228, 'epoch': 0.52}
+{'loss': 0.6885, 'grad_norm': 2.488974094390869, 'learning_rate': 0.00023101173020527855, 'epoch': 0.52}
+{'loss': 0.5777, 'grad_norm': 1.3490437269210815, 'learning_rate': 0.00023098729227761483, 'epoch': 0.52}
+{'loss': 0.9452, 'grad_norm': 2.2661380767822266, 'learning_rate': 0.00023096285434995108, 'epoch': 0.52}
+{'loss': 0.7481, 'grad_norm': 3.1678786277770996, 'learning_rate': 0.00023093841642228736, 'epoch': 0.52}
+{'loss': 1.22, 'grad_norm': 3.216134548187256, 'learning_rate': 0.00023091397849462364, 'epoch': 0.52}
+{'loss': 0.9253, 'grad_norm': 1.301865816116333, 'learning_rate': 0.0002308895405669599, 'epoch': 0.52}
+{'loss': 0.5437, 'grad_norm': 2.037886142730713, 'learning_rate': 0.00023086510263929617, 'epoch': 0.52}
+{'loss': 1.2142, 'grad_norm': 2.8060178756713867, 'learning_rate': 0.00023084066471163245, 'epoch': 0.52}
+{'loss': 1.3183, 'grad_norm': 2.1127607822418213, 'learning_rate': 0.00023081622678396867, 'epoch': 0.52}
+{'loss': 1.2346, 'grad_norm': 2.004124879837036, 'learning_rate': 0.00023079178885630495, 'epoch': 0.52}
+{'loss': 1.6289, 'grad_norm': 3.113992691040039, 'learning_rate': 0.00023076735092864123, 'epoch': 0.52}
+{'loss': 1.2212, 'grad_norm': 1.6076124906539917, 'learning_rate': 0.00023074291300097748, 'epoch': 0.52}
+{'loss': 1.0095, 'grad_norm': 1.8852514028549194, 'learning_rate': 0.00023071847507331376, 'epoch': 0.52}
+{'loss': 1.042, 'grad_norm': 4.059223651885986, 'learning_rate': 0.00023069403714565004, 'epoch': 0.52}
+{'loss': 1.6616, 'grad_norm': 2.242157459259033, 'learning_rate': 0.0002306695992179863, 'epoch': 0.52}
+{'loss': 1.2571, 'grad_norm': 2.6829006671905518, 'learning_rate': 0.00023064516129032257, 'epoch': 0.52}
+{'loss': 0.7588, 'grad_norm': 1.4247349500656128, 'learning_rate': 0.00023062072336265885, 'epoch': 0.52}
+{'loss': 0.9346, 'grad_norm': 4.124013423919678, 'learning_rate': 0.00023059628543499507, 'epoch': 0.52}
+{'loss': 0.7006, 'grad_norm': 1.38236665725708, 'learning_rate': 0.00023057184750733135, 'epoch': 0.52}
+{'loss': 0.7983, 'grad_norm': 1.1349596977233887, 'learning_rate': 0.00023054740957966763, 'epoch': 0.52}
+{'loss': 0.4338, 'grad_norm': 0.6386610269546509, 'learning_rate': 0.00023052297165200388, 'epoch': 0.52}
+{'loss': 0.2964, 'grad_norm': 0.4819841980934143, 'learning_rate': 0.00023049853372434016, 'epoch': 0.52}
+{'loss': 0.4256, 'grad_norm': 0.8476313352584839, 'learning_rate': 0.00023047409579667643, 'epoch': 0.52}
+{'loss': 0.3657, 'grad_norm': 0.8270666003227234, 'learning_rate': 0.00023044965786901266, 'epoch': 0.53}
+{'loss': 0.4129, 'grad_norm': 0.8374744653701782, 'learning_rate': 0.00023042521994134894, 'epoch': 0.53}
+{'loss': 0.3006, 'grad_norm': 0.6448666453361511, 'learning_rate': 0.00023040078201368522, 'epoch': 0.53}
+{'loss': 0.5692, 'grad_norm': 1.3359466791152954, 'learning_rate': 0.00023037634408602147, 'epoch': 0.53}
+{'loss': 0.3944, 'grad_norm': 0.5898628830909729, 'learning_rate': 0.00023035190615835775, 'epoch': 0.53}
+{'loss': 0.5376, 'grad_norm': 1.0883667469024658, 'learning_rate': 0.00023032746823069402, 'epoch': 0.53}
+{'loss': 0.4935, 'grad_norm': 0.5729881525039673, 'learning_rate': 0.00023030303030303027, 'epoch': 0.53}
+{'loss': 0.4492, 'grad_norm': 1.0635184049606323, 'learning_rate': 0.00023027859237536655, 'epoch': 0.53}
+{'loss': 3.5422, 'grad_norm': 15.57529067993164, 'learning_rate': 0.00023025415444770283, 'epoch': 0.53}
+{'loss': 0.3876, 'grad_norm': 0.9564080238342285, 'learning_rate': 0.00023022971652003906, 'epoch': 0.53}
+{'loss': 0.493, 'grad_norm': 1.117563247680664, 'learning_rate': 0.00023020527859237533, 'epoch': 0.53}
+{'loss': 0.7056, 'grad_norm': 1.619956135749817, 'learning_rate': 0.0002301808406647116, 'epoch': 0.53}
+{'loss': 0.5172, 'grad_norm': 1.5382646322250366, 'learning_rate': 0.00023015640273704786, 'epoch': 0.53}
+{'loss': 0.8012, 'grad_norm': 2.17067813873291, 'learning_rate': 0.00023013196480938414, 'epoch': 0.53}
+{'loss': 0.6448, 'grad_norm': 1.4361052513122559, 'learning_rate': 0.00023010752688172042, 'epoch': 0.53}
+{'loss': 0.5594, 'grad_norm': 1.2153189182281494, 'learning_rate': 0.00023008308895405667, 'epoch': 0.53}
+{'loss': 0.6764, 'grad_norm': 1.1564387083053589, 'learning_rate': 0.00023005865102639295, 'epoch': 0.53}
+{'loss': 0.3817, 'grad_norm': 1.91134512424469, 'learning_rate': 0.0002300342130987292, 'epoch': 0.53}
+{'loss': 0.8981, 'grad_norm': 1.3591880798339844, 'learning_rate': 0.00023000977517106545, 'epoch': 0.53}
+{'loss': 0.777, 'grad_norm': 0.9462252259254456, 'learning_rate': 0.00022998533724340173, 'epoch': 0.53}
+{'loss': 0.5385, 'grad_norm': 1.3694474697113037, 'learning_rate': 0.000229960899315738, 'epoch': 0.53}
+{'loss': 0.7427, 'grad_norm': 1.7175570726394653, 'learning_rate': 0.00022993646138807426, 'epoch': 0.53}
+{'loss': 0.8763, 'grad_norm': 1.3409452438354492, 'learning_rate': 0.00022991202346041054, 'epoch': 0.53}
+{'loss': 0.8583, 'grad_norm': 2.1568386554718018, 'learning_rate': 0.00022988758553274682, 'epoch': 0.53}
+{'loss': 0.7728, 'grad_norm': 1.5670572519302368, 'learning_rate': 0.00022986314760508304, 'epoch': 0.53}
+{'loss': 1.4348, 'grad_norm': 3.02644419670105, 'learning_rate': 0.00022983870967741932, 'epoch': 0.53}
+{'loss': 0.7838, 'grad_norm': 1.4505369663238525, 'learning_rate': 0.0002298142717497556, 'epoch': 0.53}
+{'loss': 0.6987, 'grad_norm': 1.8097779750823975, 'learning_rate': 0.00022978983382209185, 'epoch': 0.53}
+{'loss': 0.9155, 'grad_norm': 2.318065643310547, 'learning_rate': 0.00022976539589442813, 'epoch': 0.53}
+{'loss': 0.4227, 'grad_norm': 1.7021375894546509, 'learning_rate': 0.0002297409579667644, 'epoch': 0.53}
+{'loss': 0.9867, 'grad_norm': 1.7113087177276611, 'learning_rate': 0.00022971652003910066, 'epoch': 0.53}
+{'loss': 0.745, 'grad_norm': 1.9517741203308105, 'learning_rate': 0.00022969208211143694, 'epoch': 0.53}
+{'loss': 1.193, 'grad_norm': 2.208434820175171, 'learning_rate': 0.00022966764418377321, 'epoch': 0.53}
+{'loss': 1.2062, 'grad_norm': 1.7686792612075806, 'learning_rate': 0.00022964320625610944, 'epoch': 0.53}
+ 27%|██▋       | 3388/12776 [35:07<37:23,  4.18it/s] 27%|██▋       | 3389/12776 [35:07<36:02,  4.34it/s]                                                     27%|██▋       | 3389/12776 [35:07<36:02,  4.34it/s] 27%|██▋       | 3390/12776 [35:07<35:14,  4.44it/s]                                                     27%|██▋       | 3390/12776 [35:07<35:14,  4.44it/s] 27%|██▋       | 3391/12776 [35:07<36:23,  4.30it/s]                                                     27%|██▋       | 3391/12776 [35:07<36:23,  4.30it/s] 27%|██▋       | 3392/12776 [35:07<35:02,  4.46it/s]                                                     27%|██▋       | 3392/12776 [35:07<35:02,  4.46it/s] 27%|██▋       | 3393/12776 [35:08<34:01,  4.60it/s]                                                     27%|██▋       | 3393/12776 [35:08<34:01,  4.60it/s] 27%|██▋       | 3394/12776 [35:08<33:12,  4.71it/s]                                                     27%|██▋       | 3394/12776 [35:08<33:12,  4.71it/s] 27%|██▋       | 3395/12776 [35:08<32:28,  4.82it/s]                                                     27%|██▋       | 3395/12776 [35:08<32:28,  4.82it/s] 27%|██▋       | 3396/12776 [35:08<38:10,  4.10it/s]                                                     27%|██▋       | 3396/12776 [35:08<38:10,  4.10it/s] 27%|██▋       | 3397/12776 [35:09<35:44,  4.37it/s]                                                     27%|██▋       | 3397/12776 [35:09<35:44,  4.37it/s] 27%|██▋       | 3398/12776 [35:09<33:42,  4.64it/s]                                                     27%|██▋       | 3398/12776 [35:09<33:42,  4.64it/s] 27%|██▋       | 3399/12776 [35:09<32:14,  4.85it/s]                                                     27%|██▋       | 3399/12776 [35:09<32:14,  4.85it/s] 27%|██▋       | 3400/12776 [35:10<55:38,  2.81it/s]                                                     27%|██▋       | 3400/12776 [35:10<55:38,  2.81it/s] 27%|██▋       | 3401/12776 [35:11<1:54:51,  1.36it/s]                                                       27%|██▋       | 3401/12776 [35:11<1:54:51,  1.36it/s] 27%|██▋       | 3402/12776 [35:12<2:06:54,  1.23it/s]                                                       27%|██▋       | 3402/12776 [35:12<2:06:54,  1.23it/s] 27%|██▋       | 3403/12776 [35:13<2:12:59,  1.17it/s]                                                       27%|██▋       | 3403/12776 [35:13<2:12:59,  1.17it/s] 27%|██▋       | 3404/12776 [35:14<2:12:31,  1.18it/s]                                                       27%|██▋       | 3404/12776 [35:14<2:12:31,  1.18it/s] 27%|██▋       | 3405/12776 [35:15<2:09:01,  1.21it/s]                                                       27%|██▋       | 3405/12776 [35:15<2:09:01,  1.21it/s] 27%|██▋       | 3406/12776 [35:16<2:07:51,  1.22it/s]                                                       27%|██▋       | 3406/12776 [35:16<2:07:51,  1.22it/s] 27%|██▋       | 3407/12776 [35:16<2:04:22,  1.26it/s]                                                       27%|██▋       | 3407/12776 [35:16<2:04:22,  1.26it/s] 27%|██▋       | 3408/12776 [35:17<1:56:45,  1.34it/s]                                                       27%|██▋       | 3408/12776 [35:17<1:56:45,  1.34it/s] 27%|██▋       | 3409/12776 [35:18<1:50:20,  1.41it/s]                                                       27%|██▋       | 3409/12776 [35:18<1:50:20,  1.41it/s] 27%|██▋       | 3410/12776 [35:18<1:43:33,  1.51it/s]                                                       27%|██▋       | 3410/12776 [35:18<1:43:33,  1.51it/s] 27%|██▋       | 3411/12776 [35:19<1:40:05,  1.56it/s]                                                       27%|██▋       | 3411/12776 [35:19<1:40:05,  1.56it/s] 27%|██▋       | 3412/12776 [35:19<1:34:37,  1.65it/s]                                                       27%|██▋       | 3412/12776 [35:19<1:34:37,  1.65it/s] 27%|██▋       | 3413/12776 [35:20<1:34:41,  1.65it/s]                                                       27%|██▋       | 3413/12776 [35:20<1:34:41,  1.65it/s] 27%|██▋       | 3414/12776 [35:20<1:28:09,  1.77it/s]                                                       27%|██▋       | 3414/12776 [35:20<1:28:09,  1.77it/s] 27%|██▋       | 3415/12776 [35:21<1:22:24,  1.89it/s]                                                       27%|██▋       | 3415/12776 [35:21<1:22:24,  1.89it/s] 27%|██▋       | 3416/12776 [35:21<1:21:38,  1.91it/s]                                                       27%|██▋       | 3416/12776 [35:21<1:21:38,  1.91it/s] 27%|██▋       | 3417/12776 [35:22<1:15:41,  2.06it/s]                                                       27%|██▋       | 3417/12776 [35:22<1:15:41,  2.06it/s] 27%|██▋       | 3418/12776 [35:22<1:16:33,  2.04it/s]                                                       27%|██▋       | 3418/12776 [35:22<1:16:33,  2.04it/s] 27%|██▋       | 3419/12776 [35:23<1:10:21,  2.22it/s]                                                       27%|██▋       | 3419/12776 [35:23<1:10:21,  2.22it/s] 27%|██▋       | 3420/12776 [35:23<1:05:43,  2.37it/s]                                                       27%|██▋       | 3420/12776 [35:23<1:05:43,  2.37it/s] 27%|██▋       | 3421/12776 [35:23<1:04:48,  2.41it/s]                                                       27%|██▋       | 3421/12776 [35:23<1:04:48,  2.41it/s] 27%|██▋       | 3422/12776 [35:24<1:00:58,  2.56it/s]                                                       27%|██▋       | 3422/12776 [35:24<1:00:58,  2.56it/s] 27%|██▋       | 3423/12776 [35:24<57:53,  2.69it/s]                                                       27%|██▋       | 3423/12776 [35:24<57:53,  2.69it/s] 27%|██▋       | 3424/12776 [35:24<54:52,  2.84it/s]                                                     27%|██▋       | 3424/12776 [35:24<54:52,  2.84it/s] 27%|██▋       | 3425/12776 [35:25<52:29,  2.97it/s]                                                     27%|██▋       | 3425/12776 [35:25<52:29,  2.97it/s] 27%|██▋       | 3426/12776 [35:25<50:53,  3.06it/s]                                                     27%|██▋       | 3426/12776 [35:25<50:53,  3.06it/s] 27%|██▋       | 3427/12776 [35:25<48:52,  3.19it/s]                                                     27%|██▋       | 3427/12776 [35:25<48:52,  3.19it/s] 27%|██▋       | 3428/12776 [35:26<53:31,  2.91it/s]                                                     27%|██▋       | 3428/12776 [35:26<53:31,  2.91it/s] 27%|██▋       | 3429/12776 [35:26<49:52,  3.12it/s]                                                     27%|██▋       | 3429/12776 [35:26<49:52,  3.12it/s] 27%|██▋       | 3430/12776 [35:26<46:43,  3.33it/s]                                                     27%|██▋       | 3430/12776 [35:26<46:43,  3.33it/s] 27%|██▋       | 3431/12776 [35:26<44:19,  3.51it/s]                                                     27%|██▋       | 3431/12776 [35:26<44:19,  3.51it/s] 27%|██▋       | 3432/12776 [35:27<49:11,  3.17it/s]                                                     27%|██▋       | 3432/12776 [35:27<49:11,  3.17it/s] 27%|██▋       | 3433/12776 [35:27<45:31,  3.42it/s]                                                     27%|██▋       | 3433/12776 [35:27<45:31,  3.42it/s] 27%|██▋       | 3434/12776 [35:27<42:40,  3.65it/s]                                                     27%|██▋       | 3434/12776 [35:27<42:40,  3.65it/s] 27%|██▋       | 3435/12776 [35:27<40:30,  3.84it/s]                                                     27%|██▋       | 3435/12776 [35:27<40:30,  3.84it/s] 27%|██▋       | 3436/12776 [35:28<38:41,  4.02it/s]                                                     27%|██▋       | 3436/12776 [35:28<38:41,  4.02it/s] 27%|██▋       | 3437/12776 [35:28<43:12,  3.60it/s]                                                     27%|██▋       | 3437/12776 [35:28<43:12,  3.60it/s] 27%|██▋       | 3438/12776 [35:28<40:19,  3.86it/s]                                                     27%|██▋       | 3438/12776 [35:28<40:19,  3.86it/s] 27%|██▋       | 3439/12776 [35:28<38:01,  4.09it/s]                                                     27%|██▋       | 3439/12776 [35:28<38:01,  4.09it/s] 27%|██▋       | 3440/12776 [35:29<36:23,  4.28it/s]                                                     27%|██▋       | 3440/12776 [35:29<36:23,  4.28it/s] 27%|██▋       | 3441/12776 [35:29<35:05,  4.43it/s]                                                     27%|██▋       | 3441/12776 [35:29<35:05,  4.43it/s] 27%|██▋       | 3442/12776 [35:29<38:33,  4.04it/s]                                                     27%|██▋       | 3442/12776 [35:29<38:33,  4.04it/s] 27%|██▋       | 3443/12776 [35:29<36:16,  4.29it/s]                                                     27%|██▋       | 3443/12776 [35:29<36:16,  4.29it/s] 27%|██▋       | 3444/12776 [35:29<34:32,  4.50it/s]                                                     27%|██▋       | 3444/12776 [35:29<34:32,  4.50it/s] 27%|██▋       | 3445/12776 [35:30<33:22,  4.66it/s]                                                     27%|██▋       | 3445/12776 [35:30<33:22,  4.66it/s] 27%|██▋       | 3446/12776 [35:30<32:24,  4.80it/s]                                                     27%|██▋       | 3446/12776 [35:30<32:24,  4.80it/s] 27%|██▋       | 3447/12776 [35:30<33:56,  4.58it/s]                                                     27%|██▋       | 3447/12776 [35:30<33:56,  4.58it/s] 27%|██▋       | 3448/12776 [35:30<32:32,  4.78it/s]                                                     27%|██▋       | 3448/12776 [35:30<32:32,  4.78it/s] 27%|██▋       | 3449/12776 [35:30<31:17,  4.97it/s]                                                     27%|██▋       | 3449/12776 [35:30<31:17,  4.97it/s] 27%|██▋       | 3450/12776 [35:31<57:30,  2.70it/s]                                                     27%|██▋       | 3450/12776 [35:31<57:30,  2.70it/s] 27%|██▋       | 3451/12776 [35:33<1:44:01,  1.49it/s]                                                       27%|██▋       | 3451/12776 [35:33<1:44:01,  1.49it/s] 27%|██▋       | 3452/12776 [35:34<1:57:43,  1.32it/s]                                                       27%|██▋       | 3452/12776 [35:34<1:57:43,  1.32it/s] 27%|██▋       | 3453/12776 [35:34<2:01:00,  1.28it/s]                                                       27%|██▋       | 3453/12776 [35:34<2:01:00,  1.28it/s] 27%|██▋       | 3454/12776 [35:35<2:00:04,  1.29it/s]                                                       27%|██▋       | 3454/12776 [35:35<2:00:04,  1.29it/s] 27%|██▋       | 3455/12776 [35:36<1:58:13,  1.31it/s]                                                       27%|██▋       | 3455/12776 [35:36<1:58:13,  1.31it/s] 27%|██▋       | 3456/12776 [35:37<1:53:55,  1.36it/s]                                                       27%|██▋       | 3456/12776 [35:37<1:53:55,  1.36it/s] 27%|██▋       | 3457/12776 [35:37<1:54:02,  1.36it/s]                                                       27%|██▋       | 3457/12776 [35:37<1:54:02,  1.36it/s] 27%|██▋       | 3458/12776 [35:38<1:48:20,  1.43it/s]                                                       27%|██▋       | 3458/12776 [35:38<1:48:20,  1.43it/s] 27%|██▋       | 3459/12776 [35:39<1:44:53,  1.48it/s]                                                       27%|██▋       | 3459/12776 [35:39<1:44:53,  1.48it/s] 27%|██▋       | 3460/12776 [35:39<1:39:05,  1.57it/s]                                                       27%|██▋       | 3460/12776 [35:39<1:39:05,  1.57it/s] 27%|██▋       | 3461/12776 [35:40<1:37:38,  1.59it/s]                                                       27%|██▋       | 3461/12776 [35:40<1:37:38,  1.59it/s] 27%|██▋       | 3462/12776 [35:40<1:32:03,  1.69it/s]                                                       27%|██▋       | 3462/12776 [35:40<1:32:03,  1.69it/s] 27%|██▋       | 3463/12776 [35:41<1:29:47,  1.73it/s]                                                       27%|██▋       | 3463/12776 [35:41<1:29:47,  1.73it/s] 27%|██▋       | 3464/12776 [35:41<1:24:16,  1.84it/s]                                                       27%|██▋       | 3464/12776 [35:41<1:24:16,  1.84it/s] 27%|██▋       | 3465/12776 [35:42<1:22:11,  1.89it/s]                                                      {'loss': 1.2893, 'grad_norm': 2.9696571826934814, 'learning_rate': 0.00022961876832844572, 'epoch': 0.53}
+{'loss': 0.9809, 'grad_norm': 1.779247522354126, 'learning_rate': 0.000229594330400782, 'epoch': 0.53}
+{'loss': 0.998, 'grad_norm': 3.7535805702209473, 'learning_rate': 0.00022956989247311825, 'epoch': 0.53}
+{'loss': 1.7128, 'grad_norm': 4.040312767028809, 'learning_rate': 0.00022954545454545452, 'epoch': 0.53}
+{'loss': 1.7015, 'grad_norm': 3.6977481842041016, 'learning_rate': 0.0002295210166177908, 'epoch': 0.53}
+{'loss': 1.397, 'grad_norm': 3.1999452114105225, 'learning_rate': 0.00022949657869012705, 'epoch': 0.53}
+{'loss': 0.6898, 'grad_norm': 1.5966978073120117, 'learning_rate': 0.00022947214076246333, 'epoch': 0.53}
+{'loss': 1.1193, 'grad_norm': 3.1094870567321777, 'learning_rate': 0.00022944770283479958, 'epoch': 0.53}
+{'loss': 1.4771, 'grad_norm': 4.34362268447876, 'learning_rate': 0.00022942326490713583, 'epoch': 0.53}
+{'loss': 0.9172, 'grad_norm': 2.2344250679016113, 'learning_rate': 0.0002293988269794721, 'epoch': 0.53}
+{'loss': 0.6987, 'grad_norm': 1.4617431163787842, 'learning_rate': 0.0002293743890518084, 'epoch': 0.53}
+{'loss': 1.653, 'grad_norm': 2.481635093688965, 'learning_rate': 0.00022934995112414464, 'epoch': 0.53}
+{'loss': 1.5556, 'grad_norm': 6.106346607208252, 'learning_rate': 0.00022932551319648092, 'epoch': 0.53}
+{'loss': 0.4391, 'grad_norm': 0.7310137748718262, 'learning_rate': 0.0002293010752688172, 'epoch': 0.53}
+{'loss': 0.3034, 'grad_norm': 0.5438380241394043, 'learning_rate': 0.00022927663734115342, 'epoch': 0.53}
+{'loss': 0.3638, 'grad_norm': 0.5217984318733215, 'learning_rate': 0.0002292521994134897, 'epoch': 0.53}
+{'loss': 0.5653, 'grad_norm': 0.9906480312347412, 'learning_rate': 0.00022922776148582598, 'epoch': 0.53}
+{'loss': 0.2615, 'grad_norm': 0.5368504524230957, 'learning_rate': 0.00022920332355816223, 'epoch': 0.53}
+{'loss': 0.3562, 'grad_norm': 0.5763323903083801, 'learning_rate': 0.0002291788856304985, 'epoch': 0.53}
+{'loss': 0.3222, 'grad_norm': 0.5000870227813721, 'learning_rate': 0.0002291544477028348, 'epoch': 0.53}
+{'loss': 0.5258, 'grad_norm': 1.1395305395126343, 'learning_rate': 0.00022913000977517104, 'epoch': 0.53}
+{'loss': 0.394, 'grad_norm': 0.6404957175254822, 'learning_rate': 0.00022910557184750732, 'epoch': 0.53}
+{'loss': 0.4798, 'grad_norm': 0.6041001677513123, 'learning_rate': 0.0002290811339198436, 'epoch': 0.53}
+{'loss': 0.4944, 'grad_norm': 0.9585782289505005, 'learning_rate': 0.00022905669599217982, 'epoch': 0.53}
+{'loss': 0.4911, 'grad_norm': 1.1818557977676392, 'learning_rate': 0.0002290322580645161, 'epoch': 0.53}
+{'loss': 0.4152, 'grad_norm': 1.2576113939285278, 'learning_rate': 0.00022900782013685238, 'epoch': 0.53}
+{'loss': 0.5462, 'grad_norm': 1.7009164094924927, 'learning_rate': 0.00022898338220918863, 'epoch': 0.53}
+{'loss': 0.5423, 'grad_norm': 2.0021321773529053, 'learning_rate': 0.0002289589442815249, 'epoch': 0.53}
+{'loss': 0.4347, 'grad_norm': 1.420309066772461, 'learning_rate': 0.00022893450635386118, 'epoch': 0.53}
+{'loss': 0.5028, 'grad_norm': 1.0252100229263306, 'learning_rate': 0.00022891006842619744, 'epoch': 0.53}
+{'loss': 0.7375, 'grad_norm': 1.3138428926467896, 'learning_rate': 0.00022888563049853371, 'epoch': 0.54}
+{'loss': 0.3795, 'grad_norm': 0.9500388503074646, 'learning_rate': 0.00022886119257086997, 'epoch': 0.54}
+{'loss': 0.3207, 'grad_norm': 0.8862390518188477, 'learning_rate': 0.00022883675464320622, 'epoch': 0.54}
+{'loss': 0.6341, 'grad_norm': 1.376956582069397, 'learning_rate': 0.0002288123167155425, 'epoch': 0.54}
+{'loss': 0.7873, 'grad_norm': 3.1740610599517822, 'learning_rate': 0.00022878787878787877, 'epoch': 0.54}
+{'loss': 0.6119, 'grad_norm': 1.5563631057739258, 'learning_rate': 0.00022876344086021502, 'epoch': 0.54}
+{'loss': 0.9051, 'grad_norm': 2.3107411861419678, 'learning_rate': 0.0002287390029325513, 'epoch': 0.54}
+{'loss': 0.5476, 'grad_norm': 1.220873236656189, 'learning_rate': 0.00022871456500488758, 'epoch': 0.54}
+{'loss': 1.1999, 'grad_norm': 3.4480738639831543, 'learning_rate': 0.0002286901270772238, 'epoch': 0.54}
+{'loss': 0.8199, 'grad_norm': 2.0392181873321533, 'learning_rate': 0.00022866568914956008, 'epoch': 0.54}
+{'loss': 0.9086, 'grad_norm': 1.5251580476760864, 'learning_rate': 0.00022864125122189636, 'epoch': 0.54}
+{'loss': 1.0547, 'grad_norm': 1.5712864398956299, 'learning_rate': 0.00022861681329423261, 'epoch': 0.54}
+{'loss': 0.6112, 'grad_norm': 2.3451268672943115, 'learning_rate': 0.0002285923753665689, 'epoch': 0.54}
+{'loss': 1.3906, 'grad_norm': 2.3855226039886475, 'learning_rate': 0.00022856793743890517, 'epoch': 0.54}
+{'loss': 0.9376, 'grad_norm': 2.014420747756958, 'learning_rate': 0.00022854349951124142, 'epoch': 0.54}
+{'loss': 0.8464, 'grad_norm': 3.3102900981903076, 'learning_rate': 0.0002285190615835777, 'epoch': 0.54}
+{'loss': 0.8186, 'grad_norm': 2.707771062850952, 'learning_rate': 0.00022849462365591398, 'epoch': 0.54}
+{'loss': 1.0236, 'grad_norm': 1.4002277851104736, 'learning_rate': 0.0002284701857282502, 'epoch': 0.54}
+{'loss': 1.027, 'grad_norm': 2.523838996887207, 'learning_rate': 0.00022844574780058648, 'epoch': 0.54}
+{'loss': 1.0235, 'grad_norm': 2.5714972019195557, 'learning_rate': 0.00022842130987292276, 'epoch': 0.54}
+{'loss': 1.1276, 'grad_norm': 3.162550926208496, 'learning_rate': 0.000228396871945259, 'epoch': 0.54}
+{'loss': 1.0137, 'grad_norm': 1.9457319974899292, 'learning_rate': 0.0002283724340175953, 'epoch': 0.54}
+{'loss': 1.1942, 'grad_norm': 1.903373122215271, 'learning_rate': 0.00022834799608993157, 'epoch': 0.54}
+{'loss': 0.9727, 'grad_norm': 2.4530985355377197, 'learning_rate': 0.00022832355816226782, 'epoch': 0.54}
+{'loss': 1.8357, 'grad_norm': 3.6654627323150635, 'learning_rate': 0.00022829912023460407, 'epoch': 0.54}
+{'loss': 1.2149, 'grad_norm': 2.052656650543213, 'learning_rate': 0.00022827468230694035, 'epoch': 0.54}
+{'loss': 1.7517, 'grad_norm': 2.6127777099609375, 'learning_rate': 0.0002282502443792766, 'epoch': 0.54}
+{'loss': 2.0915, 'grad_norm': 1.7864444255828857, 'learning_rate': 0.00022822580645161288, 'epoch': 0.54}
+{'loss': 0.6787, 'grad_norm': 1.7597533464431763, 'learning_rate': 0.00022820136852394916, 'epoch': 0.54}
+{'loss': 0.7049, 'grad_norm': 1.367319941520691, 'learning_rate': 0.0002281769305962854, 'epoch': 0.54}
+{'loss': 1.3499, 'grad_norm': 3.0287587642669678, 'learning_rate': 0.00022815249266862169, 'epoch': 0.54}
+{'loss': 0.6301, 'grad_norm': 1.6455621719360352, 'learning_rate': 0.00022812805474095796, 'epoch': 0.54}
+{'loss': 1.35, 'grad_norm': 1.5209710597991943, 'learning_rate': 0.0002281036168132942, 'epoch': 0.54}
+{'loss': 0.3548, 'grad_norm': 0.5051299333572388, 'learning_rate': 0.00022807917888563047, 'epoch': 0.54}
+{'loss': 0.518, 'grad_norm': 0.9242395162582397, 'learning_rate': 0.00022805474095796674, 'epoch': 0.54}
+{'loss': 0.2494, 'grad_norm': 0.676452100276947, 'learning_rate': 0.000228030303030303, 'epoch': 0.54}
+{'loss': 0.3723, 'grad_norm': 0.5453509092330933, 'learning_rate': 0.00022800586510263927, 'epoch': 0.54}
+{'loss': 0.3795, 'grad_norm': 0.7466689348220825, 'learning_rate': 0.00022798142717497555, 'epoch': 0.54}
+{'loss': 0.5283, 'grad_norm': 2.0270116329193115, 'learning_rate': 0.0002279569892473118, 'epoch': 0.54}
+{'loss': 0.4212, 'grad_norm': 0.8889615535736084, 'learning_rate': 0.00022793255131964808, 'epoch': 0.54}
+{'loss': 0.4152, 'grad_norm': 0.9550113677978516, 'learning_rate': 0.00022790811339198436, 'epoch': 0.54}
+{'loss': 0.298, 'grad_norm': 0.8343170881271362, 'learning_rate': 0.00022788367546432058, 'epoch': 0.54}
+{'loss': 0.4314, 'grad_norm': 0.9619960784912109, 'learning_rate': 0.00022785923753665686, 'epoch': 0.54}
+{'loss': 0.3688, 'grad_norm': 0.5965786576271057, 'learning_rate': 0.00022783479960899314, 'epoch': 0.54}
+{'loss': 0.4797, 'grad_norm': 0.9296935200691223, 'learning_rate': 0.0002278103616813294, 'epoch': 0.54}
+{'loss': 0.513, 'grad_norm': 1.0688129663467407, 'learning_rate': 0.00022778592375366567, 'epoch': 0.54}
+{'loss': 0.6811, 'grad_norm': 1.701122522354126, 'learning_rate': 0.00022776148582600195, 'epoch': 0.54}
+ 27%|██▋       | 3465/12776 [35:42<1:22:11,  1.89it/s] 27%|██▋       | 3466/12776 [35:42<1:17:23,  2.00it/s]                                                       27%|██▋       | 3466/12776 [35:42<1:17:23,  2.00it/s] 27%|██▋       | 3467/12776 [35:43<1:13:15,  2.12it/s]                                                       27%|██▋       | 3467/12776 [35:43<1:13:15,  2.12it/s] 27%|██▋       | 3468/12776 [35:43<1:13:12,  2.12it/s]                                                       27%|██▋       | 3468/12776 [35:43<1:13:12,  2.12it/s] 27%|██▋       | 3469/12776 [35:43<1:09:01,  2.25it/s]                                                       27%|██▋       | 3469/12776 [35:43<1:09:01,  2.25it/s] 27%|██▋       | 3470/12776 [35:44<1:04:45,  2.39it/s]                                                       27%|██▋       | 3470/12776 [35:44<1:04:45,  2.39it/s] 27%|██▋       | 3471/12776 [35:44<1:06:43,  2.32it/s]                                                       27%|██▋       | 3471/12776 [35:44<1:06:43,  2.32it/s] 27%|██▋       | 3472/12776 [35:45<1:02:32,  2.48it/s]                                                       27%|██▋       | 3472/12776 [35:45<1:02:32,  2.48it/s] 27%|██▋       | 3473/12776 [35:45<59:19,  2.61it/s]                                                       27%|██▋       | 3473/12776 [35:45<59:19,  2.61it/s] 27%|██▋       | 3474/12776 [35:45<59:48,  2.59it/s]                                                     27%|██▋       | 3474/12776 [35:45<59:48,  2.59it/s] 27%|██▋       | 3475/12776 [35:46<56:02,  2.77it/s]                                                     27%|██▋       | 3475/12776 [35:46<56:02,  2.77it/s] 27%|██▋       | 3476/12776 [35:46<53:00,  2.92it/s]                                                     27%|██▋       | 3476/12776 [35:46<53:00,  2.92it/s] 27%|██▋       | 3477/12776 [35:46<53:38,  2.89it/s]                                                     27%|██▋       | 3477/12776 [35:46<53:38,  2.89it/s] 27%|██▋       | 3478/12776 [35:47<50:09,  3.09it/s]                                                     27%|██▋       | 3478/12776 [35:47<50:09,  3.09it/s] 27%|██▋       | 3479/12776 [35:47<47:20,  3.27it/s]                                                     27%|██▋       | 3479/12776 [35:47<47:20,  3.27it/s] 27%|██▋       | 3480/12776 [35:47<44:40,  3.47it/s]                                                     27%|██▋       | 3480/12776 [35:47<44:40,  3.47it/s] 27%|██▋       | 3481/12776 [35:47<48:39,  3.18it/s]                                                     27%|██▋       | 3481/12776 [35:47<48:39,  3.18it/s] 27%|██▋       | 3482/12776 [35:48<45:43,  3.39it/s]                                                     27%|██▋       | 3482/12776 [35:48<45:43,  3.39it/s] 27%|██▋       | 3483/12776 [35:48<43:15,  3.58it/s]                                                     27%|██▋       | 3483/12776 [35:48<43:15,  3.58it/s] 27%|██▋       | 3484/12776 [35:48<41:15,  3.75it/s]                                                     27%|██▋       | 3484/12776 [35:48<41:15,  3.75it/s] 27%|██▋       | 3485/12776 [35:48<39:35,  3.91it/s]                                                     27%|██▋       | 3485/12776 [35:48<39:35,  3.91it/s] 27%|██▋       | 3486/12776 [35:49<41:52,  3.70it/s]                                                     27%|██▋       | 3486/12776 [35:49<41:52,  3.70it/s] 27%|██▋       | 3487/12776 [35:49<39:13,  3.95it/s]                                                     27%|██▋       | 3487/12776 [35:49<39:13,  3.95it/s] 27%|██▋       | 3488/12776 [35:49<37:35,  4.12it/s]                                                     27%|██▋       | 3488/12776 [35:49<37:35,  4.12it/s] 27%|██▋       | 3489/12776 [35:49<36:00,  4.30it/s]                                                     27%|██▋       | 3489/12776 [35:49<36:00,  4.30it/s] 27%|██▋       | 3490/12776 [35:50<34:54,  4.43it/s]                                                     27%|██▋       | 3490/12776 [35:50<34:54,  4.43it/s] 27%|██▋       | 3491/12776 [35:50<37:36,  4.11it/s]                                                     27%|██▋       | 3491/12776 [35:50<37:36,  4.11it/s] 27%|██▋       | 3492/12776 [35:50<35:49,  4.32it/s]                                                     27%|██▋       | 3492/12776 [35:50<35:49,  4.32it/s] 27%|██▋       | 3493/12776 [35:50<34:24,  4.50it/s]                                                     27%|██▋       | 3493/12776 [35:50<34:24,  4.50it/s] 27%|██▋       | 3494/12776 [35:50<33:19,  4.64it/s]                                                     27%|██▋       | 3494/12776 [35:50<33:19,  4.64it/s] 27%|██▋       | 3495/12776 [35:51<32:28,  4.76it/s]                                                     27%|██▋       | 3495/12776 [35:51<32:28,  4.76it/s] 27%|██▋       | 3496/12776 [35:51<36:33,  4.23it/s]                                                     27%|██▋       | 3496/12776 [35:51<36:33,  4.23it/s] 27%|██▋       | 3497/12776 [35:51<33:52,  4.56it/s]                                                     27%|██▋       | 3497/12776 [35:51<33:52,  4.56it/s] 27%|██▋       | 3498/12776 [35:51<32:27,  4.76it/s]                                                     27%|██▋       | 3498/12776 [35:51<32:27,  4.76it/s] 27%|██▋       | 3499/12776 [35:51<31:21,  4.93it/s]                                                     27%|██▋       | 3499/12776 [35:51<31:21,  4.93it/s] 27%|██▋       | 3500/12776 [35:52<55:11,  2.80it/s]                                                     27%|██▋       | 3500/12776 [35:52<55:11,  2.80it/s] 27%|██▋       | 3501/12776 [35:54<1:47:08,  1.44it/s]                                                       27%|██▋       | 3501/12776 [35:54<1:47:08,  1.44it/s] 27%|██▋       | 3502/12776 [35:55<2:05:24,  1.23it/s]                                                       27%|██▋       | 3502/12776 [35:55<2:05:24,  1.23it/s] 27%|██▋       | 3503/12776 [35:56<2:10:23,  1.19it/s]                                                       27%|██▋       | 3503/12776 [35:56<2:10:23,  1.19it/s] 27%|██▋       | 3504/12776 [35:57<2:17:15,  1.13it/s]                                                       27%|██▋       | 3504/12776 [35:57<2:17:15,  1.13it/s] 27%|██▋       | 3505/12776 [35:57<2:10:57,  1.18it/s]                                                       27%|██▋       | 3505/12776 [35:57<2:10:57,  1.18it/s] 27%|██▋       | 3506/12776 [35:58<2:08:03,  1.21it/s]                                                       27%|██▋       | 3506/12776 [35:58<2:08:03,  1.21it/s] 27%|██▋       | 3507/12776 [35:59<2:03:41,  1.25it/s]                                                       27%|██▋       | 3507/12776 [35:59<2:03:41,  1.25it/s] 27%|██▋       | 3508/12776 [36:00<1:56:03,  1.33it/s]                                                       27%|██▋       | 3508/12776 [36:00<1:56:03,  1.33it/s] 27%|██▋       | 3509/12776 [36:00<1:55:37,  1.34it/s]                                                       27%|██▋       | 3509/12776 [36:00<1:55:37,  1.34it/s] 27%|██▋       | 3510/12776 [36:01<1:46:58,  1.44it/s]                                                       27%|██▋       | 3510/12776 [36:01<1:46:58,  1.44it/s] 27%|██▋       | 3511/12776 [36:01<1:42:04,  1.51it/s]                                                       27%|██▋       | 3511/12776 [36:01<1:42:04,  1.51it/s] 27%|██▋       | 3512/12776 [36:02<1:35:06,  1.62it/s]                                                       27%|██▋       | 3512/12776 [36:02<1:35:06,  1.62it/s] 27%|██▋       | 3513/12776 [36:03<1:34:18,  1.64it/s]                                                       27%|██▋       | 3513/12776 [36:03<1:34:18,  1.64it/s] 28%|██▊       | 3514/12776 [36:03<1:26:58,  1.77it/s]                                                       28%|██▊       | 3514/12776 [36:03<1:26:58,  1.77it/s] 28%|██▊       | 3515/12776 [36:04<1:23:57,  1.84it/s]                                                       28%|██▊       | 3515/12776 [36:04<1:23:57,  1.84it/s] 28%|██▊       | 3516/12776 [36:04<1:18:21,  1.97it/s]                                                       28%|██▊       | 3516/12776 [36:04<1:18:21,  1.97it/s] 28%|██▊       | 3517/12776 [36:04<1:13:18,  2.11it/s]                                                       28%|██▊       | 3517/12776 [36:04<1:13:18,  2.11it/s] 28%|██▊       | 3518/12776 [36:05<1:14:13,  2.08it/s]                                                       28%|██▊       | 3518/12776 [36:05<1:14:13,  2.08it/s] 28%|██▊       | 3519/12776 [36:05<1:08:59,  2.24it/s]                                                       28%|██▊       | 3519/12776 [36:05<1:08:59,  2.24it/s] 28%|██▊       | 3520/12776 [36:06<1:04:33,  2.39it/s]                                                       28%|██▊       | 3520/12776 [36:06<1:04:33,  2.39it/s] 28%|██▊       | 3521/12776 [36:06<1:03:55,  2.41it/s]                                                       28%|██▊       | 3521/12776 [36:06<1:03:55,  2.41it/s] 28%|██▊       | 3522/12776 [36:06<1:00:15,  2.56it/s]                                                       28%|██▊       | 3522/12776 [36:06<1:00:15,  2.56it/s] 28%|██▊       | 3523/12776 [36:07<57:28,  2.68it/s]                                                       28%|██▊       | 3523/12776 [36:07<57:28,  2.68it/s] 28%|██▊       | 3524/12776 [36:07<1:00:55,  2.53it/s]                                                       28%|██▊       | 3524/12776 [36:07<1:00:55,  2.53it/s] 28%|██▊       | 3525/12776 [36:07<56:39,  2.72it/s]                                                       28%|██▊       | 3525/12776 [36:07<56:39,  2.72it/s] 28%|██▊       | 3526/12776 [36:08<53:15,  2.89it/s]                                                     28%|██▊       | 3526/12776 [36:08<53:15,  2.89it/s] 28%|██▊       | 3527/12776 [36:08<54:47,  2.81it/s]                                                     28%|██▊       | 3527/12776 [36:08<54:47,  2.81it/s] 28%|██▊       | 3528/12776 [36:08<51:06,  3.02it/s]                                                     28%|██▊       | 3528/12776 [36:08<51:06,  3.02it/s] 28%|██▊       | 3529/12776 [36:09<48:05,  3.20it/s]                                                     28%|██▊       | 3529/12776 [36:09<48:05,  3.20it/s] 28%|██▊       | 3530/12776 [36:09<45:43,  3.37it/s]                                                     28%|██▊       | 3530/12776 [36:09<45:43,  3.37it/s] 28%|██▊       | 3531/12776 [36:09<46:36,  3.31it/s]                                                     28%|██▊       | 3531/12776 [36:09<46:36,  3.31it/s] 28%|██▊       | 3532/12776 [36:09<43:52,  3.51it/s]                                                     28%|██▊       | 3532/12776 [36:09<43:52,  3.51it/s] 28%|██▊       | 3533/12776 [36:10<41:42,  3.69it/s]                                                     28%|██▊       | 3533/12776 [36:10<41:42,  3.69it/s] 28%|██▊       | 3534/12776 [36:10<39:55,  3.86it/s]                                                     28%|██▊       | 3534/12776 [36:10<39:55,  3.86it/s] 28%|██▊       | 3535/12776 [36:10<38:29,  4.00it/s]                                                     28%|██▊       | 3535/12776 [36:10<38:29,  4.00it/s] 28%|██▊       | 3536/12776 [36:10<41:27,  3.71it/s]                                                     28%|██▊       | 3536/12776 [36:10<41:27,  3.71it/s] 28%|██▊       | 3537/12776 [36:11<38:56,  3.95it/s]                                                     28%|██▊       | 3537/12776 [36:11<38:56,  3.95it/s] 28%|██▊       | 3538/12776 [36:11<37:03,  4.16it/s]                                                     28%|██▊       | 3538/12776 [36:11<37:03,  4.16it/s] 28%|██▊       | 3539/12776 [36:11<35:32,  4.33it/s]                                                     28%|██▊       | 3539/12776 [36:11<35:32,  4.33it/s] 28%|██▊       | 3540/12776 [36:11<34:30,  4.46it/s]                                                     28%|██▊       | 3540/12776 [36:11<34:30,  4.46it/s] 28%|██▊       | 3541/12776 [36:12<37:03,  4.15it/s]                                                     28%|██▊       | 3541/12776 [36:12<37:03,  4.15it/s] 28%|██▊       | 3542/12776 [36:12<35:15,  4.37it/s]                                                    {'loss': 0.5554, 'grad_norm': 1.191488265991211, 'learning_rate': 0.0002277370478983382, 'epoch': 0.54}
+{'loss': 0.5331, 'grad_norm': 0.9095986485481262, 'learning_rate': 0.00022771260997067445, 'epoch': 0.54}
+{'loss': 0.4866, 'grad_norm': 1.1927762031555176, 'learning_rate': 0.00022768817204301073, 'epoch': 0.54}
+{'loss': 0.9596, 'grad_norm': 1.4253158569335938, 'learning_rate': 0.00022766373411534698, 'epoch': 0.54}
+{'loss': 0.7098, 'grad_norm': 1.2457616329193115, 'learning_rate': 0.00022763929618768326, 'epoch': 0.54}
+{'loss': 0.9893, 'grad_norm': 1.5363194942474365, 'learning_rate': 0.00022761485826001954, 'epoch': 0.54}
+{'loss': 0.671, 'grad_norm': 1.4052916765213013, 'learning_rate': 0.0002275904203323558, 'epoch': 0.54}
+{'loss': 0.9099, 'grad_norm': 1.5679124593734741, 'learning_rate': 0.00022756598240469207, 'epoch': 0.54}
+{'loss': 0.6937, 'grad_norm': 1.642220139503479, 'learning_rate': 0.00022754154447702835, 'epoch': 0.54}
+{'loss': 0.6677, 'grad_norm': 2.2005414962768555, 'learning_rate': 0.00022751710654936457, 'epoch': 0.54}
+{'loss': 0.6974, 'grad_norm': 1.1939611434936523, 'learning_rate': 0.00022749266862170085, 'epoch': 0.54}
+{'loss': 0.7005, 'grad_norm': 1.181011438369751, 'learning_rate': 0.00022746823069403713, 'epoch': 0.54}
+{'loss': 1.0437, 'grad_norm': 2.0277912616729736, 'learning_rate': 0.00022744379276637338, 'epoch': 0.54}
+{'loss': 1.2138, 'grad_norm': 2.596615791320801, 'learning_rate': 0.00022741935483870966, 'epoch': 0.54}
+{'loss': 0.6488, 'grad_norm': 1.2545539140701294, 'learning_rate': 0.00022739491691104593, 'epoch': 0.54}
+{'loss': 0.8829, 'grad_norm': 1.7148044109344482, 'learning_rate': 0.00022737047898338219, 'epoch': 0.54}
+{'loss': 1.189, 'grad_norm': 3.2928473949432373, 'learning_rate': 0.00022734604105571846, 'epoch': 0.54}
+{'loss': 1.0388, 'grad_norm': 2.6867687702178955, 'learning_rate': 0.00022732160312805474, 'epoch': 0.55}
+{'loss': 0.8676, 'grad_norm': 2.231804132461548, 'learning_rate': 0.00022729716520039097, 'epoch': 0.55}
+{'loss': 0.8428, 'grad_norm': 1.7448841333389282, 'learning_rate': 0.00022727272727272725, 'epoch': 0.55}
+{'loss': 0.788, 'grad_norm': 1.843652606010437, 'learning_rate': 0.00022724828934506352, 'epoch': 0.55}
+{'loss': 1.2652, 'grad_norm': 2.4735724925994873, 'learning_rate': 0.00022722385141739978, 'epoch': 0.55}
+{'loss': 1.159, 'grad_norm': 1.6568492650985718, 'learning_rate': 0.00022719941348973605, 'epoch': 0.55}
+{'loss': 1.0109, 'grad_norm': 2.2103464603424072, 'learning_rate': 0.00022717497556207233, 'epoch': 0.55}
+{'loss': 0.7408, 'grad_norm': 2.2817304134368896, 'learning_rate': 0.00022715053763440856, 'epoch': 0.55}
+{'loss': 0.9899, 'grad_norm': 1.5361602306365967, 'learning_rate': 0.00022712609970674483, 'epoch': 0.55}
+{'loss': 1.565, 'grad_norm': 2.22003436088562, 'learning_rate': 0.0002271016617790811, 'epoch': 0.55}
+{'loss': 1.1689, 'grad_norm': 1.5563693046569824, 'learning_rate': 0.00022707722385141736, 'epoch': 0.55}
+{'loss': 1.2611, 'grad_norm': 1.7934682369232178, 'learning_rate': 0.00022705278592375364, 'epoch': 0.55}
+{'loss': 1.3886, 'grad_norm': 1.765387773513794, 'learning_rate': 0.00022702834799608992, 'epoch': 0.55}
+{'loss': 1.7589, 'grad_norm': 4.801526069641113, 'learning_rate': 0.00022700391006842617, 'epoch': 0.55}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.00022700391006842617, 'epoch': 0.55}
+{'loss': 1.0594, 'grad_norm': 2.799443244934082, 'learning_rate': 0.00022697947214076245, 'epoch': 0.55}
+{'loss': 0.6357, 'grad_norm': 1.0865267515182495, 'learning_rate': 0.00022695503421309873, 'epoch': 0.55}
+{'loss': 0.5283, 'grad_norm': 1.8002020120620728, 'learning_rate': 0.00022693059628543495, 'epoch': 0.55}
+{'loss': 1.3573, 'grad_norm': 2.474811315536499, 'learning_rate': 0.00022690615835777123, 'epoch': 0.55}
+{'loss': 0.255, 'grad_norm': 0.7308578491210938, 'learning_rate': 0.0002268817204301075, 'epoch': 0.55}
+{'loss': 0.5089, 'grad_norm': 0.9229421615600586, 'learning_rate': 0.00022685728250244376, 'epoch': 0.55}
+{'loss': 0.3171, 'grad_norm': 0.6674167513847351, 'learning_rate': 0.00022683284457478004, 'epoch': 0.55}
+{'loss': 0.3403, 'grad_norm': 0.6612511873245239, 'learning_rate': 0.00022680840664711632, 'epoch': 0.55}
+{'loss': 0.3597, 'grad_norm': 0.6101376414299011, 'learning_rate': 0.00022678396871945257, 'epoch': 0.55}
+{'loss': 0.4338, 'grad_norm': 0.6959841251373291, 'learning_rate': 0.00022675953079178885, 'epoch': 0.55}
+{'loss': 0.3483, 'grad_norm': 0.6419839262962341, 'learning_rate': 0.00022673509286412513, 'epoch': 0.55}
+{'loss': 0.2916, 'grad_norm': 0.5177487134933472, 'learning_rate': 0.00022671065493646135, 'epoch': 0.55}
+{'loss': 0.2922, 'grad_norm': 0.5536925196647644, 'learning_rate': 0.00022668621700879763, 'epoch': 0.55}
+{'loss': 0.4654, 'grad_norm': 0.7985596656799316, 'learning_rate': 0.0002266617790811339, 'epoch': 0.55}
+{'loss': 0.423, 'grad_norm': 0.6217026710510254, 'learning_rate': 0.00022663734115347016, 'epoch': 0.55}
+{'loss': 0.3649, 'grad_norm': 0.8252955675125122, 'learning_rate': 0.00022661290322580644, 'epoch': 0.55}
+{'loss': 0.2662, 'grad_norm': 0.7620217204093933, 'learning_rate': 0.00022658846529814271, 'epoch': 0.55}
+{'loss': 0.8678, 'grad_norm': 1.2428134679794312, 'learning_rate': 0.00022656402737047894, 'epoch': 0.55}
+{'loss': 0.6157, 'grad_norm': 1.2475255727767944, 'learning_rate': 0.00022653958944281522, 'epoch': 0.55}
+{'loss': 0.4179, 'grad_norm': 0.6180065870285034, 'learning_rate': 0.0002265151515151515, 'epoch': 0.55}
+{'loss': 0.5168, 'grad_norm': 1.0421558618545532, 'learning_rate': 0.00022649071358748775, 'epoch': 0.55}
+{'loss': 0.4618, 'grad_norm': 1.2510666847229004, 'learning_rate': 0.00022646627565982402, 'epoch': 0.55}
+{'loss': 0.4542, 'grad_norm': 0.9947577118873596, 'learning_rate': 0.0002264418377321603, 'epoch': 0.55}
+{'loss': 0.5551, 'grad_norm': 1.770212173461914, 'learning_rate': 0.00022641739980449655, 'epoch': 0.55}
+{'loss': 0.5831, 'grad_norm': 1.2235960960388184, 'learning_rate': 0.00022639296187683283, 'epoch': 0.55}
+{'loss': 0.8161, 'grad_norm': 3.5953123569488525, 'learning_rate': 0.0002263685239491691, 'epoch': 0.55}
+{'loss': 0.8925, 'grad_norm': 2.296663761138916, 'learning_rate': 0.00022634408602150533, 'epoch': 0.55}
+{'loss': 1.1062, 'grad_norm': 1.9417630434036255, 'learning_rate': 0.0002263196480938416, 'epoch': 0.55}
+{'loss': 0.7668, 'grad_norm': 1.476485013961792, 'learning_rate': 0.0002262952101661779, 'epoch': 0.55}
+{'loss': 0.9898, 'grad_norm': 2.0487520694732666, 'learning_rate': 0.00022627077223851414, 'epoch': 0.55}
+{'loss': 1.3848, 'grad_norm': 4.198084354400635, 'learning_rate': 0.00022624633431085042, 'epoch': 0.55}
+{'loss': 0.8235, 'grad_norm': 2.556408166885376, 'learning_rate': 0.0002262218963831867, 'epoch': 0.55}
+{'loss': 0.8973, 'grad_norm': 2.0827739238739014, 'learning_rate': 0.00022619745845552295, 'epoch': 0.55}
+{'loss': 1.2371, 'grad_norm': 3.2092297077178955, 'learning_rate': 0.00022617302052785923, 'epoch': 0.55}
+{'loss': 0.9781, 'grad_norm': 2.5607619285583496, 'learning_rate': 0.00022614858260019548, 'epoch': 0.55}
+{'loss': 1.0107, 'grad_norm': 1.599326729774475, 'learning_rate': 0.00022612414467253173, 'epoch': 0.55}
+{'loss': 0.766, 'grad_norm': 2.7297661304473877, 'learning_rate': 0.000226099706744868, 'epoch': 0.55}
+{'loss': 0.9601, 'grad_norm': 2.919910430908203, 'learning_rate': 0.0002260752688172043, 'epoch': 0.55}
+{'loss': 0.9165, 'grad_norm': 2.319716691970825, 'learning_rate': 0.00022605083088954054, 'epoch': 0.55}
+{'loss': 0.954, 'grad_norm': 2.5673739910125732, 'learning_rate': 0.00022602639296187682, 'epoch': 0.55}
+{'loss': 0.9416, 'grad_norm': 3.195618152618408, 'learning_rate': 0.0002260019550342131, 'epoch': 0.55}
+{'loss': 1.2491, 'grad_norm': 3.3584465980529785, 'learning_rate': 0.00022597751710654932, 'epoch': 0.55}
+{'loss': 0.9559, 'grad_norm': 2.711599349975586, 'learning_rate': 0.0002259530791788856, 'epoch': 0.55}
+{'loss': 1.0383, 'grad_norm': 3.927182912826538, 'learning_rate': 0.00022592864125122188, 'epoch': 0.55}
+{'loss': 1.2182, 'grad_norm': 2.409851312637329, 'learning_rate': 0.00022590420332355813, 'epoch': 0.55}
+ 28%|██▊       | 3542/12776 [36:12<35:15,  4.37it/s] 28%|██▊       | 3543/12776 [36:12<33:51,  4.55it/s]                                                     28%|██▊       | 3543/12776 [36:12<33:51,  4.55it/s] 28%|██▊       | 3544/12776 [36:12<32:47,  4.69it/s]                                                     28%|██▊       | 3544/12776 [36:12<32:47,  4.69it/s] 28%|██▊       | 3545/12776 [36:12<31:59,  4.81it/s]                                                     28%|██▊       | 3545/12776 [36:12<31:59,  4.81it/s] 28%|██▊       | 3546/12776 [36:13<36:40,  4.20it/s]                                                     28%|██▊       | 3546/12776 [36:13<36:40,  4.20it/s] 28%|██▊       | 3547/12776 [36:13<34:27,  4.46it/s]                                                     28%|██▊       | 3547/12776 [36:13<34:27,  4.46it/s] 28%|██▊       | 3548/12776 [36:13<32:43,  4.70it/s]                                                     28%|██▊       | 3548/12776 [36:13<32:43,  4.70it/s] 28%|██▊       | 3549/12776 [36:13<31:28,  4.89it/s]                                                     28%|██▊       | 3549/12776 [36:13<31:28,  4.89it/s] 28%|██▊       | 3550/12776 [36:14<56:35,  2.72it/s]                                                     28%|██▊       | 3550/12776 [36:14<56:35,  2.72it/s] 28%|██▊       | 3551/12776 [36:16<1:53:46,  1.35it/s]                                                       28%|██▊       | 3551/12776 [36:16<1:53:46,  1.35it/s] 28%|██▊       | 3552/12776 [36:17<2:06:06,  1.22it/s]                                                       28%|██▊       | 3552/12776 [36:17<2:06:06,  1.22it/s] 28%|██▊       | 3553/12776 [36:17<2:09:54,  1.18it/s]                                                       28%|██▊       | 3553/12776 [36:17<2:09:54,  1.18it/s] 28%|██▊       | 3554/12776 [36:18<2:08:50,  1.19it/s]                                                       28%|██▊       | 3554/12776 [36:18<2:08:50,  1.19it/s] 28%|██▊       | 3555/12776 [36:19<2:04:27,  1.23it/s]                                                       28%|██▊       | 3555/12776 [36:19<2:04:27,  1.23it/s] 28%|██▊       | 3556/12776 [36:20<2:03:40,  1.24it/s]                                                       28%|██▊       | 3556/12776 [36:20<2:03:40,  1.24it/s] 28%|██▊       | 3557/12776 [36:21<2:01:29,  1.26it/s]                                                       28%|██▊       | 3557/12776 [36:21<2:01:29,  1.26it/s] 28%|██▊       | 3558/12776 [36:21<1:53:36,  1.35it/s]                                                       28%|██▊       | 3558/12776 [36:21<1:53:36,  1.35it/s] 28%|██▊       | 3559/12776 [36:22<1:46:22,  1.44it/s]                                                       28%|██▊       | 3559/12776 [36:22<1:46:22,  1.44it/s] 28%|██▊       | 3560/12776 [36:22<1:40:53,  1.52it/s]                                                       28%|██▊       | 3560/12776 [36:22<1:40:53,  1.52it/s] 28%|██▊       | 3561/12776 [36:23<1:37:36,  1.57it/s]                                                       28%|██▊       | 3561/12776 [36:23<1:37:36,  1.57it/s] 28%|██▊       | 3562/12776 [36:23<1:32:08,  1.67it/s]                                                       28%|██▊       | 3562/12776 [36:23<1:32:08,  1.67it/s] 28%|██▊       | 3563/12776 [36:24<1:33:08,  1.65it/s]                                                       28%|██▊       | 3563/12776 [36:24<1:33:08,  1.65it/s] 28%|██▊       | 3564/12776 [36:25<1:26:29,  1.78it/s]                                                       28%|██▊       | 3564/12776 [36:25<1:26:29,  1.78it/s] 28%|██▊       | 3565/12776 [36:25<1:19:34,  1.93it/s]                                                       28%|██▊       | 3565/12776 [36:25<1:19:34,  1.93it/s] 28%|██▊       | 3566/12776 [36:25<1:20:06,  1.92it/s]                                                       28%|██▊       | 3566/12776 [36:25<1:20:06,  1.92it/s] 28%|██▊       | 3567/12776 [36:26<1:15:41,  2.03it/s]                                                       28%|██▊       | 3567/12776 [36:26<1:15:41,  2.03it/s] 28%|██▊       | 3568/12776 [36:26<1:13:56,  2.08it/s]                                                       28%|██▊       | 3568/12776 [36:26<1:13:56,  2.08it/s] 28%|██▊       | 3569/12776 [36:27<1:09:31,  2.21it/s]                                                       28%|██▊       | 3569/12776 [36:27<1:09:31,  2.21it/s] 28%|██▊       | 3570/12776 [36:27<1:05:57,  2.33it/s]                                                       28%|██▊       | 3570/12776 [36:27<1:05:57,  2.33it/s] 28%|██▊       | 3571/12776 [36:28<1:04:00,  2.40it/s]                                                       28%|██▊       | 3571/12776 [36:28<1:04:00,  2.40it/s] 28%|██▊       | 3572/12776 [36:28<1:00:42,  2.53it/s]                                                       28%|██▊       | 3572/12776 [36:28<1:00:42,  2.53it/s] 28%|██▊       | 3573/12776 [36:28<57:53,  2.65it/s]                                                       28%|██▊       | 3573/12776 [36:28<57:53,  2.65it/s] 28%|██▊       | 3574/12776 [36:29<1:01:23,  2.50it/s]                                                       28%|██▊       | 3574/12776 [36:29<1:01:23,  2.50it/s] 28%|██▊       | 3575/12776 [36:29<57:38,  2.66it/s]                                                       28%|██▊       | 3575/12776 [36:29<57:38,  2.66it/s] 28%|██▊       | 3576/12776 [36:29<54:11,  2.83it/s]                                                     28%|██▊       | 3576/12776 [36:29<54:11,  2.83it/s] 28%|██▊       | 3577/12776 [36:30<51:29,  2.98it/s]                                                     28%|██▊       | 3577/12776 [36:30<51:29,  2.98it/s] 28%|██▊       | 3578/12776 [36:30<52:55,  2.90it/s]                                                     28%|██▊       | 3578/12776 [36:30<52:55,  2.90it/s] 28%|██▊       | 3579/12776 [36:30<49:40,  3.09it/s]                                                     28%|██▊       | 3579/12776 [36:30<49:40,  3.09it/s] 28%|██▊       | 3580/12776 [36:30<47:10,  3.25it/s]                                                     28%|██▊       | 3580/12776 [36:30<47:10,  3.25it/s] 28%|██▊       | 3581/12776 [36:31<45:12,  3.39it/s]                                                     28%|██▊       | 3581/12776 [36:31<45:12,  3.39it/s] 28%|██▊       | 3582/12776 [36:31<47:38,  3.22it/s]                                                     28%|██▊       | 3582/12776 [36:31<47:38,  3.22it/s] 28%|██▊       | 3583/12776 [36:31<44:52,  3.41it/s]                                                     28%|██▊       | 3583/12776 [36:31<44:52,  3.41it/s] 28%|██▊       | 3584/12776 [36:32<42:45,  3.58it/s]                                                     28%|██▊       | 3584/12776 [36:32<42:45,  3.58it/s] 28%|██▊       | 3585/12776 [36:32<41:08,  3.72it/s]                                                     28%|██▊       | 3585/12776 [36:32<41:08,  3.72it/s] 28%|██▊       | 3586/12776 [36:32<45:17,  3.38it/s]                                                     28%|██▊       | 3586/12776 [36:32<45:17,  3.38it/s] 28%|██▊       | 3587/12776 [36:32<42:11,  3.63it/s]                                                     28%|██▊       | 3587/12776 [36:32<42:11,  3.63it/s] 28%|██▊       | 3588/12776 [36:33<39:38,  3.86it/s]                                                     28%|██▊       | 3588/12776 [36:33<39:38,  3.86it/s] 28%|██▊       | 3589/12776 [36:33<37:40,  4.06it/s]                                                     28%|██▊       | 3589/12776 [36:33<37:40,  4.06it/s] 28%|██▊       | 3590/12776 [36:33<40:47,  3.75it/s]                                                     28%|██▊       | 3590/12776 [36:33<40:47,  3.75it/s] 28%|██▊       | 3591/12776 [36:33<38:13,  4.00it/s]                                                     28%|██▊       | 3591/12776 [36:33<38:13,  4.00it/s] 28%|██▊       | 3592/12776 [36:34<36:07,  4.24it/s]                                                     28%|██▊       | 3592/12776 [36:34<36:07,  4.24it/s] 28%|██▊       | 3593/12776 [36:34<34:30,  4.43it/s]                                                     28%|██▊       | 3593/12776 [36:34<34:30,  4.43it/s] 28%|██▊       | 3594/12776 [36:34<33:14,  4.60it/s]                                                     28%|██▊       | 3594/12776 [36:34<33:14,  4.60it/s] 28%|██▊       | 3595/12776 [36:34<36:39,  4.17it/s]                                                     28%|██▊       | 3595/12776 [36:34<36:39,  4.17it/s] 28%|██▊       | 3596/12776 [36:34<34:35,  4.42it/s]                                                     28%|██▊       | 3596/12776 [36:34<34:35,  4.42it/s] 28%|██▊       | 3597/12776 [36:35<33:03,  4.63it/s]                                                     28%|██▊       | 3597/12776 [36:35<33:03,  4.63it/s] 28%|██▊       | 3598/12776 [36:35<31:39,  4.83it/s]                                                     28%|██▊       | 3598/12776 [36:35<31:39,  4.83it/s] 28%|██▊       | 3599/12776 [36:35<30:41,  4.98it/s]                                                     28%|██▊       | 3599/12776 [36:35<30:41,  4.98it/s] 28%|██▊       | 3600/12776 [36:36<52:07,  2.93it/s]                                                     28%|██▊       | 3600/12776 [36:36<52:07,  2.93it/s]Saving model checkpoint to ./checkpoint-3600
+Configuration saved in ./checkpoint-3600/config.json
+Model weights saved in ./checkpoint-3600/model.safetensors
+Feature extractor saved in ./checkpoint-3600/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-3600/tokenizer_config.json
+Special tokens file saved in ./checkpoint-3600/special_tokens_map.json
+added tokens file saved in ./checkpoint-3600/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-2400] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 28%|██▊       | 3601/12776 [36:42<5:38:08,  2.21s/it]                                                       28%|██▊       | 3601/12776 [36:42<5:38:08,  2.21s/it] 28%|██▊       | 3602/12776 [36:43<4:39:17,  1.83s/it]                                                       28%|██▊       | 3602/12776 [36:43<4:39:17,  1.83s/it] 28%|██▊       | 3603/12776 [36:44<3:51:41,  1.52s/it]                                                       28%|██▊       | 3603/12776 [36:44<3:51:41,  1.52s/it] 28%|██▊       | 3604/12776 [36:45<3:20:46,  1.31s/it]                                                       28%|██▊       | 3604/12776 [36:45<3:20:46,  1.31s/it] 28%|██▊       | 3605/12776 [36:46<2:52:06,  1.13s/it]                                                       28%|██▊       | 3605/12776 [36:46<2:52:06,  1.13s/it] 28%|██▊       | 3606/12776 [36:46<2:28:19,  1.03it/s]                                                       28%|██▊       | 3606/12776 [36:46<2:28:19,  1.03it/s] 28%|██▊       | 3607/12776 [36:47<2:10:25,  1.17it/s]                                                       28%|██▊       | 3607/12776 [36:47<2:10:25,  1.17it/s] 28%|██▊       | 3608/12776 [36:47<1:55:48,  1.32it/s]                                                       28%|██▊       | 3608/12776 [36:47<1:55:48,  1.32it/s] 28%|██▊       | 3609/12776 [36:48<1:44:30,  1.46it/s]                                                       28%|██▊       | 3609/12776 [36:48<1:44:30,  1.46it/s] 28%|██▊       | 3610/12776 [36:48<1:37:23,  1.57it/s]                                                       28%|██▊       | 3610/12776 [36:48<1:37:23,  1.57it/s] 28%|██▊       | 3611/12776 [36:49<1:29:41,  1.70it/s]                                                       28%|██▊       | 3611/12776 [36:49<1:29:41,  1.70it/s] 28%|██▊       | 3612/12776 [36:49<1:29:58,  1.70it/s]                                                       28%|██▊       | 3612/12776 [36:49<1:29:58,  1.70it/s] 28%|██▊       | 3613/12776 [36:50<1:22:08,  1.86it/s]                                                       28%|██▊       | 3613/12776 [36:50<1:22:08,  1.86it/s] 28%|██▊       | 3614/12776 [36:50<1:21:46,  1.87it/s]                                                       28%|██▊       | 3614/12776 [36:50<1:21:46,  1.87it/s] 28%|██▊       | 3615/12776 [36:51<1:14:58,  2.04it/s]                                                       28%|██▊       | 3615/12776 [36:51<1:14:58,  2.04it/s] 28%|██▊       | 3616/12776 [36:51<1:09:15,  2.20it/s]                                                       28%|██▊       | 3616/12776 [36:51<1:09:15,  2.20it/s] 28%|██▊       | 3617/12776 [36:51<1:07:57,  2.25it/s]                                                       28%|██▊       | 3617/12776 [36:51<1:07:57,  2.25it/s] 28%|██▊       | 3618/12776 [36:52<1:03:26,  2.41it/s]                                                       28%|██▊       | 3618/12776 [36:52<1:03:26,  2.41it/s] 28%|██▊       | 3619/12776 [36:52<59:35,  2.56it/s]                                                       28%|██▊       | 3619/12776 [36:52<59:35,  2.56it/s] 28%|██▊       | 3620/12776 [36:53<1:02:59,  2.42it/s]                                                      {'loss': 1.6216, 'grad_norm': 2.19413423538208, 'learning_rate': 0.0002258797653958944, 'epoch': 0.55}
+{'loss': 1.5274, 'grad_norm': 2.6822988986968994, 'learning_rate': 0.00022585532746823069, 'epoch': 0.55}
+{'loss': 1.097, 'grad_norm': 1.415163516998291, 'learning_rate': 0.00022583088954056694, 'epoch': 0.55}
+{'loss': 0.7808, 'grad_norm': 2.0902304649353027, 'learning_rate': 0.00022580645161290321, 'epoch': 0.55}
+{'loss': 0.4876, 'grad_norm': 2.3548595905303955, 'learning_rate': 0.0002257820136852395, 'epoch': 0.56}
+{'loss': 0.8252, 'grad_norm': 1.715476632118225, 'learning_rate': 0.00022575757575757572, 'epoch': 0.56}
+{'loss': 0.8234, 'grad_norm': 2.3043689727783203, 'learning_rate': 0.000225733137829912, 'epoch': 0.56}
+{'loss': 1.091, 'grad_norm': 1.6674572229385376, 'learning_rate': 0.00022570869990224827, 'epoch': 0.56}
+{'loss': 1.4916, 'grad_norm': 4.341742038726807, 'learning_rate': 0.00022568426197458453, 'epoch': 0.56}
+{'loss': 0.3373, 'grad_norm': 0.5292091369628906, 'learning_rate': 0.0002256598240469208, 'epoch': 0.56}
+{'loss': 0.316, 'grad_norm': 0.5695773959159851, 'learning_rate': 0.00022563538611925708, 'epoch': 0.56}
+{'loss': 0.3825, 'grad_norm': 0.6100854277610779, 'learning_rate': 0.00022561094819159333, 'epoch': 0.56}
+{'loss': 0.4018, 'grad_norm': 0.7489566802978516, 'learning_rate': 0.0002255865102639296, 'epoch': 0.56}
+{'loss': 0.4573, 'grad_norm': 0.8366956114768982, 'learning_rate': 0.00022556207233626586, 'epoch': 0.56}
+{'loss': 0.4246, 'grad_norm': 0.6405544877052307, 'learning_rate': 0.00022553763440860211, 'epoch': 0.56}
+{'loss': 0.2276, 'grad_norm': 0.7330952882766724, 'learning_rate': 0.0002255131964809384, 'epoch': 0.56}
+{'loss': 0.3543, 'grad_norm': 0.7685224413871765, 'learning_rate': 0.00022548875855327467, 'epoch': 0.56}
+{'loss': 0.506, 'grad_norm': 0.679145872592926, 'learning_rate': 0.00022546432062561092, 'epoch': 0.56}
+{'loss': 0.4348, 'grad_norm': 0.8586009740829468, 'learning_rate': 0.0002254398826979472, 'epoch': 0.56}
+{'loss': 0.3119, 'grad_norm': 0.8567745685577393, 'learning_rate': 0.00022541544477028348, 'epoch': 0.56}
+{'loss': 0.4098, 'grad_norm': 0.9251272082328796, 'learning_rate': 0.0002253910068426197, 'epoch': 0.56}
+{'loss': 0.3483, 'grad_norm': 0.8240041136741638, 'learning_rate': 0.00022536656891495598, 'epoch': 0.56}
+{'loss': 0.3981, 'grad_norm': 0.5420299172401428, 'learning_rate': 0.00022534213098729226, 'epoch': 0.56}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.00022534213098729226, 'epoch': 0.56}
+{'loss': 0.8704, 'grad_norm': 5.230978488922119, 'learning_rate': 0.0002253176930596285, 'epoch': 0.56}
+{'loss': 0.5912, 'grad_norm': 1.3879233598709106, 'learning_rate': 0.0002252932551319648, 'epoch': 0.56}
+{'loss': 0.6871, 'grad_norm': 1.5150583982467651, 'learning_rate': 0.00022526881720430107, 'epoch': 0.56}
+{'loss': 0.6863, 'grad_norm': 1.0342755317687988, 'learning_rate': 0.00022524437927663732, 'epoch': 0.56}
+{'loss': 0.7084, 'grad_norm': 1.3024189472198486, 'learning_rate': 0.0002252199413489736, 'epoch': 0.56}
+{'loss': 0.9109, 'grad_norm': 1.985766887664795, 'learning_rate': 0.00022519550342130988, 'epoch': 0.56}
+{'loss': 0.5658, 'grad_norm': 1.3010162115097046, 'learning_rate': 0.0002251710654936461, 'epoch': 0.56}
+{'loss': 1.1083, 'grad_norm': 1.4666105508804321, 'learning_rate': 0.00022514662756598238, 'epoch': 0.56}
+{'loss': 0.655, 'grad_norm': 1.7869631052017212, 'learning_rate': 0.00022512218963831866, 'epoch': 0.56}
+{'loss': 0.62, 'grad_norm': 1.4627180099487305, 'learning_rate': 0.0002250977517106549, 'epoch': 0.56}
+{'loss': 1.0856, 'grad_norm': 2.270118236541748, 'learning_rate': 0.00022507331378299119, 'epoch': 0.56}
+{'loss': 0.3643, 'grad_norm': 1.2834758758544922, 'learning_rate': 0.00022504887585532746, 'epoch': 0.56}
+{'loss': 0.6055, 'grad_norm': 2.207064628601074, 'learning_rate': 0.00022502443792766372, 'epoch': 0.56}
+{'loss': 1.3798, 'grad_norm': 4.434436798095703, 'learning_rate': 0.000225, 'epoch': 0.56}
+{'loss': 1.0831, 'grad_norm': 2.0381810665130615, 'learning_rate': 0.00022497556207233625, 'epoch': 0.56}
+{'loss': 0.7524, 'grad_norm': 2.15852689743042, 'learning_rate': 0.0002249511241446725, 'epoch': 0.56}
+{'loss': 0.8807, 'grad_norm': 3.1960198879241943, 'learning_rate': 0.00022492668621700877, 'epoch': 0.56}
+{'loss': 0.4864, 'grad_norm': 1.4990772008895874, 'learning_rate': 0.00022490224828934505, 'epoch': 0.56}
+{'loss': 0.9871, 'grad_norm': 2.38069486618042, 'learning_rate': 0.0002248778103616813, 'epoch': 0.56}
+{'loss': 1.1582, 'grad_norm': 3.0171408653259277, 'learning_rate': 0.00022485337243401758, 'epoch': 0.56}
+{'loss': 0.6874, 'grad_norm': 2.1977977752685547, 'learning_rate': 0.00022482893450635386, 'epoch': 0.56}
+{'loss': 0.8322, 'grad_norm': 2.109252452850342, 'learning_rate': 0.00022480449657869009, 'epoch': 0.56}
+{'loss': 1.0376, 'grad_norm': 2.189072370529175, 'learning_rate': 0.00022478005865102636, 'epoch': 0.56}
+{'loss': 1.7067, 'grad_norm': 3.6662800312042236, 'learning_rate': 0.00022475562072336264, 'epoch': 0.56}
+{'loss': 0.9702, 'grad_norm': 2.0984466075897217, 'learning_rate': 0.0002247311827956989, 'epoch': 0.56}
+{'loss': 1.1355, 'grad_norm': 1.8233792781829834, 'learning_rate': 0.00022470674486803517, 'epoch': 0.56}
+{'loss': 1.3035, 'grad_norm': 1.7763501405715942, 'learning_rate': 0.00022468230694037145, 'epoch': 0.56}
+{'loss': 1.4573, 'grad_norm': 3.240084171295166, 'learning_rate': 0.0002246578690127077, 'epoch': 0.56}
+{'loss': 1.5207, 'grad_norm': 2.5310521125793457, 'learning_rate': 0.00022463343108504398, 'epoch': 0.56}
+{'loss': 1.0834, 'grad_norm': 1.6432092189788818, 'learning_rate': 0.00022460899315738026, 'epoch': 0.56}
+{'loss': 0.5104, 'grad_norm': 0.8675611019134521, 'learning_rate': 0.00022458455522971648, 'epoch': 0.56}
+{'loss': 0.7592, 'grad_norm': 1.3985061645507812, 'learning_rate': 0.00022456011730205276, 'epoch': 0.56}
+{'loss': 0.8605, 'grad_norm': 2.843593120574951, 'learning_rate': 0.00022453567937438904, 'epoch': 0.56}
+{'loss': 0.9727, 'grad_norm': 2.444558620452881, 'learning_rate': 0.0002245112414467253, 'epoch': 0.56}
+{'loss': 1.3655, 'grad_norm': 2.5446102619171143, 'learning_rate': 0.00022448680351906157, 'epoch': 0.56}
+{'loss': 0.35, 'grad_norm': 0.5711503624916077, 'learning_rate': 0.00022446236559139785, 'epoch': 0.56}
+{'loss': 0.238, 'grad_norm': 0.39799460768699646, 'learning_rate': 0.0002244379276637341, 'epoch': 0.56}
+{'loss': 0.4052, 'grad_norm': 0.5093404054641724, 'learning_rate': 0.00022441348973607035, 'epoch': 0.56}
+{'loss': 0.6028, 'grad_norm': 0.7445962429046631, 'learning_rate': 0.00022438905180840663, 'epoch': 0.56}
+{'loss': 0.3982, 'grad_norm': 0.8046715259552002, 'learning_rate': 0.00022436461388074288, 'epoch': 0.56}
+{'loss': 0.3914, 'grad_norm': 0.7545210123062134, 'learning_rate': 0.00022434017595307916, 'epoch': 0.56}
+{'loss': 0.4382, 'grad_norm': 0.8643534183502197, 'learning_rate': 0.00022431573802541544, 'epoch': 0.56}
+{'loss': 0.3623, 'grad_norm': 0.7606629133224487, 'learning_rate': 0.0002242913000977517, 'epoch': 0.56}
+{'loss': 0.5698, 'grad_norm': 1.1187993288040161, 'learning_rate': 0.00022426686217008796, 'epoch': 0.56}
+{'loss': 0.319, 'grad_norm': 0.6099269390106201, 'learning_rate': 0.00022424242424242424, 'epoch': 0.57}
+{'loss': 0.6025, 'grad_norm': 1.3326719999313354, 'learning_rate': 0.00022421798631476047, 'epoch': 0.57}
+{'loss': 1.0935, 'grad_norm': 2.37567400932312, 'learning_rate': 0.00022419354838709675, 'epoch': 0.57}
+{'loss': 0.3043, 'grad_norm': 0.8539568185806274, 'learning_rate': 0.00022416911045943302, 'epoch': 0.57}
+{'loss': 0.4338, 'grad_norm': 1.237491488456726, 'learning_rate': 0.00022414467253176928, 'epoch': 0.57}
+{'loss': 0.4656, 'grad_norm': 0.9956717491149902, 'learning_rate': 0.00022412023460410555, 'epoch': 0.57}
+{'loss': 0.7444, 'grad_norm': 1.1546032428741455, 'learning_rate': 0.00022409579667644183, 'epoch': 0.57}
+{'loss': 0.8117, 'grad_norm': 1.377625823020935, 'learning_rate': 0.00022407135874877808, 'epoch': 0.57}
+{'loss': 0.566, 'grad_norm': 1.8123804330825806, 'learning_rate': 0.00022404692082111436, 'epoch': 0.57}
+{'loss': 0.5954, 'grad_norm': 1.1724187135696411, 'learning_rate': 0.00022402248289345064, 'epoch': 0.57}
+ 28%|██▊       | 3620/12776 [36:53<1:02:59,  2.42it/s] 28%|██▊       | 3621/12776 [36:53<58:24,  2.61it/s]                                                       28%|██▊       | 3621/12776 [36:53<58:24,  2.61it/s] 28%|██▊       | 3622/12776 [36:53<54:48,  2.78it/s]                                                     28%|██▊       | 3622/12776 [36:53<54:48,  2.78it/s] 28%|██▊       | 3623/12776 [36:54<51:37,  2.95it/s]                                                     28%|██▊       | 3623/12776 [36:54<51:37,  2.95it/s] 28%|██▊       | 3624/12776 [36:54<51:32,  2.96it/s]                                                     28%|██▊       | 3624/12776 [36:54<51:32,  2.96it/s] 28%|██▊       | 3625/12776 [36:54<48:23,  3.15it/s]                                                     28%|██▊       | 3625/12776 [36:54<48:23,  3.15it/s] 28%|██▊       | 3626/12776 [36:54<45:53,  3.32it/s]                                                     28%|██▊       | 3626/12776 [36:54<45:53,  3.32it/s] 28%|██▊       | 3627/12776 [36:55<43:39,  3.49it/s]                                                     28%|██▊       | 3627/12776 [36:55<43:39,  3.49it/s] 28%|██▊       | 3628/12776 [36:55<45:00,  3.39it/s]                                                     28%|██▊       | 3628/12776 [36:55<45:00,  3.39it/s] 28%|██▊       | 3629/12776 [36:55<42:10,  3.61it/s]                                                     28%|██▊       | 3629/12776 [36:55<42:10,  3.61it/s] 28%|██▊       | 3630/12776 [36:55<40:04,  3.80it/s]                                                     28%|██▊       | 3630/12776 [36:55<40:04,  3.80it/s] 28%|██▊       | 3631/12776 [36:56<38:21,  3.97it/s]                                                     28%|██▊       | 3631/12776 [36:56<38:21,  3.97it/s] 28%|██▊       | 3632/12776 [36:56<41:45,  3.65it/s]                                                     28%|██▊       | 3632/12776 [36:56<41:45,  3.65it/s] 28%|██▊       | 3633/12776 [36:56<39:24,  3.87it/s]                                                     28%|██▊       | 3633/12776 [36:56<39:24,  3.87it/s] 28%|██▊       | 3634/12776 [36:56<37:13,  4.09it/s]                                                     28%|██▊       | 3634/12776 [36:56<37:13,  4.09it/s] 28%|██▊       | 3635/12776 [36:57<35:27,  4.30it/s]                                                     28%|██▊       | 3635/12776 [36:57<35:27,  4.30it/s] 28%|██▊       | 3636/12776 [36:57<33:53,  4.50it/s]                                                     28%|██▊       | 3636/12776 [36:57<33:53,  4.50it/s] 28%|██▊       | 3637/12776 [36:57<36:16,  4.20it/s]                                                     28%|██▊       | 3637/12776 [36:57<36:16,  4.20it/s] 28%|██▊       | 3638/12776 [36:57<34:05,  4.47it/s]                                                     28%|██▊       | 3638/12776 [36:57<34:05,  4.47it/s] 28%|██▊       | 3639/12776 [36:57<32:29,  4.69it/s]                                                     28%|██▊       | 3639/12776 [36:57<32:29,  4.69it/s] 28%|██▊       | 3640/12776 [36:58<31:07,  4.89it/s]                                                     28%|██▊       | 3640/12776 [36:58<31:07,  4.89it/s] 28%|██▊       | 3641/12776 [36:58<30:09,  5.05it/s]                                                     28%|██▊       | 3641/12776 [36:58<30:09,  5.05it/s] 29%|██▊       | 3642/12776 [36:58<29:14,  5.21it/s]                                                     29%|██▊       | 3642/12776 [36:58<29:14,  5.21it/s] 29%|██▊       | 3643/12776 [36:58<34:55,  4.36it/s]                                                     29%|██▊       | 3643/12776 [36:58<34:55,  4.36it/s] 29%|██▊       | 3644/12776 [36:59<32:26,  4.69it/s]                                                     29%|██▊       | 3644/12776 [36:59<32:26,  4.69it/s] 29%|██▊       | 3645/12776 [36:59<30:34,  4.98it/s]                                                     29%|██▊       | 3645/12776 [36:59<30:34,  4.98it/s] 29%|██▊       | 3646/12776 [36:59<29:11,  5.21it/s]                                                     29%|██▊       | 3646/12776 [36:59<29:11,  5.21it/s] 29%|██▊       | 3647/12776 [36:59<28:14,  5.39it/s]                                                     29%|██▊       | 3647/12776 [36:59<28:14,  5.39it/s] 29%|██▊       | 3648/12776 [36:59<27:14,  5.58it/s]                                                     29%|██▊       | 3648/12776 [36:59<27:14,  5.58it/s] 29%|██▊       | 3649/12776 [36:59<31:45,  4.79it/s]                                                     29%|██▊       | 3649/12776 [36:59<31:45,  4.79it/s] 29%|██▊       | 3650/12776 [37:00<53:42,  2.83it/s]                                                     29%|██▊       | 3650/12776 [37:00<53:42,  2.83it/s] 29%|██▊       | 3651/12776 [37:01<1:34:27,  1.61it/s]                                                       29%|██▊       | 3651/12776 [37:01<1:34:27,  1.61it/s] 29%|██▊       | 3652/12776 [37:02<1:47:43,  1.41it/s]                                                       29%|██▊       | 3652/12776 [37:02<1:47:43,  1.41it/s] 29%|██▊       | 3653/12776 [37:03<1:54:36,  1.33it/s]                                                       29%|██▊       | 3653/12776 [37:03<1:54:36,  1.33it/s] 29%|██▊       | 3654/12776 [37:04<1:57:52,  1.29it/s]                                                       29%|██▊       | 3654/12776 [37:04<1:57:52,  1.29it/s] 29%|██▊       | 3655/12776 [37:05<1:54:45,  1.32it/s]                                                       29%|██▊       | 3655/12776 [37:05<1:54:45,  1.32it/s] 29%|██▊       | 3656/12776 [37:05<1:53:17,  1.34it/s]                                                       29%|██▊       | 3656/12776 [37:05<1:53:17,  1.34it/s] 29%|██▊       | 3657/12776 [37:06<1:47:24,  1.42it/s]                                                       29%|██▊       | 3657/12776 [37:06<1:47:24,  1.42it/s] 29%|██▊       | 3658/12776 [37:07<1:42:43,  1.48it/s]                                                       29%|██▊       | 3658/12776 [37:07<1:42:43,  1.48it/s] 29%|██▊       | 3659/12776 [37:07<1:36:20,  1.58it/s]                                                       29%|██▊       | 3659/12776 [37:07<1:36:20,  1.58it/s] 29%|██▊       | 3660/12776 [37:08<1:32:38,  1.64it/s]                                                       29%|██▊       | 3660/12776 [37:08<1:32:38,  1.64it/s] 29%|██▊       | 3661/12776 [37:08<1:27:32,  1.74it/s]                                                       29%|██▊       | 3661/12776 [37:08<1:27:32,  1.74it/s] 29%|██▊       | 3662/12776 [37:09<1:26:28,  1.76it/s]                                                       29%|██▊       | 3662/12776 [37:09<1:26:28,  1.76it/s] 29%|██▊       | 3663/12776 [37:09<1:20:59,  1.88it/s]                                                       29%|██▊       | 3663/12776 [37:09<1:20:59,  1.88it/s] 29%|██▊       | 3664/12776 [37:10<1:19:02,  1.92it/s]                                                       29%|██▊       | 3664/12776 [37:10<1:19:02,  1.92it/s] 29%|██▊       | 3665/12776 [37:10<1:13:59,  2.05it/s]                                                       29%|██▊       | 3665/12776 [37:10<1:13:59,  2.05it/s] 29%|██▊       | 3666/12776 [37:11<1:09:46,  2.18it/s]                                                       29%|██▊       | 3666/12776 [37:11<1:09:46,  2.18it/s] 29%|██▊       | 3667/12776 [37:11<1:13:02,  2.08it/s]                                                       29%|██▊       | 3667/12776 [37:11<1:13:02,  2.08it/s] 29%|██▊       | 3668/12776 [37:11<1:07:42,  2.24it/s]                                                       29%|██▊       | 3668/12776 [37:11<1:07:42,  2.24it/s] 29%|██▊       | 3669/12776 [37:12<1:03:39,  2.38it/s]                                                       29%|██▊       | 3669/12776 [37:12<1:03:39,  2.38it/s] 29%|██▊       | 3670/12776 [37:12<1:04:59,  2.34it/s]                                                       29%|██▊       | 3670/12776 [37:12<1:04:59,  2.34it/s] 29%|██▊       | 3671/12776 [37:13<1:00:09,  2.52it/s]                                                       29%|██▊       | 3671/12776 [37:13<1:00:09,  2.52it/s] 29%|██▊       | 3672/12776 [37:13<56:38,  2.68it/s]                                                       29%|██▊       | 3672/12776 [37:13<56:38,  2.68it/s] 29%|██▊       | 3673/12776 [37:13<56:39,  2.68it/s]                                                     29%|██▊       | 3673/12776 [37:13<56:39,  2.68it/s] 29%|██▉       | 3674/12776 [37:14<52:48,  2.87it/s]                                                     29%|██▉       | 3674/12776 [37:14<52:48,  2.87it/s] 29%|██▉       | 3675/12776 [37:14<49:26,  3.07it/s]                                                     29%|██▉       | 3675/12776 [37:14<49:26,  3.07it/s] 29%|██▉       | 3676/12776 [37:14<46:51,  3.24it/s]                                                     29%|██▉       | 3676/12776 [37:14<46:51,  3.24it/s] 29%|██▉       | 3677/12776 [37:14<47:12,  3.21it/s]                                                     29%|██▉       | 3677/12776 [37:14<47:12,  3.21it/s] 29%|██▉       | 3678/12776 [37:15<45:41,  3.32it/s]                                                     29%|██▉       | 3678/12776 [37:15<45:41,  3.32it/s] 29%|██▉       | 3679/12776 [37:15<44:14,  3.43it/s]                                                     29%|██▉       | 3679/12776 [37:15<44:14,  3.43it/s] 29%|██▉       | 3680/12776 [37:15<43:03,  3.52it/s]                                                     29%|██▉       | 3680/12776 [37:15<43:03,  3.52it/s] 29%|██▉       | 3681/12776 [37:16<44:22,  3.42it/s]                                                     29%|██▉       | 3681/12776 [37:16<44:22,  3.42it/s] 29%|██▉       | 3682/12776 [37:16<42:24,  3.57it/s]                                                     29%|██▉       | 3682/12776 [37:16<42:24,  3.57it/s] 29%|██▉       | 3683/12776 [37:16<40:49,  3.71it/s]                                                     29%|██▉       | 3683/12776 [37:16<40:49,  3.71it/s] 29%|██▉       | 3684/12776 [37:16<39:24,  3.85it/s]                                                     29%|██▉       | 3684/12776 [37:16<39:24,  3.85it/s] 29%|██▉       | 3685/12776 [37:17<38:09,  3.97it/s]                                                     29%|██▉       | 3685/12776 [37:17<38:09,  3.97it/s] 29%|██▉       | 3686/12776 [37:17<41:04,  3.69it/s]                                                     29%|██▉       | 3686/12776 [37:17<41:04,  3.69it/s] 29%|██▉       | 3687/12776 [37:17<38:46,  3.91it/s]                                                     29%|██▉       | 3687/12776 [37:17<38:46,  3.91it/s] 29%|██▉       | 3688/12776 [37:17<36:58,  4.10it/s]                                                     29%|██▉       | 3688/12776 [37:17<36:58,  4.10it/s] 29%|██▉       | 3689/12776 [37:17<35:31,  4.26it/s]                                                     29%|██▉       | 3689/12776 [37:17<35:31,  4.26it/s] 29%|██▉       | 3690/12776 [37:18<34:24,  4.40it/s]                                                     29%|██▉       | 3690/12776 [37:18<34:24,  4.40it/s] 29%|██▉       | 3691/12776 [37:18<37:44,  4.01it/s]                                                     29%|██▉       | 3691/12776 [37:18<37:44,  4.01it/s] 29%|██▉       | 3692/12776 [37:18<35:40,  4.24it/s]                                                     29%|██▉       | 3692/12776 [37:18<35:40,  4.24it/s] 29%|██▉       | 3693/12776 [37:18<34:04,  4.44it/s]                                                     29%|██▉       | 3693/12776 [37:18<34:04,  4.44it/s] 29%|██▉       | 3694/12776 [37:19<32:52,  4.61it/s]                                                     29%|██▉       | 3694/12776 [37:19<32:52,  4.61it/s] 29%|██▉       | 3695/12776 [37:19<31:56,  4.74it/s]                                                     29%|██▉       | 3695/12776 [37:19<31:56,  4.74it/s] 29%|██▉       | 3696/12776 [37:19<35:57,  4.21it/s]                                                     29%|██▉       | 3696/12776 [37:19<35:57,  4.21it/s] 29%|██▉       | 3697/12776 [37:19<33:54,  4.46it/s]                                                    {'loss': 0.6772, 'grad_norm': 1.4252662658691406, 'learning_rate': 0.00022399804496578686, 'epoch': 0.57}
+{'loss': 0.897, 'grad_norm': 2.668489456176758, 'learning_rate': 0.00022397360703812314, 'epoch': 0.57}
+{'loss': 0.6887, 'grad_norm': 1.0190789699554443, 'learning_rate': 0.00022394916911045942, 'epoch': 0.57}
+{'loss': 0.7768, 'grad_norm': 1.3881101608276367, 'learning_rate': 0.00022392473118279567, 'epoch': 0.57}
+{'loss': 0.6853, 'grad_norm': 1.1338660717010498, 'learning_rate': 0.00022390029325513195, 'epoch': 0.57}
+{'loss': 0.7319, 'grad_norm': 1.3985800743103027, 'learning_rate': 0.00022387585532746823, 'epoch': 0.57}
+{'loss': 1.2041, 'grad_norm': 3.043156147003174, 'learning_rate': 0.00022385141739980448, 'epoch': 0.57}
+{'loss': 0.5941, 'grad_norm': 1.2043315172195435, 'learning_rate': 0.00022382697947214073, 'epoch': 0.57}
+{'loss': 1.1688, 'grad_norm': 1.9357560873031616, 'learning_rate': 0.000223802541544477, 'epoch': 0.57}
+{'loss': 0.9962, 'grad_norm': 2.4211204051971436, 'learning_rate': 0.00022377810361681326, 'epoch': 0.57}
+{'loss': 0.7847, 'grad_norm': 2.997868299484253, 'learning_rate': 0.00022375366568914954, 'epoch': 0.57}
+{'loss': 1.0314, 'grad_norm': 2.5542614459991455, 'learning_rate': 0.00022372922776148582, 'epoch': 0.57}
+{'loss': 1.4012, 'grad_norm': 2.0041849613189697, 'learning_rate': 0.00022370478983382207, 'epoch': 0.57}
+{'loss': 1.2137, 'grad_norm': 2.1820318698883057, 'learning_rate': 0.00022368035190615835, 'epoch': 0.57}
+{'loss': 1.0993, 'grad_norm': 2.45853853225708, 'learning_rate': 0.00022365591397849463, 'epoch': 0.57}
+{'loss': 1.1108, 'grad_norm': 2.2817904949188232, 'learning_rate': 0.00022363147605083085, 'epoch': 0.57}
+{'loss': 0.9754, 'grad_norm': 1.8414242267608643, 'learning_rate': 0.00022360703812316713, 'epoch': 0.57}
+{'loss': 0.6568, 'grad_norm': 1.4161016941070557, 'learning_rate': 0.0002235826001955034, 'epoch': 0.57}
+{'loss': 1.1306, 'grad_norm': 1.995675802230835, 'learning_rate': 0.00022355816226783966, 'epoch': 0.57}
+{'loss': 1.55, 'grad_norm': 2.4192090034484863, 'learning_rate': 0.00022353372434017594, 'epoch': 0.57}
+{'loss': 1.8578, 'grad_norm': 3.0786073207855225, 'learning_rate': 0.00022350928641251221, 'epoch': 0.57}
+{'loss': 1.5919, 'grad_norm': 2.814448118209839, 'learning_rate': 0.00022348484848484847, 'epoch': 0.57}
+{'loss': 0.9254, 'grad_norm': 1.6886667013168335, 'learning_rate': 0.00022346041055718474, 'epoch': 0.57}
+{'loss': 1.1636, 'grad_norm': 3.294468879699707, 'learning_rate': 0.00022343597262952102, 'epoch': 0.57}
+{'loss': 1.2624, 'grad_norm': 2.3327720165252686, 'learning_rate': 0.00022341153470185725, 'epoch': 0.57}
+{'loss': 0.4536, 'grad_norm': 1.5095564126968384, 'learning_rate': 0.00022338709677419352, 'epoch': 0.57}
+{'loss': 0.9293, 'grad_norm': 3.851837635040283, 'learning_rate': 0.0002233626588465298, 'epoch': 0.57}
+{'loss': 0.8068, 'grad_norm': 2.2999038696289062, 'learning_rate': 0.00022333822091886605, 'epoch': 0.57}
+{'loss': 0.3304, 'grad_norm': 1.686329960823059, 'learning_rate': 0.00022331378299120233, 'epoch': 0.57}
+{'loss': 0.7134, 'grad_norm': 2.073951244354248, 'learning_rate': 0.0002232893450635386, 'epoch': 0.57}
+{'loss': 1.1497, 'grad_norm': 1.7170746326446533, 'learning_rate': 0.00022326490713587484, 'epoch': 0.57}
+{'loss': 0.3378, 'grad_norm': 0.47634878754615784, 'learning_rate': 0.00022324046920821111, 'epoch': 0.57}
+{'loss': 0.4027, 'grad_norm': 0.629563570022583, 'learning_rate': 0.0002232160312805474, 'epoch': 0.57}
+{'loss': 0.3611, 'grad_norm': 0.8603024482727051, 'learning_rate': 0.00022319159335288364, 'epoch': 0.57}
+{'loss': 0.3809, 'grad_norm': 0.5923214554786682, 'learning_rate': 0.00022316715542521992, 'epoch': 0.57}
+{'loss': 0.4392, 'grad_norm': 0.8817975521087646, 'learning_rate': 0.0002231427174975562, 'epoch': 0.57}
+{'loss': 0.4117, 'grad_norm': 0.8190473914146423, 'learning_rate': 0.00022311827956989245, 'epoch': 0.57}
+{'loss': 0.2774, 'grad_norm': 0.6798378229141235, 'learning_rate': 0.00022309384164222873, 'epoch': 0.57}
+{'loss': 0.4713, 'grad_norm': 0.8386220932006836, 'learning_rate': 0.000223069403714565, 'epoch': 0.57}
+{'loss': 0.5223, 'grad_norm': 0.8392332196235657, 'learning_rate': 0.00022304496578690123, 'epoch': 0.57}
+{'loss': 0.4972, 'grad_norm': 1.1230872869491577, 'learning_rate': 0.0002230205278592375, 'epoch': 0.57}
+{'loss': 0.3689, 'grad_norm': 1.0100561380386353, 'learning_rate': 0.0002229960899315738, 'epoch': 0.57}
+{'loss': 0.3825, 'grad_norm': 0.7954285144805908, 'learning_rate': 0.00022297165200391004, 'epoch': 0.57}
+{'loss': 0.5164, 'grad_norm': 1.0235164165496826, 'learning_rate': 0.00022294721407624632, 'epoch': 0.57}
+{'loss': 0.5941, 'grad_norm': 1.4720885753631592, 'learning_rate': 0.0002229227761485826, 'epoch': 0.57}
+{'loss': 0.4673, 'grad_norm': 0.6552546620368958, 'learning_rate': 0.00022289833822091885, 'epoch': 0.57}
+{'loss': 0.5022, 'grad_norm': 0.9891802072525024, 'learning_rate': 0.00022287390029325513, 'epoch': 0.57}
+{'loss': 0.5776, 'grad_norm': 1.503885269165039, 'learning_rate': 0.0002228494623655914, 'epoch': 0.57}
+{'loss': 0.5049, 'grad_norm': 1.1407116651535034, 'learning_rate': 0.00022282502443792763, 'epoch': 0.57}
+{'loss': 0.5241, 'grad_norm': 1.1594735383987427, 'learning_rate': 0.0002228005865102639, 'epoch': 0.57}
+{'loss': 0.3553, 'grad_norm': 0.8916786313056946, 'learning_rate': 0.00022277614858260019, 'epoch': 0.57}
+{'loss': 0.7108, 'grad_norm': 1.9808990955352783, 'learning_rate': 0.00022275171065493644, 'epoch': 0.57}
+{'loss': 0.5996, 'grad_norm': 1.4254517555236816, 'learning_rate': 0.00022272727272727272, 'epoch': 0.57}
+{'loss': 0.691, 'grad_norm': 1.409542441368103, 'learning_rate': 0.000222702834799609, 'epoch': 0.57}
+{'loss': 0.5769, 'grad_norm': 1.5165317058563232, 'learning_rate': 0.00022267839687194522, 'epoch': 0.58}
+{'loss': 0.7262, 'grad_norm': 2.6416120529174805, 'learning_rate': 0.0002226539589442815, 'epoch': 0.58}
+{'loss': 0.6493, 'grad_norm': 2.1240227222442627, 'learning_rate': 0.00022262952101661777, 'epoch': 0.58}
+{'loss': 0.8552, 'grad_norm': 1.722774624824524, 'learning_rate': 0.00022260508308895403, 'epoch': 0.58}
+{'loss': 1.0542, 'grad_norm': 1.5990309715270996, 'learning_rate': 0.0002225806451612903, 'epoch': 0.58}
+{'loss': 0.752, 'grad_norm': 1.7411298751831055, 'learning_rate': 0.00022255620723362658, 'epoch': 0.58}
+{'loss': 1.3362, 'grad_norm': 2.393709182739258, 'learning_rate': 0.00022253176930596283, 'epoch': 0.58}
+{'loss': 0.77, 'grad_norm': 1.4573614597320557, 'learning_rate': 0.0002225073313782991, 'epoch': 0.58}
+{'loss': 0.9428, 'grad_norm': 1.8054333925247192, 'learning_rate': 0.0002224828934506354, 'epoch': 0.58}
+{'loss': 0.8786, 'grad_norm': 2.158707618713379, 'learning_rate': 0.00022245845552297161, 'epoch': 0.58}
+{'loss': 0.7271, 'grad_norm': 1.175864577293396, 'learning_rate': 0.0002224340175953079, 'epoch': 0.58}
+{'loss': 1.1071, 'grad_norm': 3.2046947479248047, 'learning_rate': 0.00022240957966764417, 'epoch': 0.58}
+{'loss': 1.247, 'grad_norm': 2.6925206184387207, 'learning_rate': 0.00022238514173998042, 'epoch': 0.58}
+{'loss': 0.9495, 'grad_norm': 2.9383318424224854, 'learning_rate': 0.0002223607038123167, 'epoch': 0.58}
+{'loss': 0.862, 'grad_norm': 2.484539031982422, 'learning_rate': 0.00022233626588465298, 'epoch': 0.58}
+{'loss': 1.6457, 'grad_norm': 3.069092035293579, 'learning_rate': 0.00022231182795698923, 'epoch': 0.58}
+{'loss': 0.7699, 'grad_norm': 2.6385087966918945, 'learning_rate': 0.0002222873900293255, 'epoch': 0.58}
+{'loss': 1.2982, 'grad_norm': 4.0282487869262695, 'learning_rate': 0.00022226295210166176, 'epoch': 0.58}
+{'loss': 1.1737, 'grad_norm': 1.7925729751586914, 'learning_rate': 0.000222238514173998, 'epoch': 0.58}
+{'loss': 1.3953, 'grad_norm': 3.2553768157958984, 'learning_rate': 0.0002222140762463343, 'epoch': 0.58}
+{'loss': 0.9012, 'grad_norm': 3.523599147796631, 'learning_rate': 0.00022218963831867057, 'epoch': 0.58}
+{'loss': 1.0458, 'grad_norm': 2.5558550357818604, 'learning_rate': 0.00022216520039100682, 'epoch': 0.58}
+{'loss': 0.251, 'grad_norm': 0.8834622502326965, 'learning_rate': 0.0002221407624633431, 'epoch': 0.58}
+ 29%|██▉       | 3697/12776 [37:19<33:54,  4.46it/s] 29%|██▉       | 3698/12776 [37:19<32:11,  4.70it/s]                                                     29%|██▉       | 3698/12776 [37:19<32:11,  4.70it/s] 29%|██▉       | 3699/12776 [37:20<30:56,  4.89it/s]                                                     29%|██▉       | 3699/12776 [37:20<30:56,  4.89it/s] 29%|██▉       | 3700/12776 [37:20<54:57,  2.75it/s]                                                     29%|██▉       | 3700/12776 [37:20<54:57,  2.75it/s] 29%|██▉       | 3701/12776 [37:22<1:49:09,  1.39it/s]                                                       29%|██▉       | 3701/12776 [37:22<1:49:09,  1.39it/s] 29%|██▉       | 3702/12776 [37:23<2:05:08,  1.21it/s]                                                       29%|██▉       | 3702/12776 [37:23<2:05:08,  1.21it/s] 29%|██▉       | 3703/12776 [37:24<2:10:30,  1.16it/s]                                                       29%|██▉       | 3703/12776 [37:24<2:10:30,  1.16it/s] 29%|██▉       | 3704/12776 [37:25<2:09:50,  1.16it/s]                                                       29%|██▉       | 3704/12776 [37:25<2:09:50,  1.16it/s] 29%|██▉       | 3705/12776 [37:26<2:05:30,  1.20it/s]                                                       29%|██▉       | 3705/12776 [37:26<2:05:30,  1.20it/s] 29%|██▉       | 3706/12776 [37:26<2:02:12,  1.24it/s]                                                       29%|██▉       | 3706/12776 [37:26<2:02:12,  1.24it/s] 29%|██▉       | 3707/12776 [37:27<1:58:53,  1.27it/s]                                                       29%|██▉       | 3707/12776 [37:27<1:58:53,  1.27it/s] 29%|██▉       | 3708/12776 [37:28<1:51:36,  1.35it/s]                                                       29%|██▉       | 3708/12776 [37:28<1:51:36,  1.35it/s] 29%|██▉       | 3709/12776 [37:28<1:44:38,  1.44it/s]                                                       29%|██▉       | 3709/12776 [37:28<1:44:38,  1.44it/s] 29%|██▉       | 3710/12776 [37:29<1:38:46,  1.53it/s]                                                       29%|██▉       | 3710/12776 [37:29<1:38:46,  1.53it/s] 29%|██▉       | 3711/12776 [37:29<1:34:54,  1.59it/s]                                                       29%|██▉       | 3711/12776 [37:29<1:34:54,  1.59it/s] 29%|██▉       | 3712/12776 [37:30<1:29:49,  1.68it/s]                                                       29%|██▉       | 3712/12776 [37:30<1:29:49,  1.68it/s] 29%|██▉       | 3713/12776 [37:31<1:31:40,  1.65it/s]                                                       29%|██▉       | 3713/12776 [37:31<1:31:40,  1.65it/s] 29%|██▉       | 3714/12776 [37:31<1:24:24,  1.79it/s]                                                       29%|██▉       | 3714/12776 [37:31<1:24:24,  1.79it/s] 29%|██▉       | 3715/12776 [37:31<1:18:42,  1.92it/s]                                                       29%|██▉       | 3715/12776 [37:31<1:18:42,  1.92it/s] 29%|██▉       | 3716/12776 [37:32<1:16:32,  1.97it/s]                                                       29%|██▉       | 3716/12776 [37:32<1:16:32,  1.97it/s] 29%|██▉       | 3717/12776 [37:32<1:11:44,  2.10it/s]                                                       29%|██▉       | 3717/12776 [37:32<1:11:44,  2.10it/s] 29%|██▉       | 3718/12776 [37:33<1:08:00,  2.22it/s]                                                       29%|██▉       | 3718/12776 [37:33<1:08:00,  2.22it/s] 29%|██▉       | 3719/12776 [37:33<1:07:35,  2.23it/s]                                                       29%|██▉       | 3719/12776 [37:33<1:07:35,  2.23it/s] 29%|██▉       | 3720/12776 [37:34<1:03:25,  2.38it/s]                                                       29%|██▉       | 3720/12776 [37:34<1:03:25,  2.38it/s] 29%|██▉       | 3721/12776 [37:34<1:01:09,  2.47it/s]                                                       29%|██▉       | 3721/12776 [37:34<1:01:09,  2.47it/s] 29%|██▉       | 3722/12776 [37:34<1:01:11,  2.47it/s]                                                       29%|██▉       | 3722/12776 [37:34<1:01:11,  2.47it/s] 29%|██▉       | 3723/12776 [37:35<57:41,  2.62it/s]                                                       29%|██▉       | 3723/12776 [37:35<57:41,  2.62it/s] 29%|██▉       | 3724/12776 [37:35<54:14,  2.78it/s]                                                     29%|██▉       | 3724/12776 [37:35<54:14,  2.78it/s] 29%|██▉       | 3725/12776 [37:35<56:19,  2.68it/s]                                                     29%|██▉       | 3725/12776 [37:35<56:19,  2.68it/s] 29%|██▉       | 3726/12776 [37:36<52:56,  2.85it/s]                                                     29%|██▉       | 3726/12776 [37:36<52:56,  2.85it/s] 29%|██▉       | 3727/12776 [37:36<49:56,  3.02it/s]                                                     29%|██▉       | 3727/12776 [37:36<49:56,  3.02it/s] 29%|██▉       | 3728/12776 [37:36<52:37,  2.87it/s]                                                     29%|██▉       | 3728/12776 [37:36<52:37,  2.87it/s] 29%|██▉       | 3729/12776 [37:37<49:02,  3.07it/s]                                                     29%|██▉       | 3729/12776 [37:37<49:02,  3.07it/s] 29%|██▉       | 3730/12776 [37:37<46:14,  3.26it/s]                                                     29%|██▉       | 3730/12776 [37:37<46:14,  3.26it/s] 29%|██▉       | 3731/12776 [37:37<43:54,  3.43it/s]                                                     29%|██▉       | 3731/12776 [37:37<43:54,  3.43it/s] 29%|██▉       | 3732/12776 [37:37<46:43,  3.23it/s]                                                     29%|██▉       | 3732/12776 [37:37<46:43,  3.23it/s] 29%|██▉       | 3733/12776 [37:38<43:55,  3.43it/s]                                                     29%|██▉       | 3733/12776 [37:38<43:55,  3.43it/s] 29%|██▉       | 3734/12776 [37:38<41:46,  3.61it/s]                                                     29%|██▉       | 3734/12776 [37:38<41:46,  3.61it/s] 29%|██▉       | 3735/12776 [37:38<39:57,  3.77it/s]                                                     29%|██▉       | 3735/12776 [37:38<39:57,  3.77it/s] 29%|██▉       | 3736/12776 [37:38<38:25,  3.92it/s]                                                     29%|██▉       | 3736/12776 [37:38<38:25,  3.92it/s] 29%|██▉       | 3737/12776 [37:39<40:26,  3.72it/s]                                                     29%|██▉       | 3737/12776 [37:39<40:26,  3.72it/s] 29%|██▉       | 3738/12776 [37:39<38:14,  3.94it/s]                                                     29%|██▉       | 3738/12776 [37:39<38:14,  3.94it/s] 29%|██▉       | 3739/12776 [37:39<36:26,  4.13it/s]                                                     29%|██▉       | 3739/12776 [37:39<36:26,  4.13it/s] 29%|██▉       | 3740/12776 [37:39<35:00,  4.30it/s]                                                     29%|██▉       | 3740/12776 [37:39<35:00,  4.30it/s] 29%|██▉       | 3741/12776 [37:40<33:59,  4.43it/s]                                                     29%|██▉       | 3741/12776 [37:40<33:59,  4.43it/s] 29%|██▉       | 3742/12776 [37:40<37:15,  4.04it/s]                                                     29%|██▉       | 3742/12776 [37:40<37:15,  4.04it/s] 29%|██▉       | 3743/12776 [37:40<35:17,  4.27it/s]                                                     29%|██▉       | 3743/12776 [37:40<35:17,  4.27it/s] 29%|██▉       | 3744/12776 [37:40<33:41,  4.47it/s]                                                     29%|██▉       | 3744/12776 [37:40<33:41,  4.47it/s] 29%|██▉       | 3745/12776 [37:40<32:31,  4.63it/s]                                                     29%|██▉       | 3745/12776 [37:40<32:31,  4.63it/s] 29%|██▉       | 3746/12776 [37:41<31:38,  4.76it/s]                                                     29%|██▉       | 3746/12776 [37:41<31:38,  4.76it/s] 29%|██▉       | 3747/12776 [37:41<36:11,  4.16it/s]                                                     29%|██▉       | 3747/12776 [37:41<36:11,  4.16it/s] 29%|██▉       | 3748/12776 [37:41<33:56,  4.43it/s]                                                     29%|██▉       | 3748/12776 [37:41<33:56,  4.43it/s] 29%|██▉       | 3749/12776 [37:41<32:04,  4.69it/s]                                                     29%|██▉       | 3749/12776 [37:41<32:04,  4.69it/s] 29%|██▉       | 3750/12776 [37:42<58:32,  2.57it/s]                                                     29%|██▉       | 3750/12776 [37:42<58:32,  2.57it/s] 29%|██▉       | 3751/12776 [37:44<1:59:00,  1.26it/s]                                                       29%|██▉       | 3751/12776 [37:44<1:59:00,  1.26it/s] 29%|██▉       | 3752/12776 [37:45<2:11:48,  1.14it/s]                                                       29%|██▉       | 3752/12776 [37:45<2:11:48,  1.14it/s] 29%|██▉       | 3753/12776 [37:46<2:18:07,  1.09it/s]                                                       29%|██▉       | 3753/12776 [37:46<2:18:07,  1.09it/s] 29%|██▉       | 3754/12776 [37:47<2:18:23,  1.09it/s]                                                       29%|██▉       | 3754/12776 [37:47<2:18:23,  1.09it/s] 29%|██▉       | 3755/12776 [37:48<2:13:36,  1.13it/s]                                                       29%|██▉       | 3755/12776 [37:48<2:13:36,  1.13it/s] 29%|██▉       | 3756/12776 [37:48<2:07:06,  1.18it/s]                                                       29%|██▉       | 3756/12776 [37:48<2:07:06,  1.18it/s] 29%|██▉       | 3757/12776 [37:49<2:04:43,  1.21it/s]                                                       29%|██▉       | 3757/12776 [37:49<2:04:43,  1.21it/s] 29%|██▉       | 3758/12776 [37:50<2:00:41,  1.25it/s]                                                       29%|██▉       | 3758/12776 [37:50<2:00:41,  1.25it/s] 29%|██▉       | 3759/12776 [37:51<1:53:29,  1.32it/s]                                                       29%|██▉       | 3759/12776 [37:51<1:53:29,  1.32it/s] 29%|██▉       | 3760/12776 [37:51<1:54:04,  1.32it/s]                                                       29%|██▉       | 3760/12776 [37:51<1:54:04,  1.32it/s] 29%|██▉       | 3761/12776 [37:52<1:46:27,  1.41it/s]                                                       29%|██▉       | 3761/12776 [37:52<1:46:27,  1.41it/s] 29%|██▉       | 3762/12776 [37:53<1:42:32,  1.47it/s]                                                       29%|██▉       | 3762/12776 [37:53<1:42:32,  1.47it/s] 29%|██▉       | 3763/12776 [37:53<1:35:44,  1.57it/s]                                                       29%|██▉       | 3763/12776 [37:53<1:35:44,  1.57it/s] 29%|██▉       | 3764/12776 [37:54<1:33:20,  1.61it/s]                                                       29%|██▉       | 3764/12776 [37:54<1:33:20,  1.61it/s] 29%|██▉       | 3765/12776 [37:54<1:26:14,  1.74it/s]                                                       29%|██▉       | 3765/12776 [37:54<1:26:14,  1.74it/s] 29%|██▉       | 3766/12776 [37:55<1:25:51,  1.75it/s]                                                       29%|█���▉       | 3766/12776 [37:55<1:25:51,  1.75it/s] 29%|██▉       | 3767/12776 [37:55<1:19:38,  1.89it/s]                                                       29%|██▉       | 3767/12776 [37:55<1:19:38,  1.89it/s] 29%|██▉       | 3768/12776 [37:56<1:19:16,  1.89it/s]                                                       29%|██▉       | 3768/12776 [37:56<1:19:16,  1.89it/s] 30%|██▉       | 3769/12776 [37:56<1:13:12,  2.05it/s]                                                       30%|██▉       | 3769/12776 [37:56<1:13:12,  2.05it/s] 30%|██▉       | 3770/12776 [37:56<1:08:26,  2.19it/s]                                                       30%|██▉       | 3770/12776 [37:56<1:08:26,  2.19it/s] 30%|██▉       | 3771/12776 [37:57<1:04:44,  2.32it/s]                                                       30%|██▉       | 3771/12776 [37:57<1:04:44,  2.32it/s] 30%|██▉       | 3772/12776 [37:57<1:01:11,  2.45it/s]                                                       30%|██▉       | 3772/12776 [37:57<1:01:11,  2.45it/s] 30%|██▉       | 3773/12776 [37:58<58:23,  2.57it/s]                                                       30%|██▉       | 3773/12776 [37:58<58:23,  2.57it/s] 30%|██▉       | 3774/12776 [37:58<1:00:14,  2.49it/s]                                                      {'loss': 0.653, 'grad_norm': 1.9280049800872803, 'learning_rate': 0.00022211632453567938, 'epoch': 0.58}
+{'loss': 0.8649, 'grad_norm': 2.2830820083618164, 'learning_rate': 0.0002220918866080156, 'epoch': 0.58}
+{'loss': 0.7386, 'grad_norm': 3.383026123046875, 'learning_rate': 0.00022206744868035188, 'epoch': 0.58}
+{'loss': 1.1781, 'grad_norm': 2.898946523666382, 'learning_rate': 0.00022204301075268816, 'epoch': 0.58}
+{'loss': 0.4284, 'grad_norm': 0.6618011593818665, 'learning_rate': 0.0002220185728250244, 'epoch': 0.58}
+{'loss': 0.3873, 'grad_norm': 0.7165639996528625, 'learning_rate': 0.00022199413489736069, 'epoch': 0.58}
+{'loss': 0.5621, 'grad_norm': 0.8477781414985657, 'learning_rate': 0.00022196969696969696, 'epoch': 0.58}
+{'loss': 0.5527, 'grad_norm': 1.0597972869873047, 'learning_rate': 0.00022194525904203322, 'epoch': 0.58}
+{'loss': 0.4481, 'grad_norm': 0.9503958225250244, 'learning_rate': 0.0002219208211143695, 'epoch': 0.58}
+{'loss': 0.4143, 'grad_norm': 1.0318113565444946, 'learning_rate': 0.00022189638318670577, 'epoch': 0.58}
+{'loss': 0.3822, 'grad_norm': 0.5940424203872681, 'learning_rate': 0.000221871945259042, 'epoch': 0.58}
+{'loss': 0.3572, 'grad_norm': 0.8478888869285583, 'learning_rate': 0.00022184750733137828, 'epoch': 0.58}
+{'loss': 0.2797, 'grad_norm': 0.69258052110672, 'learning_rate': 0.00022182306940371455, 'epoch': 0.58}
+{'loss': 0.4239, 'grad_norm': 0.7424476742744446, 'learning_rate': 0.0002217986314760508, 'epoch': 0.58}
+{'loss': 0.4964, 'grad_norm': 1.1544733047485352, 'learning_rate': 0.00022177419354838708, 'epoch': 0.58}
+{'loss': 0.5127, 'grad_norm': 1.1875557899475098, 'learning_rate': 0.00022174975562072336, 'epoch': 0.58}
+{'loss': 0.4603, 'grad_norm': 1.065439224243164, 'learning_rate': 0.0002217253176930596, 'epoch': 0.58}
+{'loss': 0.6937, 'grad_norm': 1.477260708808899, 'learning_rate': 0.0002217008797653959, 'epoch': 0.58}
+{'loss': 0.6814, 'grad_norm': 1.7115600109100342, 'learning_rate': 0.00022167644183773214, 'epoch': 0.58}
+{'loss': 0.5137, 'grad_norm': 1.2147424221038818, 'learning_rate': 0.0002216520039100684, 'epoch': 0.58}
+{'loss': 0.4772, 'grad_norm': 1.544769287109375, 'learning_rate': 0.00022162756598240467, 'epoch': 0.58}
+{'loss': 0.7419, 'grad_norm': 1.2717753648757935, 'learning_rate': 0.00022160312805474095, 'epoch': 0.58}
+{'loss': 0.8306, 'grad_norm': 1.403295636177063, 'learning_rate': 0.0002215786901270772, 'epoch': 0.58}
+{'loss': 0.8163, 'grad_norm': 2.2158901691436768, 'learning_rate': 0.00022155425219941348, 'epoch': 0.58}
+{'loss': 0.7549, 'grad_norm': 1.945855975151062, 'learning_rate': 0.00022152981427174976, 'epoch': 0.58}
+{'loss': 0.6222, 'grad_norm': 2.006751775741577, 'learning_rate': 0.00022150537634408598, 'epoch': 0.58}
+{'loss': 0.5719, 'grad_norm': 1.3016244173049927, 'learning_rate': 0.00022148093841642226, 'epoch': 0.58}
+{'loss': 0.8886, 'grad_norm': 1.7161877155303955, 'learning_rate': 0.00022145650048875854, 'epoch': 0.58}
+{'loss': 0.8454, 'grad_norm': 1.5674386024475098, 'learning_rate': 0.0002214320625610948, 'epoch': 0.58}
+{'loss': 1.0747, 'grad_norm': 2.3730225563049316, 'learning_rate': 0.00022140762463343107, 'epoch': 0.58}
+{'loss': 0.6091, 'grad_norm': 2.1554417610168457, 'learning_rate': 0.00022138318670576735, 'epoch': 0.58}
+{'loss': 0.4839, 'grad_norm': 1.7122255563735962, 'learning_rate': 0.0002213587487781036, 'epoch': 0.58}
+{'loss': 0.7182, 'grad_norm': 1.168192744255066, 'learning_rate': 0.00022133431085043988, 'epoch': 0.58}
+{'loss': 1.1607, 'grad_norm': 1.9943832159042358, 'learning_rate': 0.00022130987292277615, 'epoch': 0.58}
+{'loss': 0.8025, 'grad_norm': 4.930220603942871, 'learning_rate': 0.00022128543499511238, 'epoch': 0.58}
+{'loss': 0.7035, 'grad_norm': 1.833145260810852, 'learning_rate': 0.00022126099706744866, 'epoch': 0.58}
+{'loss': 1.1647, 'grad_norm': 2.07147479057312, 'learning_rate': 0.00022123655913978494, 'epoch': 0.58}
+{'loss': 0.7464, 'grad_norm': 2.026843309402466, 'learning_rate': 0.0002212121212121212, 'epoch': 0.58}
+{'loss': 1.1607, 'grad_norm': 1.1421523094177246, 'learning_rate': 0.00022118768328445747, 'epoch': 0.58}
+{'loss': 1.0897, 'grad_norm': 3.0231730937957764, 'learning_rate': 0.00022116324535679374, 'epoch': 0.58}
+{'loss': 0.869, 'grad_norm': 1.649326205253601, 'learning_rate': 0.00022113880742913, 'epoch': 0.59}
+{'loss': 1.1095, 'grad_norm': 3.129502296447754, 'learning_rate': 0.00022111436950146625, 'epoch': 0.59}
+{'loss': 0.7454, 'grad_norm': 2.2908878326416016, 'learning_rate': 0.00022108993157380252, 'epoch': 0.59}
+{'loss': 1.3902, 'grad_norm': 2.386479377746582, 'learning_rate': 0.00022106549364613878, 'epoch': 0.59}
+{'loss': 1.2762, 'grad_norm': 2.256403923034668, 'learning_rate': 0.00022104105571847505, 'epoch': 0.59}
+{'loss': 1.1909, 'grad_norm': 3.2702090740203857, 'learning_rate': 0.00022101661779081133, 'epoch': 0.59}
+{'loss': 1.6311, 'grad_norm': 1.4588786363601685, 'learning_rate': 0.00022099217986314758, 'epoch': 0.59}
+{'loss': 0.6543, 'grad_norm': 1.102078914642334, 'learning_rate': 0.00022096774193548386, 'epoch': 0.59}
+{'loss': 1.4536, 'grad_norm': 2.0560879707336426, 'learning_rate': 0.00022094330400782014, 'epoch': 0.59}
+{'loss': 1.5638, 'grad_norm': 1.8703267574310303, 'learning_rate': 0.00022091886608015636, 'epoch': 0.59}
+{'loss': 0.919, 'grad_norm': 1.5793174505233765, 'learning_rate': 0.00022089442815249264, 'epoch': 0.59}
+{'loss': 1.7933, 'grad_norm': 3.379425287246704, 'learning_rate': 0.00022086999022482892, 'epoch': 0.59}
+{'loss': 0.6506, 'grad_norm': 2.630582571029663, 'learning_rate': 0.00022084555229716517, 'epoch': 0.59}
+{'loss': 1.0893, 'grad_norm': 1.3981788158416748, 'learning_rate': 0.00022082111436950145, 'epoch': 0.59}
+{'loss': 0.3598, 'grad_norm': 0.4805038273334503, 'learning_rate': 0.00022079667644183773, 'epoch': 0.59}
+{'loss': 0.3031, 'grad_norm': 0.3983316719532013, 'learning_rate': 0.00022077223851417398, 'epoch': 0.59}
+{'loss': 1.3055, 'grad_norm': 1.9718865156173706, 'learning_rate': 0.00022074780058651026, 'epoch': 0.59}
+{'loss': 0.3184, 'grad_norm': 0.5567598938941956, 'learning_rate': 0.00022072336265884654, 'epoch': 0.59}
+{'loss': 0.3816, 'grad_norm': 0.6025465130805969, 'learning_rate': 0.00022069892473118276, 'epoch': 0.59}
+{'loss': 0.4014, 'grad_norm': 0.9688357710838318, 'learning_rate': 0.00022067448680351904, 'epoch': 0.59}
+{'loss': 0.4133, 'grad_norm': 0.6536909341812134, 'learning_rate': 0.00022065004887585532, 'epoch': 0.59}
+{'loss': 0.3517, 'grad_norm': 0.6324391961097717, 'learning_rate': 0.00022062561094819157, 'epoch': 0.59}
+{'loss': 0.404, 'grad_norm': 0.7064645886421204, 'learning_rate': 0.00022060117302052785, 'epoch': 0.59}
+{'loss': 0.517, 'grad_norm': 0.8982453942298889, 'learning_rate': 0.00022057673509286413, 'epoch': 0.59}
+{'loss': 0.4526, 'grad_norm': 1.20693838596344, 'learning_rate': 0.00022055229716520038, 'epoch': 0.59}
+{'loss': 0.3104, 'grad_norm': 0.7581743597984314, 'learning_rate': 0.00022052785923753663, 'epoch': 0.59}
+{'loss': 0.2852, 'grad_norm': 0.5927846431732178, 'learning_rate': 0.0002205034213098729, 'epoch': 0.59}
+{'loss': 0.3494, 'grad_norm': 1.2253514528274536, 'learning_rate': 0.00022047898338220916, 'epoch': 0.59}
+{'loss': 0.3888, 'grad_norm': 0.9083871245384216, 'learning_rate': 0.00022045454545454544, 'epoch': 0.59}
+{'loss': 0.487, 'grad_norm': 0.8482705950737, 'learning_rate': 0.00022043010752688171, 'epoch': 0.59}
+{'loss': 0.4058, 'grad_norm': 0.903735876083374, 'learning_rate': 0.00022040566959921797, 'epoch': 0.59}
+{'loss': 0.7215, 'grad_norm': 1.6391104459762573, 'learning_rate': 0.00022038123167155424, 'epoch': 0.59}
+{'loss': 0.3585, 'grad_norm': 1.211521863937378, 'learning_rate': 0.00022035679374389052, 'epoch': 0.59}
+{'loss': 0.5602, 'grad_norm': 1.2085306644439697, 'learning_rate': 0.00022033235581622675, 'epoch': 0.59}
+{'loss': 0.6018, 'grad_norm': 1.251050591468811, 'learning_rate': 0.00022030791788856303, 'epoch': 0.59}
+{'loss': 1.5824, 'grad_norm': 3.4225316047668457, 'learning_rate': 0.0002202834799608993, 'epoch': 0.59}
+{'loss': 0.6043, 'grad_norm': 1.3945788145065308, 'learning_rate': 0.00022025904203323555, 'epoch': 0.59}
+ 30%|██▉       | 3774/12776 [37:58<1:00:14,  2.49it/s] 30%|██▉       | 3775/12776 [37:58<57:14,  2.62it/s]                                                       30%|██▉       | 3775/12776 [37:58<57:14,  2.62it/s] 30%|██▉       | 3776/12776 [37:59<54:27,  2.75it/s]                                                     30%|██▉       | 3776/12776 [37:59<54:27,  2.75it/s] 30%|██▉       | 3777/12776 [37:59<51:48,  2.90it/s]                                                     30%|██▉       | 3777/12776 [37:59<51:48,  2.90it/s] 30%|██▉       | 3778/12776 [37:59<51:24,  2.92it/s]                                                     30%|██▉       | 3778/12776 [37:59<51:24,  2.92it/s] 30%|██▉       | 3779/12776 [38:00<48:39,  3.08it/s]                                                     30%|██▉       | 3779/12776 [38:00<48:39,  3.08it/s] 30%|██▉       | 3780/12776 [38:00<46:29,  3.22it/s]                                                     30%|██▉       | 3780/12776 [38:00<46:29,  3.22it/s] 30%|██▉       | 3781/12776 [38:00<44:37,  3.36it/s]                                                     30%|██▉       | 3781/12776 [38:00<44:37,  3.36it/s] 30%|██▉       | 3782/12776 [38:00<45:04,  3.33it/s]                                                     30%|██▉       | 3782/12776 [38:00<45:04,  3.33it/s] 30%|██▉       | 3783/12776 [38:01<42:51,  3.50it/s]                                                     30%|██▉       | 3783/12776 [38:01<42:51,  3.50it/s] 30%|██▉       | 3784/12776 [38:01<41:08,  3.64it/s]                                                     30%|██▉       | 3784/12776 [38:01<41:08,  3.64it/s] 30%|██▉       | 3785/12776 [38:01<39:41,  3.78it/s]                                                     30%|██▉       | 3785/12776 [38:01<39:41,  3.78it/s] 30%|██▉       | 3786/12776 [38:02<44:15,  3.39it/s]                                                     30%|██▉       | 3786/12776 [38:02<44:15,  3.39it/s] 30%|██▉       | 3787/12776 [38:02<41:18,  3.63it/s]                                                     30%|██▉       | 3787/12776 [38:02<41:18,  3.63it/s] 30%|██▉       | 3788/12776 [38:02<39:04,  3.83it/s]                                                     30%|██▉       | 3788/12776 [38:02<39:04,  3.83it/s] 30%|██▉       | 3789/12776 [38:02<37:06,  4.04it/s]                                                     30%|██▉       | 3789/12776 [38:02<37:06,  4.04it/s] 30%|██▉       | 3790/12776 [38:02<37:28,  4.00it/s]                                                     30%|██▉       | 3790/12776 [38:02<37:28,  4.00it/s] 30%|██▉       | 3791/12776 [38:03<37:43,  3.97it/s]                                                     30%|██▉       | 3791/12776 [38:03<37:43,  3.97it/s] 30%|██▉       | 3792/12776 [38:03<35:56,  4.17it/s]                                                     30%|██▉       | 3792/12776 [38:03<35:56,  4.17it/s] 30%|██▉       | 3793/12776 [38:03<34:17,  4.37it/s]                                                     30%|██▉       | 3793/12776 [38:03<34:17,  4.37it/s] 30%|██▉       | 3794/12776 [38:03<32:56,  4.55it/s]                                                     30%|██▉       | 3794/12776 [38:03<32:56,  4.55it/s] 30%|██▉       | 3795/12776 [38:04<31:54,  4.69it/s]                                                     30%|██▉       | 3795/12776 [38:04<31:54,  4.69it/s] 30%|██▉       | 3796/12776 [38:04<34:12,  4.38it/s]                                                     30%|██▉       | 3796/12776 [38:04<34:12,  4.38it/s] 30%|██▉       | 3797/12776 [38:04<32:34,  4.59it/s]                                                     30%|██▉       | 3797/12776 [38:04<32:34,  4.59it/s] 30%|██▉       | 3798/12776 [38:04<31:07,  4.81it/s]                                                     30%|██▉       | 3798/12776 [38:04<31:07,  4.81it/s] 30%|██▉       | 3799/12776 [38:04<29:58,  4.99it/s]                                                     30%|██▉       | 3799/12776 [38:04<29:58,  4.99it/s] 30%|██▉       | 3800/12776 [38:05<51:37,  2.90it/s]                                                     30%|██▉       | 3800/12776 [38:05<51:37,  2.90it/s] 30%|██▉       | 3801/12776 [38:07<1:45:02,  1.42it/s]                                                       30%|██▉       | 3801/12776 [38:07<1:45:02,  1.42it/s] 30%|██▉       | 3802/12776 [38:07<1:55:25,  1.30it/s]                                                       30%|██▉       | 3802/12776 [38:07<1:55:25,  1.30it/s] 30%|██▉       | 3803/12776 [38:08<1:58:09,  1.27it/s]                                                       30%|██▉       | 3803/12776 [38:08<1:58:09,  1.27it/s] 30%|██▉       | 3804/12776 [38:09<1:56:42,  1.28it/s]                                                       30%|██▉       | 3804/12776 [38:09<1:56:42,  1.28it/s] 30%|██▉       | 3805/12776 [38:10<1:56:15,  1.29it/s]                                                       30%|██▉       | 3805/12776 [38:10<1:56:15,  1.29it/s] 30%|██▉       | 3806/12776 [38:11<1:53:50,  1.31it/s]                                                       30%|██▉       | 3806/12776 [38:11<1:53:50,  1.31it/s] 30%|██▉       | 3807/12776 [38:11<1:48:37,  1.38it/s]                                                       30%|██▉       | 3807/12776 [38:11<1:48:37,  1.38it/s] 30%|██▉       | 3808/12776 [38:12<1:49:51,  1.36it/s]                                                       30%|██▉       | 3808/12776 [38:12<1:49:51,  1.36it/s] 30%|██▉       | 3809/12776 [38:13<1:44:29,  1.43it/s]                                                       30%|██▉       | 3809/12776 [38:13<1:44:29,  1.43it/s] 30%|██▉       | 3810/12776 [38:13<1:40:25,  1.49it/s]                                                       30%|██▉       | 3810/12776 [38:13<1:40:25,  1.49it/s] 30%|██▉       | 3811/12776 [38:14<1:34:45,  1.58it/s]                                                       30%|██▉       | 3811/12776 [38:14<1:34:45,  1.58it/s] 30%|██▉       | 3812/12776 [38:14<1:33:04,  1.61it/s]                                                       30%|██▉       | 3812/12776 [38:14<1:33:04,  1.61it/s] 30%|██▉       | 3813/12776 [38:15<1:27:26,  1.71it/s]                                                       30%|██▉       | 3813/12776 [38:15<1:27:26,  1.71it/s] 30%|██▉       | 3814/12776 [38:15<1:24:40,  1.76it/s]                                                       30%|██▉       | 3814/12776 [38:15<1:24:40,  1.76it/s] 30%|██▉       | 3815/12776 [38:16<1:19:30,  1.88it/s]                                                       30%|██▉       | 3815/12776 [38:16<1:19:30,  1.88it/s] 30%|██▉       | 3816/12776 [38:16<1:18:23,  1.90it/s]                                                       30%|██▉       | 3816/12776 [38:16<1:18:23,  1.90it/s] 30%|██▉       | 3817/12776 [38:17<1:13:29,  2.03it/s]                                                       30%|██▉       | 3817/12776 [38:17<1:13:29,  2.03it/s] 30%|██▉       | 3818/12776 [38:17<1:09:43,  2.14it/s]                                                       30%|██▉       | 3818/12776 [38:17<1:09:43,  2.14it/s] 30%|██▉       | 3819/12776 [38:18<1:10:54,  2.11it/s]                                                       30%|██▉       | 3819/12776 [38:18<1:10:54,  2.11it/s] 30%|██▉       | 3820/12776 [38:18<1:06:17,  2.25it/s]                                                       30%|██▉       | 3820/12776 [38:18<1:06:17,  2.25it/s] 30%|██▉       | 3821/12776 [38:18<1:02:06,  2.40it/s]                                                       30%|██▉       | 3821/12776 [38:18<1:02:06,  2.40it/s] 30%|██▉       | 3822/12776 [38:19<1:01:22,  2.43it/s]                                                       30%|██▉       | 3822/12776 [38:19<1:01:22,  2.43it/s] 30%|██▉       | 3823/12776 [38:19<57:46,  2.58it/s]                                                       30%|██▉       | 3823/12776 [38:19<57:46,  2.58it/s] 30%|██▉       | 3824/12776 [38:19<54:39,  2.73it/s]                                                     30%|██▉       | 3824/12776 [38:19<54:39,  2.73it/s] 30%|██▉       | 3825/12776 [38:20<53:22,  2.80it/s]                                                     30%|██▉       | 3825/12776 [38:20<53:22,  2.80it/s] 30%|██▉       | 3826/12776 [38:20<50:32,  2.95it/s]                                                     30%|██▉       | 3826/12776 [38:20<50:32,  2.95it/s] 30%|██▉       | 3827/12776 [38:20<48:01,  3.11it/s]                                                     30%|██▉       | 3827/12776 [38:20<48:01,  3.11it/s] 30%|██▉       | 3828/12776 [38:21<46:12,  3.23it/s]                                                     30%|██▉       | 3828/12776 [38:21<46:12,  3.23it/s] 30%|██▉       | 3829/12776 [38:21<50:44,  2.94it/s]                                                     30%|██▉       | 3829/12776 [38:21<50:44,  2.94it/s] 30%|██▉       | 3830/12776 [38:21<47:23,  3.15it/s]                                                     30%|██▉       | 3830/12776 [38:21<47:23,  3.15it/s] 30%|██▉       | 3831/12776 [38:22<44:09,  3.38it/s]                                                     30%|██▉       | 3831/12776 [38:22<44:09,  3.38it/s] 30%|██▉       | 3832/12776 [38:22<41:57,  3.55it/s]                                                     30%|██▉       | 3832/12776 [38:22<41:57,  3.55it/s] 30%|███       | 3833/12776 [38:22<46:05,  3.23it/s]                                                     30%|███       | 3833/12776 [38:22<46:05,  3.23it/s] 30%|███       | 3834/12776 [38:22<42:59,  3.47it/s]                                                     30%|███       | 3834/12776 [38:22<42:59,  3.47it/s] 30%|███       | 3835/12776 [38:23<40:28,  3.68it/s]                                                     30%|███       | 3835/12776 [38:23<40:28,  3.68it/s] 30%|███       | 3836/12776 [38:23<38:33,  3.86it/s]                                                     30%|███       | 3836/12776 [38:23<38:33,  3.86it/s] 30%|███       | 3837/12776 [38:23<41:38,  3.58it/s]                                                     30%|███       | 3837/12776 [38:23<41:38,  3.58it/s] 30%|███       | 3838/12776 [38:23<38:41,  3.85it/s]                                                     30%|███       | 3838/12776 [38:23<38:41,  3.85it/s] 30%|███       | 3839/12776 [38:24<36:30,  4.08it/s]                                                     30%|███       | 3839/12776 [38:24<36:30,  4.08it/s] 30%|███       | 3840/12776 [38:24<35:04,  4.25it/s]                                                     30%|███       | 3840/12776 [38:24<35:04,  4.25it/s] 30%|███       | 3841/12776 [38:24<33:51,  4.40it/s]                                                     30%|███       | 3841/12776 [38:24<33:51,  4.40it/s] 30%|███       | 3842/12776 [38:24<35:55,  4.14it/s]                                                     30%|███       | 3842/12776 [38:24<35:55,  4.14it/s] 30%|███       | 3843/12776 [38:25<34:13,  4.35it/s]                                                     30%|███       | 3843/12776 [38:25<34:13,  4.35it/s] 30%|███       | 3844/12776 [38:25<33:17,  4.47it/s]                                                     30%|███       | 3844/12776 [38:25<33:17,  4.47it/s] 30%|███       | 3845/12776 [38:25<32:14,  4.62it/s]                                                     30%|███       | 3845/12776 [38:25<32:14,  4.62it/s] 30%|███       | 3846/12776 [38:25<31:25,  4.74it/s]                                                     30%|███       | 3846/12776 [38:25<31:25,  4.74it/s] 30%|███       | 3847/12776 [38:25<35:35,  4.18it/s]                                                     30%|███       | 3847/12776 [38:25<35:35,  4.18it/s] 30%|███       | 3848/12776 [38:26<33:24,  4.45it/s]                                                     30%|███       | 3848/12776 [38:26<33:24,  4.45it/s] 30%|███       | 3849/12776 [38:26<31:39,  4.70it/s]                                                     30%|███       | 3849/12776 [38:26<31:39,  4.70it/s] 30%|███       | 3850/12776 [38:27<1:00:12,  2.47it/s]                                                       30%|███       | 3850/12776 [38:27<1:00:12,  2.47it/s] 30%|███       | 3851/12776 [38:28<1:47:25,  1.38it/s]                                                       30%|███       | 3851/12776 [38:28<1:47:25,  1.38it/s] 30%|███       | 3852/12776 [38:29<2:05:47,  1.18it/s]                                                      {'loss': 0.7169, 'grad_norm': 2.437675952911377, 'learning_rate': 0.00022023460410557183, 'epoch': 0.59}
+{'loss': 0.7835, 'grad_norm': 1.3050737380981445, 'learning_rate': 0.0002202101661779081, 'epoch': 0.59}
+{'loss': 0.6321, 'grad_norm': 2.816662549972534, 'learning_rate': 0.00022018572825024436, 'epoch': 0.59}
+{'loss': 0.479, 'grad_norm': 1.4117590188980103, 'learning_rate': 0.00022016129032258064, 'epoch': 0.59}
+{'loss': 0.8931, 'grad_norm': 1.9365350008010864, 'learning_rate': 0.00022013685239491692, 'epoch': 0.59}
+{'loss': 0.6348, 'grad_norm': 1.9675894975662231, 'learning_rate': 0.00022011241446725314, 'epoch': 0.59}
+{'loss': 0.7128, 'grad_norm': 1.806617259979248, 'learning_rate': 0.00022008797653958942, 'epoch': 0.59}
+{'loss': 0.7107, 'grad_norm': 2.15679669380188, 'learning_rate': 0.0002200635386119257, 'epoch': 0.59}
+{'loss': 0.6923, 'grad_norm': 1.5504449605941772, 'learning_rate': 0.00022003910068426195, 'epoch': 0.59}
+{'loss': 0.9243, 'grad_norm': 2.190402030944824, 'learning_rate': 0.00022001466275659823, 'epoch': 0.59}
+{'loss': 1.3498, 'grad_norm': 2.572727918624878, 'learning_rate': 0.0002199902248289345, 'epoch': 0.59}
+{'loss': 1.3511, 'grad_norm': 2.3958308696746826, 'learning_rate': 0.00021996578690127076, 'epoch': 0.59}
+{'loss': 1.1582, 'grad_norm': 3.290600538253784, 'learning_rate': 0.000219941348973607, 'epoch': 0.59}
+{'loss': 0.6797, 'grad_norm': 2.0067548751831055, 'learning_rate': 0.0002199169110459433, 'epoch': 0.59}
+{'loss': 1.1569, 'grad_norm': 2.599518060684204, 'learning_rate': 0.00021989247311827954, 'epoch': 0.59}
+{'loss': 1.0037, 'grad_norm': 3.275362968444824, 'learning_rate': 0.00021986803519061582, 'epoch': 0.59}
+{'loss': 1.6154, 'grad_norm': 1.9998888969421387, 'learning_rate': 0.0002198435972629521, 'epoch': 0.59}
+{'loss': 1.5551, 'grad_norm': 2.5467236042022705, 'learning_rate': 0.00021981915933528835, 'epoch': 0.59}
+{'loss': 1.3565, 'grad_norm': 5.1116042137146, 'learning_rate': 0.00021979472140762463, 'epoch': 0.59}
+{'loss': 1.0188, 'grad_norm': 1.5216659307479858, 'learning_rate': 0.0002197702834799609, 'epoch': 0.59}
+{'loss': 1.9111, 'grad_norm': 2.700352191925049, 'learning_rate': 0.00021974584555229713, 'epoch': 0.59}
+{'loss': 1.1349, 'grad_norm': 2.896486759185791, 'learning_rate': 0.0002197214076246334, 'epoch': 0.59}
+{'loss': 0.7021, 'grad_norm': 2.902263879776001, 'learning_rate': 0.00021969696969696969, 'epoch': 0.59}
+{'loss': 0.9319, 'grad_norm': 1.9411492347717285, 'learning_rate': 0.00021967253176930594, 'epoch': 0.59}
+{'loss': 0.6419, 'grad_norm': 1.5800654888153076, 'learning_rate': 0.00021964809384164222, 'epoch': 0.59}
+{'loss': 1.0969, 'grad_norm': 2.5261833667755127, 'learning_rate': 0.0002196236559139785, 'epoch': 0.59}
+{'loss': 1.2766, 'grad_norm': 3.4451301097869873, 'learning_rate': 0.00021959921798631475, 'epoch': 0.59}
+{'loss': 0.461, 'grad_norm': 0.6500614285469055, 'learning_rate': 0.00021957478005865102, 'epoch': 0.6}
+{'loss': 0.4256, 'grad_norm': 0.5970913767814636, 'learning_rate': 0.0002195503421309873, 'epoch': 0.6}
+{'loss': 0.3076, 'grad_norm': 0.7132553458213806, 'learning_rate': 0.00021952590420332353, 'epoch': 0.6}
+{'loss': 0.3146, 'grad_norm': 0.6507532000541687, 'learning_rate': 0.0002195014662756598, 'epoch': 0.6}
+{'loss': 0.3291, 'grad_norm': 0.8440534472465515, 'learning_rate': 0.00021947702834799608, 'epoch': 0.6}
+{'loss': 0.2867, 'grad_norm': 0.7817156314849854, 'learning_rate': 0.00021945259042033233, 'epoch': 0.6}
+{'loss': 0.3247, 'grad_norm': 0.5882556438446045, 'learning_rate': 0.0002194281524926686, 'epoch': 0.6}
+{'loss': 0.4633, 'grad_norm': 2.025453805923462, 'learning_rate': 0.0002194037145650049, 'epoch': 0.6}
+{'loss': 0.4273, 'grad_norm': 0.8439894318580627, 'learning_rate': 0.00021937927663734111, 'epoch': 0.6}
+{'loss': 0.6922, 'grad_norm': 4.892188549041748, 'learning_rate': 0.0002193548387096774, 'epoch': 0.6}
+{'loss': 0.4494, 'grad_norm': 1.2125036716461182, 'learning_rate': 0.00021933040078201367, 'epoch': 0.6}
+{'loss': 0.4823, 'grad_norm': 1.2884728908538818, 'learning_rate': 0.00021930596285434992, 'epoch': 0.6}
+{'loss': 0.5722, 'grad_norm': 1.1847399473190308, 'learning_rate': 0.0002192815249266862, 'epoch': 0.6}
+{'loss': 0.3829, 'grad_norm': 0.9156528115272522, 'learning_rate': 0.00021925708699902248, 'epoch': 0.6}
+{'loss': 0.364, 'grad_norm': 1.0122822523117065, 'learning_rate': 0.00021923264907135873, 'epoch': 0.6}
+{'loss': 0.2851, 'grad_norm': 0.5517539978027344, 'learning_rate': 0.000219208211143695, 'epoch': 0.6}
+{'loss': 0.363, 'grad_norm': 0.777860164642334, 'learning_rate': 0.0002191837732160313, 'epoch': 0.6}
+{'loss': 0.9008, 'grad_norm': 3.110654354095459, 'learning_rate': 0.0002191593352883675, 'epoch': 0.6}
+{'loss': 0.5437, 'grad_norm': 1.7335413694381714, 'learning_rate': 0.0002191348973607038, 'epoch': 0.6}
+{'loss': 0.3403, 'grad_norm': 1.2909818887710571, 'learning_rate': 0.00021911045943304007, 'epoch': 0.6}
+{'loss': 0.6593, 'grad_norm': 2.242083787918091, 'learning_rate': 0.00021908602150537632, 'epoch': 0.6}
+{'loss': 0.64, 'grad_norm': 1.45481538772583, 'learning_rate': 0.0002190615835777126, 'epoch': 0.6}
+{'loss': 0.8033, 'grad_norm': 1.841384768486023, 'learning_rate': 0.00021903714565004888, 'epoch': 0.6}
+{'loss': 0.9931, 'grad_norm': 4.309696197509766, 'learning_rate': 0.00021901270772238513, 'epoch': 0.6}
+{'loss': 0.7233, 'grad_norm': 2.3548316955566406, 'learning_rate': 0.0002189882697947214, 'epoch': 0.6}
+{'loss': 1.0121, 'grad_norm': 2.9939208030700684, 'learning_rate': 0.00021896383186705768, 'epoch': 0.6}
+{'loss': 0.6097, 'grad_norm': 1.7699570655822754, 'learning_rate': 0.0002189393939393939, 'epoch': 0.6}
+{'loss': 0.8177, 'grad_norm': 2.700235605239868, 'learning_rate': 0.0002189149560117302, 'epoch': 0.6}
+{'loss': 0.6307, 'grad_norm': 1.7766852378845215, 'learning_rate': 0.00021889051808406647, 'epoch': 0.6}
+{'loss': 0.9373, 'grad_norm': 1.4954442977905273, 'learning_rate': 0.00021886608015640272, 'epoch': 0.6}
+{'loss': 0.627, 'grad_norm': 1.9515974521636963, 'learning_rate': 0.000218841642228739, 'epoch': 0.6}
+{'loss': 0.8141, 'grad_norm': 1.922837734222412, 'learning_rate': 0.00021881720430107527, 'epoch': 0.6}
+{'loss': 1.1225, 'grad_norm': 2.2533392906188965, 'learning_rate': 0.0002187927663734115, 'epoch': 0.6}
+{'loss': 0.8138, 'grad_norm': 1.5181680917739868, 'learning_rate': 0.00021876832844574778, 'epoch': 0.6}
+{'loss': 1.0635, 'grad_norm': 2.555852174758911, 'learning_rate': 0.00021874389051808403, 'epoch': 0.6}
+{'loss': 1.2145, 'grad_norm': 1.9086858034133911, 'learning_rate': 0.0002187194525904203, 'epoch': 0.6}
+{'loss': 0.4878, 'grad_norm': 2.0855298042297363, 'learning_rate': 0.00021869501466275658, 'epoch': 0.6}
+{'loss': 1.3879, 'grad_norm': 2.812436819076538, 'learning_rate': 0.00021867057673509283, 'epoch': 0.6}
+{'loss': 1.0922, 'grad_norm': 2.317641258239746, 'learning_rate': 0.0002186461388074291, 'epoch': 0.6}
+{'loss': 1.0511, 'grad_norm': 1.9752837419509888, 'learning_rate': 0.0002186217008797654, 'epoch': 0.6}
+{'loss': 1.2536, 'grad_norm': 3.892122745513916, 'learning_rate': 0.00021859726295210162, 'epoch': 0.6}
+{'loss': 1.7708, 'grad_norm': 3.568765640258789, 'learning_rate': 0.0002185728250244379, 'epoch': 0.6}
+{'loss': 1.8441, 'grad_norm': 4.544713497161865, 'learning_rate': 0.00021854838709677417, 'epoch': 0.6}
+{'loss': 1.3045, 'grad_norm': 3.862272024154663, 'learning_rate': 0.00021852394916911042, 'epoch': 0.6}
+{'loss': 0.8241, 'grad_norm': 2.4139466285705566, 'learning_rate': 0.0002184995112414467, 'epoch': 0.6}
+{'loss': 1.4736, 'grad_norm': 2.8001489639282227, 'learning_rate': 0.00021847507331378298, 'epoch': 0.6}
+{'loss': 0.8626, 'grad_norm': 1.7694708108901978, 'learning_rate': 0.00021845063538611923, 'epoch': 0.6}
+{'loss': 0.8587, 'grad_norm': 1.846111536026001, 'learning_rate': 0.0002184261974584555, 'epoch': 0.6}
+{'loss': 0.8416, 'grad_norm': 2.2559170722961426, 'learning_rate': 0.0002184017595307918, 'epoch': 0.6}
+{'loss': 0.8497, 'grad_norm': 3.0026137828826904, 'learning_rate': 0.000218377321603128, 'epoch': 0.6}
+{'loss': 0.4743, 'grad_norm': 2.2647368907928467, 'learning_rate': 0.0002183528836754643, 'epoch': 0.6}
+ 30%|███       | 3852/12776 [38:29<2:05:47,  1.18it/s] 30%|███       | 3853/12776 [38:30<2:08:57,  1.15it/s]                                                       30%|███       | 3853/12776 [38:30<2:08:57,  1.15it/s] 30%|███       | 3854/12776 [38:31<2:14:37,  1.10it/s]                                                       30%|███       | 3854/12776 [38:31<2:14:37,  1.10it/s] 30%|███       | 3855/12776 [38:32<2:09:24,  1.15it/s]                                                       30%|███       | 3855/12776 [38:32<2:09:24,  1.15it/s] 30%|███       | 3856/12776 [38:33<2:06:05,  1.18it/s]                                                       30%|███       | 3856/12776 [38:33<2:06:05,  1.18it/s] 30%|███       | 3857/12776 [38:33<2:00:21,  1.24it/s]                                                       30%|███       | 3857/12776 [38:33<2:00:21,  1.24it/s] 30%|███       | 3858/12776 [38:34<1:54:10,  1.30it/s]                                                       30%|███       | 3858/12776 [38:34<1:54:10,  1.30it/s] 30%|███       | 3859/12776 [38:35<1:52:25,  1.32it/s]                                                       30%|███       | 3859/12776 [38:35<1:52:25,  1.32it/s] 30%|███       | 3860/12776 [38:35<1:46:16,  1.40it/s]                                                       30%|███       | 3860/12776 [38:35<1:46:16,  1.40it/s] 30%|███       | 3861/12776 [38:36<1:42:23,  1.45it/s]                                                       30%|███       | 3861/12776 [38:36<1:42:23,  1.45it/s] 30%|███       | 3862/12776 [38:37<1:36:07,  1.55it/s]                                                       30%|███       | 3862/12776 [38:37<1:36:07,  1.55it/s] 30%|███       | 3863/12776 [38:37<1:33:32,  1.59it/s]                                                       30%|███       | 3863/12776 [38:37<1:33:32,  1.59it/s] 30%|███       | 3864/12776 [38:38<1:28:11,  1.68it/s]                                                       30%|███       | 3864/12776 [38:38<1:28:11,  1.68it/s] 30%|███       | 3865/12776 [38:38<1:27:38,  1.69it/s]                                                       30%|███       | 3865/12776 [38:38<1:27:38,  1.69it/s] 30%|███       | 3866/12776 [38:39<1:21:22,  1.82it/s]                                                       30%|███       | 3866/12776 [38:39<1:21:22,  1.82it/s] 30%|███       | 3867/12776 [38:39<1:22:07,  1.81it/s]                                                       30%|███       | 3867/12776 [38:39<1:22:07,  1.81it/s] 30%|███       | 3868/12776 [38:40<1:16:10,  1.95it/s]                                                       30%|███       | 3868/12776 [38:40<1:16:10,  1.95it/s] 30%|███       | 3869/12776 [38:40<1:17:09,  1.92it/s]                                                       30%|███       | 3869/12776 [38:40<1:17:09,  1.92it/s] 30%|███       | 3870/12776 [38:41<1:11:33,  2.07it/s]                                                       30%|███       | 3870/12776 [38:41<1:11:33,  2.07it/s] 30%|███       | 3871/12776 [38:41<1:06:43,  2.22it/s]                                                       30%|███       | 3871/12776 [38:41<1:06:43,  2.22it/s] 30%|███       | 3872/12776 [38:41<1:03:14,  2.35it/s]                                                       30%|███       | 3872/12776 [38:41<1:03:14,  2.35it/s] 30%|███       | 3873/12776 [38:42<59:06,  2.51it/s]                                                       30%|███       | 3873/12776 [38:42<59:06,  2.51it/s] 30%|███       | 3874/12776 [38:42<56:10,  2.64it/s]                                                     30%|███       | 3874/12776 [38:42<56:10,  2.64it/s] 30%|███       | 3875/12776 [38:42<54:03,  2.74it/s]                                                     30%|███       | 3875/12776 [38:42<54:03,  2.74it/s] 30%|███       | 3876/12776 [38:43<51:57,  2.85it/s]                                                     30%|███       | 3876/12776 [38:43<51:57,  2.85it/s] 30%|███       | 3877/12776 [38:43<49:47,  2.98it/s]                                                     30%|███       | 3877/12776 [38:43<49:47,  2.98it/s] 30%|███       | 3878/12776 [38:43<47:52,  3.10it/s]                                                     30%|███       | 3878/12776 [38:43<47:52,  3.10it/s] 30%|███       | 3879/12776 [38:44<49:27,  3.00it/s]                                                     30%|███       | 3879/12776 [38:44<49:27,  3.00it/s] 30%|███       | 3880/12776 [38:44<46:56,  3.16it/s]                                                     30%|███       | 3880/12776 [38:44<46:56,  3.16it/s] 30%|███       | 3881/12776 [38:44<44:53,  3.30it/s]                                                     30%|███       | 3881/12776 [38:44<44:53,  3.30it/s] 30%|███       | 3882/12776 [38:45<42:52,  3.46it/s]                                                     30%|███       | 3882/12776 [38:45<42:52,  3.46it/s] 30%|███       | 3883/12776 [38:45<44:05,  3.36it/s]                                                     30%|███       | 3883/12776 [38:45<44:05,  3.36it/s] 30%|███       | 3884/12776 [38:45<41:56,  3.53it/s]                                                     30%|███       | 3884/12776 [38:45<41:56,  3.53it/s] 30%|███       | 3885/12776 [38:45<40:16,  3.68it/s]                                                     30%|███       | 3885/12776 [38:45<40:16,  3.68it/s] 30%|███       | 3886/12776 [38:46<38:36,  3.84it/s]                                                     30%|███       | 3886/12776 [38:46<38:36,  3.84it/s] 30%|███       | 3887/12776 [38:46<37:02,  4.00it/s]                                                     30%|███       | 3887/12776 [38:46<37:02,  4.00it/s] 30%|███       | 3888/12776 [38:46<38:06,  3.89it/s]                                                     30%|███       | 3888/12776 [38:46<38:06,  3.89it/s] 30%|███       | 3889/12776 [38:46<36:04,  4.11it/s]                                                     30%|███       | 3889/12776 [38:46<36:04,  4.11it/s] 30%|███       | 3890/12776 [38:46<34:41,  4.27it/s]                                                     30%|███       | 3890/12776 [38:46<34:41,  4.27it/s] 30%|███       | 3891/12776 [38:47<33:32,  4.42it/s]                                                     30%|███       | 3891/12776 [38:47<33:32,  4.42it/s] 30%|███       | 3892/12776 [38:47<32:38,  4.54it/s]                                                     30%|███       | 3892/12776 [38:47<32:38,  4.54it/s] 30%|███       | 3893/12776 [38:47<34:47,  4.26it/s]                                                     30%|███       | 3893/12776 [38:47<34:47,  4.26it/s] 30%|███       | 3894/12776 [38:47<33:09,  4.46it/s]                                                     30%|███       | 3894/12776 [38:47<33:09,  4.46it/s] 30%|███       | 3895/12776 [38:48<31:58,  4.63it/s]                                                     30%|███       | 3895/12776 [38:48<31:58,  4.63it/s] 30%|███       | 3896/12776 [38:48<30:58,  4.78it/s]                                                     30%|███       | 3896/12776 [38:48<30:58,  4.78it/s] 31%|███       | 3897/12776 [38:48<30:14,  4.89it/s]                                                     31%|███       | 3897/12776 [38:48<30:14,  4.89it/s] 31%|███       | 3898/12776 [38:48<31:59,  4.62it/s]                                                     31%|███       | 3898/12776 [38:48<31:59,  4.62it/s] 31%|███       | 3899/12776 [38:48<30:36,  4.83it/s]                                                     31%|███       | 3899/12776 [38:48<30:36,  4.83it/s] 31%|███       | 3900/12776 [38:49<55:01,  2.69it/s]                                                     31%|███       | 3900/12776 [38:49<55:01,  2.69it/s] 31%|███       | 3901/12776 [38:51<1:51:02,  1.33it/s]                                                       31%|███       | 3901/12776 [38:51<1:51:02,  1.33it/s] 31%|███       | 3902/12776 [38:52<1:59:09,  1.24it/s]                                                       31%|███       | 3902/12776 [38:52<1:59:09,  1.24it/s] 31%|███       | 3903/12776 [38:53<2:01:05,  1.22it/s]                                                       31%|███       | 3903/12776 [38:53<2:01:05,  1.22it/s] 31%|███       | 3904/12776 [38:53<1:59:08,  1.24it/s]                                                       31%|███       | 3904/12776 [38:53<1:59:08,  1.24it/s] 31%|███       | 3905/12776 [38:54<1:56:13,  1.27it/s]                                                       31%|███       | 3905/12776 [38:54<1:56:13,  1.27it/s] 31%|███       | 3906/12776 [38:55<1:54:26,  1.29it/s]                                                       31%|███       | 3906/12776 [38:55<1:54:26,  1.29it/s] 31%|███       | 3907/12776 [38:55<1:49:53,  1.35it/s]                                                       31%|███       | 3907/12776 [38:55<1:49:53,  1.35it/s] 31%|███       | 3908/12776 [38:56<1:49:10,  1.35it/s]                                                       31%|███       | 3908/12776 [38:56<1:49:10,  1.35it/s] 31%|███       | 3909/12776 [38:57<1:42:58,  1.44it/s]                                                       31%|███       | 3909/12776 [38:57<1:42:58,  1.44it/s] 31%|███       | 3910/12776 [38:57<1:37:26,  1.52it/s]                                                       31%|███       | 3910/12776 [38:57<1:37:26,  1.52it/s] 31%|███       | 3911/12776 [38:58<1:32:16,  1.60it/s]                                                       31%|███       | 3911/12776 [38:58<1:32:16,  1.60it/s] 31%|███       | 3912/12776 [38:59<1:31:43,  1.61it/s]                                                       31%|███       | 3912/12776 [38:59<1:31:43,  1.61it/s] 31%|███       | 3913/12776 [38:59<1:26:16,  1.71it/s]                                                       31%|███       | 3913/12776 [38:59<1:26:16,  1.71it/s] 31%|███       | 3914/12776 [39:00<1:20:42,  1.83it/s]                                                       31%|███       | 3914/12776 [39:00<1:20:42,  1.83it/s] 31%|███       | 3915/12776 [39:00<1:19:34,  1.86it/s]                                                       31%|███       | 3915/12776 [39:00<1:19:34,  1.86it/s] 31%|███       | 3916/12776 [39:00<1:15:06,  1.97it/s]                                                       31%|███       | 3916/12776 [39:00<1:15:06,  1.97it/s] 31%|███       | 3917/12776 [39:01<1:12:58,  2.02it/s]                                                       31%|███       | 3917/12776 [39:01<1:12:58,  2.02it/s] 31%|███       | 3918/12776 [39:01<1:09:10,  2.13it/s]                                                       31%|███       | 3918/12776 [39:01<1:09:10,  2.13it/s] 31%|███       | 3919/12776 [39:02<1:06:00,  2.24it/s]                                                       31%|███       | 3919/12776 [39:02<1:06:00,  2.24it/s] 31%|███       | 3920/12776 [39:02<1:08:21,  2.16it/s]                                                       31%|███       | 3920/12776 [39:02<1:08:21,  2.16it/s] 31%|███       | 3921/12776 [39:03<1:03:56,  2.31it/s]                                                       31%|███       | 3921/12776 [39:03<1:03:56,  2.31it/s] 31%|███       | 3922/12776 [39:03<1:00:25,  2.44it/s]                                                       31%|███       | 3922/12776 [39:03<1:00:25,  2.44it/s] 31%|███       | 3923/12776 [39:03<1:01:10,  2.41it/s]                                                       31%|███       | 3923/12776 [39:03<1:01:10,  2.41it/s] 31%|███       | 3924/12776 [39:04<57:45,  2.55it/s]                                                       31%|███       | 3924/12776 [39:04<57:45,  2.55it/s] 31%|███       | 3925/12776 [39:04<54:57,  2.68it/s]                                                     31%|███       | 3925/12776 [39:04<54:57,  2.68it/s] 31%|███       | 3926/12776 [39:04<57:13,  2.58it/s]                                                     31%|███       | 3926/12776 [39:04<57:13,  2.58it/s] 31%|███       | 3927/12776 [39:05<53:21,  2.76it/s]                                                     31%|███       | 3927/12776 [39:05<53:21,  2.76it/s] 31%|███       | 3928/12776 [39:05<50:22,  2.93it/s]                                                     31%|███       | 3928/12776 [39:05<50:22,  2.93it/s] 31%|███       | 3929/12776 [39:05<52:13,  2.82it/s]                                                    {'loss': 0.3079, 'grad_norm': 0.49257951974868774, 'learning_rate': 0.00021832844574780057, 'epoch': 0.6}
+{'loss': 0.4189, 'grad_norm': 0.505824089050293, 'learning_rate': 0.00021830400782013682, 'epoch': 0.6}
+{'loss': 0.2503, 'grad_norm': 0.44036567211151123, 'learning_rate': 0.0002182795698924731, 'epoch': 0.6}
+{'loss': 0.2296, 'grad_norm': 0.5498443245887756, 'learning_rate': 0.00021825513196480938, 'epoch': 0.6}
+{'loss': 0.4274, 'grad_norm': 0.8247083425521851, 'learning_rate': 0.00021823069403714563, 'epoch': 0.6}
+{'loss': 0.2763, 'grad_norm': 0.6773415207862854, 'learning_rate': 0.00021820625610948188, 'epoch': 0.6}
+{'loss': 0.4864, 'grad_norm': 0.810204803943634, 'learning_rate': 0.00021818181818181816, 'epoch': 0.6}
+{'loss': 0.3392, 'grad_norm': 0.6132627129554749, 'learning_rate': 0.0002181573802541544, 'epoch': 0.6}
+{'loss': 0.3409, 'grad_norm': 0.5890900492668152, 'learning_rate': 0.0002181329423264907, 'epoch': 0.6}
+{'loss': 0.5649, 'grad_norm': 1.143919587135315, 'learning_rate': 0.00021810850439882697, 'epoch': 0.6}
+{'loss': 0.354, 'grad_norm': 0.9753417372703552, 'learning_rate': 0.00021808406647116322, 'epoch': 0.6}
+{'loss': 0.2972, 'grad_norm': 0.9301658272743225, 'learning_rate': 0.0002180596285434995, 'epoch': 0.6}
+{'loss': 0.3733, 'grad_norm': 0.7712864279747009, 'learning_rate': 0.00021803519061583577, 'epoch': 0.6}
+{'loss': 0.5233, 'grad_norm': 1.1033275127410889, 'learning_rate': 0.000218010752688172, 'epoch': 0.61}
+{'loss': 0.6654, 'grad_norm': 1.8046352863311768, 'learning_rate': 0.00021798631476050828, 'epoch': 0.61}
+{'loss': 0.4882, 'grad_norm': 1.1659013032913208, 'learning_rate': 0.00021796187683284455, 'epoch': 0.61}
+{'loss': 0.7751, 'grad_norm': 2.8485543727874756, 'learning_rate': 0.0002179374389051808, 'epoch': 0.61}
+{'loss': 0.7052, 'grad_norm': 2.1419053077697754, 'learning_rate': 0.00021791300097751708, 'epoch': 0.61}
+{'loss': 0.4797, 'grad_norm': 1.2547987699508667, 'learning_rate': 0.00021788856304985336, 'epoch': 0.61}
+{'loss': 0.5729, 'grad_norm': 1.7469266653060913, 'learning_rate': 0.00021786412512218961, 'epoch': 0.61}
+{'loss': 0.63, 'grad_norm': 1.2163182497024536, 'learning_rate': 0.0002178396871945259, 'epoch': 0.61}
+{'loss': 0.723, 'grad_norm': 1.283544898033142, 'learning_rate': 0.00021781524926686217, 'epoch': 0.61}
+{'loss': 0.7387, 'grad_norm': 1.9158495664596558, 'learning_rate': 0.0002177908113391984, 'epoch': 0.61}
+{'loss': 0.8682, 'grad_norm': 2.14650821685791, 'learning_rate': 0.00021776637341153467, 'epoch': 0.61}
+{'loss': 0.5418, 'grad_norm': 1.1081241369247437, 'learning_rate': 0.00021774193548387095, 'epoch': 0.61}
+{'loss': 0.9634, 'grad_norm': 1.6972672939300537, 'learning_rate': 0.0002177174975562072, 'epoch': 0.61}
+{'loss': 0.9434, 'grad_norm': 2.2414941787719727, 'learning_rate': 0.00021769305962854348, 'epoch': 0.61}
+{'loss': 0.5801, 'grad_norm': 1.9545202255249023, 'learning_rate': 0.00021766862170087976, 'epoch': 0.61}
+{'loss': 0.9481, 'grad_norm': 2.0093443393707275, 'learning_rate': 0.00021764418377321598, 'epoch': 0.61}
+{'loss': 0.3291, 'grad_norm': 1.2470927238464355, 'learning_rate': 0.00021761974584555226, 'epoch': 0.61}
+{'loss': 0.4683, 'grad_norm': 3.9128541946411133, 'learning_rate': 0.00021759530791788854, 'epoch': 0.61}
+{'loss': 0.9931, 'grad_norm': 1.703982949256897, 'learning_rate': 0.0002175708699902248, 'epoch': 0.61}
+{'loss': 1.0189, 'grad_norm': 2.462613582611084, 'learning_rate': 0.00021754643206256107, 'epoch': 0.61}
+{'loss': 0.7844, 'grad_norm': 5.5664591789245605, 'learning_rate': 0.00021752199413489735, 'epoch': 0.61}
+{'loss': 1.414, 'grad_norm': 2.5077731609344482, 'learning_rate': 0.0002174975562072336, 'epoch': 0.61}
+{'loss': 0.8179, 'grad_norm': 2.240034341812134, 'learning_rate': 0.00021747311827956988, 'epoch': 0.61}
+{'loss': 1.0263, 'grad_norm': 1.7230664491653442, 'learning_rate': 0.00021744868035190616, 'epoch': 0.61}
+{'loss': 1.0993, 'grad_norm': 3.0299770832061768, 'learning_rate': 0.00021742424242424238, 'epoch': 0.61}
+{'loss': 1.0955, 'grad_norm': 4.0090413093566895, 'learning_rate': 0.00021739980449657866, 'epoch': 0.61}
+{'loss': 1.6831, 'grad_norm': 3.151949644088745, 'learning_rate': 0.00021737536656891494, 'epoch': 0.61}
+{'loss': 1.0109, 'grad_norm': 1.1991798877716064, 'learning_rate': 0.0002173509286412512, 'epoch': 0.61}
+{'loss': 1.4323, 'grad_norm': 3.1792802810668945, 'learning_rate': 0.00021732649071358747, 'epoch': 0.61}
+{'loss': 1.1116, 'grad_norm': 1.9545060396194458, 'learning_rate': 0.00021730205278592374, 'epoch': 0.61}
+{'loss': 1.1406, 'grad_norm': 2.42977237701416, 'learning_rate': 0.00021727761485826, 'epoch': 0.61}
+{'loss': 0.7086, 'grad_norm': 1.8895028829574585, 'learning_rate': 0.00021725317693059627, 'epoch': 0.61}
+{'loss': 1.2922, 'grad_norm': 4.542184829711914, 'learning_rate': 0.00021722873900293253, 'epoch': 0.61}
+{'loss': 0.777, 'grad_norm': 1.5227937698364258, 'learning_rate': 0.00021720430107526878, 'epoch': 0.61}
+{'loss': 0.5624, 'grad_norm': 2.370554208755493, 'learning_rate': 0.00021717986314760506, 'epoch': 0.61}
+{'loss': 0.91, 'grad_norm': 1.582291603088379, 'learning_rate': 0.00021715542521994133, 'epoch': 0.61}
+{'loss': 0.4112, 'grad_norm': 0.5427595973014832, 'learning_rate': 0.00021713098729227758, 'epoch': 0.61}
+{'loss': 0.3359, 'grad_norm': 0.9115118980407715, 'learning_rate': 0.00021710654936461386, 'epoch': 0.61}
+{'loss': 0.3834, 'grad_norm': 0.6298912763595581, 'learning_rate': 0.00021708211143695014, 'epoch': 0.61}
+{'loss': 0.5009, 'grad_norm': 0.669316828250885, 'learning_rate': 0.00021705767350928637, 'epoch': 0.61}
+{'loss': 0.2752, 'grad_norm': 0.608873188495636, 'learning_rate': 0.00021703323558162264, 'epoch': 0.61}
+{'loss': 0.3135, 'grad_norm': 0.4760167598724365, 'learning_rate': 0.00021700879765395892, 'epoch': 0.61}
+{'loss': 0.4159, 'grad_norm': 0.5999681353569031, 'learning_rate': 0.00021698435972629517, 'epoch': 0.61}
+{'loss': 0.5722, 'grad_norm': 0.8838676810264587, 'learning_rate': 0.00021695992179863145, 'epoch': 0.61}
+{'loss': 0.5293, 'grad_norm': 0.8558535575866699, 'learning_rate': 0.00021693548387096773, 'epoch': 0.61}
+{'loss': 0.3242, 'grad_norm': 0.6933575868606567, 'learning_rate': 0.00021691104594330398, 'epoch': 0.61}
+{'loss': 0.288, 'grad_norm': 0.6868442893028259, 'learning_rate': 0.00021688660801564026, 'epoch': 0.61}
+{'loss': 0.49, 'grad_norm': 0.9867046475410461, 'learning_rate': 0.00021686217008797654, 'epoch': 0.61}
+{'loss': 0.4821, 'grad_norm': 0.8089808225631714, 'learning_rate': 0.00021683773216031276, 'epoch': 0.61}
+{'loss': 0.5644, 'grad_norm': 1.2595003843307495, 'learning_rate': 0.00021681329423264904, 'epoch': 0.61}
+{'loss': 0.3876, 'grad_norm': 1.136533260345459, 'learning_rate': 0.00021678885630498532, 'epoch': 0.61}
+{'loss': 0.3581, 'grad_norm': 0.9469622373580933, 'learning_rate': 0.00021676441837732157, 'epoch': 0.61}
+{'loss': 0.9731, 'grad_norm': 1.390121340751648, 'learning_rate': 0.00021673998044965785, 'epoch': 0.61}
+{'loss': 0.5445, 'grad_norm': 0.9338005185127258, 'learning_rate': 0.00021671554252199413, 'epoch': 0.61}
+{'loss': 0.5307, 'grad_norm': 1.3326369524002075, 'learning_rate': 0.00021669110459433038, 'epoch': 0.61}
+{'loss': 0.7355, 'grad_norm': 1.399505615234375, 'learning_rate': 0.00021666666666666666, 'epoch': 0.61}
+{'loss': 0.3582, 'grad_norm': 0.6472836136817932, 'learning_rate': 0.0002166422287390029, 'epoch': 0.61}
+{'loss': 0.5519, 'grad_norm': 1.1896679401397705, 'learning_rate': 0.00021661779081133916, 'epoch': 0.61}
+{'loss': 0.5286, 'grad_norm': 1.0599162578582764, 'learning_rate': 0.00021659335288367544, 'epoch': 0.61}
+{'loss': 0.5916, 'grad_norm': 4.264036178588867, 'learning_rate': 0.00021656891495601172, 'epoch': 0.61}
+{'loss': 1.2145, 'grad_norm': 5.218554973602295, 'learning_rate': 0.00021654447702834797, 'epoch': 0.61}
+{'loss': 0.6093, 'grad_norm': 1.6409800052642822, 'learning_rate': 0.00021652003910068425, 'epoch': 0.61}
+{'loss': 0.7161, 'grad_norm': 1.286957025527954, 'learning_rate': 0.00021649560117302052, 'epoch': 0.61}
+{'loss': 0.7036, 'grad_norm': 2.0003950595855713, 'learning_rate': 0.00021647116324535675, 'epoch': 0.61}
+ 31%|███       | 3929/12776 [39:05<52:13,  2.82it/s] 31%|███       | 3930/12776 [39:06<48:49,  3.02it/s]                                                     31%|███       | 3930/12776 [39:06<48:49,  3.02it/s] 31%|███       | 3931/12776 [39:06<46:02,  3.20it/s]                                                     31%|███       | 3931/12776 [39:06<46:02,  3.20it/s] 31%|███       | 3932/12776 [39:06<43:21,  3.40it/s]                                                     31%|███       | 3932/12776 [39:06<43:21,  3.40it/s] 31%|███       | 3933/12776 [39:07<44:33,  3.31it/s]                                                     31%|███       | 3933/12776 [39:07<44:33,  3.31it/s] 31%|███       | 3934/12776 [39:07<42:14,  3.49it/s]                                                     31%|███       | 3934/12776 [39:07<42:14,  3.49it/s] 31%|███       | 3935/12776 [39:07<40:21,  3.65it/s]                                                     31%|███       | 3935/12776 [39:07<40:21,  3.65it/s] 31%|███       | 3936/12776 [39:07<38:41,  3.81it/s]                                                     31%|███       | 3936/12776 [39:07<38:41,  3.81it/s] 31%|███       | 3937/12776 [39:08<37:15,  3.95it/s]                                                     31%|███       | 3937/12776 [39:08<37:15,  3.95it/s] 31%|███       | 3938/12776 [39:08<39:44,  3.71it/s]                                                     31%|███       | 3938/12776 [39:08<39:44,  3.71it/s] 31%|███       | 3939/12776 [39:08<37:21,  3.94it/s]                                                     31%|███       | 3939/12776 [39:08<37:21,  3.94it/s] 31%|███       | 3940/12776 [39:08<35:31,  4.15it/s]                                                     31%|███       | 3940/12776 [39:08<35:31,  4.15it/s] 31%|███       | 3941/12776 [39:08<34:13,  4.30it/s]                                                     31%|███       | 3941/12776 [39:08<34:13,  4.30it/s] 31%|███       | 3942/12776 [39:09<33:01,  4.46it/s]                                                     31%|███       | 3942/12776 [39:09<33:01,  4.46it/s] 31%|███       | 3943/12776 [39:09<36:11,  4.07it/s]                                                     31%|███       | 3943/12776 [39:09<36:11,  4.07it/s] 31%|███       | 3944/12776 [39:09<34:09,  4.31it/s]                                                     31%|███       | 3944/12776 [39:09<34:09,  4.31it/s] 31%|███       | 3945/12776 [39:09<32:38,  4.51it/s]                                                     31%|███       | 3945/12776 [39:09<32:38,  4.51it/s] 31%|███       | 3946/12776 [39:10<31:34,  4.66it/s]                                                     31%|███       | 3946/12776 [39:10<31:34,  4.66it/s] 31%|███       | 3947/12776 [39:10<30:37,  4.81it/s]                                                     31%|███       | 3947/12776 [39:10<30:37,  4.81it/s] 31%|███       | 3948/12776 [39:10<32:58,  4.46it/s]                                                     31%|███       | 3948/12776 [39:10<32:58,  4.46it/s] 31%|███       | 3949/12776 [39:10<31:18,  4.70it/s]                                                     31%|███       | 3949/12776 [39:10<31:18,  4.70it/s] 31%|███       | 3950/12776 [39:11<55:13,  2.66it/s]                                                     31%|███       | 3950/12776 [39:11<55:13,  2.66it/s] 31%|███       | 3951/12776 [39:12<1:43:04,  1.43it/s]                                                       31%|███       | 3951/12776 [39:12<1:43:04,  1.43it/s] 31%|███       | 3952/12776 [39:13<1:57:09,  1.26it/s]                                                       31%|███       | 3952/12776 [39:13<1:57:09,  1.26it/s] 31%|███       | 3953/12776 [39:14<1:59:36,  1.23it/s]                                                       31%|███       | 3953/12776 [39:14<1:59:36,  1.23it/s] 31%|███       | 3954/12776 [39:15<1:58:14,  1.24it/s]                                                       31%|███       | 3954/12776 [39:15<1:58:14,  1.24it/s] 31%|███       | 3955/12776 [39:16<1:55:15,  1.28it/s]                                                       31%|███       | 3955/12776 [39:16<1:55:15,  1.28it/s] 31%|███       | 3956/12776 [39:17<1:51:38,  1.32it/s]                                                       31%|███       | 3956/12776 [39:17<1:51:38,  1.32it/s] 31%|███       | 3957/12776 [39:17<1:48:50,  1.35it/s]                                                       31%|███       | 3957/12776 [39:17<1:48:50,  1.35it/s] 31%|███       | 3958/12776 [39:18<1:44:10,  1.41it/s]                                                       31%|███       | 3958/12776 [39:18<1:44:10,  1.41it/s] 31%|███       | 3959/12776 [39:18<1:39:26,  1.48it/s]                                                       31%|███       | 3959/12776 [39:18<1:39:26,  1.48it/s] 31%|███       | 3960/12776 [39:19<1:34:08,  1.56it/s]                                                       31%|███       | 3960/12776 [39:19<1:34:08,  1.56it/s] 31%|███       | 3961/12776 [39:20<1:29:12,  1.65it/s]                                                       31%|███       | 3961/12776 [39:20<1:29:12,  1.65it/s] 31%|███       | 3962/12776 [39:20<1:25:07,  1.73it/s]                                                       31%|███       | 3962/12776 [39:20<1:25:07,  1.73it/s] 31%|███       | 3963/12776 [39:21<1:25:19,  1.72it/s]                                                       31%|███       | 3963/12776 [39:21<1:25:19,  1.72it/s] 31%|███       | 3964/12776 [39:21<1:19:50,  1.84it/s]                                                       31%|███       | 3964/12776 [39:21<1:19:50,  1.84it/s] 31%|███       | 3965/12776 [39:22<1:21:38,  1.80it/s]                                                       31%|███       | 3965/12776 [39:22<1:21:38,  1.80it/s] 31%|███       | 3966/12776 [39:22<1:16:24,  1.92it/s]                                                       31%|███       | 3966/12776 [39:22<1:16:24,  1.92it/s] 31%|███       | 3967/12776 [39:23<1:16:47,  1.91it/s]                                                       31%|███       | 3967/12776 [39:23<1:16:47,  1.91it/s] 31%|███       | 3968/12776 [39:23<1:12:02,  2.04it/s]                                                       31%|███       | 3968/12776 [39:23<1:12:02,  2.04it/s] 31%|███       | 3969/12776 [39:23<1:07:36,  2.17it/s]                                                       31%|███       | 3969/12776 [39:23<1:07:36,  2.17it/s] 31%|███       | 3970/12776 [39:24<1:08:47,  2.13it/s]                                                       31%|███       | 3970/12776 [39:24<1:08:47,  2.13it/s] 31%|███       | 3971/12776 [39:24<1:04:12,  2.29it/s]                                                       31%|███       | 3971/12776 [39:24<1:04:12,  2.29it/s] 31%|███       | 3972/12776 [39:25<1:00:29,  2.43it/s]                                                       31%|███       | 3972/12776 [39:25<1:00:29,  2.43it/s] 31%|███       | 3973/12776 [39:25<59:45,  2.46it/s]                                                       31%|███       | 3973/12776 [39:25<59:45,  2.46it/s] 31%|███       | 3974/12776 [39:25<56:32,  2.59it/s]                                                     31%|███       | 3974/12776 [39:25<56:32,  2.59it/s] 31%|███       | 3975/12776 [39:26<53:59,  2.72it/s]                                                     31%|███       | 3975/12776 [39:26<53:59,  2.72it/s] 31%|███       | 3976/12776 [39:26<57:21,  2.56it/s]                                                     31%|███       | 3976/12776 [39:26<57:21,  2.56it/s] 31%|███       | 3977/12776 [39:26<53:39,  2.73it/s]                                                     31%|███       | 3977/12776 [39:26<53:39,  2.73it/s] 31%|███       | 3978/12776 [39:27<50:42,  2.89it/s]                                                     31%|███       | 3978/12776 [39:27<50:42,  2.89it/s] 31%|███       | 3979/12776 [39:27<51:51,  2.83it/s]                                                     31%|███       | 3979/12776 [39:27<51:51,  2.83it/s] 31%|███       | 3980/12776 [39:27<48:39,  3.01it/s]                                                     31%|███       | 3980/12776 [39:27<48:39,  3.01it/s] 31%|███       | 3981/12776 [39:28<46:00,  3.19it/s]                                                     31%|███       | 3981/12776 [39:28<46:00,  3.19it/s] 31%|███       | 3982/12776 [39:28<43:51,  3.34it/s]                                                     31%|███       | 3982/12776 [39:28<43:51,  3.34it/s] 31%|███       | 3983/12776 [39:28<44:29,  3.29it/s]                                                     31%|███       | 3983/12776 [39:28<44:29,  3.29it/s] 31%|███       | 3984/12776 [39:29<42:14,  3.47it/s]                                                     31%|███       | 3984/12776 [39:29<42:14,  3.47it/s] 31%|███       | 3985/12776 [39:29<40:24,  3.63it/s]                                                     31%|███       | 3985/12776 [39:29<40:24,  3.63it/s] 31%|███       | 3986/12776 [39:29<38:52,  3.77it/s]                                                     31%|███       | 3986/12776 [39:29<38:52,  3.77it/s] 31%|███       | 3987/12776 [39:29<37:33,  3.90it/s]                                                     31%|███       | 3987/12776 [39:29<37:33,  3.90it/s] 31%|███       | 3988/12776 [39:30<39:42,  3.69it/s]                                                     31%|███       | 3988/12776 [39:30<39:42,  3.69it/s] 31%|███       | 3989/12776 [39:30<37:28,  3.91it/s]                                                     31%|███       | 3989/12776 [39:30<37:28,  3.91it/s] 31%|███       | 3990/12776 [39:30<35:36,  4.11it/s]                                                     31%|███       | 3990/12776 [39:30<35:36,  4.11it/s] 31%|███       | 3991/12776 [39:30<34:04,  4.30it/s]                                                     31%|███       | 3991/12776 [39:30<34:04,  4.30it/s] 31%|███       | 3992/12776 [39:30<33:03,  4.43it/s]                                                     31%|███       | 3992/12776 [39:30<33:03,  4.43it/s] 31%|███▏      | 3993/12776 [39:31<35:52,  4.08it/s]                                                     31%|███▏      | 3993/12776 [39:31<35:52,  4.08it/s] 31%|███▏      | 3994/12776 [39:31<33:51,  4.32it/s]                                                     31%|███▏      | 3994/12776 [39:31<33:51,  4.32it/s] 31%|███▏      | 3995/12776 [39:31<32:17,  4.53it/s]                                                     31%|███▏      | 3995/12776 [39:31<32:17,  4.53it/s] 31%|███▏      | 3996/12776 [39:31<31:16,  4.68it/s]                                                     31%|███▏      | 3996/12776 [39:31<31:16,  4.68it/s] 31%|███▏      | 3997/12776 [39:31<30:24,  4.81it/s]                                                     31%|███▏      | 3997/12776 [39:31<30:24,  4.81it/s] 31%|███▏      | 3998/12776 [39:32<31:43,  4.61it/s]                                                     31%|███▏      | 3998/12776 [39:32<31:43,  4.61it/s] 31%|███▏      | 3999/12776 [39:32<30:25,  4.81it/s]                                                     31%|███▏      | 3999/12776 [39:32<30:25,  4.81it/s] 31%|███▏      | 4000/12776 [39:33<52:40,  2.78it/s]                                                     31%|███▏      | 4000/12776 [39:33<52:40,  2.78it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 0.683, 'grad_norm': 1.561766505241394, 'learning_rate': 0.00021644672531769303, 'epoch': 0.62}
+{'loss': 0.5201, 'grad_norm': 1.2200226783752441, 'learning_rate': 0.0002164222873900293, 'epoch': 0.62}
+{'loss': 0.7908, 'grad_norm': 3.1129469871520996, 'learning_rate': 0.00021639784946236556, 'epoch': 0.62}
+{'loss': 1.0388, 'grad_norm': 2.578989028930664, 'learning_rate': 0.00021637341153470183, 'epoch': 0.62}
+{'loss': 0.6982, 'grad_norm': 1.8467810153961182, 'learning_rate': 0.0002163489736070381, 'epoch': 0.62}
+{'loss': 0.8299, 'grad_norm': 2.9458441734313965, 'learning_rate': 0.00021632453567937436, 'epoch': 0.62}
+{'loss': 0.9103, 'grad_norm': 4.411853313446045, 'learning_rate': 0.00021630009775171064, 'epoch': 0.62}
+{'loss': 0.9272, 'grad_norm': 1.8267143964767456, 'learning_rate': 0.00021627565982404692, 'epoch': 0.62}
+{'loss': 2.282, 'grad_norm': 5.815080165863037, 'learning_rate': 0.00021625122189638314, 'epoch': 0.62}
+{'loss': 1.254, 'grad_norm': 3.6911022663116455, 'learning_rate': 0.00021622678396871942, 'epoch': 0.62}
+{'loss': 1.1136, 'grad_norm': 2.4619340896606445, 'learning_rate': 0.0002162023460410557, 'epoch': 0.62}
+{'loss': 1.067, 'grad_norm': 6.795989036560059, 'learning_rate': 0.00021617790811339195, 'epoch': 0.62}
+{'loss': 1.8717, 'grad_norm': 4.4798126220703125, 'learning_rate': 0.00021615347018572823, 'epoch': 0.62}
+{'loss': 1.3237, 'grad_norm': 5.152458667755127, 'learning_rate': 0.0002161290322580645, 'epoch': 0.62}
+{'loss': 1.6574, 'grad_norm': 4.771958827972412, 'learning_rate': 0.00021610459433040076, 'epoch': 0.62}
+{'loss': 2.0387, 'grad_norm': 2.8596343994140625, 'learning_rate': 0.00021608015640273704, 'epoch': 0.62}
+{'loss': 1.2219, 'grad_norm': 2.4196653366088867, 'learning_rate': 0.0002160557184750733, 'epoch': 0.62}
+{'loss': 0.9666, 'grad_norm': 1.575811505317688, 'learning_rate': 0.00021603128054740954, 'epoch': 0.62}
+{'loss': 0.6596, 'grad_norm': 1.7998102903366089, 'learning_rate': 0.00021600684261974582, 'epoch': 0.62}
+{'loss': 0.7692, 'grad_norm': 2.5774621963500977, 'learning_rate': 0.0002159824046920821, 'epoch': 0.62}
+{'loss': 1.069, 'grad_norm': 2.7812228202819824, 'learning_rate': 0.00021595796676441835, 'epoch': 0.62}
+{'loss': 1.0517, 'grad_norm': 3.195280075073242, 'learning_rate': 0.00021593352883675463, 'epoch': 0.62}
+{'loss': 0.2721, 'grad_norm': 0.4578677713871002, 'learning_rate': 0.0002159090909090909, 'epoch': 0.62}
+{'loss': 0.3176, 'grad_norm': 0.7027708292007446, 'learning_rate': 0.00021588465298142713, 'epoch': 0.62}
+{'loss': 0.416, 'grad_norm': 0.6740389466285706, 'learning_rate': 0.0002158602150537634, 'epoch': 0.62}
+{'loss': 0.3664, 'grad_norm': 0.6556975245475769, 'learning_rate': 0.0002158357771260997, 'epoch': 0.62}
+{'loss': 0.3161, 'grad_norm': 0.4339908957481384, 'learning_rate': 0.00021581133919843594, 'epoch': 0.62}
+{'loss': 0.4315, 'grad_norm': 0.7102330923080444, 'learning_rate': 0.00021578690127077222, 'epoch': 0.62}
+{'loss': 0.3424, 'grad_norm': 0.7351087331771851, 'learning_rate': 0.0002157624633431085, 'epoch': 0.62}
+{'loss': 0.4188, 'grad_norm': 0.7561622858047485, 'learning_rate': 0.00021573802541544475, 'epoch': 0.62}
+{'loss': 0.4811, 'grad_norm': 1.3164188861846924, 'learning_rate': 0.00021571358748778102, 'epoch': 0.62}
+{'loss': 0.3488, 'grad_norm': 0.9041202664375305, 'learning_rate': 0.0002156891495601173, 'epoch': 0.62}
+{'loss': 0.3205, 'grad_norm': 1.003924012184143, 'learning_rate': 0.00021566471163245353, 'epoch': 0.62}
+{'loss': 0.2906, 'grad_norm': 0.5604303479194641, 'learning_rate': 0.0002156402737047898, 'epoch': 0.62}
+{'loss': 0.3154, 'grad_norm': 0.638725221157074, 'learning_rate': 0.00021561583577712608, 'epoch': 0.62}
+{'loss': 0.5327, 'grad_norm': 1.5510947704315186, 'learning_rate': 0.00021559139784946234, 'epoch': 0.62}
+{'loss': 0.5919, 'grad_norm': 1.3755923509597778, 'learning_rate': 0.0002155669599217986, 'epoch': 0.62}
+{'loss': 0.2272, 'grad_norm': 0.9352520704269409, 'learning_rate': 0.0002155425219941349, 'epoch': 0.62}
+{'loss': 0.352, 'grad_norm': 1.7107229232788086, 'learning_rate': 0.00021551808406647114, 'epoch': 0.62}
+{'loss': 0.7203, 'grad_norm': 1.7082419395446777, 'learning_rate': 0.0002154936461388074, 'epoch': 0.62}
+{'loss': 0.8749, 'grad_norm': 2.179602861404419, 'learning_rate': 0.00021546920821114367, 'epoch': 0.62}
+{'loss': 0.6856, 'grad_norm': 1.580485224723816, 'learning_rate': 0.00021544477028347992, 'epoch': 0.62}
+{'loss': 0.4401, 'grad_norm': 1.842761754989624, 'learning_rate': 0.0002154203323558162, 'epoch': 0.62}
+{'loss': 0.6815, 'grad_norm': 1.207093596458435, 'learning_rate': 0.00021539589442815248, 'epoch': 0.62}
+{'loss': 0.5138, 'grad_norm': 1.4717131853103638, 'learning_rate': 0.00021537145650048873, 'epoch': 0.62}
+{'loss': 0.6397, 'grad_norm': 1.9231157302856445, 'learning_rate': 0.000215347018572825, 'epoch': 0.62}
+{'loss': 0.6357, 'grad_norm': 1.4516040086746216, 'learning_rate': 0.0002153225806451613, 'epoch': 0.62}
+{'loss': 0.497, 'grad_norm': 1.3185113668441772, 'learning_rate': 0.0002152981427174975, 'epoch': 0.62}
+{'loss': 0.6394, 'grad_norm': 1.5022931098937988, 'learning_rate': 0.0002152737047898338, 'epoch': 0.62}
+{'loss': 0.6296, 'grad_norm': 2.147803544998169, 'learning_rate': 0.00021524926686217007, 'epoch': 0.62}
+{'loss': 1.1217, 'grad_norm': 2.3835220336914062, 'learning_rate': 0.00021522482893450632, 'epoch': 0.62}
+{'loss': 0.9852, 'grad_norm': 2.3003957271575928, 'learning_rate': 0.0002152003910068426, 'epoch': 0.62}
+{'loss': 0.9064, 'grad_norm': 2.8181819915771484, 'learning_rate': 0.00021517595307917888, 'epoch': 0.62}
+{'loss': 0.8785, 'grad_norm': 1.4563791751861572, 'learning_rate': 0.00021515151515151513, 'epoch': 0.62}
+{'loss': 0.708, 'grad_norm': 1.4717764854431152, 'learning_rate': 0.0002151270772238514, 'epoch': 0.62}
+{'loss': 1.0, 'grad_norm': 1.8528285026550293, 'learning_rate': 0.00021510263929618769, 'epoch': 0.62}
+{'loss': 0.6012, 'grad_norm': 1.295715093612671, 'learning_rate': 0.0002150782013685239, 'epoch': 0.62}
+{'loss': 1.2052, 'grad_norm': 3.2727482318878174, 'learning_rate': 0.0002150537634408602, 'epoch': 0.62}
+{'loss': 1.0637, 'grad_norm': 2.074563980102539, 'learning_rate': 0.00021502932551319647, 'epoch': 0.62}
+{'loss': 0.8989, 'grad_norm': 2.350457191467285, 'learning_rate': 0.00021500488758553272, 'epoch': 0.62}
+{'loss': 0.3382, 'grad_norm': 2.035271406173706, 'learning_rate': 0.000214980449657869, 'epoch': 0.62}
+{'loss': 1.3783, 'grad_norm': 2.9534759521484375, 'learning_rate': 0.00021495601173020527, 'epoch': 0.62}
+{'loss': 1.3872, 'grad_norm': 1.612892508506775, 'learning_rate': 0.00021493157380254153, 'epoch': 0.62}
+{'loss': 1.1709, 'grad_norm': 1.9635756015777588, 'learning_rate': 0.00021490713587487778, 'epoch': 0.62}
+{'loss': 1.7638, 'grad_norm': 3.8679616451263428, 'learning_rate': 0.00021488269794721405, 'epoch': 0.63}
+{'loss': 1.1448, 'grad_norm': 2.8190555572509766, 'learning_rate': 0.0002148582600195503, 'epoch': 0.63}
+{'loss': 0.749, 'grad_norm': 1.001844882965088, 'learning_rate': 0.00021483382209188658, 'epoch': 0.63}
+{'loss': 0.5163, 'grad_norm': 1.1124273538589478, 'learning_rate': 0.00021480938416422286, 'epoch': 0.63}
+{'loss': 0.2587, 'grad_norm': 2.1196939945220947, 'learning_rate': 0.00021478494623655911, 'epoch': 0.63}
+{'loss': 0.7465, 'grad_norm': 3.0413784980773926, 'learning_rate': 0.0002147605083088954, 'epoch': 0.63}
+{'loss': 0.7421, 'grad_norm': 2.728447437286377, 'learning_rate': 0.00021473607038123167, 'epoch': 0.63}
+{'loss': 0.5418, 'grad_norm': 1.2172927856445312, 'learning_rate': 0.0002147116324535679, 'epoch': 0.63}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:05,  6.14it/s][A
+  0%|          | 3/774 [00:00<02:50,  4.53it/s][A
+  1%|          | 4/774 [00:00<03:15,  3.93it/s][A
+  1%|          | 5/774 [00:01<03:16,  3.91it/s][A
+  1%|          | 6/774 [00:01<03:32,  3.61it/s][A
+  1%|          | 7/774 [00:01<03:30,  3.64it/s][A
+  1%|          | 8/774 [00:02<03:36,  3.54it/s][A
+  1%|          | 9/774 [00:02<03:21,  3.79it/s][A
+  1%|▏         | 10/774 [00:02<03:20,  3.82it/s][A
+  1%|▏         | 11/774 [00:02<03:34,  3.56it/s][A
+  2%|▏         | 12/774 [00:03<03:19,  3.82it/s][A
+  2%|▏         | 13/774 [00:03<03:09,  4.01it/s][A
+  2%|▏         | 14/774 [00:03<03:22,  3.76it/s][A
+  2%|▏         | 15/774 [00:03<03:41,  3.43it/s][A
+  2%|▏         | 16/774 [00:04<03:43,  3.40it/s][A
+  2%|▏         | 17/774 [00:04<03:17,  3.84it/s][A
+  2%|▏         | 18/774 [00:04<03:09,  3.99it/s][A
+  2%|▏         | 19/774 [00:04<03:20,  3.76it/s][A
+  3%|▎         | 20/774 [00:05<03:20,  3.76it/s][A
+  3%|▎         | 21/774 [00:05<03:23,  3.70it/s][A
+  3%|▎         | 22/774 [00:05<03:26,  3.64it/s][A
+  3%|▎         | 23/774 [00:06<03:36,  3.46it/s][A
+  3%|▎         | 24/774 [00:06<03:34,  3.49it/s][A
+  3%|▎         | 25/774 [00:06<03:39,  3.41it/s][A
+  3%|▎         | 26/774 [00:07<03:45,  3.32it/s][A
+  3%|▎         | 27/774 [00:07<03:42,  3.36it/s][A
+  4%|▎         | 28/774 [00:07<03:46,  3.29it/s][A
+  4%|▎         | 29/774 [00:07<03:47,  3.27it/s][A
+  4%|▍         | 30/774 [00:08<03:34,  3.47it/s][A
+  4%|▍         | 31/774 [00:08<03:41,  3.36it/s][A
+  4%|▍         | 32/774 [00:08<04:07,  3.00it/s][A
+  4%|▍         | 33/774 [00:09<03:53,  3.17it/s][A
+  4%|▍         | 34/774 [00:09<03:41,  3.35it/s][A
+  5%|▍         | 35/774 [00:09<03:48,  3.23it/s][A
+  5%|▍         | 36/774 [00:10<03:55,  3.14it/s][A
+  5%|▍         | 37/774 [00:10<03:53,  3.16it/s][A
+  5%|▍         | 38/774 [00:10<03:41,  3.32it/s][A
+  5%|▌         | 39/774 [00:10<03:26,  3.57it/s][A
+  5%|▌         | 40/774 [00:11<03:32,  3.46it/s][A
+  5%|▌         | 41/774 [00:11<03:26,  3.56it/s][A
+  5%|▌         | 42/774 [00:11<03:14,  3.76it/s][A
+  6%|▌         | 43/774 [00:12<03:26,  3.54it/s][A
+  6%|▌         | 44/774 [00:12<03:30,  3.47it/s][A
+  6%|▌         | 45/774 [00:12<03:17,  3.68it/s][A
+  6%|▌         | 46/774 [00:12<03:02,  3.98it/s][A
+  6%|▌         | 47/774 [00:13<02:51,  4.24it/s][A
+  6%|▌         | 48/774 [00:13<02:53,  4.18it/s][A
+  6%|▋         | 49/774 [00:13<02:56,  4.11it/s][A
+  6%|▋         | 50/774 [00:13<02:57,  4.07it/s][A
+  7%|▋         | 51/774 [00:14<02:59,  4.04it/s][A
+  7%|▋         | 52/774 [00:14<02:59,  4.02it/s][A
+  7%|▋         | 53/774 [00:14<03:09,  3.81it/s][A
+  7%|▋         | 54/774 [00:14<03:12,  3.74it/s][A
+  7%|▋         | 55/774 [00:15<03:20,  3.58it/s][A
+  7%|▋         | 56/774 [00:15<03:22,  3.55it/s][A
+  7%|▋         | 57/774 [00:15<03:27,  3.45it/s][A
+  7%|▋         | 58/774 [00:16<03:26,  3.46it/s][A
+  8%|▊         | 59/774 [00:16<03:08,  3.80it/s][A
+  8%|▊         | 60/774 [00:16<02:54,  4.08it/s][A
+  8%|▊         | 61/774 [00:16<02:32,  4.67it/s][A
+  8%|▊         | 62/774 [00:16<02:30,  4.73it/s][A
+  8%|▊         | 63/774 [00:17<02:55,  4.05it/s][A
+  8%|▊         | 64/774 [00:17<02:46,  4.27it/s][A
+  8%|▊         | 65/774 [00:17<02:52,  4.12it/s][A
+  9%|▊         | 66/774 [00:17<02:48,  4.19it/s][A
+  9%|▊         | 67/774 [00:18<02:42,  4.35it/s][A
+  9%|▉         | 68/774 [00:18<02:38,  4.46it/s][A
+  9%|▉         | 69/774 [00:18<02:29,  4.71it/s][A
+  9%|▉         | 70/774 [00:18<02:40,  4.40it/s][A
+  9%|▉         | 71/774 [00:18<02:34,  4.56it/s][A
+  9%|▉         | 72/774 [00:19<02:44,  4.27it/s][A
+  9%|▉         | 73/774 [00:19<02:54,  4.02it/s][A
+ 10%|▉         | 74/774 [00:19<02:58,  3.91it/s][A
+ 10%|▉         | 75/774 [00:20<03:05,  3.77it/s][A
+ 10%|▉         | 76/774 [00:20<03:02,  3.83it/s][A
+ 10%|▉         | 77/774 [00:20<03:18,  3.52it/s][A
+ 10%|█         | 78/774 [00:20<02:57,  3.92it/s][A
+ 10%|█         | 79/774 [00:20<02:44,  4.22it/s][A
+ 10%|█         | 80/774 [00:21<02:41,  4.30it/s][A
+ 10%|█         | 81/774 [00:21<02:19,  4.98it/s][A
+ 11%|█         | 82/774 [00:21<02:25,  4.75it/s][A
+ 11%|█         | 83/774 [00:21<02:27,  4.68it/s][A
+ 11%|█         | 84/774 [00:22<02:30,  4.59it/s][A
+ 11%|█         | 85/774 [00:22<02:38,  4.35it/s][A
+ 11%|█         | 86/774 [00:22<02:45,  4.17it/s][A
+ 11%|█         | 87/774 [00:22<02:46,  4.14it/s][A
+ 11%|█▏        | 88/774 [00:22<02:34,  4.45it/s][A
+ 11%|█▏        | 89/774 [00:23<02:27,  4.66it/s][A
+ 12%|█▏        | 90/774 [00:23<02:35,  4.39it/s][A
+ 12%|█▏        | 91/774 [00:23<02:55,  3.89it/s][A
+ 12%|█▏        | 92/774 [00:24<03:08,  3.63it/s][A
+ 12%|█▏        | 93/774 [00:24<03:01,  3.76it/s][A
+ 12%|█▏        | 94/774 [00:24<03:04,  3.68it/s][A
+ 12%|█▏        | 95/774 [00:24<03:03,  3.70it/s][A
+ 12%|█▏        | 96/774 [00:25<02:59,  3.78it/s][A
+ 13%|█▎        | 97/774 [00:25<02:45,  4.09it/s][A
+ 13%|█▎        | 98/774 [00:25<02:35,  4.34it/s][A
+ 13%|█▎        | 99/774 [00:25<02:47,  4.02it/s][A
+ 13%|█▎        | 100/774 [00:26<02:59,  3.76it/s][A
+ 13%|█▎        | 101/774 [00:26<03:04,  3.64it/s][A
+ 13%|█▎        | 102/774 [00:26<03:15,  3.45it/s][A
+ 13%|█▎        | 103/774 [00:27<03:21,  3.33it/s][A
+ 13%|█▎        | 104/774 [00:27<03:21,  3.32it/s][A
+ 14%|█▎        | 105/774 [00:27<03:20,  3.33it/s][A
+ 14%|█▎        | 106/774 [00:28<03:39,  3.04it/s][A
+ 14%|█▍        | 107/774 [00:28<03:50,  2.89it/s][A
+ 14%|█▍        | 108/774 [00:28<03:41,  3.01it/s][A
+ 14%|█▍        | 109/774 [00:29<03:36,  3.08it/s][A
+ 14%|█▍        | 110/774 [00:29<03:26,  3.22it/s][A
+ 14%|█▍        | 111/774 [00:29<03:25,  3.23it/s][A
+ 14%|█▍        | 112/774 [00:29<03:15,  3.39it/s][A
+ 15%|█▍        | 113/774 [00:30<03:18,  3.33it/s][A
+ 15%|█▍        | 114/774 [00:30<03:25,  3.21it/s][A
+ 15%|█▍        | 115/774 [00:30<03:19,  3.31it/s][A
+ 15%|█▍        | 116/774 [00:31<03:04,  3.57it/s][A
+ 15%|█▌        | 117/774 [00:31<03:08,  3.48it/s][A
+ 15%|█▌        | 118/774 [00:31<03:07,  3.50it/s][A
+ 15%|█▌        | 119/774 [00:31<02:59,  3.65it/s][A
+ 16%|█▌        | 120/774 [00:32<03:09,  3.45it/s][A
+ 16%|█▌        | 121/774 [00:32<03:04,  3.54it/s][A
+ 16%|█▌        | 122/774 [00:32<03:07,  3.47it/s][A
+ 16%|█▌        | 123/774 [00:33<02:59,  3.62it/s][A
+ 16%|█▌        | 124/774 [00:33<03:00,  3.59it/s][A
+ 16%|█▌        | 125/774 [00:33<03:02,  3.55it/s][A
+ 16%|█▋        | 126/774 [00:33<03:10,  3.40it/s][A
+ 16%|█▋        | 127/774 [00:34<03:20,  3.23it/s][A
+ 17%|█▋        | 128/774 [00:34<03:10,  3.40it/s][A
+ 17%|█▋        | 129/774 [00:34<03:11,  3.37it/s][A
+ 17%|█▋        | 130/774 [00:35<03:18,  3.25it/s][A
+ 17%|█▋        | 131/774 [00:35<03:08,  3.42it/s][A
+ 17%|█▋        | 132/774 [00:35<03:08,  3.40it/s][A
+ 17%|█▋        | 133/774 [00:35<03:04,  3.47it/s][A
+ 17%|█▋        | 134/774 [00:36<03:04,  3.47it/s][A
+ 17%|█▋        | 135/774 [00:36<03:21,  3.17it/s][A
+ 18%|█▊        | 136/774 [00:37<03:29,  3.05it/s][A
+ 18%|█▊        | 137/774 [00:37<03:27,  3.07it/s][A
+ 18%|█▊        | 138/774 [00:37<03:23,  3.13it/s][A
+ 18%|█▊        | 139/774 [00:37<03:24,  3.11it/s][A
+ 18%|█▊        | 140/774 [00:38<03:20,  3.16it/s][A
+ 18%|█▊        | 141/774 [00:38<03:12,  3.29it/s][A
+ 18%|█▊        | 142/774 [00:38<03:25,  3.08it/s][A
+ 18%|█▊        | 143/774 [00:39<03:21,  3.14it/s][A
+ 19%|█▊        | 144/774 [00:39<03:10,  3.31it/s][A
+ 19%|█▊        | 145/774 [00:39<03:02,  3.44it/s][A
+ 19%|█▉        | 146/774 [00:39<02:51,  3.67it/s][A
+ 19%|█▉        | 147/774 [00:40<02:41,  3.87it/s][A
+ 19%|█▉        | 148/774 [00:40<02:51,  3.65it/s][A
+ 19%|█▉        | 149/774 [00:40<03:03,  3.41it/s][A
+ 19%|█▉        | 150/774 [00:41<03:06,  3.35it/s][A
+ 20%|█▉        | 151/774 [00:41<02:56,  3.54it/s][A
+ 20%|█▉        | 152/774 [00:41<02:48,  3.70it/s][A
+ 20%|█▉        | 153/774 [00:41<02:54,  3.57it/s][A
+ 20%|█▉        | 154/774 [00:42<02:49,  3.65it/s][A
+ 20%|██        | 155/774 [00:42<02:47,  3.70it/s][A
+ 20%|██        | 156/774 [00:42<02:41,  3.82it/s][A
+ 20%|██        | 157/774 [00:42<02:35,  3.97it/s][A
+ 20%|██        | 158/774 [00:43<02:38,  3.88it/s][A
+ 21%|██        | 159/774 [00:43<02:40,  3.82it/s][A
+ 21%|██        | 160/774 [00:43<02:32,  4.02it/s][A
+ 21%|██        | 161/774 [00:44<02:42,  3.77it/s][A
+ 21%|██        | 162/774 [00:44<02:47,  3.65it/s][A
+ 21%|██        | 163/774 [00:44<02:47,  3.66it/s][A
+ 21%|██        | 164/774 [00:44<02:41,  3.78it/s][A
+ 21%|██▏       | 165/774 [00:45<02:39,  3.82it/s][A
+ 21%|██▏       | 166/774 [00:45<02:43,  3.72it/s][A
+ 22%|██▏       | 167/774 [00:45<02:45,  3.66it/s][A
+ 22%|██▏       | 168/774 [00:45<02:36,  3.86it/s][A
+ 22%|██▏       | 169/774 [00:46<02:29,  4.05it/s][A
+ 22%|██▏       | 170/774 [00:46<02:38,  3.81it/s][A
+ 22%|██▏       | 171/774 [00:46<02:48,  3.58it/s][A
+ 22%|██▏       | 172/774 [00:47<02:56,  3.42it/s][A
+ 22%|██▏       | 173/774 [00:47<02:52,  3.49it/s][A
+ 22%|██▏       | 174/774 [00:47<02:45,  3.63it/s][A
+ 23%|██▎       | 175/774 [00:47<02:44,  3.64it/s][A
+ 23%|██▎       | 176/774 [00:48<02:38,  3.78it/s][A
+ 23%|██▎       | 177/774 [00:48<02:51,  3.47it/s][A
+ 23%|██▎       | 178/774 [00:48<02:36,  3.81it/s][A
+ 23%|██▎       | 179/774 [00:48<02:23,  4.15it/s][A
+ 23%|██▎       | 180/774 [00:49<02:16,  4.35it/s][A
+ 23%|██▎       | 181/774 [00:49<02:20,  4.22it/s][A
+ 24%|██▎       | 182/774 [00:49<02:24,  4.11it/s][A
+ 24%|██▎       | 183/774 [00:49<02:25,  4.06it/s][A
+ 24%|██▍       | 184/774 [00:50<02:36,  3.78it/s][A
+ 24%|██▍       | 185/774 [00:50<02:44,  3.57it/s][A
+ 24%|██▍       | 186/774 [00:50<02:43,  3.60it/s][A
+ 24%|██▍       | 187/774 [00:50<02:37,  3.73it/s][A
+ 24%|██▍       | 188/774 [00:51<02:36,  3.75it/s][A
+ 24%|██▍       | 189/774 [00:51<02:32,  3.84it/s][A
+ 25%|██▍       | 190/774 [00:51<02:27,  3.96it/s][A
+ 25%|██▍       | 191/774 [00:51<02:32,  3.81it/s][A
+ 25%|██▍       | 192/774 [00:52<02:37,  3.69it/s][A
+ 25%|██▍       | 193/774 [00:52<02:39,  3.63it/s][A
+ 25%|██▌       | 194/774 [00:52<02:49,  3.42it/s][A
+ 25%|██▌       | 195/774 [00:53<02:58,  3.24it/s][A
+ 25%|██▌       | 196/774 [00:53<02:59,  3.23it/s][A
+ 25%|██▌       | 197/774 [00:53<02:55,  3.29it/s][A
+ 26%|██▌       | 198/774 [00:54<02:45,  3.48it/s][A
+ 26%|██▌       | 199/774 [00:54<02:47,  3.44it/s][A
+ 26%|██▌       | 200/774 [00:54<02:41,  3.54it/s][A
+ 26%|██▌       | 201/774 [00:54<02:37,  3.64it/s][A
+ 26%|██▌       | 202/774 [00:55<02:34,  3.70it/s][A
+ 26%|██▌       | 203/774 [00:55<02:27,  3.88it/s][A
+ 26%|██▋       | 204/774 [00:55<02:31,  3.77it/s][A
+ 26%|██▋       | 205/774 [00:55<02:41,  3.53it/s][A
+ 27%|██▋       | 206/774 [00:56<02:36,  3.63it/s][A
+ 27%|██▋       | 207/774 [00:56<02:33,  3.69it/s][A
+ 27%|██▋       | 208/774 [00:56<02:33,  3.68it/s][A
+ 27%|██▋       | 209/774 [00:57<02:32,  3.70it/s][A
+ 27%|██▋       | 210/774 [00:57<02:31,  3.73it/s][A
+ 27%|██▋       | 211/774 [00:57<02:27,  3.80it/s][A
+ 27%|██▋       | 212/774 [00:57<02:16,  4.10it/s][A
+ 28%|██▊       | 213/774 [00:57<02:01,  4.60it/s][A
+ 28%|██▊       | 214/774 [00:58<02:04,  4.50it/s][A
+ 28%|██▊       | 215/774 [00:58<02:03,  4.54it/s][A
+ 28%|██▊       | 216/774 [00:58<02:02,  4.57it/s][A
+ 28%|██▊       | 217/774 [00:58<02:05,  4.43it/s][A
+ 28%|██▊       | 218/774 [00:59<02:11,  4.22it/s][A
+ 28%|██▊       | 219/774 [00:59<02:21,  3.92it/s][A
+ 28%|██▊       | 220/774 [00:59<02:19,  3.98it/s][A
+ 29%|██▊       | 221/774 [00:59<02:25,  3.81it/s][A
+ 29%|██▊       | 222/774 [01:00<02:34,  3.56it/s][A
+ 29%|██▉       | 223/774 [01:00<02:52,  3.20it/s][A
+ 29%|██▉       | 224/774 [01:00<03:02,  3.02it/s][A
+ 29%|██▉       | 225/774 [01:01<03:14,  2.83it/s][A
+ 29%|██▉       | 226/774 [01:01<03:19,  2.75it/s][A
+ 29%|██▉       | 227/774 [01:02<03:15,  2.80it/s][A
+ 29%|██▉       | 228/774 [01:02<03:07,  2.91it/s][A
+ 30%|██▉       | 229/774 [01:02<03:21,  2.71it/s][A
+ 30%|██▉       | 230/774 [01:03<03:05,  2.93it/s][A
+ 30%|██▉       | 231/774 [01:03<03:02,  2.97it/s][A
+ 30%|██▉       | 232/774 [01:03<02:54,  3.11it/s][A
+ 30%|███       | 233/774 [01:04<03:09,  2.86it/s][A
+ 30%|███       | 234/774 [01:04<03:12,  2.81it/s][A
+ 30%|███       | 235/774 [01:04<03:11,  2.82it/s][A
+ 30%|███       | 236/774 [01:05<03:15,  2.76it/s][A
+ 31%|███       | 237/774 [01:05<03:12,  2.79it/s][A
+ 31%|███       | 238/774 [01:05<03:02,  2.94it/s][A
+ 31%|███       | 239/774 [01:06<03:00,  2.96it/s][A
+ 31%|███       | 240/774 [01:06<02:58,  3.00it/s][A
+ 31%|███       | 241/774 [01:06<03:01,  2.94it/s][A
+ 31%|███▏      | 242/774 [01:07<03:12,  2.77it/s][A
+ 31%|███▏      | 243/774 [01:07<03:21,  2.63it/s][A
+ 32%|███▏      | 244/774 [01:08<03:16,  2.69it/s][A
+ 32%|███▏      | 245/774 [01:08<03:08,  2.80it/s][A
+ 32%|███▏      | 246/774 [01:08<03:08,  2.80it/s][A
+ 32%|███▏      | 247/774 [01:09<03:46,  2.32it/s][A
+ 32%|███▏      | 248/774 [01:09<03:52,  2.26it/s][A
+ 32%|███▏      | 249/774 [01:10<03:27,  2.53it/s][A
+ 32%|███▏      | 250/774 [01:10<03:20,  2.61it/s][A
+ 32%|███▏      | 251/774 [01:10<03:19,  2.63it/s][A
+ 33%|███▎      | 252/774 [01:11<03:14,  2.69it/s][A
+ 33%|███▎      | 253/774 [01:11<03:12,  2.70it/s][A
+ 33%|███▎      | 254/774 [01:11<03:07,  2.77it/s][A
+ 33%|███▎      | 255/774 [01:12<03:03,  2.83it/s][A
+ 33%|███▎      | 256/774 [01:12<02:59,  2.89it/s][A
+ 33%|███▎      | 257/774 [01:12<02:57,  2.92it/s][A
+ 33%|███▎      | 258/774 [01:13<02:42,  3.18it/s][A
+ 33%|███▎      | 259/774 [01:13<02:25,  3.55it/s][A
+ 34%|███▎      | 260/774 [01:13<02:24,  3.56it/s][A
+ 34%|███▎      | 261/774 [01:13<02:29,  3.43it/s][A
+ 34%|███▍      | 262/774 [01:14<02:13,  3.82it/s][A
+ 34%|███▍      | 263/774 [01:14<02:06,  4.03it/s][A
+ 34%|███▍      | 264/774 [01:14<02:15,  3.77it/s][A
+ 34%|███▍      | 265/774 [01:14<02:09,  3.92it/s][A
+ 34%|███▍      | 266/774 [01:15<02:03,  4.12it/s][A
+ 34%|███▍      | 267/774 [01:15<02:01,  4.16it/s][A
+ 35%|███▍      | 268/774 [01:15<02:08,  3.92it/s][A
+ 35%|███▍      | 269/774 [01:15<02:13,  3.78it/s][A
+ 35%|███▍      | 270/774 [01:16<02:20,  3.60it/s][A
+ 35%|███▌      | 271/774 [01:16<02:15,  3.70it/s][A
+ 35%|███▌      | 272/774 [01:16<02:05,  4.02it/s][A
+ 35%|███▌      | 273/774 [01:16<02:01,  4.14it/s][A
+ 35%|███▌      | 274/774 [01:17<02:05,  4.00it/s][A
+ 36%|███▌      | 275/774 [01:17<01:58,  4.20it/s][A
+ 36%|███▌      | 276/774 [01:17<01:52,  4.41it/s][A
+ 36%|███▌      | 277/774 [01:17<01:57,  4.24it/s][A
+ 36%|███▌      | 278/774 [01:18<01:59,  4.15it/s][A
+ 36%|███▌      | 279/774 [01:18<01:52,  4.41it/s][A
+ 36%|███▌      | 280/774 [01:18<01:54,  4.32it/s][A
+ 36%|███▋      | 281/774 [01:18<02:05,  3.93it/s][A
+ 36%|███▋      | 282/774 [01:19<02:16,  3.59it/s][A
+ 37%|███▋      | 283/774 [01:19<02:12,  3.70it/s][A
+ 37%|███▋      | 284/774 [01:19<02:13,  3.66it/s][A
+ 37%|███▋      | 285/774 [01:19<02:06,  3.87it/s][A
+ 37%|███▋      | 286/774 [01:20<02:01,  4.03it/s][A
+ 37%|███▋      | 287/774 [01:20<02:12,  3.68it/s][A
+ 37%|███▋      | 288/774 [01:20<02:16,  3.56it/s][A
+ 37%|███▋      | 289/774 [01:21<02:14,  3.60it/s][A
+ 37%|███▋      | 290/774 [01:21<02:10,  3.71it/s][A
+ 38%|███▊      | 291/774 [01:21<02:10,  3.71it/s][A
+ 38%|███▊      | 292/774 [01:21<02:05,  3.84it/s][A
+ 38%|███▊      | 293/774 [01:22<01:55,  4.18it/s][A
+ 38%|███▊      | 294/774 [01:22<01:51,  4.31it/s][A
+ 38%|███▊      | 295/774 [01:22<01:49,  4.37it/s][A
+ 38%|███▊      | 296/774 [01:22<01:44,  4.56it/s][A
+ 38%|███▊      | 297/774 [01:22<01:39,  4.81it/s][A
+ 39%|███▊      | 298/774 [01:23<01:43,  4.60it/s][A
+ 39%|███▊      | 299/774 [01:23<01:47,  4.43it/s][A
+ 39%|███▉      | 300/774 [01:23<01:53,  4.16it/s][A
+ 39%|███▉      | 301/774 [01:23<01:46,  4.45it/s][A
+ 39%|███▉      | 302/774 [01:23<01:40,  4.69it/s][A
+ 39%|███▉      | 303/774 [01:24<01:38,  4.80it/s][A
+ 39%|███▉      | 304/774 [01:24<01:25,  5.51it/s][A
+ 39%|███▉      | 305/774 [01:24<01:24,  5.52it/s][A
+ 40%|███▉      | 306/774 [01:24<01:37,  4.81it/s][A
+ 40%|███▉      | 307/774 [01:24<01:42,  4.55it/s][A
+ 40%|███▉      | 308/774 [01:25<01:37,  4.78it/s][A
+ 40%|███▉      | 309/774 [01:25<01:37,  4.77it/s][A
+ 40%|████      | 310/774 [01:25<01:42,  4.51it/s][A
+ 40%|████      | 311/774 [01:25<01:41,  4.56it/s][A
+ 40%|████      | 312/774 [01:26<01:38,  4.67it/s][A
+ 40%|████      | 313/774 [01:26<01:38,  4.67it/s][A
+ 41%|████      | 314/774 [01:26<01:40,  4.60it/s][A
+ 41%|████      | 315/774 [01:26<01:48,  4.23it/s][A
+ 41%|████      | 316/774 [01:26<01:39,  4.58it/s][A
+ 41%|████      | 317/774 [01:27<01:33,  4.90it/s][A
+ 41%|████      | 318/774 [01:27<01:36,  4.70it/s][A
+ 41%|████      | 319/774 [01:27<01:39,  4.59it/s][A
+ 41%|████▏     | 320/774 [01:27<01:39,  4.59it/s][A
+ 41%|████▏     | 321/774 [01:27<01:31,  4.97it/s][A
+ 42%|████▏     | 322/774 [01:28<01:25,  5.28it/s][A
+ 42%|████▏     | 323/774 [01:28<01:17,  5.81it/s][A
+ 42%|████▏     | 324/774 [01:28<01:24,  5.34it/s][A
+ 42%|████▏     | 325/774 [01:28<01:28,  5.08it/s][A
+ 42%|████▏     | 326/774 [01:28<01:24,  5.30it/s][A
+ 42%|████▏     | 327/774 [01:29<01:27,  5.08it/s][A
+ 42%|████▏     | 328/774 [01:29<01:25,  5.19it/s][A
+ 43%|████▎     | 329/774 [01:29<01:34,  4.71it/s][A
+ 43%|████▎     | 330/774 [01:29<01:30,  4.91it/s][A
+ 43%|████▎     | 331/774 [01:29<01:22,  5.39it/s][A
+ 43%|████▎     | 332/774 [01:30<01:19,  5.58it/s][A
+ 43%|████▎     | 333/774 [01:30<01:22,  5.33it/s][A
+ 43%|████▎     | 334/774 [01:30<01:26,  5.08it/s][A
+ 43%|████▎     | 335/774 [01:30<01:27,  5.03it/s][A
+ 43%|████▎     | 336/774 [01:30<01:26,  5.06it/s][A
+ 44%|████▎     | 337/774 [01:30<01:20,  5.43it/s][A
+ 44%|████▎     | 338/774 [01:31<01:14,  5.82it/s][A
+ 44%|████▍     | 339/774 [01:31<01:10,  6.18it/s][A
+ 44%|████▍     | 340/774 [01:31<01:10,  6.17it/s][A
+ 44%|████▍     | 341/774 [01:31<01:27,  4.92it/s][A
+ 44%|████▍     | 342/774 [01:31<01:37,  4.45it/s][A
+ 44%|████▍     | 343/774 [01:32<01:37,  4.41it/s][A
+ 44%|████▍     | 344/774 [01:32<01:41,  4.22it/s][A
+ 45%|████▍     | 345/774 [01:32<01:44,  4.09it/s][A
+ 45%|████▍     | 346/774 [01:33<01:46,  4.00it/s][A
+ 45%|████▍     | 347/774 [01:33<01:43,  4.11it/s][A
+ 45%|████▍     | 348/774 [01:33<01:38,  4.34it/s][A
+ 45%|████▌     | 349/774 [01:33<01:34,  4.50it/s][A
+ 45%|████▌     | 350/774 [01:33<01:37,  4.34it/s][A
+ 45%|████▌     | 351/774 [01:34<01:37,  4.33it/s][A
+ 45%|████▌     | 352/774 [01:34<01:33,  4.50it/s][A
+ 46%|████▌     | 353/774 [01:34<01:33,  4.49it/s][A
+ 46%|████▌     | 354/774 [01:34<01:33,  4.51it/s][A
+ 46%|████▌     | 355/774 [01:35<01:37,  4.30it/s][A
+ 46%|████▌     | 356/774 [01:35<01:47,  3.88it/s][A
+ 46%|████▌     | 357/774 [01:35<02:04,  3.36it/s][A
+ 46%|████▋     | 358/774 [01:36<02:08,  3.23it/s][A
+ 46%|████▋     | 359/774 [01:36<02:07,  3.25it/s][A
+ 47%|████▋     | 360/774 [01:36<02:08,  3.23it/s][A
+ 47%|████▋     | 361/774 [01:36<02:01,  3.41it/s][A
+ 47%|████▋     | 362/774 [01:37<02:07,  3.22it/s][A
+ 47%|████▋     | 363/774 [01:37<02:06,  3.25it/s][A
+ 47%|████▋     | 364/774 [01:37<02:08,  3.20it/s][A
+ 47%|████▋     | 365/774 [01:38<02:04,  3.27it/s][A
+ 47%|████▋     | 366/774 [01:38<01:55,  3.53it/s][A
+ 47%|████▋     | 367/774 [01:38<01:50,  3.68it/s][A
+ 48%|████▊     | 368/774 [01:38<01:47,  3.77it/s][A
+ 48%|████▊     | 369/774 [01:39<01:54,  3.54it/s][A
+ 48%|████▊     | 370/774 [01:39<02:09,  3.13it/s][A
+ 48%|████▊     | 371/774 [01:39<02:00,  3.35it/s][A
+ 48%|████▊     | 372/774 [01:40<02:00,  3.34it/s][A
+ 48%|████▊     | 373/774 [01:40<01:57,  3.40it/s][A
+ 48%|████▊     | 374/774 [01:40<01:55,  3.47it/s][A
+ 48%|████▊     | 375/774 [01:41<01:55,  3.45it/s][A
+ 49%|████▊     | 376/774 [01:41<02:00,  3.31it/s][A
+ 49%|████▊     | 377/774 [01:41<02:12,  3.00it/s][A
+ 49%|████▉     | 378/774 [01:42<02:13,  2.97it/s][A
+ 49%|████▉     | 379/774 [01:42<02:04,  3.18it/s][A
+ 49%|████▉     | 380/774 [01:42<01:53,  3.46it/s][A
+ 49%|████▉     | 381/774 [01:42<01:45,  3.74it/s][A
+ 49%|████▉     | 382/774 [01:43<01:41,  3.85it/s][A
+ 49%|████▉     | 383/774 [01:43<01:39,  3.92it/s][A
+ 50%|████▉     | 384/774 [01:43<01:47,  3.63it/s][A
+ 50%|████▉     | 385/774 [01:44<01:55,  3.36it/s][A
+ 50%|████▉     | 386/774 [01:44<01:48,  3.56it/s][A
+ 50%|█████     | 387/774 [01:44<01:42,  3.77it/s][A
+ 50%|█████     | 388/774 [01:44<01:48,  3.56it/s][A
+ 50%|█████     | 389/774 [01:45<01:44,  3.67it/s][A
+ 50%|█████     | 390/774 [01:45<01:58,  3.24it/s][A
+ 51%|█████     | 391/774 [01:45<01:59,  3.21it/s][A
+ 51%|█████     | 392/774 [01:46<01:49,  3.49it/s][A
+ 51%|█████     | 393/774 [01:46<01:40,  3.78it/s][A
+ 51%|█████     | 394/774 [01:46<01:41,  3.76it/s][A
+ 51%|█████     | 395/774 [01:46<01:48,  3.49it/s][A
+ 51%|█████     | 396/774 [01:47<01:46,  3.56it/s][A
+ 51%|█████▏    | 397/774 [01:47<01:49,  3.44it/s][A
+ 51%|█████▏    | 398/774 [01:47<01:44,  3.61it/s][A
+ 52%|█████▏    | 399/774 [01:47<01:42,  3.65it/s][A
+ 52%|█████▏    | 400/774 [01:48<01:34,  3.94it/s][A
+ 52%|█████▏    | 401/774 [01:48<01:31,  4.08it/s][A
+ 52%|█████▏    | 402/774 [01:48<01:30,  4.09it/s][A
+ 52%|█████▏    | 403/774 [01:48<01:34,  3.91it/s][A
+ 52%|█████▏    | 404/774 [01:49<01:40,  3.67it/s][A
+ 52%|█████▏    | 405/774 [01:49<01:36,  3.81it/s][A
+ 52%|█████▏    | 406/774 [01:49<01:39,  3.68it/s][A
+ 53%|█████▎    | 407/774 [01:50<01:45,  3.49it/s][A
+ 53%|█████▎    | 408/774 [01:50<01:42,  3.59it/s][A
+ 53%|█████▎    | 409/774 [01:50<01:38,  3.69it/s][A
+ 53%|█████▎    | 410/774 [01:50<01:39,  3.65it/s][A
+ 53%|█████▎    | 411/774 [01:51<01:39,  3.66it/s][A
+ 53%|█████▎    | 412/774 [01:51<01:40,  3.60it/s][A
+ 53%|█████▎    | 413/774 [01:51<01:38,  3.66it/s][A
+ 53%|█████▎    | 414/774 [01:51<01:36,  3.75it/s][A
+ 54%|█████▎    | 415/774 [01:52<01:24,  4.23it/s][A
+ 54%|█████▎    | 416/774 [01:52<01:24,  4.23it/s][A
+ 54%|█████▍    | 417/774 [01:52<01:23,  4.27it/s][A
+ 54%|█████▍    | 418/774 [01:52<01:17,  4.58it/s][A
+ 54%|█████▍    | 419/774 [01:53<01:32,  3.84it/s][A
+ 54%|█████▍    | 420/774 [01:53<01:36,  3.66it/s][A
+ 54%|█████▍    | 421/774 [01:53<01:36,  3.66it/s][A
+ 55%|█████▍    | 422/774 [01:53<01:36,  3.64it/s][A
+ 55%|█████▍    | 423/774 [01:54<01:37,  3.59it/s][A
+ 55%|█████▍    | 424/774 [01:54<01:35,  3.66it/s][A
+ 55%|█████▍    | 425/774 [01:54<01:23,  4.17it/s][A
+ 55%|█████▌    | 426/774 [01:54<01:17,  4.46it/s][A
+ 55%|█████▌    | 427/774 [01:55<01:14,  4.68it/s][A
+ 55%|█████▌    | 428/774 [01:55<01:16,  4.54it/s][A
+ 55%|█████▌    | 429/774 [01:55<01:18,  4.38it/s][A
+ 56%|█████▌    | 430/774 [01:55<01:22,  4.17it/s][A
+ 56%|█████▌    | 431/774 [01:56<01:35,  3.59it/s][A
+ 56%|█████▌    | 432/774 [01:56<01:34,  3.62it/s][A
+ 56%|█████▌    | 433/774 [01:56<01:27,  3.89it/s][A
+ 56%|█████▌    | 434/774 [01:56<01:22,  4.10it/s][A
+ 56%|█████▌    | 435/774 [01:57<01:21,  4.14it/s][A
+ 56%|█████▋    | 436/774 [01:57<01:23,  4.05it/s][A
+ 56%|█████▋    | 437/774 [01:57<01:20,  4.21it/s][A
+ 57%|█████▋    | 438/774 [01:57<01:16,  4.38it/s][A
+ 57%|█████▋    | 439/774 [01:58<01:19,  4.21it/s][A
+ 57%|█████▋    | 440/774 [01:58<01:23,  4.00it/s][A
+ 57%|█████▋    | 441/774 [01:58<01:27,  3.80it/s][A
+ 57%|█████▋    | 442/774 [01:58<01:29,  3.72it/s][A
+ 57%|█████▋    | 443/774 [01:59<01:27,  3.80it/s][A
+ 57%|█████▋    | 444/774 [01:59<01:24,  3.89it/s][A
+ 57%|█████▋    | 445/774 [01:59<01:25,  3.86it/s][A
+ 58%|█████▊    | 446/774 [01:59<01:22,  3.96it/s][A
+ 58%|█████▊    | 447/774 [02:00<01:21,  4.01it/s][A
+ 58%|█████▊    | 448/774 [02:00<01:14,  4.38it/s][A
+ 58%|█████▊    | 449/774 [02:00<01:14,  4.37it/s][A
+ 58%|█████▊    | 450/774 [02:00<01:17,  4.18it/s][A
+ 58%|█████▊    | 451/774 [02:00<01:15,  4.28it/s][A
+ 58%|█████▊    | 452/774 [02:01<01:11,  4.49it/s][A
+ 59%|█████▊    | 453/774 [02:01<01:10,  4.55it/s][A
+ 59%|█████▊    | 454/774 [02:01<01:15,  4.24it/s][A
+ 59%|█████▉    | 455/774 [02:01<01:20,  3.96it/s][A
+ 59%|█████▉    | 456/774 [02:02<01:25,  3.74it/s][A
+ 59%|█████▉    | 457/774 [02:02<01:19,  4.01it/s][A
+ 59%|█████▉    | 458/774 [02:02<01:18,  4.01it/s][A
+ 59%|█████▉    | 459/774 [02:02<01:17,  4.08it/s][A
+ 59%|█████▉    | 460/774 [02:03<01:22,  3.80it/s][A
+ 60%|█████▉    | 461/774 [02:03<01:29,  3.50it/s][A
+ 60%|█████▉    | 462/774 [02:03<01:26,  3.59it/s][A
+ 60%|█████▉    | 463/774 [02:04<01:23,  3.71it/s][A
+ 60%|█████▉    | 464/774 [02:04<01:23,  3.71it/s][A
+ 60%|██████    | 465/774 [02:04<01:15,  4.10it/s][A
+ 60%|██████    | 466/774 [02:04<01:12,  4.25it/s][A
+ 60%|██████    | 467/774 [02:04<01:08,  4.48it/s][A
+ 60%|██████    | 468/774 [02:05<01:08,  4.49it/s][A
+ 61%|██████    | 469/774 [02:05<01:02,  4.89it/s][A
+ 61%|██████    | 470/774 [02:05<01:00,  5.05it/s][A
+ 61%|██████    | 471/774 [02:05<01:02,  4.84it/s][A
+ 61%|██████    | 472/774 [02:06<01:07,  4.47it/s][A
+ 61%|██████    | 473/774 [02:06<01:10,  4.28it/s][A
+ 61%|██████    | 474/774 [02:06<01:08,  4.37it/s][A
+ 61%|██████▏   | 475/774 [02:06<01:10,  4.26it/s][A
+ 61%|██████▏   | 476/774 [02:07<01:17,  3.82it/s][A
+ 62%|██████▏   | 477/774 [02:07<01:31,  3.23it/s][A
+ 62%|██████▏   | 478/774 [02:07<01:33,  3.18it/s][A
+ 62%|██████▏   | 479/774 [02:08<01:30,  3.25it/s][A
+ 62%|██████▏   | 480/774 [02:08<01:27,  3.36it/s][A
+ 62%|██████▏   | 481/774 [02:08<01:27,  3.33it/s][A
+ 62%|██████▏   | 482/774 [02:08<01:26,  3.39it/s][A
+ 62%|██████▏   | 483/774 [02:09<01:23,  3.47it/s][A
+ 63%|██████▎   | 484/774 [02:09<01:25,  3.41it/s][A
+ 63%|██████▎   | 485/774 [02:09<01:26,  3.34it/s][A
+ 63%|██████▎   | 486/774 [02:10<01:23,  3.45it/s][A
+ 63%|██████▎   | 487/774 [02:10<01:24,  3.38it/s][A
+ 63%|██████▎   | 488/774 [02:10<01:22,  3.46it/s][A
+ 63%|██████▎   | 489/774 [02:10<01:17,  3.68it/s][A
+ 63%|██████▎   | 490/774 [02:11<01:17,  3.65it/s][A
+ 63%|██████▎   | 491/774 [02:11<01:16,  3.69it/s][A
+ 64%|██████▎   | 492/774 [02:11<01:18,  3.60it/s][A
+ 64%|██████▎   | 493/774 [02:12<01:19,  3.56it/s][A
+ 64%|██████▍   | 494/774 [02:12<01:17,  3.63it/s][A
+ 64%|██████▍   | 495/774 [02:12<01:17,  3.59it/s][A
+ 64%|██████▍   | 496/774 [02:12<01:22,  3.35it/s][A
+ 64%|██████▍   | 497/774 [02:13<01:23,  3.30it/s][A
+ 64%|██████▍   | 498/774 [02:13<01:22,  3.36it/s][A
+ 64%|██████▍   | 499/774 [02:13<01:20,  3.43it/s][A
+ 65%|██████▍   | 500/774 [02:14<01:17,  3.52it/s][A
+ 65%|██████▍   | 501/774 [02:14<01:14,  3.65it/s][A
+ 65%|██████▍   | 502/774 [02:14<01:13,  3.69it/s][A
+ 65%|██████▍   | 503/774 [02:14<01:20,  3.39it/s][A
+ 65%|██████▌   | 504/774 [02:15<01:23,  3.23it/s][A
+ 65%|██████▌   | 505/774 [02:15<01:20,  3.36it/s][A
+ 65%|██████▌   | 506/774 [02:15<01:18,  3.39it/s][A
+ 66%|██████▌   | 507/774 [02:16<01:23,  3.18it/s][A
+ 66%|██████▌   | 508/774 [02:16<01:21,  3.26it/s][A
+ 66%|██████▌   | 509/774 [02:16<01:20,  3.30it/s][A
+ 66%|██████▌   | 510/774 [02:17<01:17,  3.41it/s][A
+ 66%|██████▌   | 511/774 [02:17<01:13,  3.59it/s][A
+ 66%|██████▌   | 512/774 [02:17<01:11,  3.67it/s][A
+ 66%|██████▋   | 513/774 [02:17<01:14,  3.51it/s][A
+ 66%|██████▋   | 514/774 [02:18<01:15,  3.44it/s][A
+ 67%|██████▋   | 515/774 [02:18<01:21,  3.16it/s][A
+ 67%|██████▋   | 516/774 [02:18<01:16,  3.37it/s][A
+ 67%|██████▋   | 517/774 [02:19<01:10,  3.66it/s][A
+ 67%|██████▋   | 518/774 [02:19<01:07,  3.79it/s][A
+ 67%|██████▋   | 519/774 [02:19<01:10,  3.62it/s][A
+ 67%|██████▋   | 520/774 [02:19<01:09,  3.64it/s][A
+ 67%|██████▋   | 521/774 [02:20<01:07,  3.75it/s][A
+ 67%|██████▋   | 522/774 [02:20<01:03,  3.96it/s][A
+ 68%|██████▊   | 523/774 [02:20<01:01,  4.10it/s][A
+ 68%|██████▊   | 524/774 [02:20<01:05,  3.81it/s][A
+ 68%|██████▊   | 525/774 [02:21<01:06,  3.72it/s][A
+ 68%|██████▊   | 526/774 [02:21<01:09,  3.56it/s][A
+ 68%|██████▊   | 527/774 [02:21<01:10,  3.50it/s][A
+ 68%|██████▊   | 528/774 [02:22<01:10,  3.51it/s][A
+ 68%|██████▊   | 529/774 [02:22<01:06,  3.69it/s][A
+ 68%|██████▊   | 530/774 [02:22<01:05,  3.74it/s][A
+ 69%|██████▊   | 531/774 [02:22<01:04,  3.75it/s][A
+ 69%|██████▊   | 532/774 [02:23<01:02,  3.86it/s][A
+ 69%|██████▉   | 533/774 [02:23<00:59,  4.04it/s][A
+ 69%|██████▉   | 534/774 [02:23<00:56,  4.23it/s][A
+ 69%|██████▉   | 535/774 [02:23<00:59,  4.03it/s][A
+ 69%|██████▉   | 536/774 [02:24<01:01,  3.88it/s][A
+ 69%|██████▉   | 537/774 [02:24<01:02,  3.82it/s][A
+ 70%|██████▉   | 538/774 [02:24<01:05,  3.58it/s][A
+ 70%|██████▉   | 539/774 [02:24<01:05,  3.61it/s][A
+ 70%|██████▉   | 540/774 [02:25<01:04,  3.63it/s][A
+ 70%|██████▉   | 541/774 [02:25<01:02,  3.73it/s][A
+ 70%|███████   | 542/774 [02:25<01:02,  3.71it/s][A
+ 70%|███████   | 543/774 [02:25<01:03,  3.62it/s][A
+ 70%|███████   | 544/774 [02:26<01:03,  3.61it/s][A
+ 70%|████���██   | 545/774 [02:26<01:01,  3.75it/s][A
+ 71%|███████   | 546/774 [02:26<00:57,  3.94it/s][A
+ 71%|███████   | 547/774 [02:26<00:55,  4.08it/s][A
+ 71%|███████   | 548/774 [02:27<00:54,  4.13it/s][A
+ 71%|███████   | 549/774 [02:27<00:55,  4.05it/s][A
+ 71%|███████   | 550/774 [02:27<00:58,  3.81it/s][A
+ 71%|███████   | 551/774 [02:28<01:01,  3.65it/s][A
+ 71%|███████▏  | 552/774 [02:28<01:04,  3.45it/s][A
+ 71%|███████▏  | 553/774 [02:28<01:07,  3.25it/s][A
+ 72%|███████▏  | 554/774 [02:29<01:07,  3.28it/s][A
+ 72%|███████▏  | 555/774 [02:29<01:06,  3.28it/s][A
+ 72%|███████▏  | 556/774 [02:29<01:03,  3.45it/s][A
+ 72%|███████▏  | 557/774 [02:29<01:06,  3.26it/s][A
+ 72%|███████▏  | 558/774 [02:30<01:00,  3.55it/s][A
+ 72%|███████▏  | 559/774 [02:30<00:56,  3.84it/s][A
+ 72%|███████▏  | 560/774 [02:30<01:00,  3.54it/s][A
+ 72%|███████▏  | 561/774 [02:30<00:56,  3.74it/s][A
+ 73%|███████▎  | 562/774 [02:31<00:51,  4.10it/s][A
+ 73%|███████▎  | 563/774 [02:31<00:49,  4.23it/s][A
+ 73%|███████▎  | 564/774 [02:31<00:51,  4.06it/s][A
+ 73%|███████▎  | 565/774 [02:31<00:53,  3.88it/s][A
+ 73%|███████▎  | 566/774 [02:32<00:50,  4.15it/s][A
+ 73%|███████▎  | 567/774 [02:32<00:45,  4.51it/s][A
+ 73%|███████▎  | 568/774 [02:32<00:47,  4.31it/s][A
+ 74%|███████▎  | 569/774 [02:32<00:48,  4.23it/s][A
+ 74%|███████▎  | 570/774 [02:33<00:48,  4.21it/s][A
+ 74%|███████▍  | 571/774 [02:33<00:52,  3.87it/s][A
+ 74%|███████▍  | 572/774 [02:33<00:54,  3.74it/s][A
+ 74%|███████▍  | 573/774 [02:33<00:53,  3.74it/s][A
+ 74%|███████▍  | 574/774 [02:34<00:52,  3.83it/s][A
+ 74%|███████▍  | 575/774 [02:34<00:51,  3.84it/s][A
+ 74%|███████▍  | 576/774 [02:34<00:56,  3.50it/s][A
+ 75%|███████▍  | 577/774 [02:34<00:54,  3.60it/s][A
+ 75%|███████▍  | 578/774 [02:35<00:53,  3.65it/s][A
+ 75%|███████▍  | 579/774 [02:35<00:56,  3.47it/s][A
+ 75%|███████▍  | 580/774 [02:35<00:55,  3.51it/s][A
+ 75%|███████▌  | 581/774 [02:36<00:54,  3.52it/s][A
+ 75%|███████▌  | 582/774 [02:36<00:53,  3.60it/s][A
+ 75%|███████▌  | 583/774 [02:36<00:51,  3.74it/s][A
+ 75%|███████▌  | 584/774 [02:36<00:50,  3.77it/s][A
+ 76%|███████▌  | 585/774 [02:37<00:52,  3.60it/s][A
+ 76%|███████▌  | 586/774 [02:37<00:52,  3.56it/s][A
+ 76%|███████▌  | 587/774 [02:37<00:51,  3.62it/s][A
+ 76%|███████▌  | 588/774 [02:38<00:50,  3.70it/s][A
+ 76%|███████▌  | 589/774 [02:38<00:49,  3.77it/s][A
+ 76%|███████▌  | 590/774 [02:38<00:45,  4.04it/s][A
+ 76%|███████▋  | 591/774 [02:38<00:47,  3.88it/s][A
+ 76%|███████▋  | 592/774 [02:39<00:49,  3.65it/s][A
+ 77%|███████▋  | 593/774 [02:39<00:50,  3.61it/s][A
+ 77%|███████▋  | 594/774 [02:39<00:50,  3.60it/s][A
+ 77%|███████▋  | 595/774 [02:39<00:54,  3.31it/s][A
+ 77%|███████▋  | 596/774 [02:40<00:56,  3.14it/s][A
+ 77%|███████▋  | 597/774 [02:40<00:56,  3.12it/s][A
+ 77%|███████▋  | 598/774 [02:41<00:57,  3.05it/s][A
+ 77%|███████▋  | 599/774 [02:41<00:58,  3.00it/s][A
+ 78%|███████▊  | 600/774 [02:41<00:57,  3.02it/s][A
+ 78%|███████▊  | 601/774 [02:42<00:58,  2.98it/s][A
+ 78%|███████▊  | 602/774 [02:42<00:58,  2.94it/s][A
+ 78%|███████▊  | 603/774 [02:42<00:57,  2.99it/s][A
+ 78%|███████▊  | 604/774 [02:43<00:57,  2.94it/s][A
+ 78%|███████▊  | 605/774 [02:43<00:56,  2.99it/s][A
+ 78%|███████▊  | 606/774 [02:43<00:58,  2.89it/s][A
+ 78%|███████▊  | 607/774 [02:44<00:56,  2.94it/s][A
+ 79%|███████▊  | 608/774 [02:44<00:56,  2.92it/s][A
+ 79%|███████▊  | 609/774 [02:44<00:54,  3.04it/s][A
+ 79%|███████▉  | 610/774 [02:45<00:55,  2.97it/s][A
+ 79%|███████▉  | 611/774 [02:45<00:59,  2.73it/s][A
+ 79%|███████▉  | 612/774 [02:45<01:02,  2.61it/s][A
+ 79%|███████▉  | 613/774 [02:46<00:57,  2.79it/s][A
+ 79%|███████▉  | 614/774 [02:46<00:55,  2.87it/s][A
+ 79%|███████▉  | 615/774 [02:46<00:52,  3.02it/s][A
+ 80%|███████▉  | 616/774 [02:47<00:51,  3.06it/s][A
+ 80%|███████▉  | 617/774 [02:47<00:50,  3.09it/s][A
+ 80%|███████▉  | 618/774 [02:47<00:48,  3.24it/s][A
+ 80%|███████▉  | 619/774 [02:48<00:45,  3.40it/s][A
+ 80%|████████  | 620/774 [02:48<00:44,  3.43it/s][A
+ 80%|████████  | 621/774 [02:48<00:41,  3.70it/s][A
+ 80%|████████  | 622/774 [02:48<00:38,  3.97it/s][A
+ 80%|████████  | 623/774 [02:49<00:38,  3.93it/s][A
+ 81%|████████  | 624/774 [02:49<00:41,  3.61it/s][A
+ 81%|████████  | 625/774 [02:49<00:41,  3.55it/s][A
+ 81%|████████  | 626/774 [02:49<00:44,  3.31it/s][A
+ 81%|████████  | 627/774 [02:50<00:45,  3.22it/s][A
+ 81%|████████  | 628/774 [02:50<00:45,  3.21it/s][A
+ 81%|████████▏ | 629/774 [02:50<00:43,  3.32it/s][A
+ 81%|████████▏ | 630/774 [02:51<00:40,  3.57it/s][A
+ 82%|████████▏ | 631/774 [02:51<00:38,  3.75it/s][A
+ 82%|████████▏ | 632/774 [02:51<00:37,  3.75it/s][A
+ 82%|████████▏ | 633/774 [02:51<00:39,  3.57it/s][A
+ 82%|████████▏ | 634/774 [02:52<00:40,  3.47it/s][A
+ 82%|████████▏ | 635/774 [02:52<00:39,  3.53it/s][A
+ 82%|████████▏ | 636/774 [02:52<00:40,  3.44it/s][A
+ 82%|████████▏ | 637/774 [02:53<00:39,  3.48it/s][A
+ 82%|████████▏ | 638/774 [02:53<00:39,  3.46it/s][A
+ 83%|████████▎ | 639/774 [02:53<00:43,  3.08it/s][A
+ 83%|████████▎ | 640/774 [02:54<00:50,  2.66it/s][A
+ 83%|████████▎ | 641/774 [02:54<00:49,  2.71it/s][A
+ 83%|████████▎ | 642/774 [02:54<00:45,  2.88it/s][A
+ 83%|████████▎ | 643/774 [02:55<00:45,  2.88it/s][A
+ 83%|████████▎ | 644/774 [02:55<00:42,  3.09it/s][A
+ 83%|████████▎ | 645/774 [02:55<00:37,  3.40it/s][A
+ 83%|████████▎ | 646/774 [02:56<00:35,  3.63it/s][A
+ 84%|████████▎ | 647/774 [02:56<00:32,  3.89it/s][A
+ 84%|████████▎ | 648/774 [02:56<00:31,  4.04it/s][A
+ 84%|████████▍ | 649/774 [02:56<00:30,  4.07it/s][A
+ 84%|████████▍ | 650/774 [02:56<00:28,  4.29it/s][A
+ 84%|████████▍ | 651/774 [02:57<00:29,  4.22it/s][A
+ 84%|████████▍ | 652/774 [02:57<00:29,  4.11it/s][A
+ 84%|████████▍ | 653/774 [02:57<00:31,  3.85it/s][A
+ 84%|████████▍ | 654/774 [02:57<00:29,  4.09it/s][A
+ 85%|████████▍ | 655/774 [02:58<00:26,  4.43it/s][A
+ 85%|████████▍ | 656/774 [02:58<00:28,  4.20it/s][A
+ 85%|████████▍ | 657/774 [02:58<00:26,  4.39it/s][A
+ 85%|████████▌ | 658/774 [02:58<00:27,  4.18it/s][A
+ 85%|████████▌ | 659/774 [02:59<00:29,  3.86it/s][A
+ 85%|████████▌ | 660/774 [02:59<00:30,  3.76it/s][A
+ 85%|████████▌ | 661/774 [02:59<00:30,  3.70it/s][A
+ 86%|████████▌ | 662/774 [02:59<00:28,  3.87it/s][A
+ 86%|████████▌ | 663/774 [03:00<00:30,  3.65it/s][A
+ 86%|████████▌ | 664/774 [03:00<00:30,  3.66it/s][A
+ 86%|████████▌ | 665/774 [03:00<00:27,  3.93it/s][A
+ 86%|████████▌ | 666/774 [03:00<00:24,  4.33it/s][A
+ 86%|████████▌ | 667/774 [03:01<00:23,  4.59it/s][A
+ 86%|████████▋ | 668/774 [03:01<00:23,  4.42it/s][A
+ 86%|████████▋ | 669/774 [03:01<00:25,  4.17it/s][A
+ 87%|████████▋ | 670/774 [03:01<00:24,  4.31it/s][A
+ 87%|████████▋ | 671/774 [03:02<00:26,  3.92it/s][A
+ 87%|████████▋ | 672/774 [03:02<00:25,  3.99it/s][A
+ 87%|████████▋ | 673/774 [03:02<00:24,  4.06it/s][A
+ 87%|████████▋ | 674/774 [03:02<00:24,  4.05it/s][A
+ 87%|████████▋ | 675/774 [03:03<00:23,  4.24it/s][A
+ 87%|████████▋ | 676/774 [03:03<00:22,  4.40it/s][A
+ 87%|████████▋ | 677/774 [03:03<00:22,  4.38it/s][A
+ 88%|████████▊ | 678/774 [03:03<00:21,  4.45it/s][A
+ 88%|████████▊ | 679/774 [03:03<00:22,  4.25it/s][A
+ 88%|████████▊ | 680/774 [03:04<00:22,  4.19it/s][A
+ 88%|████████▊ | 681/774 [03:04<00:20,  4.47it/s][A
+ 88%|████████▊ | 682/774 [03:04<00:20,  4.51it/s][A
+ 88%|████████▊ | 683/774 [03:04<00:22,  4.14it/s][A
+ 88%|████████▊ | 684/774 [03:05<00:22,  3.93it/s][A
+ 89%|████████▊ | 685/774 [03:05<00:23,  3.72it/s][A
+ 89%|████████▊ | 686/774 [03:05<00:23,  3.82it/s][A
+ 89%|████████▉ | 687/774 [03:05<00:21,  4.01it/s][A
+ 89%|████████▉ | 688/774 [03:06<00:21,  4.02it/s][A
+ 89%|████████▉ | 689/774 [03:06<00:20,  4.20it/s][A
+ 89%|████████▉ | 690/774 [03:06<00:19,  4.30it/s][A
+ 89%|████████▉ | 691/774 [03:06<00:18,  4.39it/s][A
+ 89%|████████▉ | 692/774 [03:07<00:18,  4.44it/s][A
+ 90%|████████▉ | 693/774 [03:07<00:18,  4.45it/s][A
+ 90%|████████▉ | 694/774 [03:07<00:19,  4.20it/s][A
+ 90%|████████▉ | 695/774 [03:07<00:20,  3.85it/s][A
+ 90%|████████▉ | 696/774 [03:08<00:19,  3.95it/s][A
+ 90%|█████████ | 697/774 [03:08<00:19,  3.95it/s][A
+ 90%|█████████ | 698/774 [03:08<00:17,  4.35it/s][A
+ 90%|█████████ | 699/774 [03:08<00:15,  4.73it/s][A
+ 90%|█████████ | 700/774 [03:09<00:17,  4.33it/s][A
+ 91%|█████████ | 701/774 [03:09<00:16,  4.40it/s][A
+ 91%|█████████ | 702/774 [03:09<00:16,  4.38it/s][A
+ 91%|█████████ | 703/774 [03:09<00:16,  4.37it/s][A
+ 91%|█████████ | 704/774 [03:09<00:16,  4.25it/s][A
+ 91%|█████████ | 705/774 [03:10<00:15,  4.60it/s][A
+ 91%|█████████ | 706/774 [03:10<00:14,  4.76it/s][A
+ 91%|█████████▏| 707/774 [03:10<00:14,  4.66it/s][A
+ 91%|█████████▏| 708/774 [03:10<00:13,  4.93it/s][A
+ 92%|█████████▏| 709/774 [03:10<00:13,  4.76it/s][A
+ 92%|█████████▏| 710/774 [03:11<00:13,  4.73it/s][A
+ 92%|█████████▏| 711/774 [03:11<00:12,  4.88it/s][A
+ 92%|█████████▏| 712/774 [03:11<00:12,  5.09it/s][A
+ 92%|█████████▏| 713/774 [03:11<00:12,  4.92it/s][A
+ 92%|█████████▏| 714/774 [03:11<00:12,  4.63it/s][A
+ 92%|█████████▏| 715/774 [03:12<00:12,  4.75it/s][A
+ 93%|█████████▎| 716/774 [03:12<00:10,  5.31it/s][A
+ 93%|█████████▎| 717/774 [03:12<00:10,  5.34it/s][A
+ 93%|█████████▎| 718/774 [03:12<00:11,  4.72it/s][A
+ 93%|█████████▎| 719/774 [03:13<00:11,  4.60it/s][A
+ 93%|█████████▎| 720/774 [03:13<00:10,  4.93it/s][A
+ 93%|█████████▎| 721/774 [03:13<00:10,  5.20it/s][A
+ 93%|█████████▎| 722/774 [03:13<00:09,  5.69it/s][A
+ 93%|█████████▎| 723/774 [03:13<00:09,  5.42it/s][A
+ 94%|█████████▎| 724/774 [03:13<00:09,  5.34it/s][A
+ 94%|█████████▎| 725/774 [03:14<00:08,  5.49it/s][A
+ 94%|█████████▍| 726/774 [03:14<00:08,  5.54it/s][A
+ 94%|█████████▍| 727/774 [03:14<00:08,  5.31it/s][A
+ 94%|█████████▍| 728/774 [03:14<00:09,  4.84it/s][A
+ 94%|█████████▍| 729/774 [03:14<00:08,  5.10it/s][A
+ 94%|█████████▍| 730/774 [03:15<00:08,  5.35it/s][A
+ 94%|█████████▍| 731/774 [03:15<00:08,  5.35it/s][A
+ 95%|█████████▍| 732/774 [03:15<00:07,  5.45it/s][A
+ 95%|█████████▍| 733/774 [03:15<00:07,  5.45it/s][A
+ 95%|█████████▍| 734/774 [03:15<00:07,  5.54it/s][A
+ 95%|█████████▍| 735/774 [03:15<00:06,  5.65it/s][A
+ 95%|█████████▌| 736/774 [03:16<00:06,  5.68it/s][A
+ 95%|█████████▌| 737/774 [03:16<00:06,  5.60it/s][A
+ 95%|█████████▌| 738/774 [03:16<00:06,  5.44it/s][A
+ 95%|█████████▌| 739/774 [03:16<00:06,  5.39it/s][A
+ 96%|█████████▌| 740/774 [03:16<00:06,  5.33it/s][A
+ 96%|█████████▌| 741/774 [03:17<00:06,  5.04it/s][A
+ 96%|█████████▌| 742/774 [03:17<00:06,  5.21it/s][A
+ 96%|█████████▌| 743/774 [03:17<00:05,  5.54it/s][A
+ 96%|█████████▌| 744/774 [03:17<00:05,  5.36it/s][A
+ 96%|█████████▋| 745/774 [03:17<00:06,  4.44it/s][A
+ 96%|█████████▋| 746/774 [03:18<00:07,  3.85it/s][A
+ 97%|█████████▋| 747/774 [03:18<00:06,  4.06it/s][A
+ 97%|█████████▋| 748/774 [03:18<00:06,  4.28it/s][A
+ 97%|█████████▋| 749/774 [03:18<00:05,  4.58it/s][A
+ 97%|█████████▋| 750/774 [03:19<00:05,  4.27it/s][A
+ 97%|█████████▋| 751/774 [03:19<00:05,  4.46it/s][A
+ 97%|█████████▋| 752/774 [03:19<00:04,  4.41it/s][A
+ 97%|█████████▋| 753/774 [03:19<00:04,  4.70it/s][A
+ 97%|█████████▋| 754/774 [03:19<00:03,  5.33it/s][A
+ 98%|█████████▊| 755/774 [03:20<00:03,  5.63it/s][A
+ 98%|█████████▊| 756/774 [03:20<00:03,  5.47it/s][A
+ 98%|█████████▊| 757/774 [03:20<00:03,  5.31it/s][A
+ 98%|█████████▊| 758/774 [03:20<00:03,  5.21it/s][A
+ 98%|█████████▊| 759/774 [03:20<00:02,  5.44it/s][A
+ 98%|█████████▊| 760/774 [03:20<00:02,  5.42it/s][A
+ 98%|█████████▊| 761/774 [03:21<00:02,  5.89it/s][A
+ 98%|█████████▊| 762/774 [03:21<00:02,  5.99it/s][A
+ 99%|█████████▊| 763/774 [03:21<00:01,  6.19it/s][A
+ 99%|█████████▊| 764/774 [03:21<00:01,  6.29it/s][A
+ 99%|█████████▉| 765/774 [03:21<00:01,  6.20it/s][A
+ 99%|█████████▉| 766/774 [03:21<00:01,  5.31it/s][A
+ 99%|█████████▉| 767/774 [03:22<00:01,  5.47it/s][A
+ 99%|█████████▉| 768/774 [03:22<00:01,  5.46it/s][A
+ 99%|█████████▉| 769/774 [03:22<00:00,  5.17it/s][A
+ 99%|█████████▉| 770/774 [03:22<00:00,  5.11it/s][A
+100%|█████████▉| 771/774 [03:22<00:00,  5.39it/s][A
+100%|█████████▉| 772/774 [03:23<00:00,  5.08it/s][A
+100%|█████████▉| 773/774 [03:23<00:00,  4.91it/s][A                                                    
+                                                 [A 31%|███▏      | 4000/12776 [42:59<52:40,  2.78it/s]
+100%|██████████| 774/774 [03:25<00:00,  4.91it/s][A
+                                                 [ASaving model checkpoint to ./checkpoint-4000
+Configuration saved in ./checkpoint-4000/config.json
+Model weights saved in ./checkpoint-4000/model.safetensors
+Feature extractor saved in ./checkpoint-4000/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-4000/tokenizer_config.json
+Special tokens file saved in ./checkpoint-4000/special_tokens_map.json
+added tokens file saved in ./checkpoint-4000/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-2800] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 31%|███▏      | 4001/12776 [43:06<156:27:19, 64.19s/it]                                                         31%|███▏      | 4001/12776 [43:06<156:27:19, 64.19s/it] 31%|███▏      | 4002/12776 [43:07<110:10:05, 45.20s/it]                                                         31%|███▏      | 4002/12776 [43:07<110:10:05, 45.20s/it] 31%|███▏      | 4003/12776 [43:08<77:44:37, 31.90s/it]                                                         31%|███▏      | 4003/12776 [43:08<77:44:37, 31.90s/it] 31%|███▏      | 4004/12776 [43:08<55:00:59, 22.58s/it]                                                        31%|███▏      | 4004/12776 [43:08<55:00:59, 22.58s/it] 31%|███▏      | 4005/12776 [43:09<39:01:56, 16.02s/it]                                                        31%|███▏      | 4005/12776 [43:09<39:01:56, 16.02s/it] 31%|███▏      | 4006/12776 [43:10<27:49:35, 11.42s/it]                                                        31%|███▏      | 4006/12776 [43:10<27:49:35, 11.42s/it] 31%|███▏      | 4007/12776 [43:10<19:56:19,  8.19s/it]                                                        31%|███▏      | 4007/12776 [43:10<19:56:19,  8.19s/it] 31%|███▏      | 4008/12776 [43:11<14:23:37,  5.91s/it]                                                        31%|███▏      | 4008/12776 [43:11<14:23:37,  5.91s/it] 31%|███▏      | 4009/12776 [43:12<10:29:53,  4.31s/it]                                                        31%|███▏      | 4009/12776 [43:12<10:29:53,  4.31s/it] 31%|███▏      | 4010/12776 [43:12<7:45:27,  3.19s/it]                                                        31%|███▏      | 4010/12776 [43:12<7:45:27,  3.19s/it] 31%|███▏      | 4011/12776 [43:13<5:48:24,  2.38s/it]                                                       31%|███▏      | 4011/12776 [43:13<5:48:24,  2.38s/it] 31%|███▏      | 4012/12776 [43:13<4:30:26,  1.85s/it]                                                       31%|███▏      | 4012/12776 [43:13<4:30:26,  1.85s/it] 31%|███▏      | 4013/12776 [43:14<3:30:09,  1.44s/it]                                                       31%|███▏      | 4013/12776 [43:14<3:30:09,  1.44s/it] 31%|███▏      | 4014/12776 [43:14<2:51:35,  1.17s/it]                                                       31%|███▏      | 4014/12776 [43:14<2:51:35,  1.17s/it] 31%|███▏      | 4015/12776 [43:15<2:18:25,  1.05it/s]                                                       31%|███▏      | 4015/12776 [43:15<2:18:25,  1.05it/s] 31%|███▏      | 4016/12776 [43:15<2:00:27,  1.21it/s]                                                       31%|███▏      | 4016/12776 [43:15<2:00:27,  1.21it/s] 31%|███▏      | 4017/12776 [43:16<1:41:13,  1.44it/s]                                                       31%|███▏      | 4017/12776 [43:16<1:41:13,  1.44it/s] 31%|███▏      | 4018/12776 [43:16<1:26:54,  1.68it/s]                                                       31%|███▏      | 4018/12776 [43:16<1:26:54,  1.68it/s] 31%|███▏      | 4019/12776 [43:16<1:18:35,  1.86it/s]                                                       31%|███▏      | 4019/12776 [43:16<1:18:35,  1.86it/s] 31%|███▏      | 4020/12776 [43:17<1:10:16,  2.08it/s]                                                       31%|███▏      | 4020/12776 [43:17<1:10:16,  2.08it/s] 31%|███▏      | 4021/12776 [43:17<1:03:32,  2.30it/s]                                                       31%|███▏      | 4021/12776 [43:17<1:03:32,  2.30it/s] 31%|███▏      | 4022/12776 [43:17<58:33,  2.49it/s]                                                       31%|███▏      | 4022/12776 [43:17<58:33,  2.49it/s] 31%|███▏      | 4023/12776 [43:18<56:48,  2.57it/s]                                                     31%|███▏      | 4023/12776 [43:18<56:48,  2.57it/s] 31%|███▏      | 4024/12776 [43:18<53:05,  2.75it/s]                                                     31%|███▏      | 4024/12776 [43:18<53:05,  2.75it/s] 32%|███▏      | 4025/12776 [43:18<50:03,  2.91it/s]                                                     32%|███▏      | 4025/12776 [43:18<50:03,  2.91it/s] 32%|███▏      | 4026/12776 [43:19<49:18,  2.96it/s]                                                     32%|███▏      | 4026/12776 [43:19<49:18,  2.96it/s] 32%|███▏      | 4027/12776 [43:19<46:19,  3.15it/s]                                                     32%|███▏      | 4027/12776 [43:19<46:19,  3.15it/s] 32%|███▏      | 4028/12776 [43:19<43:51,  3.32it/s]                                                     32%|███▏      | 4028/12776 [43:19<43:51,  3.32it/s] 32%|███▏      | 4029/12776 [43:19<41:39,  3.50it/s]                                                     32%|███▏      | 4029/12776 [43:19<41:39,  3.50it/s] 32%|███▏      | 4030/12776 [43:20<43:42,  3.34it/s]                                                     32%|███▏      | 4030/12776 [43:20<43:42,  3.34it/s] 32%|███▏      | 4031/12776 [43:20<41:12,  3.54it/s]                                                     32%|███▏      | 4031/12776 [43:20<41:12,  3.54it/s] 32%|███▏      | 4032/12776 [43:20<39:04,  3.73it/s]                                                     32%|███▏      | 4032/12776 [43:20<39:04,  3.73it/s] 32%|███▏      | 4033/12776 [43:21<37:44,  3.86it/s]                                                     32%|███▏      | 4033/12776 [43:21<37:44,  3.86it/s] 32%|███▏      | 4034/12776 [43:21<36:07,  4.03it/s]                                                     32%|███▏      | 4034/12776 [43:21<36:07,  4.03it/s] 32%|███▏      | 4035/12776 [43:21<38:39,  3.77it/s]                                                     32%|███▏      | 4035/12776 [43:21<38:39,  3.77it/s] 32%|███▏      | 4036/12776 [43:21<36:12,  4.02it/s]                                                     32%|███▏      | 4036/12776 [43:21<36:12,  4.02it/s] 32%|███▏      | 4037/12776 [43:21<34:16,  4.25it/s]                                                     32%|███▏      | 4037/12776 [43:21<34:16,  4.25it/s] 32%|███▏      | 4038/12776 [43:22<32:44,  4.45it/s]                                                     32%|███▏      | 4038/12776 [43:22<32:44,  4.45it/s] 32%|███▏      | 4039/12776 [43:22<31:28,  4.63it/s]                                                     32%|███▏      | 4039/12776 [43:22<31:28,  4.63it/s] 32%|███▏      | 4040/12776 [43:22<34:42,  4.20it/s]                                                     32%|███▏      | 4040/12776 [43:22<34:42,  4.20it/s] 32%|███▏      | 4041/12776 [43:22<32:43,  4.45it/s]                                                     32%|███▏      | 4041/12776 [43:22<32:43,  4.45it/s] 32%|███▏      | 4042/12776 [43:23<31:07,  4.68it/s]                                                     32%|███▏      | 4042/12776 [43:23<31:07,  4.68it/s] 32%|███▏      | 4043/12776 [43:23<29:45,  4.89it/s]                                                     32%|███▏      | 4043/12776 [43:23<29:45,  4.89it/s] 32%|███▏      | 4044/12776 [43:23<28:45,  5.06it/s]                                                     32%|███▏      | 4044/12776 [43:23<28:45,  5.06it/s] 32%|███▏      | 4045/12776 [43:23<31:48,  4.57it/s]                                                     32%|███▏      | 4045/12776 [43:23<31:48,  4.57it/s] 32%|███▏      | 4046/12776 [43:23<29:58,  4.85it/s]                                                     32%|███▏      | 4046/12776 [43:23<29:58,  4.85it/s] 32%|███▏      | 4047/12776 [43:23<28:31,  5.10it/s]                                                     32%|███▏      | 4047/12776 [43:23<28:31,  5.10it/s] 32%|███▏      | 4048/12776 [43:24<27:20,  5.32it/s]                                                     32%|███▏      | 4048/12776 [43:24<27:20,  5.32it/s] 32%|███▏      | 4049/12776 [43:24<26:16,  5.54it/s]                                                     32%|███▏      | 4049/12776 [43:24<26:16,  5.54it/s] 32%|███▏      | 4050/12776 [43:25<47:54,  3.04it/s]                                                     32%|███▏      | 4050/12776 [43:25<47:54,  3.04it/s] 32%|███▏      | 4051/12776 [43:26<1:32:13,  1.58it/s]                                                       32%|███▏      | 4051/12776 [43:26<1:32:13,  1.58it/s] 32%|███▏      | 4052/12776 [43:27<1:51:36,  1.30it/s]                                                       32%|███▏      | 4052/12776 [43:27<1:51:36,  1.30it/s] 32%|███▏      | 4053/12776 [43:28<1:55:51,  1.25it/s]                                                       32%|███▏      | 4053/12776 [43:28<1:55:51,  1.25it/s] 32%|███▏      | 4054/12776 [43:29<1:54:56,  1.26it/s]                                                       32%|███▏      | 4054/12776 [43:29<1:54:56,  1.26it/s] 32%|███▏      | 4055/12776 [43:29<1:52:14,  1.29it/s]                                                       32%|███▏      | 4055/12776 [43:29<1:52:14,  1.29it/s] 32%|███▏      | 4056/12776 [43:30<1:49:57,  1.32it/s]                                                       32%|███▏      | 4056/12776 [43:30<1:49:57,  1.32it/s] 32%|███▏      | 4057/12776 [43:31<1:44:31,  1.39it/s]                                                       32%|███▏      | 4057/12776 [43:31<1:44:31,  1.39it/s] 32%|███▏      | 4058/12776 [43:31<1:46:18,  1.37it/s]                                                       32%|███▏      | 4058/12776 [43:31<1:46:18,  1.37it/s] 32%|███▏      | 4059/12776 [43:32<1:39:40,  1.46it/s]                                                       32%|███▏      | 4059/12776 [43:32<1:39:40,  1.46it/s] 32%|███▏      | 4060/12776 [43:33<1:37:22,  1.49it/s]                                                       32%|███▏      | 4060/12776 [43:33<1:37:22,  1.49it/s] 32%|███▏      | 4061/12776 [43:33<1:30:40,  1.60it/s]                                                       32%|███▏      | 4061/12776 [43:33<1:30:40,  1.60it/s] 32%|███▏      | 4062/12776 [43:34<1:29:02,  1.63it/s]                                                       32%|███▏      | 4062/12776 [43:34<1:29:02,  1.63it/s] 32%|███▏      | 4063/12776 [43:34<1:23:22,  1.74it/s]                                                       32%|███▏      | 4063/12776 [43:34<1:23:22,  1.74it/s] 32%|███▏      | 4064/12776 [43:35<1:24:41,  1.71it/s]                                                       32%|███▏      | 4064/12776 [43:35<1:24:41,  1.71it/s] 32%|███▏      | 4065/12776 [43:35<1:17:16,  1.88it/s]                                                       32%|███▏      | 4065/12776 [43:35<1:17:16,  1.88it/s] 32%|███▏      | 4066/12776 [43:36<1:17:18,  1.88it/s]                                                       32%|███▏      | 4066/12776 [43:36<1:17:18,  1.88it/s] 32%|███▏      | 4067/12776 [43:36<1:11:50,  2.02it/s]                                                       32%|███▏      | 4067/12776 [43:36<1:11:50,  2.02it/s] 32%|███▏      | 4068/12776 [43:37<1:07:21,  2.15it/s]                                                       32%|███▏      | 4068/12776 [43:37<1:07:21,  2.15it/s] 32%|███▏      | 4069/12776 [43:37<1:09:35,  2.09it/s]                                                       32%|███▏      | 4069/12776 [43:37<1:09:35,  2.09it/s] 32%|███▏      | 4070/12776 [43:37<1:04:33,  2.25it/s]                                                       32%|███▏      | 4070/12776 [43:37<1:04:33,  2.25it/s] 32%|███▏      | 4071/12776 [43:38<1:00:16,  2.41it/s]                                                       32%|███▏      | 4071/12776 [43:38<1:00:16,  2.41it/s] 32%|███▏      | 4072/12776 [43:38<1:00:04,  2.41it/s]                                                       32%|███▏      | 4072/12776 [43:38<1:00:04,  2.41it/s] 32%|███▏      | 4073/12776 [43:39<56:27,  2.57it/s]                                                       32%|███▏      | 4073/12776 [43:39<56:27,  2.57it/s] 32%|███▏      | 4074/12776 [43:39<53:18,  2.72it/s]                                                     32%|███▏      | 4074/12776 [43:39<53:18,  2.72it/s] 32%|███▏      | 4075/12776 [43:39<52:05,  2.78it/s]                                                     32%|███▏      | 4075/12776 [43:39<52:05,  2.78it/s] 32%|███▏      | 4076/12776 [43:39<49:33,  2.93it/s]                                                    {'eval_loss': 0.6170632839202881, 'eval_wer': 0.37072061542384765, 'eval_runtime': 206.2566, 'eval_samples_per_second': 60.037, 'eval_steps_per_second': 3.753, 'epoch': 0.63}
+{'loss': 0.3052, 'grad_norm': 0.6230515837669373, 'learning_rate': 0.00021468719452590417, 'epoch': 0.63}
+{'loss': 0.253, 'grad_norm': 0.36038628220558167, 'learning_rate': 0.00021466275659824045, 'epoch': 0.63}
+{'loss': 0.1601, 'grad_norm': 0.40060994029045105, 'learning_rate': 0.0002146383186705767, 'epoch': 0.63}
+{'loss': 0.3302, 'grad_norm': 0.9217357635498047, 'learning_rate': 0.00021461388074291298, 'epoch': 0.63}
+{'loss': 0.3022, 'grad_norm': 0.5147327780723572, 'learning_rate': 0.00021458944281524926, 'epoch': 0.63}
+{'loss': 0.3489, 'grad_norm': 0.6115538477897644, 'learning_rate': 0.0002145650048875855, 'epoch': 0.63}
+{'loss': 0.2962, 'grad_norm': 0.9012541174888611, 'learning_rate': 0.0002145405669599218, 'epoch': 0.63}
+{'loss': 0.3945, 'grad_norm': 0.6350597143173218, 'learning_rate': 0.00021451612903225807, 'epoch': 0.63}
+{'loss': 0.411, 'grad_norm': 0.8227857947349548, 'learning_rate': 0.0002144916911045943, 'epoch': 0.63}
+{'loss': 0.3542, 'grad_norm': 0.618043839931488, 'learning_rate': 0.00021446725317693057, 'epoch': 0.63}
+{'loss': 0.9303, 'grad_norm': 1.9552290439605713, 'learning_rate': 0.00021444281524926685, 'epoch': 0.63}
+{'loss': 0.3111, 'grad_norm': 0.5457572937011719, 'learning_rate': 0.0002144183773216031, 'epoch': 0.63}
+{'loss': 0.3717, 'grad_norm': 1.2322858572006226, 'learning_rate': 0.00021439393939393938, 'epoch': 0.63}
+{'loss': 0.4199, 'grad_norm': 1.5767748355865479, 'learning_rate': 0.00021436950146627566, 'epoch': 0.63}
+{'loss': 0.416, 'grad_norm': 1.3694690465927124, 'learning_rate': 0.0002143450635386119, 'epoch': 0.63}
+{'loss': 0.4201, 'grad_norm': 0.9982991814613342, 'learning_rate': 0.00021432062561094816, 'epoch': 0.63}
+{'loss': 0.677, 'grad_norm': 1.6237698793411255, 'learning_rate': 0.00021429618768328444, 'epoch': 0.63}
+{'loss': 0.5136, 'grad_norm': 1.1065876483917236, 'learning_rate': 0.0002142717497556207, 'epoch': 0.63}
+{'loss': 0.4697, 'grad_norm': 0.8187892436981201, 'learning_rate': 0.00021424731182795697, 'epoch': 0.63}
+{'loss': 0.3248, 'grad_norm': 0.7985676527023315, 'learning_rate': 0.00021422287390029325, 'epoch': 0.63}
+{'loss': 0.788, 'grad_norm': 2.245656967163086, 'learning_rate': 0.0002141984359726295, 'epoch': 0.63}
+{'loss': 0.9463, 'grad_norm': 2.0072994232177734, 'learning_rate': 0.00021417399804496577, 'epoch': 0.63}
+{'loss': 0.7412, 'grad_norm': 2.0363926887512207, 'learning_rate': 0.00021414956011730205, 'epoch': 0.63}
+{'loss': 0.435, 'grad_norm': 2.1571948528289795, 'learning_rate': 0.00021412512218963828, 'epoch': 0.63}
+{'loss': 0.7009, 'grad_norm': 1.8495596647262573, 'learning_rate': 0.00021410068426197456, 'epoch': 0.63}
+{'loss': 0.5252, 'grad_norm': 1.2187695503234863, 'learning_rate': 0.00021407624633431083, 'epoch': 0.63}
+{'loss': 0.586, 'grad_norm': 1.3533210754394531, 'learning_rate': 0.00021405180840664709, 'epoch': 0.63}
+{'loss': 0.9532, 'grad_norm': 3.1232190132141113, 'learning_rate': 0.00021402737047898336, 'epoch': 0.63}
+{'loss': 0.7114, 'grad_norm': 1.3464336395263672, 'learning_rate': 0.00021400293255131964, 'epoch': 0.63}
+{'loss': 1.0843, 'grad_norm': 1.8784873485565186, 'learning_rate': 0.0002139784946236559, 'epoch': 0.63}
+{'loss': 0.8251, 'grad_norm': 1.593939185142517, 'learning_rate': 0.00021395405669599217, 'epoch': 0.63}
+{'loss': 0.9525, 'grad_norm': 3.432223320007324, 'learning_rate': 0.00021392961876832845, 'epoch': 0.63}
+{'loss': 0.5878, 'grad_norm': 2.074753522872925, 'learning_rate': 0.00021390518084066467, 'epoch': 0.63}
+{'loss': 0.6233, 'grad_norm': 1.5874582529067993, 'learning_rate': 0.00021388074291300095, 'epoch': 0.63}
+{'loss': 0.9886, 'grad_norm': 1.6352641582489014, 'learning_rate': 0.00021385630498533723, 'epoch': 0.63}
+{'loss': 1.6576, 'grad_norm': 2.6665916442871094, 'learning_rate': 0.00021383186705767348, 'epoch': 0.63}
+{'loss': 0.6991, 'grad_norm': 1.7960261106491089, 'learning_rate': 0.00021380742913000976, 'epoch': 0.63}
+{'loss': 0.8864, 'grad_norm': 1.923484206199646, 'learning_rate': 0.00021378299120234604, 'epoch': 0.63}
+{'loss': 0.9939, 'grad_norm': 1.8930014371871948, 'learning_rate': 0.00021375855327468226, 'epoch': 0.63}
+{'loss': 1.4059, 'grad_norm': 3.678338050842285, 'learning_rate': 0.00021373411534701854, 'epoch': 0.63}
+{'loss': 1.4417, 'grad_norm': 2.632375955581665, 'learning_rate': 0.00021370967741935482, 'epoch': 0.63}
+{'loss': 0.9454, 'grad_norm': 2.788484573364258, 'learning_rate': 0.00021368523949169107, 'epoch': 0.63}
+{'loss': 1.4575, 'grad_norm': 2.319645643234253, 'learning_rate': 0.00021366080156402735, 'epoch': 0.63}
+{'loss': 1.7115, 'grad_norm': 3.663972854614258, 'learning_rate': 0.00021363636363636363, 'epoch': 0.63}
+{'loss': 0.8049, 'grad_norm': 1.7628999948501587, 'learning_rate': 0.00021361192570869988, 'epoch': 0.63}
+{'loss': 0.9988, 'grad_norm': 1.7270902395248413, 'learning_rate': 0.00021358748778103616, 'epoch': 0.63}
+{'loss': 0.9895, 'grad_norm': 2.860292911529541, 'learning_rate': 0.00021356304985337244, 'epoch': 0.63}
+{'loss': 1.4816, 'grad_norm': 3.5345377922058105, 'learning_rate': 0.00021353861192570866, 'epoch': 0.63}
+{'loss': 1.0884, 'grad_norm': 3.5948832035064697, 'learning_rate': 0.00021351417399804494, 'epoch': 0.63}
+{'loss': 1.7915, 'grad_norm': 3.8039002418518066, 'learning_rate': 0.00021348973607038122, 'epoch': 0.63}
+{'loss': 0.313, 'grad_norm': 0.4440103769302368, 'learning_rate': 0.00021346529814271747, 'epoch': 0.63}
+{'loss': 0.3249, 'grad_norm': 0.5781520009040833, 'learning_rate': 0.00021344086021505375, 'epoch': 0.63}
+{'loss': 0.3927, 'grad_norm': 0.5911738276481628, 'learning_rate': 0.00021341642228739002, 'epoch': 0.63}
+{'loss': 0.243, 'grad_norm': 0.4731442332267761, 'learning_rate': 0.00021339198435972628, 'epoch': 0.63}
+{'loss': 0.3089, 'grad_norm': 0.5945417881011963, 'learning_rate': 0.00021336754643206255, 'epoch': 0.63}
+{'loss': 0.4119, 'grad_norm': 0.6877188086509705, 'learning_rate': 0.0002133431085043988, 'epoch': 0.63}
+{'loss': 0.3965, 'grad_norm': 0.788280725479126, 'learning_rate': 0.00021331867057673506, 'epoch': 0.64}
+{'loss': 0.4293, 'grad_norm': 0.7950897216796875, 'learning_rate': 0.00021329423264907133, 'epoch': 0.64}
+{'loss': 0.3425, 'grad_norm': 0.7185248732566833, 'learning_rate': 0.0002132697947214076, 'epoch': 0.64}
+{'loss': 0.3744, 'grad_norm': 1.1782954931259155, 'learning_rate': 0.00021324535679374386, 'epoch': 0.64}
+{'loss': 0.7161, 'grad_norm': 1.3076963424682617, 'learning_rate': 0.00021322091886608014, 'epoch': 0.64}
+{'loss': 0.3778, 'grad_norm': 0.9288998246192932, 'learning_rate': 0.00021319648093841642, 'epoch': 0.64}
+{'loss': 0.5499, 'grad_norm': 1.1923285722732544, 'learning_rate': 0.00021317204301075265, 'epoch': 0.64}
+{'loss': 0.4468, 'grad_norm': 0.8304926156997681, 'learning_rate': 0.00021314760508308892, 'epoch': 0.64}
+{'loss': 0.4214, 'grad_norm': 0.9042999744415283, 'learning_rate': 0.0002131231671554252, 'epoch': 0.64}
+{'loss': 0.4863, 'grad_norm': 1.1802743673324585, 'learning_rate': 0.00021309872922776145, 'epoch': 0.64}
+{'loss': 0.5456, 'grad_norm': 0.9650530219078064, 'learning_rate': 0.00021307429130009773, 'epoch': 0.64}
+{'loss': 0.4523, 'grad_norm': 1.8889888525009155, 'learning_rate': 0.000213049853372434, 'epoch': 0.64}
+{'loss': 0.5295, 'grad_norm': 1.3778254985809326, 'learning_rate': 0.00021302541544477026, 'epoch': 0.64}
+{'loss': 0.8532, 'grad_norm': 1.3689180612564087, 'learning_rate': 0.00021300097751710654, 'epoch': 0.64}
+{'loss': 0.5106, 'grad_norm': 1.6118444204330444, 'learning_rate': 0.00021297653958944282, 'epoch': 0.64}
+{'loss': 0.8254, 'grad_norm': 2.750725507736206, 'learning_rate': 0.00021295210166177904, 'epoch': 0.64}
+{'loss': 0.8185, 'grad_norm': 2.30176043510437, 'learning_rate': 0.00021292766373411532, 'epoch': 0.64}
+{'loss': 0.7568, 'grad_norm': 2.448899745941162, 'learning_rate': 0.0002129032258064516, 'epoch': 0.64}
+{'loss': 0.7279, 'grad_norm': 1.9214074611663818, 'learning_rate': 0.00021287878787878785, 'epoch': 0.64}
+{'loss': 0.8629, 'grad_norm': 1.4484351873397827, 'learning_rate': 0.00021285434995112413, 'epoch': 0.64}
+ 32%|███▏      | 4076/12776 [43:39<49:33,  2.93it/s] 32%|███▏      | 4077/12776 [43:40<47:25,  3.06it/s]                                                     32%|███▏      | 4077/12776 [43:40<47:25,  3.06it/s] 32%|███▏      | 4078/12776 [43:40<45:23,  3.19it/s]                                                     32%|███▏      | 4078/12776 [43:40<45:23,  3.19it/s] 32%|███▏      | 4079/12776 [43:40<48:46,  2.97it/s]                                                     32%|███▏      | 4079/12776 [43:40<48:46,  2.97it/s] 32%|███▏      | 4080/12776 [43:41<45:27,  3.19it/s]                                                     32%|███▏      | 4080/12776 [43:41<45:27,  3.19it/s] 32%|███▏      | 4081/12776 [43:41<42:43,  3.39it/s]                                                     32%|███▏      | 4081/12776 [43:41<42:43,  3.39it/s] 32%|███▏      | 4082/12776 [43:41<40:32,  3.57it/s]                                                     32%|███▏      | 4082/12776 [43:41<40:32,  3.57it/s] 32%|███▏      | 4083/12776 [43:42<43:47,  3.31it/s]                                                     32%|███▏      | 4083/12776 [43:42<43:47,  3.31it/s] 32%|███▏      | 4084/12776 [43:42<41:10,  3.52it/s]                                                     32%|███▏      | 4084/12776 [43:42<41:10,  3.52it/s] 32%|███▏      | 4085/12776 [43:42<38:37,  3.75it/s]                                                     32%|███▏      | 4085/12776 [43:42<38:37,  3.75it/s] 32%|███▏      | 4086/12776 [43:42<36:28,  3.97it/s]                                                     32%|███▏      | 4086/12776 [43:42<36:28,  3.97it/s] 32%|███▏      | 4087/12776 [43:43<39:22,  3.68it/s]                                                     32%|███▏      | 4087/12776 [43:43<39:22,  3.68it/s] 32%|███▏      | 4088/12776 [43:43<36:36,  3.96it/s]                                                     32%|███▏      | 4088/12776 [43:43<36:36,  3.96it/s] 32%|███▏      | 4089/12776 [43:43<34:47,  4.16it/s]                                                     32%|███▏      | 4089/12776 [43:43<34:47,  4.16it/s] 32%|███▏      | 4090/12776 [43:43<33:15,  4.35it/s]                                                     32%|███▏      | 4090/12776 [43:43<33:15,  4.35it/s] 32%|███▏      | 4091/12776 [43:43<32:03,  4.52it/s]                                                     32%|███▏      | 4091/12776 [43:43<32:03,  4.52it/s] 32%|███▏      | 4092/12776 [43:44<35:46,  4.05it/s]                                                     32%|███▏      | 4092/12776 [43:44<35:46,  4.05it/s] 32%|███▏      | 4093/12776 [43:44<33:38,  4.30it/s]                                                     32%|███▏      | 4093/12776 [43:44<33:38,  4.30it/s] 32%|███▏      | 4094/12776 [43:44<32:03,  4.51it/s]                                                     32%|███▏      | 4094/12776 [43:44<32:03,  4.51it/s] 32%|███▏      | 4095/12776 [43:44<30:50,  4.69it/s]                                                     32%|███▏      | 4095/12776 [43:44<30:50,  4.69it/s] 32%|███▏      | 4096/12776 [43:44<29:56,  4.83it/s]                                                     32%|███▏      | 4096/12776 [43:44<29:56,  4.83it/s] 32%|███▏      | 4097/12776 [43:45<29:05,  4.97it/s]                                                     32%|███▏      | 4097/12776 [43:45<29:05,  4.97it/s] 32%|███▏      | 4098/12776 [43:45<33:00,  4.38it/s]                                                     32%|███▏      | 4098/12776 [43:45<33:00,  4.38it/s] 32%|███▏      | 4099/12776 [43:45<31:01,  4.66it/s]                                                     32%|███▏      | 4099/12776 [43:45<31:01,  4.66it/s] 32%|███▏      | 4100/12776 [43:46<58:52,  2.46it/s]                                                     32%|███▏      | 4100/12776 [43:46<58:52,  2.46it/s] 32%|███▏      | 4101/12776 [43:47<1:42:12,  1.41it/s]                                                       32%|███▏      | 4101/12776 [43:47<1:42:12,  1.41it/s] 32%|███▏      | 4102/12776 [43:48<1:52:40,  1.28it/s]                                                       32%|███▏      | 4102/12776 [43:48<1:52:40,  1.28it/s] 32%|███▏      | 4103/12776 [43:49<1:55:27,  1.25it/s]                                                       32%|███▏      | 4103/12776 [43:49<1:55:27,  1.25it/s] 32%|███▏      | 4104/12776 [43:50<1:54:04,  1.27it/s]                                                       32%|███▏      | 4104/12776 [43:50<1:54:04,  1.27it/s] 32%|███▏      | 4105/12776 [43:51<1:50:48,  1.30it/s]                                                       32%|███▏      | 4105/12776 [43:51<1:50:48,  1.30it/s] 32%|███▏      | 4106/12776 [43:51<1:46:37,  1.36it/s]                                                       32%|███▏      | 4106/12776 [43:51<1:46:37,  1.36it/s] 32%|███▏      | 4107/12776 [43:52<1:45:04,  1.37it/s]                                                       32%|███▏      | 4107/12776 [43:52<1:45:04,  1.37it/s] 32%|███▏      | 4108/12776 [43:53<1:39:58,  1.44it/s]                                                       32%|███▏      | 4108/12776 [43:53<1:39:58,  1.44it/s] 32%|███▏      | 4109/12776 [43:53<1:36:46,  1.49it/s]                                                       32%|███▏      | 4109/12776 [43:53<1:36:46,  1.49it/s] 32%|███▏      | 4110/12776 [43:54<1:31:26,  1.58it/s]                                                       32%|███▏      | 4110/12776 [43:54<1:31:26,  1.58it/s] 32%|███▏      | 4111/12776 [43:54<1:28:21,  1.63it/s]                                                       32%|███▏      | 4111/12776 [43:54<1:28:21,  1.63it/s] 32%|███▏      | 4112/12776 [43:55<1:23:43,  1.72it/s]                                                       32%|███▏      | 4112/12776 [43:55<1:23:43,  1.72it/s] 32%|███▏      | 4113/12776 [43:55<1:22:41,  1.75it/s]                                                       32%|███▏      | 4113/12776 [43:55<1:22:41,  1.75it/s] 32%|███▏      | 4114/12776 [43:56<1:17:28,  1.86it/s]                                                       32%|███▏      | 4114/12776 [43:56<1:17:28,  1.86it/s] 32%|███▏      | 4115/12776 [43:56<1:16:32,  1.89it/s]                                                       32%|███▏      | 4115/12776 [43:56<1:16:32,  1.89it/s] 32%|███▏      | 4116/12776 [43:57<1:11:52,  2.01it/s]                                                       32%|███▏      | 4116/12776 [43:57<1:11:52,  2.01it/s] 32%|███▏      | 4117/12776 [43:57<1:07:36,  2.13it/s]                                                       32%|███▏      | 4117/12776 [43:57<1:07:36,  2.13it/s] 32%|███▏      | 4118/12776 [43:58<1:09:27,  2.08it/s]                                                       32%|███▏      | 4118/12776 [43:58<1:09:27,  2.08it/s] 32%|███▏      | 4119/12776 [43:58<1:04:47,  2.23it/s]                                                       32%|███▏      | 4119/12776 [43:58<1:04:47,  2.23it/s] 32%|███▏      | 4120/12776 [43:58<1:00:56,  2.37it/s]                                                       32%|███▏      | 4120/12776 [43:58<1:00:56,  2.37it/s] 32%|███▏      | 4121/12776 [43:59<1:01:31,  2.34it/s]                                                       32%|███▏      | 4121/12776 [43:59<1:01:31,  2.34it/s] 32%|███▏      | 4122/12776 [43:59<57:54,  2.49it/s]                                                       32%|███▏      | 4122/12776 [43:59<57:54,  2.49it/s] 32%|███▏      | 4123/12776 [44:00<55:01,  2.62it/s]                                                     32%|███▏      | 4123/12776 [44:00<55:01,  2.62it/s] 32%|███▏      | 4124/12776 [44:00<57:00,  2.53it/s]                                                     32%|███▏      | 4124/12776 [44:00<57:00,  2.53it/s] 32%|███▏      | 4125/12776 [44:00<53:05,  2.72it/s]                                                     32%|███▏      | 4125/12776 [44:00<53:05,  2.72it/s] 32%|███▏      | 4126/12776 [44:01<49:48,  2.89it/s]                                                     32%|███▏      | 4126/12776 [44:01<49:48,  2.89it/s] 32%|███▏      | 4127/12776 [44:01<51:21,  2.81it/s]                                                     32%|███▏      | 4127/12776 [44:01<51:21,  2.81it/s] 32%|███▏      | 4128/12776 [44:01<48:05,  3.00it/s]                                                     32%|███▏      | 4128/12776 [44:01<48:05,  3.00it/s] 32%|███▏      | 4129/12776 [44:02<45:25,  3.17it/s]                                                     32%|███▏      | 4129/12776 [44:02<45:25,  3.17it/s] 32%|███▏      | 4130/12776 [44:02<43:16,  3.33it/s]                                                     32%|███▏      | 4130/12776 [44:02<43:16,  3.33it/s] 32%|███▏      | 4131/12776 [44:02<44:18,  3.25it/s]                                                     32%|███▏      | 4131/12776 [44:02<44:18,  3.25it/s] 32%|███▏      | 4132/12776 [44:02<41:48,  3.45it/s]                                                     32%|███▏      | 4132/12776 [44:02<41:48,  3.45it/s] 32%|███▏      | 4133/12776 [44:03<39:45,  3.62it/s]                                                     32%|███▏      | 4133/12776 [44:03<39:45,  3.62it/s] 32%|███▏      | 4134/12776 [44:03<38:16,  3.76it/s]                                                     32%|███▏      | 4134/12776 [44:03<38:16,  3.76it/s] 32%|███▏      | 4135/12776 [44:03<36:55,  3.90it/s]                                                     32%|███▏      | 4135/12776 [44:03<36:55,  3.90it/s] 32%|███▏      | 4136/12776 [44:03<38:47,  3.71it/s]                                                     32%|███▏      | 4136/12776 [44:03<38:47,  3.71it/s] 32%|███▏      | 4137/12776 [44:04<36:59,  3.89it/s]                                                     32%|███▏      | 4137/12776 [44:04<36:59,  3.89it/s] 32%|███▏      | 4138/12776 [44:04<35:23,  4.07it/s]                                                     32%|███▏      | 4138/12776 [44:04<35:23,  4.07it/s] 32%|███▏      | 4139/12776 [44:04<34:02,  4.23it/s]                                                     32%|███▏      | 4139/12776 [44:04<34:02,  4.23it/s] 32%|███▏      | 4140/12776 [44:04<32:48,  4.39it/s]                                                     32%|███▏      | 4140/12776 [44:04<32:48,  4.39it/s] 32%|███▏      | 4141/12776 [44:05<36:37,  3.93it/s]                                                     32%|███▏      | 4141/12776 [44:05<36:37,  3.93it/s] 32%|███▏      | 4142/12776 [44:05<34:36,  4.16it/s]                                                     32%|███▏      | 4142/12776 [44:05<34:36,  4.16it/s] 32%|███▏      | 4143/12776 [44:05<33:10,  4.34it/s]                                                     32%|███▏      | 4143/12776 [44:05<33:10,  4.34it/s] 32%|███▏      | 4144/12776 [44:05<31:58,  4.50it/s]                                                     32%|███▏      | 4144/12776 [44:05<31:58,  4.50it/s] 32%|███▏      | 4145/12776 [44:05<31:02,  4.63it/s]                                                     32%|███▏      | 4145/12776 [44:05<31:02,  4.63it/s] 32%|███▏      | 4146/12776 [44:06<34:17,  4.19it/s]                                                     32%|███▏      | 4146/12776 [44:06<34:17,  4.19it/s] 32%|███▏      | 4147/12776 [44:06<32:23,  4.44it/s]                                                     32%|███▏      | 4147/12776 [44:06<32:23,  4.44it/s] 32%|███▏      | 4148/12776 [44:06<30:47,  4.67it/s]                                                     32%|███▏      | 4148/12776 [44:06<30:47,  4.67it/s] 32%|███▏      | 4149/12776 [44:06<29:38,  4.85it/s]                                                     32%|███▏      | 4149/12776 [44:06<29:38,  4.85it/s] 32%|███▏      | 4150/12776 [44:07<53:50,  2.67it/s]                                                     32%|███▏      | 4150/12776 [44:07<53:50,  2.67it/s] 32%|███▏      | 4151/12776 [44:09<1:45:16,  1.37it/s]                                                       32%|███▏      | 4151/12776 [44:09<1:45:16,  1.37it/s] 32%|███▏      | 4152/12776 [44:10<1:59:46,  1.20it/s]                                                       32%|███▏      | 4152/12776 [44:10<1:59:46,  1.20it/s] 33%|███▎      | 4153/12776 [44:11<2:08:11,  1.12it/s]                                                       33%|███▎      | 4153/12776 [44:11<2:08:11,  1.12it/s] 33%|███▎      | 4154/12776 [44:12<2:11:51,  1.09it/s]                                                      {'loss': 0.7699, 'grad_norm': 1.9307438135147095, 'learning_rate': 0.0002128299120234604, 'epoch': 0.64}
+{'loss': 0.5525, 'grad_norm': 1.6077808141708374, 'learning_rate': 0.00021280547409579666, 'epoch': 0.64}
+{'loss': 0.688, 'grad_norm': 2.4951558113098145, 'learning_rate': 0.00021278103616813294, 'epoch': 0.64}
+{'loss': 1.2918, 'grad_norm': 3.6255581378936768, 'learning_rate': 0.0002127565982404692, 'epoch': 0.64}
+{'loss': 0.8204, 'grad_norm': 2.62573504447937, 'learning_rate': 0.00021273216031280544, 'epoch': 0.64}
+{'loss': 0.5305, 'grad_norm': 1.5834779739379883, 'learning_rate': 0.00021270772238514172, 'epoch': 0.64}
+{'loss': 1.1743, 'grad_norm': 2.354691505432129, 'learning_rate': 0.000212683284457478, 'epoch': 0.64}
+{'loss': 0.5419, 'grad_norm': 1.5539332628250122, 'learning_rate': 0.00021265884652981425, 'epoch': 0.64}
+{'loss': 1.0443, 'grad_norm': 1.5284450054168701, 'learning_rate': 0.00021263440860215052, 'epoch': 0.64}
+{'loss': 1.6458, 'grad_norm': 3.700737476348877, 'learning_rate': 0.0002126099706744868, 'epoch': 0.64}
+{'loss': 0.8026, 'grad_norm': 2.4430134296417236, 'learning_rate': 0.00021258553274682303, 'epoch': 0.64}
+{'loss': 1.4549, 'grad_norm': 2.9838156700134277, 'learning_rate': 0.0002125610948191593, 'epoch': 0.64}
+{'loss': 1.2308, 'grad_norm': 1.9823230504989624, 'learning_rate': 0.00021253665689149558, 'epoch': 0.64}
+{'loss': 1.7271, 'grad_norm': 2.805058717727661, 'learning_rate': 0.00021251221896383184, 'epoch': 0.64}
+{'loss': 1.0577, 'grad_norm': 3.0151147842407227, 'learning_rate': 0.00021248778103616811, 'epoch': 0.64}
+{'loss': 0.9225, 'grad_norm': 2.8304717540740967, 'learning_rate': 0.0002124633431085044, 'epoch': 0.64}
+{'loss': 1.0837, 'grad_norm': 1.2722463607788086, 'learning_rate': 0.00021243890518084064, 'epoch': 0.64}
+{'loss': 1.4426, 'grad_norm': 2.478703498840332, 'learning_rate': 0.00021241446725317692, 'epoch': 0.64}
+{'loss': 0.5695, 'grad_norm': 1.236244559288025, 'learning_rate': 0.0002123900293255132, 'epoch': 0.64}
+{'loss': 0.6294, 'grad_norm': 1.9802755117416382, 'learning_rate': 0.00021236559139784942, 'epoch': 0.64}
+{'loss': 1.5215, 'grad_norm': 2.97894549369812, 'learning_rate': 0.0002123411534701857, 'epoch': 0.64}
+{'loss': 1.141, 'grad_norm': 7.575010776519775, 'learning_rate': 0.00021231671554252198, 'epoch': 0.64}
+{'loss': 1.2502, 'grad_norm': 2.812551975250244, 'learning_rate': 0.00021229227761485823, 'epoch': 0.64}
+{'loss': 1.0142, 'grad_norm': 1.490632176399231, 'learning_rate': 0.0002122678396871945, 'epoch': 0.64}
+{'loss': 0.479, 'grad_norm': 0.6560826897621155, 'learning_rate': 0.0002122434017595308, 'epoch': 0.64}
+{'loss': 0.2937, 'grad_norm': 0.4517360329627991, 'learning_rate': 0.00021221896383186704, 'epoch': 0.64}
+{'loss': 0.2687, 'grad_norm': 0.5508907437324524, 'learning_rate': 0.00021219452590420332, 'epoch': 0.64}
+{'loss': 0.3279, 'grad_norm': 0.5405100584030151, 'learning_rate': 0.00021217008797653957, 'epoch': 0.64}
+{'loss': 0.4322, 'grad_norm': 0.8774932622909546, 'learning_rate': 0.00021214565004887582, 'epoch': 0.64}
+{'loss': 0.2838, 'grad_norm': 0.5635179281234741, 'learning_rate': 0.0002121212121212121, 'epoch': 0.64}
+{'loss': 0.3582, 'grad_norm': 2.6404831409454346, 'learning_rate': 0.00021209677419354838, 'epoch': 0.64}
+{'loss': 0.4306, 'grad_norm': 0.9229952692985535, 'learning_rate': 0.00021207233626588463, 'epoch': 0.64}
+{'loss': 0.4203, 'grad_norm': 1.216314435005188, 'learning_rate': 0.0002120478983382209, 'epoch': 0.64}
+{'loss': 0.3948, 'grad_norm': 0.6581313014030457, 'learning_rate': 0.00021202346041055719, 'epoch': 0.64}
+{'loss': 0.4591, 'grad_norm': 0.8522962331771851, 'learning_rate': 0.0002119990224828934, 'epoch': 0.64}
+{'loss': 0.581, 'grad_norm': 1.1675865650177002, 'learning_rate': 0.0002119745845552297, 'epoch': 0.64}
+{'loss': 0.3727, 'grad_norm': 0.7074406147003174, 'learning_rate': 0.00021195014662756597, 'epoch': 0.64}
+{'loss': 0.4141, 'grad_norm': 1.105228066444397, 'learning_rate': 0.00021192570869990222, 'epoch': 0.64}
+{'loss': 0.4892, 'grad_norm': 1.1589969396591187, 'learning_rate': 0.0002119012707722385, 'epoch': 0.64}
+{'loss': 0.861, 'grad_norm': 2.829336404800415, 'learning_rate': 0.00021187683284457477, 'epoch': 0.64}
+{'loss': 0.3913, 'grad_norm': 0.7126592993736267, 'learning_rate': 0.00021185239491691103, 'epoch': 0.64}
+{'loss': 0.7084, 'grad_norm': 1.3239237070083618, 'learning_rate': 0.0002118279569892473, 'epoch': 0.64}
+{'loss': 0.4494, 'grad_norm': 0.9522157311439514, 'learning_rate': 0.00021180351906158358, 'epoch': 0.64}
+{'loss': 0.6281, 'grad_norm': 1.3508566617965698, 'learning_rate': 0.0002117790811339198, 'epoch': 0.64}
+{'loss': 0.5852, 'grad_norm': 2.4964592456817627, 'learning_rate': 0.00021175464320625608, 'epoch': 0.65}
+{'loss': 0.5809, 'grad_norm': 1.3371846675872803, 'learning_rate': 0.00021173020527859236, 'epoch': 0.65}
+{'loss': 0.5822, 'grad_norm': 1.801464319229126, 'learning_rate': 0.00021170576735092861, 'epoch': 0.65}
+{'loss': 0.6492, 'grad_norm': 1.4076706171035767, 'learning_rate': 0.0002116813294232649, 'epoch': 0.65}
+{'loss': 0.7113, 'grad_norm': 1.2247923612594604, 'learning_rate': 0.00021165689149560117, 'epoch': 0.65}
+{'loss': 0.9173, 'grad_norm': 2.9736526012420654, 'learning_rate': 0.00021163245356793742, 'epoch': 0.65}
+{'loss': 0.717, 'grad_norm': 1.522207498550415, 'learning_rate': 0.00021160801564027367, 'epoch': 0.65}
+{'loss': 0.7599, 'grad_norm': 2.8256654739379883, 'learning_rate': 0.00021158357771260995, 'epoch': 0.65}
+{'loss': 0.9544, 'grad_norm': 2.5539469718933105, 'learning_rate': 0.0002115591397849462, 'epoch': 0.65}
+{'loss': 0.9117, 'grad_norm': 1.8205986022949219, 'learning_rate': 0.00021153470185728248, 'epoch': 0.65}
+{'loss': 1.1332, 'grad_norm': 3.9707062244415283, 'learning_rate': 0.00021151026392961876, 'epoch': 0.65}
+{'loss': 0.5995, 'grad_norm': 1.7716293334960938, 'learning_rate': 0.000211485826001955, 'epoch': 0.65}
+{'loss': 1.1797, 'grad_norm': 2.0093953609466553, 'learning_rate': 0.0002114613880742913, 'epoch': 0.65}
+{'loss': 0.5744, 'grad_norm': 1.3525505065917969, 'learning_rate': 0.00021143695014662757, 'epoch': 0.65}
+{'loss': 0.8285, 'grad_norm': 0.9893065690994263, 'learning_rate': 0.0002114125122189638, 'epoch': 0.65}
+{'loss': 1.8302, 'grad_norm': 3.0879127979278564, 'learning_rate': 0.00021138807429130007, 'epoch': 0.65}
+{'loss': 0.8313, 'grad_norm': 1.6415681838989258, 'learning_rate': 0.00021136363636363635, 'epoch': 0.65}
+{'loss': 0.9433, 'grad_norm': 2.8045177459716797, 'learning_rate': 0.0002113391984359726, 'epoch': 0.65}
+{'loss': 0.7906, 'grad_norm': 1.968576431274414, 'learning_rate': 0.00021131476050830888, 'epoch': 0.65}
+{'loss': 1.2082, 'grad_norm': 1.4854947328567505, 'learning_rate': 0.00021129032258064516, 'epoch': 0.65}
+{'loss': 1.9279, 'grad_norm': 3.7475380897521973, 'learning_rate': 0.0002112658846529814, 'epoch': 0.65}
+{'loss': 1.4245, 'grad_norm': 1.9504555463790894, 'learning_rate': 0.00021124144672531769, 'epoch': 0.65}
+{'loss': 1.1829, 'grad_norm': 2.5647873878479004, 'learning_rate': 0.00021121700879765396, 'epoch': 0.65}
+{'loss': 1.759, 'grad_norm': 1.7342371940612793, 'learning_rate': 0.0002111925708699902, 'epoch': 0.65}
+{'loss': 1.0318, 'grad_norm': 1.559617519378662, 'learning_rate': 0.00021116813294232647, 'epoch': 0.65}
+{'loss': 1.1075, 'grad_norm': 1.5105706453323364, 'learning_rate': 0.00021114369501466275, 'epoch': 0.65}
+{'loss': 0.7295, 'grad_norm': 1.054640769958496, 'learning_rate': 0.000211119257086999, 'epoch': 0.65}
+{'loss': 0.6747, 'grad_norm': 1.3204190731048584, 'learning_rate': 0.00021109481915933528, 'epoch': 0.65}
+{'loss': 1.0632, 'grad_norm': 1.6427069902420044, 'learning_rate': 0.00021107038123167155, 'epoch': 0.65}
+{'loss': 1.724, 'grad_norm': 2.6483640670776367, 'learning_rate': 0.0002110459433040078, 'epoch': 0.65}
+{'loss': 0.2836, 'grad_norm': 0.44321414828300476, 'learning_rate': 0.00021102150537634406, 'epoch': 0.65}
+{'loss': 0.2743, 'grad_norm': 0.5823266506195068, 'learning_rate': 0.00021099706744868033, 'epoch': 0.65}
+{'loss': 0.2912, 'grad_norm': 0.6924779415130615, 'learning_rate': 0.00021097262952101659, 'epoch': 0.65}
+ 33%|███▎      | 4154/12776 [44:12<2:11:51,  1.09it/s] 33%|███▎      | 4155/12776 [44:13<2:08:05,  1.12it/s]                                                       33%|███▎      | 4155/12776 [44:13<2:08:05,  1.12it/s] 33%|███▎      | 4156/12776 [44:13<2:00:56,  1.19it/s]                                                       33%|███▎      | 4156/12776 [44:13<2:00:56,  1.19it/s] 33%|███▎      | 4157/12776 [44:14<1:55:51,  1.24it/s]                                                       33%|███▎      | 4157/12776 [44:14<1:55:51,  1.24it/s] 33%|███▎      | 4158/12776 [44:15<1:48:33,  1.32it/s]                                                       33%|███▎      | 4158/12776 [44:15<1:48:33,  1.32it/s] 33%|███▎      | 4159/12776 [44:15<1:41:29,  1.42it/s]                                                       33%|███▎      | 4159/12776 [44:15<1:41:29,  1.42it/s] 33%|███▎      | 4160/12776 [44:16<1:35:22,  1.51it/s]                                                       33%|███▎      | 4160/12776 [44:16<1:35:22,  1.51it/s] 33%|███▎      | 4161/12776 [44:16<1:30:52,  1.58it/s]                                                       33%|███▎      | 4161/12776 [44:16<1:30:52,  1.58it/s] 33%|███▎      | 4162/12776 [44:17<1:25:47,  1.67it/s]                                                       33%|███▎      | 4162/12776 [44:17<1:25:47,  1.67it/s] 33%|███▎      | 4163/12776 [44:17<1:25:19,  1.68it/s]                                                       33%|███▎      | 4163/12776 [44:17<1:25:19,  1.68it/s] 33%|███▎      | 4164/12776 [44:18<1:19:42,  1.80it/s]                                                       33%|███▎      | 4164/12776 [44:18<1:19:42,  1.80it/s] 33%|███▎      | 4165/12776 [44:18<1:19:17,  1.81it/s]                                                       33%|███▎      | 4165/12776 [44:18<1:19:17,  1.81it/s] 33%|███▎      | 4166/12776 [44:19<1:14:19,  1.93it/s]                                                       33%|███▎      | 4166/12776 [44:19<1:14:19,  1.93it/s] 33%|███▎      | 4167/12776 [44:19<1:13:53,  1.94it/s]                                                       33%|███▎      | 4167/12776 [44:19<1:13:53,  1.94it/s] 33%|███▎      | 4168/12776 [44:20<1:09:21,  2.07it/s]                                                       33%|███▎      | 4168/12776 [44:20<1:09:21,  2.07it/s] 33%|███▎      | 4169/12776 [44:20<1:05:12,  2.20it/s]                                                       33%|███▎      | 4169/12776 [44:20<1:05:12,  2.20it/s] 33%|███▎      | 4170/12776 [44:21<1:07:51,  2.11it/s]                                                       33%|███▎      | 4170/12776 [44:21<1:07:51,  2.11it/s] 33%|███▎      | 4171/12776 [44:21<1:03:22,  2.26it/s]                                                       33%|███▎      | 4171/12776 [44:21<1:03:22,  2.26it/s] 33%|███▎      | 4172/12776 [44:21<59:41,  2.40it/s]                                                       33%|███▎      | 4172/12776 [44:21<59:41,  2.40it/s] 33%|███▎      | 4173/12776 [44:22<59:06,  2.43it/s]                                                     33%|███▎      | 4173/12776 [44:22<59:06,  2.43it/s] 33%|███▎      | 4174/12776 [44:22<56:05,  2.56it/s]                                                     33%|███▎      | 4174/12776 [44:22<56:05,  2.56it/s] 33%|███▎      | 4175/12776 [44:23<53:31,  2.68it/s]                                                     33%|███▎      | 4175/12776 [44:23<53:31,  2.68it/s] 33%|███▎      | 4176/12776 [44:23<56:19,  2.55it/s]                                                     33%|███▎      | 4176/12776 [44:23<56:19,  2.55it/s] 33%|███▎      | 4177/12776 [44:23<52:41,  2.72it/s]                                                     33%|███▎      | 4177/12776 [44:23<52:41,  2.72it/s] 33%|███▎      | 4178/12776 [44:24<49:37,  2.89it/s]                                                     33%|███▎      | 4178/12776 [44:24<49:37,  2.89it/s] 33%|███▎      | 4179/12776 [44:24<50:40,  2.83it/s]                                                     33%|███▎      | 4179/12776 [44:24<50:40,  2.83it/s] 33%|███▎      | 4180/12776 [44:24<47:30,  3.02it/s]                                                     33%|███▎      | 4180/12776 [44:24<47:30,  3.02it/s] 33%|███▎      | 4181/12776 [44:24<45:02,  3.18it/s]                                                     33%|███▎      | 4181/12776 [44:24<45:02,  3.18it/s] 33%|███▎      | 4182/12776 [44:25<42:51,  3.34it/s]                                                     33%|███▎      | 4182/12776 [44:25<42:51,  3.34it/s] 33%|███▎      | 4183/12776 [44:25<43:48,  3.27it/s]                                                     33%|███▎      | 4183/12776 [44:25<43:48,  3.27it/s] 33%|███▎      | 4184/12776 [44:25<41:27,  3.45it/s]                                                     33%|███▎      | 4184/12776 [44:25<41:27,  3.45it/s] 33%|███▎      | 4185/12776 [44:26<39:25,  3.63it/s]                                                     33%|███▎      | 4185/12776 [44:26<39:25,  3.63it/s] 33%|███▎      | 4186/12776 [44:26<37:34,  3.81it/s]                                                     33%|███▎      | 4186/12776 [44:26<37:34,  3.81it/s] 33%|███▎      | 4187/12776 [44:26<36:04,  3.97it/s]                                                     33%|███▎      | 4187/12776 [44:26<36:04,  3.97it/s] 33%|███▎      | 4188/12776 [44:26<38:31,  3.72it/s]                                                     33%|███▎      | 4188/12776 [44:26<38:31,  3.72it/s] 33%|███▎      | 4189/12776 [44:27<36:04,  3.97it/s]                                                     33%|███▎      | 4189/12776 [44:27<36:04,  3.97it/s] 33%|███▎      | 4190/12776 [44:27<33:56,  4.22it/s]                                                     33%|███▎      | 4190/12776 [44:27<33:56,  4.22it/s] 33%|███▎      | 4191/12776 [44:27<32:43,  4.37it/s]                                                     33%|███▎      | 4191/12776 [44:27<32:43,  4.37it/s] 33%|███▎      | 4192/12776 [44:27<31:42,  4.51it/s]                                                     33%|███▎      | 4192/12776 [44:27<31:42,  4.51it/s] 33%|███▎      | 4193/12776 [44:27<35:17,  4.05it/s]                                                     33%|███▎      | 4193/12776 [44:27<35:17,  4.05it/s] 33%|███▎      | 4194/12776 [44:28<33:12,  4.31it/s]                                                     33%|███▎      | 4194/12776 [44:28<33:12,  4.31it/s] 33%|███▎      | 4195/12776 [44:28<31:38,  4.52it/s]                                                     33%|███▎      | 4195/12776 [44:28<31:38,  4.52it/s] 33%|███▎      | 4196/12776 [44:28<30:23,  4.71it/s]                                                     33%|███▎      | 4196/12776 [44:28<30:23,  4.71it/s] 33%|███▎      | 4197/12776 [44:28<29:27,  4.85it/s]                                                     33%|███▎      | 4197/12776 [44:28<29:27,  4.85it/s] 33%|███▎      | 4198/12776 [44:28<31:03,  4.60it/s]                                                     33%|███▎      | 4198/12776 [44:28<31:03,  4.60it/s] 33%|███▎      | 4199/12776 [44:29<29:38,  4.82it/s]                                                     33%|███▎      | 4199/12776 [44:29<29:38,  4.82it/s] 33%|███▎      | 4200/12776 [44:29<50:29,  2.83it/s]                                                     33%|███▎      | 4200/12776 [44:29<50:29,  2.83it/s] 33%|███▎      | 4201/12776 [44:31<1:40:07,  1.43it/s]                                                       33%|███▎      | 4201/12776 [44:31<1:40:07,  1.43it/s] 33%|███▎      | 4202/12776 [44:32<1:53:33,  1.26it/s]                                                       33%|███▎      | 4202/12776 [44:32<1:53:33,  1.26it/s] 33%|███▎      | 4203/12776 [44:33<1:56:14,  1.23it/s]                                                       33%|███▎      | 4203/12776 [44:33<1:56:14,  1.23it/s] 33%|███▎      | 4204/12776 [44:34<1:55:07,  1.24it/s]                                                       33%|███▎      | 4204/12776 [44:34<1:55:07,  1.24it/s] 33%|███▎      | 4205/12776 [44:34<1:51:30,  1.28it/s]                                                       33%|███▎      | 4205/12776 [44:34<1:51:30,  1.28it/s] 33%|███▎      | 4206/12776 [44:35<1:47:30,  1.33it/s]                                                       33%|███▎      | 4206/12776 [44:35<1:47:30,  1.33it/s] 33%|███▎      | 4207/12776 [44:36<1:48:00,  1.32it/s]                                                       33%|███▎      | 4207/12776 [44:36<1:48:00,  1.32it/s] 33%|███▎      | 4208/12776 [44:36<1:47:50,  1.32it/s]                                                       33%|███▎      | 4208/12776 [44:36<1:47:50,  1.32it/s] 33%|███▎      | 4209/12776 [44:37<1:40:57,  1.41it/s]                                                       33%|███▎      | 4209/12776 [44:37<1:40:57,  1.41it/s] 33%|███▎      | 4210/12776 [44:38<1:35:04,  1.50it/s]                                                       33%|███▎      | 4210/12776 [44:38<1:35:04,  1.50it/s] 33%|███▎      | 4211/12776 [44:38<1:29:34,  1.59it/s]                                                       33%|███▎      | 4211/12776 [44:38<1:29:34,  1.59it/s] 33%|███▎      | 4212/12776 [44:39<1:29:12,  1.60it/s]                                                       33%|███▎      | 4212/12776 [44:39<1:29:12,  1.60it/s] 33%|███▎      | 4213/12776 [44:39<1:23:19,  1.71it/s]                                                       33%|███▎      | 4213/12776 [44:39<1:23:19,  1.71it/s] 33%|███▎      | 4214/12776 [44:40<1:17:28,  1.84it/s]                                                       33%|███▎      | 4214/12776 [44:40<1:17:28,  1.84it/s] 33%|███▎      | 4215/12776 [44:40<1:15:17,  1.90it/s]                                                       33%|███▎      | 4215/12776 [44:40<1:15:17,  1.90it/s] 33%|███▎      | 4216/12776 [44:41<1:10:23,  2.03it/s]                                                       33%|███▎      | 4216/12776 [44:41<1:10:23,  2.03it/s] 33%|███▎      | 4217/12776 [44:41<1:09:09,  2.06it/s]                                                       33%|███▎      | 4217/12776 [44:41<1:09:09,  2.06it/s] 33%|███▎      | 4218/12776 [44:41<1:05:13,  2.19it/s]                                                       33%|███▎      | 4218/12776 [44:41<1:05:13,  2.19it/s] 33%|███▎      | 4219/12776 [44:42<1:01:53,  2.30it/s]                                                       33%|███▎      | 4219/12776 [44:42<1:01:53,  2.30it/s] 33%|███▎      | 4220/12776 [44:42<59:29,  2.40it/s]                                                       33%|███▎      | 4220/12776 [44:42<59:29,  2.40it/s] 33%|███▎      | 4221/12776 [44:43<56:36,  2.52it/s]                                                     33%|███▎      | 4221/12776 [44:43<56:36,  2.52it/s] 33%|███▎      | 4222/12776 [44:43<54:14,  2.63it/s]                                                     33%|███▎      | 4222/12776 [44:43<54:14,  2.63it/s] 33%|███▎      | 4223/12776 [44:43<56:24,  2.53it/s]                                                     33%|███▎      | 4223/12776 [44:43<56:24,  2.53it/s] 33%|███▎      | 4224/12776 [44:44<53:05,  2.68it/s]                                                     33%|███▎      | 4224/12776 [44:44<53:05,  2.68it/s] 33%|███▎      | 4225/12776 [44:44<50:09,  2.84it/s]                                                     33%|███▎      | 4225/12776 [44:44<50:09,  2.84it/s] 33%|███▎      | 4226/12776 [44:44<47:40,  2.99it/s]                                                     33%|███▎      | 4226/12776 [44:44<47:40,  2.99it/s] 33%|███▎      | 4227/12776 [44:45<48:25,  2.94it/s]                                                     33%|███▎      | 4227/12776 [44:45<48:25,  2.94it/s] 33%|███▎      | 4228/12776 [44:45<46:02,  3.09it/s]                                                     33%|███▎      | 4228/12776 [44:45<46:02,  3.09it/s] 33%|███▎      | 4229/12776 [44:45<43:47,  3.25it/s]                                                     33%|███▎      | 4229/12776 [44:45<43:47,  3.25it/s] 33%|███▎      | 4230/12776 [44:45<41:56,  3.40it/s]                                                     33%|███▎      | 4230/12776 [44:45<41:56,  3.40it/s] 33%|███▎      | 4231/12776 [44:46<44:15,  3.22it/s]                                                    {'loss': 0.4401, 'grad_norm': 0.7143043279647827, 'learning_rate': 0.00021094819159335286, 'epoch': 0.65}
+{'loss': 0.3095, 'grad_norm': 1.3243781328201294, 'learning_rate': 0.00021092375366568914, 'epoch': 0.65}
+{'loss': 0.3707, 'grad_norm': 0.8137784600257874, 'learning_rate': 0.0002108993157380254, 'epoch': 0.65}
+{'loss': 0.2581, 'grad_norm': 0.5884976983070374, 'learning_rate': 0.00021087487781036167, 'epoch': 0.65}
+{'loss': 0.4062, 'grad_norm': 0.9753175973892212, 'learning_rate': 0.00021085043988269795, 'epoch': 0.65}
+{'loss': 0.4112, 'grad_norm': 0.8055758476257324, 'learning_rate': 0.00021082600195503417, 'epoch': 0.65}
+{'loss': 0.5229, 'grad_norm': 0.7871150374412537, 'learning_rate': 0.00021080156402737045, 'epoch': 0.65}
+{'loss': 0.2359, 'grad_norm': 0.666612982749939, 'learning_rate': 0.00021077712609970673, 'epoch': 0.65}
+{'loss': 1.1647, 'grad_norm': 4.991723537445068, 'learning_rate': 0.00021075268817204298, 'epoch': 0.65}
+{'loss': 0.2791, 'grad_norm': 0.6733570098876953, 'learning_rate': 0.00021072825024437926, 'epoch': 0.65}
+{'loss': 0.4028, 'grad_norm': 1.0828582048416138, 'learning_rate': 0.00021070381231671554, 'epoch': 0.65}
+{'loss': 0.4959, 'grad_norm': 1.031664252281189, 'learning_rate': 0.0002106793743890518, 'epoch': 0.65}
+{'loss': 0.5123, 'grad_norm': 1.7756761312484741, 'learning_rate': 0.00021065493646138807, 'epoch': 0.65}
+{'loss': 0.5411, 'grad_norm': 1.409739375114441, 'learning_rate': 0.00021063049853372435, 'epoch': 0.65}
+{'loss': 0.5408, 'grad_norm': 0.8608100414276123, 'learning_rate': 0.00021060606060606057, 'epoch': 0.65}
+{'loss': 0.5228, 'grad_norm': 1.9020010232925415, 'learning_rate': 0.00021058162267839685, 'epoch': 0.65}
+{'loss': 0.4451, 'grad_norm': 1.2698709964752197, 'learning_rate': 0.00021055718475073313, 'epoch': 0.65}
+{'loss': 0.5889, 'grad_norm': 1.3573118448257446, 'learning_rate': 0.00021053274682306938, 'epoch': 0.65}
+{'loss': 0.4531, 'grad_norm': 1.1787749528884888, 'learning_rate': 0.00021050830889540566, 'epoch': 0.65}
+{'loss': 0.6581, 'grad_norm': 1.463240146636963, 'learning_rate': 0.00021048387096774194, 'epoch': 0.65}
+{'loss': 0.6735, 'grad_norm': 1.1334830522537231, 'learning_rate': 0.0002104594330400782, 'epoch': 0.65}
+{'loss': 0.5811, 'grad_norm': 0.9221059083938599, 'learning_rate': 0.00021043499511241444, 'epoch': 0.65}
+{'loss': 0.3367, 'grad_norm': 0.7749282717704773, 'learning_rate': 0.00021041055718475072, 'epoch': 0.65}
+{'loss': 0.744, 'grad_norm': 2.028270721435547, 'learning_rate': 0.00021038611925708697, 'epoch': 0.65}
+{'loss': 0.7978, 'grad_norm': 2.1838300228118896, 'learning_rate': 0.00021036168132942325, 'epoch': 0.65}
+{'loss': 0.7741, 'grad_norm': 2.014901876449585, 'learning_rate': 0.00021033724340175952, 'epoch': 0.65}
+{'loss': 0.8107, 'grad_norm': 2.365983009338379, 'learning_rate': 0.00021031280547409578, 'epoch': 0.65}
+{'loss': 1.2278, 'grad_norm': 2.7603774070739746, 'learning_rate': 0.00021028836754643205, 'epoch': 0.65}
+{'loss': 0.8463, 'grad_norm': 1.7093403339385986, 'learning_rate': 0.00021026392961876833, 'epoch': 0.65}
+{'loss': 0.8286, 'grad_norm': 1.8434052467346191, 'learning_rate': 0.00021023949169110456, 'epoch': 0.65}
+{'loss': 0.7018, 'grad_norm': 1.2847257852554321, 'learning_rate': 0.00021021505376344084, 'epoch': 0.65}
+{'loss': 0.8022, 'grad_norm': 1.887803077697754, 'learning_rate': 0.0002101906158357771, 'epoch': 0.66}
+{'loss': 0.5651, 'grad_norm': 2.067678928375244, 'learning_rate': 0.00021016617790811336, 'epoch': 0.66}
+{'loss': 0.9908, 'grad_norm': 1.9013502597808838, 'learning_rate': 0.00021014173998044964, 'epoch': 0.66}
+{'loss': 0.9834, 'grad_norm': 2.773240566253662, 'learning_rate': 0.00021011730205278592, 'epoch': 0.66}
+{'loss': 1.1264, 'grad_norm': 2.2640483379364014, 'learning_rate': 0.00021009286412512217, 'epoch': 0.66}
+{'loss': 1.1565, 'grad_norm': 1.875777006149292, 'learning_rate': 0.00021006842619745845, 'epoch': 0.66}
+{'loss': 1.6043, 'grad_norm': 4.020966529846191, 'learning_rate': 0.00021004398826979473, 'epoch': 0.66}
+{'loss': 1.0613, 'grad_norm': 2.4041779041290283, 'learning_rate': 0.00021001955034213095, 'epoch': 0.66}
+{'loss': 1.2259, 'grad_norm': 2.9603583812713623, 'learning_rate': 0.00020999511241446723, 'epoch': 0.66}
+{'loss': 0.9003, 'grad_norm': 3.0527658462524414, 'learning_rate': 0.0002099706744868035, 'epoch': 0.66}
+{'loss': 1.3881, 'grad_norm': 2.1919569969177246, 'learning_rate': 0.00020994623655913976, 'epoch': 0.66}
+{'loss': 0.6151, 'grad_norm': 2.547144889831543, 'learning_rate': 0.00020992179863147604, 'epoch': 0.66}
+{'loss': 0.7662, 'grad_norm': 1.940132975578308, 'learning_rate': 0.00020989736070381232, 'epoch': 0.66}
+{'loss': 0.6982, 'grad_norm': 1.2711700201034546, 'learning_rate': 0.00020987292277614854, 'epoch': 0.66}
+{'loss': 0.4992, 'grad_norm': 1.6666027307510376, 'learning_rate': 0.00020984848484848482, 'epoch': 0.66}
+{'loss': 1.2499, 'grad_norm': 4.075043201446533, 'learning_rate': 0.0002098240469208211, 'epoch': 0.66}
+{'loss': 0.2544, 'grad_norm': 0.5925846695899963, 'learning_rate': 0.00020979960899315735, 'epoch': 0.66}
+{'loss': 0.3609, 'grad_norm': 0.90831458568573, 'learning_rate': 0.00020977517106549363, 'epoch': 0.66}
+{'loss': 0.2574, 'grad_norm': 0.6981661319732666, 'learning_rate': 0.0002097507331378299, 'epoch': 0.66}
+{'loss': 0.2842, 'grad_norm': 0.8402687311172485, 'learning_rate': 0.00020972629521016616, 'epoch': 0.66}
+{'loss': 0.3959, 'grad_norm': 0.7427016496658325, 'learning_rate': 0.00020970185728250244, 'epoch': 0.66}
+{'loss': 0.4118, 'grad_norm': 0.7101203799247742, 'learning_rate': 0.00020967741935483871, 'epoch': 0.66}
+{'loss': 0.3418, 'grad_norm': 0.7009015083312988, 'learning_rate': 0.00020965298142717494, 'epoch': 0.66}
+{'loss': 0.3614, 'grad_norm': 0.8235505223274231, 'learning_rate': 0.00020962854349951122, 'epoch': 0.66}
+{'loss': 0.3866, 'grad_norm': 0.49669182300567627, 'learning_rate': 0.0002096041055718475, 'epoch': 0.66}
+{'loss': 0.8613, 'grad_norm': 8.812310218811035, 'learning_rate': 0.00020957966764418375, 'epoch': 0.66}
+{'loss': 1.7947, 'grad_norm': 6.276821613311768, 'learning_rate': 0.00020955522971652003, 'epoch': 0.66}
+{'loss': 0.5293, 'grad_norm': 2.7206177711486816, 'learning_rate': 0.0002095307917888563, 'epoch': 0.66}
+{'loss': 0.832, 'grad_norm': 2.726612091064453, 'learning_rate': 0.00020950635386119255, 'epoch': 0.66}
+{'loss': 0.556, 'grad_norm': 0.86571204662323, 'learning_rate': 0.00020948191593352883, 'epoch': 0.66}
+{'loss': 0.5059, 'grad_norm': 1.1770457029342651, 'learning_rate': 0.00020945747800586508, 'epoch': 0.66}
+{'loss': 0.6573, 'grad_norm': 1.2586264610290527, 'learning_rate': 0.00020943304007820134, 'epoch': 0.66}
+{'loss': 0.471, 'grad_norm': 1.2371019124984741, 'learning_rate': 0.00020940860215053761, 'epoch': 0.66}
+{'loss': 0.4126, 'grad_norm': 0.7235432863235474, 'learning_rate': 0.0002093841642228739, 'epoch': 0.66}
+{'loss': 0.9866, 'grad_norm': 1.9548500776290894, 'learning_rate': 0.00020935972629521014, 'epoch': 0.66}
+{'loss': 0.5777, 'grad_norm': 1.3150765895843506, 'learning_rate': 0.00020933528836754642, 'epoch': 0.66}
+{'loss': 0.7398, 'grad_norm': 1.9192475080490112, 'learning_rate': 0.0002093108504398827, 'epoch': 0.66}
+{'loss': 0.5958, 'grad_norm': 2.385049819946289, 'learning_rate': 0.00020928641251221892, 'epoch': 0.66}
+{'loss': 0.6067, 'grad_norm': 1.309512734413147, 'learning_rate': 0.0002092619745845552, 'epoch': 0.66}
+{'loss': 0.7334, 'grad_norm': 1.5893750190734863, 'learning_rate': 0.00020923753665689148, 'epoch': 0.66}
+{'loss': 0.8073, 'grad_norm': 2.3424973487854004, 'learning_rate': 0.00020921309872922773, 'epoch': 0.66}
+{'loss': 0.6511, 'grad_norm': 1.3689500093460083, 'learning_rate': 0.000209188660801564, 'epoch': 0.66}
+{'loss': 0.8244, 'grad_norm': 1.4567039012908936, 'learning_rate': 0.0002091642228739003, 'epoch': 0.66}
+{'loss': 0.774, 'grad_norm': 5.747929573059082, 'learning_rate': 0.00020913978494623654, 'epoch': 0.66}
+{'loss': 0.6109, 'grad_norm': 1.2225399017333984, 'learning_rate': 0.00020911534701857282, 'epoch': 0.66}
+{'loss': 0.8214, 'grad_norm': 1.25154709815979, 'learning_rate': 0.0002090909090909091, 'epoch': 0.66}
+ 33%|███▎      | 4231/12776 [44:46<44:15,  3.22it/s] 33%|███▎      | 4232/12776 [44:46<41:37,  3.42it/s]                                                     33%|███▎      | 4232/12776 [44:46<41:37,  3.42it/s] 33%|███▎      | 4233/12776 [44:46<39:44,  3.58it/s]                                                     33%|███▎      | 4233/12776 [44:46<39:44,  3.58it/s] 33%|███▎      | 4234/12776 [44:47<38:11,  3.73it/s]                                                     33%|███▎      | 4234/12776 [44:47<38:11,  3.73it/s] 33%|███▎      | 4235/12776 [44:47<41:57,  3.39it/s]                                                     33%|███▎      | 4235/12776 [44:47<41:57,  3.39it/s] 33%|███▎      | 4236/12776 [44:47<39:18,  3.62it/s]                                                     33%|███▎      | 4236/12776 [44:47<39:18,  3.62it/s] 33%|███▎      | 4237/12776 [44:47<37:14,  3.82it/s]                                                     33%|███▎      | 4237/12776 [44:47<37:14,  3.82it/s] 33%|███▎      | 4238/12776 [44:48<35:26,  4.01it/s]                                                     33%|███▎      | 4238/12776 [44:48<35:26,  4.01it/s] 33%|███▎      | 4239/12776 [44:48<37:56,  3.75it/s]                                                     33%|███▎      | 4239/12776 [44:48<37:56,  3.75it/s] 33%|███▎      | 4240/12776 [44:48<35:24,  4.02it/s]                                                     33%|███▎      | 4240/12776 [44:48<35:24,  4.02it/s] 33%|███▎      | 4241/12776 [44:48<33:44,  4.22it/s]                                                     33%|███▎      | 4241/12776 [44:48<33:44,  4.22it/s] 33%|███▎      | 4242/12776 [44:48<32:22,  4.39it/s]                                                     33%|███▎      | 4242/12776 [44:48<32:22,  4.39it/s] 33%|███▎      | 4243/12776 [44:49<31:18,  4.54it/s]                                                     33%|███▎      | 4243/12776 [44:49<31:18,  4.54it/s] 33%|███▎      | 4244/12776 [44:49<35:16,  4.03it/s]                                                     33%|███▎      | 4244/12776 [44:49<35:16,  4.03it/s] 33%|███▎      | 4245/12776 [44:49<33:08,  4.29it/s]                                                     33%|███▎      | 4245/12776 [44:49<33:08,  4.29it/s] 33%|███▎      | 4246/12776 [44:49<31:30,  4.51it/s]                                                     33%|███▎      | 4246/12776 [44:49<31:30,  4.51it/s] 33%|███▎      | 4247/12776 [44:50<30:16,  4.70it/s]                                                     33%|███▎      | 4247/12776 [44:50<30:16,  4.70it/s] 33%|███▎      | 4248/12776 [44:50<29:17,  4.85it/s]                                                     33%|███▎      | 4248/12776 [44:50<29:17,  4.85it/s] 33%|███▎      | 4249/12776 [44:50<28:21,  5.01it/s]                                                     33%|███▎      | 4249/12776 [44:50<28:21,  5.01it/s] 33%|███▎      | 4250/12776 [44:51<51:18,  2.77it/s]                                                     33%|███▎      | 4250/12776 [44:51<51:18,  2.77it/s] 33%|███▎      | 4251/12776 [44:52<1:44:26,  1.36it/s]                                                       33%|███▎      | 4251/12776 [44:52<1:44:26,  1.36it/s] 33%|███▎      | 4252/12776 [44:53<1:56:27,  1.22it/s]                                                       33%|███▎      | 4252/12776 [44:53<1:56:27,  1.22it/s] 33%|███▎      | 4253/12776 [44:54<1:58:42,  1.20it/s]                                                       33%|███▎      | 4253/12776 [44:54<1:58:42,  1.20it/s] 33%|███▎      | 4254/12776 [44:55<1:59:54,  1.18it/s]                                                       33%|███▎      | 4254/12776 [44:55<1:59:54,  1.18it/s] 33%|███▎      | 4255/12776 [44:56<1:59:20,  1.19it/s]                                                       33%|███▎      | 4255/12776 [44:56<1:59:20,  1.19it/s] 33%|███▎      | 4256/12776 [44:57<1:54:43,  1.24it/s]                                                       33%|███▎      | 4256/12776 [44:57<1:54:43,  1.24it/s] 33%|███▎      | 4257/12776 [44:57<1:51:59,  1.27it/s]                                                       33%|███▎      | 4257/12776 [44:57<1:51:59,  1.27it/s] 33%|███▎      | 4258/12776 [44:58<1:46:15,  1.34it/s]                                                       33%|███▎      | 4258/12776 [44:58<1:46:15,  1.34it/s] 33%|███▎      | 4259/12776 [44:59<1:41:31,  1.40it/s]                                                       33%|███▎      | 4259/12776 [44:59<1:41:31,  1.40it/s] 33%|███▎      | 4260/12776 [44:59<1:42:26,  1.39it/s]                                                       33%|███▎      | 4260/12776 [44:59<1:42:26,  1.39it/s] 33%|███▎      | 4261/12776 [45:00<1:35:57,  1.48it/s]                                                       33%|███▎      | 4261/12776 [45:00<1:35:57,  1.48it/s] 33%|███▎      | 4262/12776 [45:01<1:32:25,  1.54it/s]                                                       33%|███▎      | 4262/12776 [45:01<1:32:25,  1.54it/s] 33%|███▎      | 4263/12776 [45:01<1:26:59,  1.63it/s]                                                       33%|███▎      | 4263/12776 [45:01<1:26:59,  1.63it/s] 33%|███▎      | 4264/12776 [45:02<1:27:04,  1.63it/s]                                                       33%|███▎      | 4264/12776 [45:02<1:27:04,  1.63it/s] 33%|███▎      | 4265/12776 [45:02<1:21:22,  1.74it/s]                                                       33%|███▎      | 4265/12776 [45:02<1:21:22,  1.74it/s] 33%|███▎      | 4266/12776 [45:03<1:16:26,  1.86it/s]                                                       33%|███▎      | 4266/12776 [45:03<1:16:26,  1.86it/s] 33%|███▎      | 4267/12776 [45:03<1:14:15,  1.91it/s]                                                       33%|███▎      | 4267/12776 [45:03<1:14:15,  1.91it/s] 33%|███▎      | 4268/12776 [45:04<1:09:51,  2.03it/s]                                                       33%|███▎      | 4268/12776 [45:04<1:09:51,  2.03it/s] 33%|███▎      | 4269/12776 [45:04<1:08:18,  2.08it/s]                                                       33%|███▎      | 4269/12776 [45:04<1:08:18,  2.08it/s] 33%|███▎      | 4270/12776 [45:04<1:04:25,  2.20it/s]                                                       33%|███▎      | 4270/12776 [45:04<1:04:25,  2.20it/s] 33%|███▎      | 4271/12776 [45:05<1:01:16,  2.31it/s]                                                       33%|███▎      | 4271/12776 [45:05<1:01:16,  2.31it/s] 33%|███▎      | 4272/12776 [45:05<1:00:30,  2.34it/s]                                                       33%|███▎      | 4272/12776 [45:05<1:00:30,  2.34it/s] 33%|███▎      | 4273/12776 [45:06<57:25,  2.47it/s]                                                       33%|███▎      | 4273/12776 [45:06<57:25,  2.47it/s] 33%|███▎      | 4274/12776 [45:06<54:59,  2.58it/s]                                                     33%|███▎      | 4274/12776 [45:06<54:59,  2.58it/s] 33%|███▎      | 4275/12776 [45:06<58:03,  2.44it/s]                                                     33%|███▎      | 4275/12776 [45:06<58:03,  2.44it/s] 33%|███▎      | 4276/12776 [45:07<54:51,  2.58it/s]                                                     33%|███▎      | 4276/12776 [45:07<54:51,  2.58it/s] 33%|███▎      | 4277/12776 [45:07<52:13,  2.71it/s]                                                     33%|███▎      | 4277/12776 [45:07<52:13,  2.71it/s] 33%|███▎      | 4278/12776 [45:07<49:27,  2.86it/s]                                                     33%|███▎      | 4278/12776 [45:07<49:27,  2.86it/s] 33%|███▎      | 4279/12776 [45:08<48:38,  2.91it/s]                                                     33%|███▎      | 4279/12776 [45:08<48:38,  2.91it/s] 34%|███▎      | 4280/12776 [45:08<46:03,  3.07it/s]                                                     34%|███▎      | 4280/12776 [45:08<46:03,  3.07it/s] 34%|███▎      | 4281/12776 [45:08<43:43,  3.24it/s]                                                     34%|███▎      | 4281/12776 [45:08<43:43,  3.24it/s] 34%|███▎      | 4282/12776 [45:08<41:50,  3.38it/s]                                                     34%|███▎      | 4282/12776 [45:08<41:50,  3.38it/s] 34%|███▎      | 4283/12776 [45:09<42:57,  3.29it/s]                                                     34%|███▎      | 4283/12776 [45:09<42:57,  3.29it/s] 34%|███▎      | 4284/12776 [45:09<40:40,  3.48it/s]                                                     34%|███▎      | 4284/12776 [45:09<40:40,  3.48it/s] 34%|███▎      | 4285/12776 [45:09<38:54,  3.64it/s]                                                     34%|███▎      | 4285/12776 [45:09<38:54,  3.64it/s] 34%|███▎      | 4286/12776 [45:10<37:19,  3.79it/s]                                                     34%|███▎      | 4286/12776 [45:10<37:19,  3.79it/s] 34%|███▎      | 4287/12776 [45:10<41:18,  3.42it/s]                                                     34%|███▎      | 4287/12776 [45:10<41:18,  3.42it/s] 34%|███▎      | 4288/12776 [45:10<38:23,  3.68it/s]                                                     34%|███▎      | 4288/12776 [45:10<38:23,  3.68it/s] 34%|███▎      | 4289/12776 [45:10<36:06,  3.92it/s]                                                     34%|███▎      | 4289/12776 [45:10<36:06,  3.92it/s] 34%|███▎      | 4290/12776 [45:11<34:17,  4.12it/s]                                                     34%|███▎      | 4290/12776 [45:11<34:17,  4.12it/s] 34%|███▎      | 4291/12776 [45:11<32:48,  4.31it/s]                                                     34%|███▎      | 4291/12776 [45:11<32:48,  4.31it/s] 34%|███▎      | 4292/12776 [45:11<35:54,  3.94it/s]                                                     34%|███▎      | 4292/12776 [45:11<35:54,  3.94it/s] 34%|███▎      | 4293/12776 [45:11<33:41,  4.20it/s]                                                     34%|███▎      | 4293/12776 [45:11<33:41,  4.20it/s] 34%|███▎      | 4294/12776 [45:11<32:00,  4.42it/s]                                                     34%|███▎      | 4294/12776 [45:11<32:00,  4.42it/s] 34%|███▎      | 4295/12776 [45:12<30:42,  4.60it/s]                                                     34%|███▎      | 4295/12776 [45:12<30:42,  4.60it/s] 34%|███▎      | 4296/12776 [45:12<29:41,  4.76it/s]                                                     34%|███▎      | 4296/12776 [45:12<29:41,  4.76it/s] 34%|███▎      | 4297/12776 [45:12<34:17,  4.12it/s]                                                     34%|███▎      | 4297/12776 [45:12<34:17,  4.12it/s] 34%|███▎      | 4298/12776 [45:12<31:49,  4.44it/s]                                                     34%|███▎      | 4298/12776 [45:12<31:49,  4.44it/s] 34%|███▎      | 4299/12776 [45:13<30:06,  4.69it/s]                                                     34%|███▎      | 4299/12776 [45:13<30:06,  4.69it/s] 34%|███▎      | 4300/12776 [45:13<53:12,  2.66it/s]                                                     34%|███▎      | 4300/12776 [45:13<53:12,  2.66it/s] 34%|███▎      | 4301/12776 [45:15<1:35:43,  1.48it/s]                                                       34%|███▎      | 4301/12776 [45:15<1:35:43,  1.48it/s] 34%|███▎      | 4302/12776 [45:16<1:50:39,  1.28it/s]                                                       34%|███▎      | 4302/12776 [45:16<1:50:39,  1.28it/s] 34%|███▎      | 4303/12776 [45:17<1:57:35,  1.20it/s]                                                       34%|███▎      | 4303/12776 [45:17<1:57:35,  1.20it/s] 34%|███▎      | 4304/12776 [45:17<1:56:06,  1.22it/s]                                                       34%|███▎      | 4304/12776 [45:17<1:56:06,  1.22it/s] 34%|███▎      | 4305/12776 [45:18<1:54:00,  1.24it/s]                                                       34%|███▎      | 4305/12776 [45:18<1:54:00,  1.24it/s] 34%|███▎      | 4306/12776 [45:19<1:50:51,  1.27it/s]                                                       34%|███▎      | 4306/12776 [45:19<1:50:51,  1.27it/s] 34%|███▎      | 4307/12776 [45:20<1:45:51,  1.33it/s]                                                       34%|███▎      | 4307/12776 [45:20<1:45:51,  1.33it/s] 34%|███▎      | 4308/12776 [45:20<1:45:15,  1.34it/s]                                                      {'loss': 0.9792, 'grad_norm': 2.5937745571136475, 'learning_rate': 0.00020906647116324532, 'epoch': 0.66}
+{'loss': 0.6997, 'grad_norm': 4.251043796539307, 'learning_rate': 0.0002090420332355816, 'epoch': 0.66}
+{'loss': 0.6984, 'grad_norm': 2.7848551273345947, 'learning_rate': 0.00020901759530791788, 'epoch': 0.66}
+{'loss': 0.8251, 'grad_norm': 3.419072151184082, 'learning_rate': 0.00020899315738025413, 'epoch': 0.66}
+{'loss': 0.9583, 'grad_norm': 4.448485374450684, 'learning_rate': 0.0002089687194525904, 'epoch': 0.66}
+{'loss': 1.222, 'grad_norm': 3.649228572845459, 'learning_rate': 0.00020894428152492669, 'epoch': 0.66}
+{'loss': 1.2783, 'grad_norm': 2.3916118144989014, 'learning_rate': 0.00020891984359726294, 'epoch': 0.66}
+{'loss': 0.9857, 'grad_norm': 2.919701337814331, 'learning_rate': 0.00020889540566959922, 'epoch': 0.66}
+{'loss': 1.1059, 'grad_norm': 2.3915016651153564, 'learning_rate': 0.00020887096774193547, 'epoch': 0.66}
+{'loss': 1.2545, 'grad_norm': 4.138210773468018, 'learning_rate': 0.00020884652981427172, 'epoch': 0.66}
+{'loss': 1.5261, 'grad_norm': 4.440970420837402, 'learning_rate': 0.000208822091886608, 'epoch': 0.66}
+{'loss': 1.3974, 'grad_norm': 3.668529748916626, 'learning_rate': 0.00020879765395894427, 'epoch': 0.66}
+{'loss': 1.0031, 'grad_norm': 2.6934828758239746, 'learning_rate': 0.00020877321603128053, 'epoch': 0.66}
+{'loss': 1.5351, 'grad_norm': 3.754547119140625, 'learning_rate': 0.0002087487781036168, 'epoch': 0.66}
+{'loss': 1.2908, 'grad_norm': 2.1990721225738525, 'learning_rate': 0.00020872434017595308, 'epoch': 0.66}
+{'loss': 0.7223, 'grad_norm': 1.8534988164901733, 'learning_rate': 0.0002086999022482893, 'epoch': 0.66}
+{'loss': 0.7743, 'grad_norm': 1.668362021446228, 'learning_rate': 0.00020867546432062559, 'epoch': 0.66}
+{'loss': 1.1719, 'grad_norm': 3.3219528198242188, 'learning_rate': 0.00020865102639296186, 'epoch': 0.66}
+{'loss': 0.5382, 'grad_norm': 1.7605839967727661, 'learning_rate': 0.00020862658846529811, 'epoch': 0.67}
+{'loss': 1.0354, 'grad_norm': 2.431262731552124, 'learning_rate': 0.0002086021505376344, 'epoch': 0.67}
+{'loss': 0.3289, 'grad_norm': 0.6644131541252136, 'learning_rate': 0.00020857771260997067, 'epoch': 0.67}
+{'loss': 0.2793, 'grad_norm': 0.4004727303981781, 'learning_rate': 0.00020855327468230692, 'epoch': 0.67}
+{'loss': 0.2703, 'grad_norm': 0.5396392345428467, 'learning_rate': 0.0002085288367546432, 'epoch': 0.67}
+{'loss': 0.2565, 'grad_norm': 0.4889543056488037, 'learning_rate': 0.00020850439882697948, 'epoch': 0.67}
+{'loss': 0.294, 'grad_norm': 0.5937919616699219, 'learning_rate': 0.0002084799608993157, 'epoch': 0.67}
+{'loss': 0.4547, 'grad_norm': 1.0246959924697876, 'learning_rate': 0.00020845552297165198, 'epoch': 0.67}
+{'loss': 0.558, 'grad_norm': 0.8385992646217346, 'learning_rate': 0.00020843108504398826, 'epoch': 0.67}
+{'loss': 0.3394, 'grad_norm': 0.6174805760383606, 'learning_rate': 0.0002084066471163245, 'epoch': 0.67}
+{'loss': 0.2792, 'grad_norm': 0.82181715965271, 'learning_rate': 0.0002083822091886608, 'epoch': 0.67}
+{'loss': 0.3115, 'grad_norm': 0.5891256332397461, 'learning_rate': 0.00020835777126099707, 'epoch': 0.67}
+{'loss': 0.3959, 'grad_norm': 0.9402687549591064, 'learning_rate': 0.00020833333333333332, 'epoch': 0.67}
+{'loss': 0.3962, 'grad_norm': 0.887904942035675, 'learning_rate': 0.0002083088954056696, 'epoch': 0.67}
+{'loss': 0.4687, 'grad_norm': 1.0026453733444214, 'learning_rate': 0.00020828445747800585, 'epoch': 0.67}
+{'loss': 0.3504, 'grad_norm': 0.8495490550994873, 'learning_rate': 0.0002082600195503421, 'epoch': 0.67}
+{'loss': 0.4712, 'grad_norm': 1.067874789237976, 'learning_rate': 0.00020823558162267838, 'epoch': 0.67}
+{'loss': 0.5911, 'grad_norm': 1.5945181846618652, 'learning_rate': 0.00020821114369501466, 'epoch': 0.67}
+{'loss': 0.5162, 'grad_norm': 1.3359369039535522, 'learning_rate': 0.0002081867057673509, 'epoch': 0.67}
+{'loss': 0.5094, 'grad_norm': 1.5492264032363892, 'learning_rate': 0.0002081622678396872, 'epoch': 0.67}
+{'loss': 0.5471, 'grad_norm': 1.4912946224212646, 'learning_rate': 0.00020813782991202347, 'epoch': 0.67}
+{'loss': 0.3852, 'grad_norm': 0.7569031715393066, 'learning_rate': 0.0002081133919843597, 'epoch': 0.67}
+{'loss': 0.5473, 'grad_norm': 1.4911766052246094, 'learning_rate': 0.00020808895405669597, 'epoch': 0.67}
+{'loss': 0.4659, 'grad_norm': 1.7154449224472046, 'learning_rate': 0.00020806451612903225, 'epoch': 0.67}
+{'loss': 0.8146, 'grad_norm': 1.293839693069458, 'learning_rate': 0.0002080400782013685, 'epoch': 0.67}
+{'loss': 0.5831, 'grad_norm': 1.7432725429534912, 'learning_rate': 0.00020801564027370478, 'epoch': 0.67}
+{'loss': 0.7064, 'grad_norm': 1.6460049152374268, 'learning_rate': 0.00020799120234604105, 'epoch': 0.67}
+{'loss': 0.6556, 'grad_norm': 1.205126404762268, 'learning_rate': 0.0002079667644183773, 'epoch': 0.67}
+{'loss': 0.5786, 'grad_norm': 2.2884862422943115, 'learning_rate': 0.00020794232649071358, 'epoch': 0.67}
+{'loss': 0.916, 'grad_norm': 1.5183991193771362, 'learning_rate': 0.00020791788856304986, 'epoch': 0.67}
+{'loss': 0.6673, 'grad_norm': 1.8654146194458008, 'learning_rate': 0.00020789345063538609, 'epoch': 0.67}
+{'loss': 0.6066, 'grad_norm': 1.886267066001892, 'learning_rate': 0.00020786901270772236, 'epoch': 0.67}
+{'loss': 0.8144, 'grad_norm': 1.4289623498916626, 'learning_rate': 0.00020784457478005864, 'epoch': 0.67}
+{'loss': 0.879, 'grad_norm': 2.9784719944000244, 'learning_rate': 0.0002078201368523949, 'epoch': 0.67}
+{'loss': 1.1636, 'grad_norm': 3.7526068687438965, 'learning_rate': 0.00020779569892473117, 'epoch': 0.67}
+{'loss': 0.9424, 'grad_norm': 2.4929449558258057, 'learning_rate': 0.00020777126099706745, 'epoch': 0.67}
+{'loss': 0.8243, 'grad_norm': 3.227921962738037, 'learning_rate': 0.0002077468230694037, 'epoch': 0.67}
+{'loss': 0.8561, 'grad_norm': 1.7560867071151733, 'learning_rate': 0.00020772238514173995, 'epoch': 0.67}
+{'loss': 1.1016, 'grad_norm': 2.7342591285705566, 'learning_rate': 0.00020769794721407623, 'epoch': 0.67}
+{'loss': 1.1822, 'grad_norm': 1.8622885942459106, 'learning_rate': 0.00020767350928641248, 'epoch': 0.67}
+{'loss': 1.1509, 'grad_norm': 3.4083244800567627, 'learning_rate': 0.00020764907135874876, 'epoch': 0.67}
+{'loss': 0.8396, 'grad_norm': 2.9127471446990967, 'learning_rate': 0.00020762463343108504, 'epoch': 0.67}
+{'loss': 1.2386, 'grad_norm': 2.4927799701690674, 'learning_rate': 0.0002076001955034213, 'epoch': 0.67}
+{'loss': 1.2362, 'grad_norm': 2.8656435012817383, 'learning_rate': 0.00020757575757575757, 'epoch': 0.67}
+{'loss': 1.3128, 'grad_norm': 3.2088873386383057, 'learning_rate': 0.00020755131964809385, 'epoch': 0.67}
+{'loss': 1.6932, 'grad_norm': 3.3077950477600098, 'learning_rate': 0.00020752688172043007, 'epoch': 0.67}
+{'loss': 1.554, 'grad_norm': 4.320693016052246, 'learning_rate': 0.00020750244379276635, 'epoch': 0.67}
+{'loss': 1.0525, 'grad_norm': 3.5774495601654053, 'learning_rate': 0.00020747800586510263, 'epoch': 0.67}
+{'loss': 0.9957, 'grad_norm': 2.9771318435668945, 'learning_rate': 0.00020745356793743888, 'epoch': 0.67}
+{'loss': 0.903, 'grad_norm': 2.046910524368286, 'learning_rate': 0.00020742913000977516, 'epoch': 0.67}
+{'loss': 0.8665, 'grad_norm': 1.8642005920410156, 'learning_rate': 0.00020740469208211144, 'epoch': 0.67}
+{'loss': 0.5211, 'grad_norm': 1.3706653118133545, 'learning_rate': 0.0002073802541544477, 'epoch': 0.67}
+{'loss': 0.4051, 'grad_norm': 0.5348394513130188, 'learning_rate': 0.00020735581622678397, 'epoch': 0.67}
+{'loss': 0.2932, 'grad_norm': 0.43285202980041504, 'learning_rate': 0.00020733137829912024, 'epoch': 0.67}
+{'loss': 0.4606, 'grad_norm': 0.6725158095359802, 'learning_rate': 0.00020730694037145647, 'epoch': 0.67}
+{'loss': 0.3906, 'grad_norm': 0.676227867603302, 'learning_rate': 0.00020728250244379275, 'epoch': 0.67}
+{'loss': 0.5082, 'grad_norm': 0.6109701991081238, 'learning_rate': 0.00020725806451612903, 'epoch': 0.67}
+{'loss': 0.3191, 'grad_norm': 0.6717798113822937, 'learning_rate': 0.00020723362658846528, 'epoch': 0.67}
+{'loss': 0.5007, 'grad_norm': 0.5643997192382812, 'learning_rate': 0.00020720918866080155, 'epoch': 0.67}
+ 34%|███▎      | 4308/12776 [45:20<1:45:15,  1.34it/s] 34%|███▎      | 4309/12776 [45:21<1:38:50,  1.43it/s]                                                       34%|███▎      | 4309/12776 [45:21<1:38:50,  1.43it/s] 34%|███▎      | 4310/12776 [45:22<1:34:51,  1.49it/s]                                                       34%|███▎      | 4310/12776 [45:22<1:34:51,  1.49it/s] 34%|███▎      | 4311/12776 [45:22<1:29:21,  1.58it/s]                                                       34%|███▎      | 4311/12776 [45:22<1:29:21,  1.58it/s] 34%|███▍      | 4312/12776 [45:23<1:27:14,  1.62it/s]                                                       34%|███▍      | 4312/12776 [45:23<1:27:14,  1.62it/s] 34%|███▍      | 4313/12776 [45:23<1:21:44,  1.73it/s]                                                       34%|███▍      | 4313/12776 [45:23<1:21:44,  1.73it/s] 34%|███▍      | 4314/12776 [45:24<1:20:26,  1.75it/s]                                                       34%|███▍      | 4314/12776 [45:24<1:20:26,  1.75it/s] 34%|███▍      | 4315/12776 [45:24<1:14:48,  1.88it/s]                                                       34%|███▍      | 4315/12776 [45:24<1:14:48,  1.88it/s] 34%|███▍      | 4316/12776 [45:25<1:14:21,  1.90it/s]                                                       34%|███▍      | 4316/12776 [45:25<1:14:21,  1.90it/s] 34%|███▍      | 4317/12776 [45:25<1:09:05,  2.04it/s]                                                       34%|███▍      | 4317/12776 [45:25<1:09:05,  2.04it/s] 34%|███▍      | 4318/12776 [45:25<1:04:50,  2.17it/s]                                                       34%|███▍      | 4318/12776 [45:25<1:04:50,  2.17it/s] 34%|███▍      | 4319/12776 [45:26<1:02:32,  2.25it/s]                                                       34%|███▍      | 4319/12776 [45:26<1:02:32,  2.25it/s] 34%|███▍      | 4320/12776 [45:26<58:36,  2.40it/s]                                                       34%|███▍      | 4320/12776 [45:26<58:36,  2.40it/s] 34%|███▍      | 4321/12776 [45:27<55:35,  2.54it/s]                                                     34%|███▍      | 4321/12776 [45:27<55:35,  2.54it/s] 34%|███▍      | 4322/12776 [45:27<57:40,  2.44it/s]                                                     34%|███▍      | 4322/12776 [45:27<57:40,  2.44it/s] 34%|███▍      | 4323/12776 [45:27<54:16,  2.60it/s]                                                     34%|███▍      | 4323/12776 [45:27<54:16,  2.60it/s] 34%|███▍      | 4324/12776 [45:28<51:07,  2.76it/s]                                                     34%|███▍      | 4324/12776 [45:28<51:07,  2.76it/s] 34%|███▍      | 4325/12776 [45:28<48:36,  2.90it/s]                                                     34%|███▍      | 4325/12776 [45:28<48:36,  2.90it/s] 34%|███▍      | 4326/12776 [45:28<48:09,  2.92it/s]                                                     34%|███▍      | 4326/12776 [45:28<48:09,  2.92it/s] 34%|███▍      | 4327/12776 [45:29<45:41,  3.08it/s]                                                     34%|███▍      | 4327/12776 [45:29<45:41,  3.08it/s] 34%|███▍      | 4328/12776 [45:29<43:38,  3.23it/s]                                                     34%|███▍      | 4328/12776 [45:29<43:38,  3.23it/s] 34%|███▍      | 4329/12776 [45:29<41:53,  3.36it/s]                                                     34%|███▍      | 4329/12776 [45:29<41:53,  3.36it/s] 34%|███▍      | 4330/12776 [45:29<41:39,  3.38it/s]                                                     34%|███▍      | 4330/12776 [45:29<41:39,  3.38it/s] 34%|███▍      | 4331/12776 [45:30<39:51,  3.53it/s]                                                     34%|███▍      | 4331/12776 [45:30<39:51,  3.53it/s] 34%|███▍      | 4332/12776 [45:30<38:29,  3.66it/s]                                                     34%|███▍      | 4332/12776 [45:30<38:29,  3.66it/s] 34%|███▍      | 4333/12776 [45:30<37:09,  3.79it/s]                                                     34%|███▍      | 4333/12776 [45:30<37:09,  3.79it/s] 34%|███▍      | 4334/12776 [45:31<41:35,  3.38it/s]                                                     34%|███▍      | 4334/12776 [45:31<41:35,  3.38it/s] 34%|███▍      | 4335/12776 [45:31<38:56,  3.61it/s]                                                     34%|███▍      | 4335/12776 [45:31<38:56,  3.61it/s] 34%|███▍      | 4336/12776 [45:31<36:59,  3.80it/s]                                                     34%|███▍      | 4336/12776 [45:31<36:59,  3.80it/s] 34%|███▍      | 4337/12776 [45:31<35:11,  4.00it/s]                                                     34%|███▍      | 4337/12776 [45:31<35:11,  4.00it/s] 34%|███▍      | 4338/12776 [45:32<38:22,  3.66it/s]                                                     34%|███▍      | 4338/12776 [45:32<38:22,  3.66it/s] 34%|███▍      | 4339/12776 [45:32<35:41,  3.94it/s]                                                     34%|███▍      | 4339/12776 [45:32<35:41,  3.94it/s] 34%|███▍      | 4340/12776 [45:32<33:50,  4.15it/s]                                                     34%|███▍      | 4340/12776 [45:32<33:50,  4.15it/s] 34%|███▍      | 4341/12776 [45:32<32:18,  4.35it/s]                                                     34%|███▍      | 4341/12776 [45:32<32:18,  4.35it/s] 34%|███▍      | 4342/12776 [45:32<31:03,  4.53it/s]                                                     34%|███▍      | 4342/12776 [45:32<31:03,  4.53it/s] 34%|███▍      | 4343/12776 [45:33<35:02,  4.01it/s]                                                     34%|███▍      | 4343/12776 [45:33<35:02,  4.01it/s] 34%|███▍      | 4344/12776 [45:33<32:50,  4.28it/s]                                                     34%|███▍      | 4344/12776 [45:33<32:50,  4.28it/s] 34%|███▍      | 4345/12776 [45:33<31:16,  4.49it/s]                                                     34%|███▍      | 4345/12776 [45:33<31:16,  4.49it/s] 34%|███▍      | 4346/12776 [45:33<30:03,  4.67it/s]                                                     34%|███▍      | 4346/12776 [45:33<30:03,  4.67it/s] 34%|███▍      | 4347/12776 [45:33<29:04,  4.83it/s]                                                     34%|███▍      | 4347/12776 [45:33<29:04,  4.83it/s] 34%|███▍      | 4348/12776 [45:34<28:10,  4.98it/s]                                                     34%|███▍      | 4348/12776 [45:34<28:10,  4.98it/s] 34%|███▍      | 4349/12776 [45:34<31:57,  4.40it/s]                                                     34%|███▍      | 4349/12776 [45:34<31:57,  4.40it/s] 34%|███▍      | 4350/12776 [45:35<52:35,  2.67it/s]                                                     34%|███▍      | 4350/12776 [45:35<52:35,  2.67it/s] 34%|███▍      | 4351/12776 [45:36<1:39:38,  1.41it/s]                                                       34%|███▍      | 4351/12776 [45:36<1:39:38,  1.41it/s] 34%|███▍      | 4352/12776 [45:37<1:55:38,  1.21it/s]                                                       34%|███▍      | 4352/12776 [45:37<1:55:38,  1.21it/s] 34%|███▍      | 4353/12776 [45:38<1:56:41,  1.20it/s]                                                       34%|███▍      | 4353/12776 [45:38<1:56:41,  1.20it/s] 34%|███▍      | 4354/12776 [45:39<1:54:32,  1.23it/s]                                                       34%|███▍      | 4354/12776 [45:39<1:54:32,  1.23it/s] 34%|███▍      | 4355/12776 [45:40<1:51:14,  1.26it/s]                                                       34%|███▍      | 4355/12776 [45:40<1:51:14,  1.26it/s] 34%|███▍      | 4356/12776 [45:40<1:47:39,  1.30it/s]                                                       34%|███▍      | 4356/12776 [45:40<1:47:39,  1.30it/s] 34%|███▍      | 4357/12776 [45:41<1:44:30,  1.34it/s]                                                       34%|███▍      | 4357/12776 [45:41<1:44:30,  1.34it/s] 34%|███▍      | 4358/12776 [45:42<1:40:01,  1.40it/s]                                                       34%|███▍      | 4358/12776 [45:42<1:40:01,  1.40it/s] 34%|███▍      | 4359/12776 [45:42<1:36:00,  1.46it/s]                                                       34%|███▍      | 4359/12776 [45:42<1:36:00,  1.46it/s] 34%|███▍      | 4360/12776 [45:43<1:31:23,  1.53it/s]                                                       34%|███▍      | 4360/12776 [45:43<1:31:23,  1.53it/s] 34%|███▍      | 4361/12776 [45:43<1:27:13,  1.61it/s]                                                       34%|███▍      | 4361/12776 [45:43<1:27:13,  1.61it/s] 34%|███▍      | 4362/12776 [45:44<1:23:41,  1.68it/s]                                                       34%|███▍      | 4362/12776 [45:44<1:23:41,  1.68it/s] 34%|███▍      | 4363/12776 [45:44<1:22:10,  1.71it/s]                                                       34%|███▍      | 4363/12776 [45:44<1:22:10,  1.71it/s] 34%|███▍      | 4364/12776 [45:45<1:18:05,  1.80it/s]                                                       34%|███▍      | 4364/12776 [45:45<1:18:05,  1.80it/s] 34%|███▍      | 4365/12776 [45:46<1:17:32,  1.81it/s]                                                       34%|███▍      | 4365/12776 [45:46<1:17:32,  1.81it/s] 34%|███▍      | 4366/12776 [45:46<1:12:30,  1.93it/s]                                                       34%|███▍      | 4366/12776 [45:46<1:12:30,  1.93it/s] 34%|███▍      | 4367/12776 [45:46<1:12:24,  1.94it/s]                                                       34%|███▍      | 4367/12776 [45:46<1:12:24,  1.94it/s] 34%|███▍      | 4368/12776 [45:47<1:07:21,  2.08it/s]                                                       34%|███▍      | 4368/12776 [45:47<1:07:21,  2.08it/s] 34%|███▍      | 4369/12776 [45:47<1:03:14,  2.22it/s]                                                       34%|███▍      | 4369/12776 [45:47<1:03:14,  2.22it/s] 34%|███▍      | 4370/12776 [45:48<1:00:33,  2.31it/s]                                                       34%|███▍      | 4370/12776 [45:48<1:00:33,  2.31it/s] 34%|███▍      | 4371/12776 [45:48<57:08,  2.45it/s]                                                       34%|███▍      | 4371/12776 [45:48<57:08,  2.45it/s] 34%|███▍      | 4372/12776 [45:48<54:20,  2.58it/s]                                                     34%|███▍      | 4372/12776 [45:48<54:20,  2.58it/s] 34%|███▍      | 4373/12776 [45:49<56:18,  2.49it/s]                                                     34%|███▍      | 4373/12776 [45:49<56:18,  2.49it/s] 34%|███▍      | 4374/12776 [45:49<52:17,  2.68it/s]                                                     34%|███▍      | 4374/12776 [45:49<52:17,  2.68it/s] 34%|███▍      | 4375/12776 [45:49<49:21,  2.84it/s]                                                     34%|███▍      | 4375/12776 [45:49<49:21,  2.84it/s] 34%|███▍      | 4376/12776 [45:50<47:06,  2.97it/s]                                                     34%|███▍      | 4376/12776 [45:50<47:06,  2.97it/s] 34%|███▍      | 4377/12776 [45:50<48:24,  2.89it/s]                                                     34%|███▍      | 4377/12776 [45:50<48:24,  2.89it/s] 34%|███▍      | 4378/12776 [45:50<45:30,  3.08it/s]                                                     34%|███▍      | 4378/12776 [45:50<45:30,  3.08it/s] 34%|███▍      | 4379/12776 [45:51<43:12,  3.24it/s]                                                     34%|███▍      | 4379/12776 [45:51<43:12,  3.24it/s] 34%|███▍      | 4380/12776 [45:51<41:18,  3.39it/s]                                                     34%|███▍      | 4380/12776 [45:51<41:18,  3.39it/s] 34%|███▍      | 4381/12776 [45:51<42:07,  3.32it/s]                                                     34%|███▍      | 4381/12776 [45:51<42:07,  3.32it/s] 34%|███▍      | 4382/12776 [45:51<39:52,  3.51it/s]                                                     34%|███▍      | 4382/12776 [45:51<39:52,  3.51it/s] 34%|███▍      | 4383/12776 [45:52<38:12,  3.66it/s]                                                     34%|███▍      | 4383/12776 [45:52<38:12,  3.66it/s] 34%|███▍      | 4384/12776 [45:52<36:42,  3.81it/s]                                                     34%|███▍      | 4384/12776 [45:52<36:42,  3.81it/s] 34%|███▍      | 4385/12776 [45:52<41:25,  3.38it/s]                                                    {'loss': 0.3413, 'grad_norm': 0.6461310386657715, 'learning_rate': 0.00020718475073313783, 'epoch': 0.67}
+{'loss': 0.4067, 'grad_norm': 0.7936927080154419, 'learning_rate': 0.00020716031280547408, 'epoch': 0.67}
+{'loss': 0.4756, 'grad_norm': 0.7589799761772156, 'learning_rate': 0.00020713587487781034, 'epoch': 0.67}
+{'loss': 0.3809, 'grad_norm': 0.9731733202934265, 'learning_rate': 0.00020711143695014661, 'epoch': 0.67}
+{'loss': 0.4063, 'grad_norm': 0.774403989315033, 'learning_rate': 0.00020708699902248287, 'epoch': 0.68}
+{'loss': 0.3735, 'grad_norm': 1.026542067527771, 'learning_rate': 0.00020706256109481914, 'epoch': 0.68}
+{'loss': 0.501, 'grad_norm': 1.497208595275879, 'learning_rate': 0.00020703812316715542, 'epoch': 0.68}
+{'loss': 0.607, 'grad_norm': 1.3015997409820557, 'learning_rate': 0.00020701368523949167, 'epoch': 0.68}
+{'loss': 0.7354, 'grad_norm': 1.392624855041504, 'learning_rate': 0.00020698924731182795, 'epoch': 0.68}
+{'loss': 0.3351, 'grad_norm': 1.2120251655578613, 'learning_rate': 0.00020696480938416423, 'epoch': 0.68}
+{'loss': 0.5827, 'grad_norm': 1.3814586400985718, 'learning_rate': 0.00020694037145650045, 'epoch': 0.68}
+{'loss': 0.4963, 'grad_norm': 1.2726234197616577, 'learning_rate': 0.00020691593352883673, 'epoch': 0.68}
+{'loss': 0.6605, 'grad_norm': 2.120060682296753, 'learning_rate': 0.000206891495601173, 'epoch': 0.68}
+{'loss': 0.7302, 'grad_norm': 1.4398250579833984, 'learning_rate': 0.00020686705767350926, 'epoch': 0.68}
+{'loss': 0.5445, 'grad_norm': 1.0549432039260864, 'learning_rate': 0.00020684261974584554, 'epoch': 0.68}
+{'loss': 0.6731, 'grad_norm': 1.3273248672485352, 'learning_rate': 0.00020681818181818182, 'epoch': 0.68}
+{'loss': 1.0642, 'grad_norm': 1.9823793172836304, 'learning_rate': 0.00020679374389051807, 'epoch': 0.68}
+{'loss': 0.897, 'grad_norm': 1.5976402759552002, 'learning_rate': 0.00020676930596285435, 'epoch': 0.68}
+{'loss': 0.5817, 'grad_norm': 1.3136533498764038, 'learning_rate': 0.00020674486803519063, 'epoch': 0.68}
+{'loss': 0.67, 'grad_norm': 3.931715726852417, 'learning_rate': 0.00020672043010752685, 'epoch': 0.68}
+{'loss': 0.573, 'grad_norm': 1.4402580261230469, 'learning_rate': 0.00020669599217986313, 'epoch': 0.68}
+{'loss': 0.6495, 'grad_norm': 3.5042459964752197, 'learning_rate': 0.0002066715542521994, 'epoch': 0.68}
+{'loss': 0.689, 'grad_norm': 1.4907385110855103, 'learning_rate': 0.00020664711632453566, 'epoch': 0.68}
+{'loss': 0.8336, 'grad_norm': 1.5746352672576904, 'learning_rate': 0.00020662267839687194, 'epoch': 0.68}
+{'loss': 0.8657, 'grad_norm': 1.0189085006713867, 'learning_rate': 0.00020659824046920822, 'epoch': 0.68}
+{'loss': 1.1317, 'grad_norm': 2.78194522857666, 'learning_rate': 0.00020657380254154447, 'epoch': 0.68}
+{'loss': 0.7439, 'grad_norm': 1.732728362083435, 'learning_rate': 0.00020654936461388072, 'epoch': 0.68}
+{'loss': 1.0976, 'grad_norm': 3.1215643882751465, 'learning_rate': 0.000206524926686217, 'epoch': 0.68}
+{'loss': 0.7992, 'grad_norm': 2.303900957107544, 'learning_rate': 0.00020650048875855325, 'epoch': 0.68}
+{'loss': 1.4244, 'grad_norm': 2.1387338638305664, 'learning_rate': 0.00020647605083088953, 'epoch': 0.68}
+{'loss': 1.5699, 'grad_norm': 2.9875400066375732, 'learning_rate': 0.0002064516129032258, 'epoch': 0.68}
+{'loss': 0.9713, 'grad_norm': 1.6865546703338623, 'learning_rate': 0.00020642717497556206, 'epoch': 0.68}
+{'loss': 1.4294, 'grad_norm': 2.4787464141845703, 'learning_rate': 0.00020640273704789833, 'epoch': 0.68}
+{'loss': 0.7225, 'grad_norm': 2.0415139198303223, 'learning_rate': 0.0002063782991202346, 'epoch': 0.68}
+{'loss': 1.3035, 'grad_norm': 3.089674234390259, 'learning_rate': 0.00020635386119257084, 'epoch': 0.68}
+{'loss': 1.1855, 'grad_norm': 1.6210919618606567, 'learning_rate': 0.00020632942326490711, 'epoch': 0.68}
+{'loss': 1.2739, 'grad_norm': 3.708653450012207, 'learning_rate': 0.0002063049853372434, 'epoch': 0.68}
+{'loss': 1.0516, 'grad_norm': 2.1804747581481934, 'learning_rate': 0.00020628054740957964, 'epoch': 0.68}
+{'loss': 0.8218, 'grad_norm': 2.3872623443603516, 'learning_rate': 0.00020625610948191592, 'epoch': 0.68}
+{'loss': 0.6364, 'grad_norm': 2.274266242980957, 'learning_rate': 0.00020623167155425217, 'epoch': 0.68}
+{'loss': 0.6342, 'grad_norm': 4.519956588745117, 'learning_rate': 0.00020620723362658845, 'epoch': 0.68}
+{'loss': 1.1602, 'grad_norm': 1.9020459651947021, 'learning_rate': 0.00020618279569892473, 'epoch': 0.68}
+{'loss': 1.2329, 'grad_norm': 2.6886696815490723, 'learning_rate': 0.00020615835777126095, 'epoch': 0.68}
+{'loss': 0.2924, 'grad_norm': 0.6094191670417786, 'learning_rate': 0.00020613391984359723, 'epoch': 0.68}
+{'loss': 0.3504, 'grad_norm': 0.5536388158798218, 'learning_rate': 0.0002061094819159335, 'epoch': 0.68}
+{'loss': 0.3809, 'grad_norm': 0.5976225137710571, 'learning_rate': 0.00020608504398826976, 'epoch': 0.68}
+{'loss': 0.4363, 'grad_norm': 0.694050133228302, 'learning_rate': 0.00020606060606060604, 'epoch': 0.68}
+{'loss': 0.2675, 'grad_norm': 0.606235682964325, 'learning_rate': 0.00020603616813294232, 'epoch': 0.68}
+{'loss': 0.4803, 'grad_norm': 2.016716957092285, 'learning_rate': 0.00020601173020527857, 'epoch': 0.68}
+{'loss': 0.5548, 'grad_norm': 0.9147260785102844, 'learning_rate': 0.00020598729227761482, 'epoch': 0.68}
+{'loss': 0.4761, 'grad_norm': 0.6617628931999207, 'learning_rate': 0.0002059628543499511, 'epoch': 0.68}
+{'loss': 0.3996, 'grad_norm': 0.9042291045188904, 'learning_rate': 0.00020593841642228735, 'epoch': 0.68}
+{'loss': 0.6516, 'grad_norm': 1.5656142234802246, 'learning_rate': 0.00020591397849462363, 'epoch': 0.68}
+{'loss': 0.3016, 'grad_norm': 0.7753491401672363, 'learning_rate': 0.0002058895405669599, 'epoch': 0.68}
+{'loss': 0.2986, 'grad_norm': 0.5845422148704529, 'learning_rate': 0.00020586510263929616, 'epoch': 0.68}
+{'loss': 0.7511, 'grad_norm': 2.0732033252716064, 'learning_rate': 0.00020584066471163244, 'epoch': 0.68}
+{'loss': 0.5104, 'grad_norm': 1.0156924724578857, 'learning_rate': 0.00020581622678396872, 'epoch': 0.68}
+{'loss': 0.2841, 'grad_norm': 1.2725284099578857, 'learning_rate': 0.00020579178885630494, 'epoch': 0.68}
+{'loss': 0.3947, 'grad_norm': 0.7375205159187317, 'learning_rate': 0.00020576735092864122, 'epoch': 0.68}
+{'loss': 0.5679, 'grad_norm': 1.245803952217102, 'learning_rate': 0.0002057429130009775, 'epoch': 0.68}
+{'loss': 0.3995, 'grad_norm': 1.113433599472046, 'learning_rate': 0.00020571847507331375, 'epoch': 0.68}
+{'loss': 0.5824, 'grad_norm': 1.771336555480957, 'learning_rate': 0.00020569403714565003, 'epoch': 0.68}
+{'loss': 0.576, 'grad_norm': 0.9144507050514221, 'learning_rate': 0.0002056695992179863, 'epoch': 0.68}
+{'loss': 0.4776, 'grad_norm': 1.0565403699874878, 'learning_rate': 0.00020564516129032256, 'epoch': 0.68}
+{'loss': 0.9487, 'grad_norm': 2.380849838256836, 'learning_rate': 0.00020562072336265883, 'epoch': 0.68}
+{'loss': 0.5143, 'grad_norm': 1.3692196607589722, 'learning_rate': 0.0002055962854349951, 'epoch': 0.68}
+{'loss': 0.5443, 'grad_norm': 2.105653762817383, 'learning_rate': 0.00020557184750733134, 'epoch': 0.68}
+{'loss': 0.7253, 'grad_norm': 1.329337239265442, 'learning_rate': 0.00020554740957966762, 'epoch': 0.68}
+{'loss': 0.5775, 'grad_norm': 0.9854122400283813, 'learning_rate': 0.0002055229716520039, 'epoch': 0.69}
+{'loss': 0.6427, 'grad_norm': 1.4902323484420776, 'learning_rate': 0.00020549853372434014, 'epoch': 0.69}
+{'loss': 0.8651, 'grad_norm': 2.347221851348877, 'learning_rate': 0.00020547409579667642, 'epoch': 0.69}
+{'loss': 0.8242, 'grad_norm': 1.5970897674560547, 'learning_rate': 0.0002054496578690127, 'epoch': 0.69}
+{'loss': 0.6639, 'grad_norm': 1.8963264226913452, 'learning_rate': 0.00020542521994134895, 'epoch': 0.69}
+{'loss': 0.882, 'grad_norm': 1.861736536026001, 'learning_rate': 0.0002054007820136852, 'epoch': 0.69}
+{'loss': 0.5547, 'grad_norm': 2.6222732067108154, 'learning_rate': 0.00020537634408602148, 'epoch': 0.69}
+{'loss': 0.9057, 'grad_norm': 7.720530033111572, 'learning_rate': 0.00020535190615835773, 'epoch': 0.69}
+{'loss': 1.5026, 'grad_norm': 2.147437810897827, 'learning_rate': 0.000205327468230694, 'epoch': 0.69}
+ 34%|███▍      | 4385/12776 [45:52<41:25,  3.38it/s] 34%|███▍      | 4386/12776 [45:52<38:32,  3.63it/s]                                                     34%|███▍      | 4386/12776 [45:52<38:32,  3.63it/s] 34%|███▍      | 4387/12776 [45:53<36:08,  3.87it/s]                                                     34%|███▍      | 4387/12776 [45:53<36:08,  3.87it/s] 34%|███▍      | 4388/12776 [45:53<34:08,  4.09it/s]                                                     34%|███▍      | 4388/12776 [45:53<34:08,  4.09it/s] 34%|███▍      | 4389/12776 [45:53<32:36,  4.29it/s]                                                     34%|███▍      | 4389/12776 [45:53<32:36,  4.29it/s] 34%|███▍      | 4390/12776 [45:53<35:36,  3.93it/s]                                                     34%|███▍      | 4390/12776 [45:53<35:36,  3.93it/s] 34%|███▍      | 4391/12776 [45:54<33:27,  4.18it/s]                                                     34%|███▍      | 4391/12776 [45:54<33:27,  4.18it/s] 34%|███▍      | 4392/12776 [45:54<31:46,  4.40it/s]                                                     34%|███▍      | 4392/12776 [45:54<31:46,  4.40it/s] 34%|███▍      | 4393/12776 [45:54<30:34,  4.57it/s]                                                     34%|███▍      | 4393/12776 [45:54<30:34,  4.57it/s] 34%|███▍      | 4394/12776 [45:54<29:41,  4.70it/s]                                                     34%|███▍      | 4394/12776 [45:54<29:41,  4.70it/s] 34%|███▍      | 4395/12776 [45:55<32:51,  4.25it/s]                                                     34%|███▍      | 4395/12776 [45:55<32:51,  4.25it/s] 34%|███▍      | 4396/12776 [45:55<31:00,  4.50it/s]                                                     34%|███▍      | 4396/12776 [45:55<31:00,  4.50it/s] 34%|███▍      | 4397/12776 [45:55<29:46,  4.69it/s]                                                     34%|███▍      | 4397/12776 [45:55<29:46,  4.69it/s] 34%|███▍      | 4398/12776 [45:55<28:38,  4.87it/s]                                                     34%|███▍      | 4398/12776 [45:55<28:38,  4.87it/s] 34%|███▍      | 4399/12776 [45:55<27:53,  5.01it/s]                                                     34%|███▍      | 4399/12776 [45:55<27:53,  5.01it/s] 34%|███▍      | 4400/12776 [45:56<48:31,  2.88it/s]                                                     34%|███▍      | 4400/12776 [45:56<48:31,  2.88it/s]Saving model checkpoint to ./checkpoint-4400
+Configuration saved in ./checkpoint-4400/config.json
+Model weights saved in ./checkpoint-4400/model.safetensors
+Feature extractor saved in ./checkpoint-4400/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-4400/tokenizer_config.json
+Special tokens file saved in ./checkpoint-4400/special_tokens_map.json
+added tokens file saved in ./checkpoint-4400/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-3200] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 34%|███▍      | 4401/12776 [46:03<5:08:53,  2.21s/it]                                                       34%|███▍      | 4401/12776 [46:03<5:08:53,  2.21s/it] 34%|███▍      | 4402/12776 [46:04<4:17:04,  1.84s/it]                                                       34%|███▍      | 4402/12776 [46:04<4:17:04,  1.84s/it] 34%|███▍      | 4403/12776 [46:04<3:33:53,  1.53s/it]                                                       34%|███▍      | 4403/12776 [46:04<3:33:53,  1.53s/it] 34%|███▍      | 4404/12776 [46:05<3:00:44,  1.30s/it]                                                       34%|███▍      | 4404/12776 [46:05<3:00:44,  1.30s/it] 34%|███▍      | 4405/12776 [46:06<2:37:58,  1.13s/it]                                                       34%|███▍      | 4405/12776 [46:06<2:37:58,  1.13s/it] 34%|███▍      | 4406/12776 [46:06<2:17:03,  1.02it/s]                                                       34%|███▍      | 4406/12776 [46:06<2:17:03,  1.02it/s] 34%|███▍      | 4407/12776 [46:07<2:07:24,  1.09it/s]                                                       34%|███▍      | 4407/12776 [46:07<2:07:24,  1.09it/s] 35%|███▍      | 4408/12776 [46:08<1:52:34,  1.24it/s]                                                       35%|███▍      | 4408/12776 [46:08<1:52:34,  1.24it/s] 35%|███▍      | 4409/12776 [46:08<1:43:17,  1.35it/s]                                                       35%|███▍      | 4409/12776 [46:08<1:43:17,  1.35it/s] 35%|███▍      | 4410/12776 [46:09<1:33:44,  1.49it/s]                                                       35%|███▍      | 4410/12776 [46:09<1:33:44,  1.49it/s] 35%|███▍      | 4411/12776 [46:09<1:30:13,  1.55it/s]                                                       35%|███▍      | 4411/12776 [46:09<1:30:13,  1.55it/s] 35%|███▍      | 4412/12776 [46:10<1:22:54,  1.68it/s]                                                       35%|███▍      | 4412/12776 [46:10<1:22:54,  1.68it/s] 35%|███▍      | 4413/12776 [46:10<1:19:05,  1.76it/s]                                                       35%|███▍      | 4413/12776 [46:10<1:19:05,  1.76it/s] 35%|███▍      | 4414/12776 [46:11<1:12:52,  1.91it/s]                                                       35%|███▍      | 4414/12776 [46:11<1:12:52,  1.91it/s] 35%|███▍      | 4415/12776 [46:11<1:07:48,  2.05it/s]                                                       35%|███▍      | 4415/12776 [46:11<1:07:48,  2.05it/s] 35%|███▍      | 4416/12776 [46:12<1:07:01,  2.08it/s]                                                       35%|███▍      | 4416/12776 [46:12<1:07:01,  2.08it/s] 35%|███▍      | 4417/12776 [46:12<1:02:26,  2.23it/s]                                                       35%|███▍      | 4417/12776 [46:12<1:02:26,  2.23it/s] 35%|███▍      | 4418/12776 [46:12<58:41,  2.37it/s]                                                       35%|███▍      | 4418/12776 [46:12<58:41,  2.37it/s] 35%|███▍      | 4419/12776 [46:13<1:01:14,  2.27it/s]                                                       35%|███▍      | 4419/12776 [46:13<1:01:14,  2.27it/s] 35%|███▍      | 4420/12776 [46:13<56:39,  2.46it/s]                                                       35%|███▍      | 4420/12776 [46:13<56:39,  2.46it/s] 35%|███▍      | 4421/12776 [46:14<53:18,  2.61it/s]                                                     35%|███▍      | 4421/12776 [46:14<53:18,  2.61it/s] 35%|███▍      | 4422/12776 [46:14<54:26,  2.56it/s]                                                     35%|███▍      | 4422/12776 [46:14<54:26,  2.56it/s] 35%|███▍      | 4423/12776 [46:14<50:51,  2.74it/s]                                                     35%|███▍      | 4423/12776 [46:14<50:51,  2.74it/s] 35%|███▍      | 4424/12776 [46:15<47:56,  2.90it/s]                                                     35%|███▍      | 4424/12776 [46:15<47:56,  2.90it/s] 35%|███▍      | 4425/12776 [46:15<50:46,  2.74it/s]                                                     35%|███▍      | 4425/12776 [46:15<50:46,  2.74it/s] 35%|███▍      | 4426/12776 [46:15<46:54,  2.97it/s]                                                     35%|███▍      | 4426/12776 [46:15<46:54,  2.97it/s] 35%|███▍      | 4427/12776 [46:16<44:03,  3.16it/s]                                                     35%|███▍      | 4427/12776 [46:16<44:03,  3.16it/s] 35%|███▍      | 4428/12776 [46:16<41:41,  3.34it/s]                                                     35%|███▍      | 4428/12776 [46:16<41:41,  3.34it/s] 35%|███▍      | 4429/12776 [46:16<43:48,  3.18it/s]                                                     35%|███▍      | 4429/12776 [46:16<43:48,  3.18it/s] 35%|███▍      | 4430/12776 [46:16<40:56,  3.40it/s]                                                     35%|███▍      | 4430/12776 [46:16<40:56,  3.40it/s] 35%|███▍      | 4431/12776 [46:17<38:48,  3.58it/s]                                                     35%|███▍      | 4431/12776 [46:17<38:48,  3.58it/s] 35%|███▍      | 4432/12776 [46:17<36:45,  3.78it/s]                                                     35%|███▍      | 4432/12776 [46:17<36:45,  3.78it/s] 35%|███▍      | 4433/12776 [46:17<35:11,  3.95it/s]                                                     35%|███▍      | 4433/12776 [46:17<35:11,  3.95it/s] 35%|███▍      | 4434/12776 [46:17<38:03,  3.65it/s]                                                     35%|███▍      | 4434/12776 [46:17<38:03,  3.65it/s] 35%|███▍      | 4435/12776 [46:18<35:39,  3.90it/s]                                                     35%|███▍      | 4435/12776 [46:18<35:39,  3.90it/s] 35%|███▍      | 4436/12776 [46:18<34:02,  4.08it/s]                                                     35%|███▍      | 4436/12776 [46:18<34:02,  4.08it/s] 35%|███▍      | 4437/12776 [46:18<32:12,  4.31it/s]                                                     35%|███▍      | 4437/12776 [46:18<32:12,  4.31it/s] 35%|███▍      | 4438/12776 [46:18<30:42,  4.53it/s]                                                     35%|███▍      | 4438/12776 [46:18<30:42,  4.53it/s] 35%|███▍      | 4439/12776 [46:19<34:57,  3.97it/s]                                                     35%|███▍      | 4439/12776 [46:19<34:57,  3.97it/s] 35%|███▍      | 4440/12776 [46:19<32:26,  4.28it/s]                                                     35%|███▍      | 4440/12776 [46:19<32:26,  4.28it/s] 35%|███▍      | 4441/12776 [46:19<30:19,  4.58it/s]                                                     35%|███▍      | 4441/12776 [46:19<30:19,  4.58it/s] 35%|███▍      | 4442/12776 [46:19<28:47,  4.82it/s]                                                     35%|███▍      | 4442/12776 [46:19<28:47,  4.82it/s] 35%|███▍      | 4443/12776 [46:19<27:30,  5.05it/s]                                                     35%|███▍      | 4443/12776 [46:19<27:30,  5.05it/s] 35%|███▍      | 4444/12776 [46:20<29:27,  4.71it/s]                                                     35%|███▍      | 4444/12776 [46:20<29:27,  4.71it/s] 35%|███▍      | 4445/12776 [46:20<27:50,  4.99it/s]                                                     35%|███▍      | 4445/12776 [46:20<27:50,  4.99it/s] 35%|███▍      | 4446/12776 [46:20<26:38,  5.21it/s]                                                     35%|███▍      | 4446/12776 [46:20<26:38,  5.21it/s] 35%|███▍      | 4447/12776 [46:20<25:34,  5.43it/s]                                                     35%|███▍      | 4447/12776 [46:20<25:34,  5.43it/s] 35%|███▍      | 4448/12776 [46:20<24:44,  5.61it/s]                                                     35%|███▍      | 4448/12776 [46:20<24:44,  5.61it/s] 35%|███▍      | 4449/12776 [46:20<24:03,  5.77it/s]                                                     35%|███▍      | 4449/12776 [46:20<24:03,  5.77it/s] 35%|███▍      | 4450/12776 [46:21<45:58,  3.02it/s]                                                     35%|███▍      | 4450/12776 [46:21<45:58,  3.02it/s] 35%|███▍      | 4451/12776 [46:23<1:30:46,  1.53it/s]                                                       35%|███▍      | 4451/12776 [46:23<1:30:46,  1.53it/s] 35%|███▍      | 4452/12776 [46:24<1:50:50,  1.25it/s]                                                       35%|███▍      | 4452/12776 [46:24<1:50:50,  1.25it/s] 35%|███▍      | 4453/12776 [46:25<1:56:45,  1.19it/s]                                                       35%|███▍      | 4453/12776 [46:25<1:56:45,  1.19it/s] 35%|███▍      | 4454/12776 [46:25<1:55:36,  1.20it/s]                                                       35%|███▍      | 4454/12776 [46:25<1:55:36,  1.20it/s] 35%|███▍      | 4455/12776 [46:26<1:51:38,  1.24it/s]                                                       35%|███▍      | 4455/12776 [46:26<1:51:38,  1.24it/s] 35%|███▍      | 4456/12776 [46:27<1:50:07,  1.26it/s]                                                       35%|███▍      | 4456/12776 [46:27<1:50:07,  1.26it/s] 35%|███▍      | 4457/12776 [46:28<1:48:41,  1.28it/s]                                                       35%|███▍      | 4457/12776 [46:28<1:48:41,  1.28it/s] 35%|███▍      | 4458/12776 [46:28<1:42:18,  1.36it/s]                                                       35%|███▍      | 4458/12776 [46:28<1:42:18,  1.36it/s] 35%|███▍      | 4459/12776 [46:29<1:35:20,  1.45it/s]                                                       35%|███▍      | 4459/12776 [46:29<1:35:20,  1.45it/s] 35%|███▍      | 4460/12776 [46:29<1:30:12,  1.54it/s]                                                       35%|███▍      | 4460/12776 [46:29<1:30:12,  1.54it/s] 35%|███▍      | 4461/12776 [46:30<1:25:17,  1.62it/s]                                                       35%|███▍      | 4461/12776 [46:30<1:25:17,  1.62it/s] 35%|███▍      | 4462/12776 [46:30<1:20:00,  1.73it/s]                                                       35%|███▍      | 4462/12776 [46:30<1:20:00,  1.73it/s] 35%|███▍      | 4463/12776 [46:31<1:15:02,  1.85it/s]                                                      {'loss': 1.0872, 'grad_norm': 1.9895732402801514, 'learning_rate': 0.0002053030303030303, 'epoch': 0.69}
+{'loss': 1.4365, 'grad_norm': 3.379164218902588, 'learning_rate': 0.00020527859237536654, 'epoch': 0.69}
+{'loss': 0.855, 'grad_norm': 3.2199456691741943, 'learning_rate': 0.00020525415444770282, 'epoch': 0.69}
+{'loss': 1.2534, 'grad_norm': 4.4102911949157715, 'learning_rate': 0.0002052297165200391, 'epoch': 0.69}
+{'loss': 0.9027, 'grad_norm': 1.3978711366653442, 'learning_rate': 0.00020520527859237532, 'epoch': 0.69}
+{'loss': 0.8522, 'grad_norm': 2.3685648441314697, 'learning_rate': 0.0002051808406647116, 'epoch': 0.69}
+{'loss': 1.2799, 'grad_norm': 2.080855369567871, 'learning_rate': 0.00020515640273704788, 'epoch': 0.69}
+{'loss': 1.1435, 'grad_norm': 2.1453280448913574, 'learning_rate': 0.00020513196480938413, 'epoch': 0.69}
+{'loss': 1.9692, 'grad_norm': 2.5470387935638428, 'learning_rate': 0.0002051075268817204, 'epoch': 0.69}
+{'loss': 1.7749, 'grad_norm': 2.8885648250579834, 'learning_rate': 0.0002050830889540567, 'epoch': 0.69}
+{'loss': 1.5839, 'grad_norm': 2.181378126144409, 'learning_rate': 0.00020505865102639294, 'epoch': 0.69}
+{'loss': 1.4834, 'grad_norm': 4.625649452209473, 'learning_rate': 0.00020503421309872922, 'epoch': 0.69}
+{'loss': 1.4019, 'grad_norm': 2.8574166297912598, 'learning_rate': 0.0002050097751710655, 'epoch': 0.69}
+{'loss': 1.1237, 'grad_norm': 2.1024327278137207, 'learning_rate': 0.00020498533724340172, 'epoch': 0.69}
+{'loss': 0.9635, 'grad_norm': 1.9445034265518188, 'learning_rate': 0.000204960899315738, 'epoch': 0.69}
+{'loss': 1.1126, 'grad_norm': 2.038120746612549, 'learning_rate': 0.00020493646138807428, 'epoch': 0.69}
+{'loss': 0.8911, 'grad_norm': 0.9593400955200195, 'learning_rate': 0.00020491202346041053, 'epoch': 0.69}
+{'loss': 0.306, 'grad_norm': 0.4626724421977997, 'learning_rate': 0.0002048875855327468, 'epoch': 0.69}
+{'loss': 0.387, 'grad_norm': 0.5017557144165039, 'learning_rate': 0.00020486314760508308, 'epoch': 0.69}
+{'loss': 0.2952, 'grad_norm': 0.7594949007034302, 'learning_rate': 0.0002048387096774193, 'epoch': 0.69}
+{'loss': 0.3796, 'grad_norm': 0.5855520367622375, 'learning_rate': 0.00020481427174975559, 'epoch': 0.69}
+{'loss': 0.3505, 'grad_norm': 0.7784630656242371, 'learning_rate': 0.00020478983382209186, 'epoch': 0.69}
+{'loss': 0.3887, 'grad_norm': 0.7261280417442322, 'learning_rate': 0.00020476539589442812, 'epoch': 0.69}
+{'loss': 0.3455, 'grad_norm': 0.5250700116157532, 'learning_rate': 0.0002047409579667644, 'epoch': 0.69}
+{'loss': 0.4713, 'grad_norm': 1.4963864088058472, 'learning_rate': 0.00020471652003910067, 'epoch': 0.69}
+{'loss': 0.3757, 'grad_norm': 0.7633784413337708, 'learning_rate': 0.00020469208211143692, 'epoch': 0.69}
+{'loss': 0.3691, 'grad_norm': 0.6645981669425964, 'learning_rate': 0.0002046676441837732, 'epoch': 0.69}
+{'loss': 0.4197, 'grad_norm': 0.977664589881897, 'learning_rate': 0.00020464320625610948, 'epoch': 0.69}
+{'loss': 0.4272, 'grad_norm': 0.827433168888092, 'learning_rate': 0.0002046187683284457, 'epoch': 0.69}
+{'loss': 0.546, 'grad_norm': 1.0397074222564697, 'learning_rate': 0.00020459433040078198, 'epoch': 0.69}
+{'loss': 0.4227, 'grad_norm': 0.9832563400268555, 'learning_rate': 0.00020456989247311826, 'epoch': 0.69}
+{'loss': 0.4485, 'grad_norm': 1.1884939670562744, 'learning_rate': 0.0002045454545454545, 'epoch': 0.69}
+{'loss': 0.5891, 'grad_norm': 1.7477940320968628, 'learning_rate': 0.0002045210166177908, 'epoch': 0.69}
+{'loss': 0.4141, 'grad_norm': 1.1390234231948853, 'learning_rate': 0.00020449657869012707, 'epoch': 0.69}
+{'loss': 0.5264, 'grad_norm': 2.4235641956329346, 'learning_rate': 0.00020447214076246332, 'epoch': 0.69}
+{'loss': 0.7413, 'grad_norm': 2.1330161094665527, 'learning_rate': 0.0002044477028347996, 'epoch': 0.69}
+{'loss': 0.5941, 'grad_norm': 1.950913429260254, 'learning_rate': 0.00020442326490713588, 'epoch': 0.69}
+{'loss': 0.6298, 'grad_norm': 1.4877159595489502, 'learning_rate': 0.0002043988269794721, 'epoch': 0.69}
+{'loss': 0.6594, 'grad_norm': 2.8081557750701904, 'learning_rate': 0.00020437438905180838, 'epoch': 0.69}
+{'loss': 0.8644, 'grad_norm': 3.2152159214019775, 'learning_rate': 0.00020434995112414466, 'epoch': 0.69}
+{'loss': 0.5714, 'grad_norm': 1.4229108095169067, 'learning_rate': 0.0002043255131964809, 'epoch': 0.69}
+{'loss': 0.7934, 'grad_norm': 2.301750421524048, 'learning_rate': 0.0002043010752688172, 'epoch': 0.69}
+{'loss': 0.7241, 'grad_norm': 1.235784888267517, 'learning_rate': 0.00020427663734115347, 'epoch': 0.69}
+{'loss': 1.0616, 'grad_norm': 1.9009183645248413, 'learning_rate': 0.0002042521994134897, 'epoch': 0.69}
+{'loss': 0.6872, 'grad_norm': 1.6725027561187744, 'learning_rate': 0.00020422776148582597, 'epoch': 0.69}
+{'loss': 1.1431, 'grad_norm': 2.6750926971435547, 'learning_rate': 0.00020420332355816225, 'epoch': 0.69}
+{'loss': 1.205, 'grad_norm': 3.524231195449829, 'learning_rate': 0.0002041788856304985, 'epoch': 0.69}
+{'loss': 0.9005, 'grad_norm': 1.7456165552139282, 'learning_rate': 0.00020415444770283478, 'epoch': 0.69}
+{'loss': 0.6964, 'grad_norm': 1.8678324222564697, 'learning_rate': 0.00020413000977517106, 'epoch': 0.69}
+{'loss': 1.397, 'grad_norm': 2.4247958660125732, 'learning_rate': 0.0002041055718475073, 'epoch': 0.69}
+{'loss': 0.9847, 'grad_norm': 3.6125667095184326, 'learning_rate': 0.00020408113391984358, 'epoch': 0.69}
+{'loss': 1.3243, 'grad_norm': 2.9348018169403076, 'learning_rate': 0.00020405669599217986, 'epoch': 0.69}
+{'loss': 1.0472, 'grad_norm': 1.7903168201446533, 'learning_rate': 0.0002040322580645161, 'epoch': 0.69}
+{'loss': 0.8571, 'grad_norm': 1.7623295783996582, 'learning_rate': 0.00020400782013685237, 'epoch': 0.69}
+{'loss': 0.6425, 'grad_norm': 1.318037986755371, 'learning_rate': 0.00020398338220918864, 'epoch': 0.69}
+{'loss': 1.0581, 'grad_norm': 2.563493013381958, 'learning_rate': 0.0002039589442815249, 'epoch': 0.7}
+{'loss': 1.1032, 'grad_norm': 2.8916120529174805, 'learning_rate': 0.00020393450635386117, 'epoch': 0.7}
+{'loss': 1.2579, 'grad_norm': 2.9763169288635254, 'learning_rate': 0.00020391006842619745, 'epoch': 0.7}
+{'loss': 1.4755, 'grad_norm': 1.8792319297790527, 'learning_rate': 0.0002038856304985337, 'epoch': 0.7}
+{'loss': 1.6926, 'grad_norm': 2.4958150386810303, 'learning_rate': 0.00020386119257086998, 'epoch': 0.7}
+{'loss': 0.8777, 'grad_norm': 3.446523904800415, 'learning_rate': 0.00020383675464320623, 'epoch': 0.7}
+{'loss': 0.5229, 'grad_norm': 1.1154353618621826, 'learning_rate': 0.00020381231671554248, 'epoch': 0.7}
+{'loss': 0.7689, 'grad_norm': 1.8287882804870605, 'learning_rate': 0.00020378787878787876, 'epoch': 0.7}
+{'loss': 0.6508, 'grad_norm': 2.325324535369873, 'learning_rate': 0.00020376344086021504, 'epoch': 0.7}
+{'loss': 0.4842, 'grad_norm': 1.61253821849823, 'learning_rate': 0.0002037390029325513, 'epoch': 0.7}
+{'loss': 1.2176, 'grad_norm': 3.248868703842163, 'learning_rate': 0.00020371456500488757, 'epoch': 0.7}
+{'loss': 0.4581, 'grad_norm': 0.74549800157547, 'learning_rate': 0.00020369012707722385, 'epoch': 0.7}
+{'loss': 0.2706, 'grad_norm': 0.5620818734169006, 'learning_rate': 0.00020366568914956007, 'epoch': 0.7}
+{'loss': 0.2828, 'grad_norm': 0.6187224984169006, 'learning_rate': 0.00020364125122189635, 'epoch': 0.7}
+{'loss': 0.5353, 'grad_norm': 1.0300192832946777, 'learning_rate': 0.00020361681329423263, 'epoch': 0.7}
+{'loss': 0.3123, 'grad_norm': 0.5869678854942322, 'learning_rate': 0.00020359237536656888, 'epoch': 0.7}
+{'loss': 0.4034, 'grad_norm': 0.944622814655304, 'learning_rate': 0.00020356793743890516, 'epoch': 0.7}
+{'loss': 0.5087, 'grad_norm': 0.7487533688545227, 'learning_rate': 0.00020354349951124144, 'epoch': 0.7}
+{'loss': 0.2688, 'grad_norm': 0.7087098956108093, 'learning_rate': 0.0002035190615835777, 'epoch': 0.7}
+{'loss': 0.4653, 'grad_norm': 1.030664324760437, 'learning_rate': 0.00020349462365591397, 'epoch': 0.7}
+{'loss': 0.4403, 'grad_norm': 1.0674512386322021, 'learning_rate': 0.00020347018572825025, 'epoch': 0.7}
+{'loss': 0.322, 'grad_norm': 0.9911973476409912, 'learning_rate': 0.00020344574780058647, 'epoch': 0.7}
+{'loss': 0.4332, 'grad_norm': 2.1932692527770996, 'learning_rate': 0.00020342130987292275, 'epoch': 0.7}
+ 35%|███▍      | 4463/12776 [46:31<1:15:02,  1.85it/s] 35%|███▍      | 4464/12776 [46:31<1:11:13,  1.94it/s]                                                       35%|███▍      | 4464/12776 [46:31<1:11:13,  1.94it/s] 35%|███▍      | 4465/12776 [46:32<1:06:53,  2.07it/s]                                                       35%|███▍      | 4465/12776 [46:32<1:06:53,  2.07it/s] 35%|███▍      | 4466/12776 [46:32<1:07:02,  2.07it/s]                                                       35%|███▍      | 4466/12776 [46:32<1:07:02,  2.07it/s] 35%|███▍      | 4467/12776 [46:33<1:02:34,  2.21it/s]                                                       35%|███▍      | 4467/12776 [46:33<1:02:34,  2.21it/s] 35%|███▍      | 4468/12776 [46:33<58:45,  2.36it/s]                                                       35%|███▍      | 4468/12776 [46:33<58:45,  2.36it/s] 35%|███▍      | 4469/12776 [46:33<58:22,  2.37it/s]                                                     35%|███▍      | 4469/12776 [46:33<58:22,  2.37it/s] 35%|███▍      | 4470/12776 [46:34<54:38,  2.53it/s]                                                     35%|███▍      | 4470/12776 [46:34<54:38,  2.53it/s] 35%|███▍      | 4471/12776 [46:34<51:24,  2.69it/s]                                                     35%|███▍      | 4471/12776 [46:34<51:24,  2.69it/s] 35%|███▌      | 4472/12776 [46:34<48:45,  2.84it/s]                                                     35%|███▌      | 4472/12776 [46:34<48:45,  2.84it/s] 35%|███▌      | 4473/12776 [46:35<49:11,  2.81it/s]                                                     35%|███▌      | 4473/12776 [46:35<49:11,  2.81it/s] 35%|███▌      | 4474/12776 [46:35<46:05,  3.00it/s]                                                     35%|███▌      | 4474/12776 [46:35<46:05,  3.00it/s] 35%|███▌      | 4475/12776 [46:35<43:28,  3.18it/s]                                                     35%|███▌      | 4475/12776 [46:35<43:28,  3.18it/s] 35%|███▌      | 4476/12776 [46:36<46:55,  2.95it/s]                                                     35%|███▌      | 4476/12776 [46:36<46:55,  2.95it/s] 35%|███▌      | 4477/12776 [46:36<43:32,  3.18it/s]                                                     35%|███▌      | 4477/12776 [46:36<43:32,  3.18it/s] 35%|███▌      | 4478/12776 [46:36<40:52,  3.38it/s]                                                     35%|███▌      | 4478/12776 [46:36<40:52,  3.38it/s] 35%|███▌      | 4479/12776 [46:36<38:57,  3.55it/s]                                                     35%|███▌      | 4479/12776 [46:36<38:57,  3.55it/s] 35%|███▌      | 4480/12776 [46:37<37:04,  3.73it/s]                                                     35%|███▌      | 4480/12776 [46:37<37:04,  3.73it/s] 35%|███▌      | 4481/12776 [46:37<38:21,  3.60it/s]                                                     35%|███▌      | 4481/12776 [46:37<38:21,  3.60it/s] 35%|███▌      | 4482/12776 [46:37<36:04,  3.83it/s]                                                     35%|███▌      | 4482/12776 [46:37<36:04,  3.83it/s] 35%|███▌      | 4483/12776 [46:37<34:18,  4.03it/s]                                                     35%|███▌      | 4483/12776 [46:37<34:18,  4.03it/s] 35%|███▌      | 4484/12776 [46:38<33:55,  4.07it/s]                                                     35%|███▌      | 4484/12776 [46:38<33:55,  4.07it/s] 35%|███▌      | 4485/12776 [46:38<36:25,  3.79it/s]                                                     35%|███▌      | 4485/12776 [46:38<36:25,  3.79it/s] 35%|███▌      | 4486/12776 [46:38<35:14,  3.92it/s]                                                     35%|███▌      | 4486/12776 [46:38<35:14,  3.92it/s] 35%|███▌      | 4487/12776 [46:38<33:52,  4.08it/s]                                                     35%|███▌      | 4487/12776 [46:38<33:52,  4.08it/s] 35%|███▌      | 4488/12776 [46:39<32:33,  4.24it/s]                                                     35%|███▌      | 4488/12776 [46:39<32:33,  4.24it/s] 35%|███▌      | 4489/12776 [46:39<31:27,  4.39it/s]                                                     35%|███▌      | 4489/12776 [46:39<31:27,  4.39it/s] 35%|███▌      | 4490/12776 [46:39<32:07,  4.30it/s]                                                     35%|███▌      | 4490/12776 [46:39<32:07,  4.30it/s] 35%|███▌      | 4491/12776 [46:39<31:02,  4.45it/s]                                                     35%|███▌      | 4491/12776 [46:39<31:02,  4.45it/s] 35%|███▌      | 4492/12776 [46:39<30:10,  4.57it/s]                                                     35%|███▌      | 4492/12776 [46:40<30:10,  4.57it/s] 35%|███▌      | 4493/12776 [46:40<29:21,  4.70it/s]                                                     35%|███▌      | 4493/12776 [46:40<29:21,  4.70it/s] 35%|███▌      | 4494/12776 [46:40<28:40,  4.81it/s]                                                     35%|███▌      | 4494/12776 [46:40<28:40,  4.81it/s] 35%|███▌      | 4495/12776 [46:40<32:28,  4.25it/s]                                                     35%|███▌      | 4495/12776 [46:40<32:28,  4.25it/s] 35%|███▌      | 4496/12776 [46:40<30:41,  4.50it/s]                                                     35%|███▌      | 4496/12776 [46:40<30:41,  4.50it/s] 35%|███▌      | 4497/12776 [46:41<29:38,  4.65it/s]                                                     35%|███▌      | 4497/12776 [46:41<29:38,  4.65it/s] 35%|███▌      | 4498/12776 [46:41<28:22,  4.86it/s]                                                     35%|███▌      | 4498/12776 [46:41<28:22,  4.86it/s] 35%|███▌      | 4499/12776 [46:41<27:24,  5.03it/s]                                                     35%|███▌      | 4499/12776 [46:41<27:24,  5.03it/s] 35%|███▌      | 4500/12776 [46:42<48:36,  2.84it/s]                                                     35%|███▌      | 4500/12776 [46:42<48:36,  2.84it/s] 35%|███▌      | 4501/12776 [46:43<1:36:21,  1.43it/s]                                                       35%|███▌      | 4501/12776 [46:43<1:36:21,  1.43it/s] 35%|███▌      | 4502/12776 [46:44<1:49:43,  1.26it/s]                                                       35%|███▌      | 4502/12776 [46:44<1:49:43,  1.26it/s] 35%|███▌      | 4503/12776 [46:45<1:54:59,  1.20it/s]                                                       35%|███▌      | 4503/12776 [46:45<1:54:59,  1.20it/s] 35%|███▌      | 4504/12776 [46:46<1:54:33,  1.20it/s]                                                       35%|███▌      | 4504/12776 [46:46<1:54:33,  1.20it/s] 35%|███▌      | 4505/12776 [46:47<1:50:10,  1.25it/s]                                                       35%|���██▌      | 4505/12776 [46:47<1:50:10,  1.25it/s] 35%|███▌      | 4506/12776 [46:47<1:47:23,  1.28it/s]                                                       35%|███▌      | 4506/12776 [46:47<1:47:23,  1.28it/s] 35%|███▌      | 4507/12776 [46:48<1:41:56,  1.35it/s]                                                       35%|███▌      | 4507/12776 [46:48<1:41:56,  1.35it/s] 35%|███▌      | 4508/12776 [46:49<1:36:49,  1.42it/s]                                                       35%|███▌      | 4508/12776 [46:49<1:36:49,  1.42it/s] 35%|███▌      | 4509/12776 [46:49<1:31:27,  1.51it/s]                                                       35%|███▌      | 4509/12776 [46:49<1:31:27,  1.51it/s] 35%|███▌      | 4510/12776 [46:50<1:27:28,  1.58it/s]                                                       35%|███▌      | 4510/12776 [46:50<1:27:28,  1.58it/s] 35%|███▌      | 4511/12776 [46:50<1:23:36,  1.65it/s]                                                       35%|███▌      | 4511/12776 [46:50<1:23:36,  1.65it/s] 35%|███▌      | 4512/12776 [46:51<1:21:13,  1.70it/s]                                                       35%|███▌      | 4512/12776 [46:51<1:21:13,  1.70it/s] 35%|███▌      | 4513/12776 [46:51<1:17:23,  1.78it/s]                                                       35%|███▌      | 4513/12776 [46:51<1:17:23,  1.78it/s] 35%|███▌      | 4514/12776 [46:52<1:15:04,  1.83it/s]                                                       35%|███▌      | 4514/12776 [46:52<1:15:04,  1.83it/s] 35%|███▌      | 4515/12776 [46:52<1:10:26,  1.95it/s]                                                       35%|███▌      | 4515/12776 [46:52<1:10:26,  1.95it/s] 35%|███▌      | 4516/12776 [46:53<1:11:15,  1.93it/s]                                                       35%|███▌      | 4516/12776 [46:53<1:11:15,  1.93it/s] 35%|███▌      | 4517/12776 [46:53<1:06:23,  2.07it/s]                                                       35%|███▌      | 4517/12776 [46:53<1:06:23,  2.07it/s] 35%|███▌      | 4518/12776 [46:54<1:02:24,  2.21it/s]                                                       35%|███▌      | 4518/12776 [46:54<1:02:24,  2.21it/s] 35%|███▌      | 4519/12776 [46:54<1:00:04,  2.29it/s]                                                       35%|███▌      | 4519/12776 [46:54<1:00:04,  2.29it/s] 35%|███▌      | 4520/12776 [46:54<56:51,  2.42it/s]                                                       35%|███▌      | 4520/12776 [46:54<56:51,  2.42it/s] 35%|███▌      | 4521/12776 [46:55<53:57,  2.55it/s]                                                     35%|███▌      | 4521/12776 [46:55<53:57,  2.55it/s] 35%|███▌      | 4522/12776 [46:55<55:46,  2.47it/s]                                                     35%|███▌      | 4522/12776 [46:55<55:46,  2.47it/s] 35%|███▌      | 4523/12776 [46:56<52:27,  2.62it/s]                                                     35%|███▌      | 4523/12776 [46:56<52:27,  2.62it/s] 35%|███▌      | 4524/12776 [46:56<49:23,  2.78it/s]                                                     35%|███▌      | 4524/12776 [46:56<49:23,  2.78it/s] 35%|███▌      | 4525/12776 [46:56<46:56,  2.93it/s]                                                     35%|███▌      | 4525/12776 [46:56<46:56,  2.93it/s] 35%|███▌      | 4526/12776 [46:56<47:00,  2.92it/s]                                                     35%|███▌      | 4526/12776 [46:56<47:00,  2.92it/s] 35%|███▌      | 4527/12776 [46:57<44:39,  3.08it/s]                                                     35%|███▌      | 4527/12776 [46:57<44:39,  3.08it/s] 35%|███▌      | 4528/12776 [46:57<42:45,  3.22it/s]                                                     35%|███▌      | 4528/12776 [46:57<42:45,  3.22it/s] 35%|███▌      | 4529/12776 [46:57<41:11,  3.34it/s]                                                     35%|███▌      | 4529/12776 [46:57<41:11,  3.34it/s] 35%|███▌      | 4530/12776 [46:58<41:53,  3.28it/s]                                                     35%|███▌      | 4530/12776 [46:58<41:53,  3.28it/s] 35%|███▌      | 4531/12776 [46:58<39:55,  3.44it/s]                                                     35%|███▌      | 4531/12776 [46:58<39:55,  3.44it/s] 35%|███▌      | 4532/12776 [46:58<38:23,  3.58it/s]                                                     35%|███▌      | 4532/12776 [46:58<38:23,  3.58it/s] 35%|███▌      | 4533/12776 [46:58<37:00,  3.71it/s]                                                     35%|███▌      | 4533/12776 [46:58<37:00,  3.71it/s] 35%|███▌      | 4534/12776 [46:59<40:05,  3.43it/s]                                                     35%|███▌      | 4534/12776 [46:59<40:05,  3.43it/s] 35%|███▌      | 4535/12776 [46:59<37:42,  3.64it/s]                                                     35%|███▌      | 4535/12776 [46:59<37:42,  3.64it/s] 36%|███▌      | 4536/12776 [46:59<35:53,  3.83it/s]                                                     36%|███▌      | 4536/12776 [46:59<35:53,  3.83it/s] 36%|███▌      | 4537/12776 [46:59<34:25,  3.99it/s]                                                     36%|███▌      | 4537/12776 [46:59<34:25,  3.99it/s] 36%|███▌      | 4538/12776 [47:00<37:04,  3.70it/s]                                                     36%|███▌      | 4538/12776 [47:00<37:04,  3.70it/s] 36%|███▌      | 4539/12776 [47:00<34:48,  3.94it/s]                                                     36%|███▌      | 4539/12776 [47:00<34:48,  3.94it/s] 36%|███▌      | 4540/12776 [47:00<33:32,  4.09it/s]                                                     36%|███▌      | 4540/12776 [47:00<33:32,  4.09it/s] 36%|███▌      | 4541/12776 [47:00<31:58,  4.29it/s]                                                    {'loss': 0.3078, 'grad_norm': 0.7170542478561401, 'learning_rate': 0.00020339687194525903, 'epoch': 0.7}
+{'loss': 0.3968, 'grad_norm': 1.0266473293304443, 'learning_rate': 0.00020337243401759528, 'epoch': 0.7}
+{'loss': 0.2991, 'grad_norm': 0.8614751100540161, 'learning_rate': 0.00020334799608993156, 'epoch': 0.7}
+{'loss': 0.6496, 'grad_norm': 0.9565255641937256, 'learning_rate': 0.00020332355816226783, 'epoch': 0.7}
+{'loss': 0.569, 'grad_norm': 1.1029607057571411, 'learning_rate': 0.00020329912023460409, 'epoch': 0.7}
+{'loss': 0.5768, 'grad_norm': 1.1075776815414429, 'learning_rate': 0.00020327468230694036, 'epoch': 0.7}
+{'loss': 0.9883, 'grad_norm': 1.6944200992584229, 'learning_rate': 0.00020325024437927661, 'epoch': 0.7}
+{'loss': 0.6218, 'grad_norm': 1.856938362121582, 'learning_rate': 0.00020322580645161287, 'epoch': 0.7}
+{'loss': 0.6972, 'grad_norm': 1.5838960409164429, 'learning_rate': 0.00020320136852394914, 'epoch': 0.7}
+{'loss': 0.8786, 'grad_norm': 1.5133699178695679, 'learning_rate': 0.00020317693059628542, 'epoch': 0.7}
+{'loss': 0.4001, 'grad_norm': 0.9104510545730591, 'learning_rate': 0.00020315249266862167, 'epoch': 0.7}
+{'loss': 0.4596, 'grad_norm': 1.3532549142837524, 'learning_rate': 0.00020312805474095795, 'epoch': 0.7}
+{'loss': 0.6143, 'grad_norm': 0.9898942708969116, 'learning_rate': 0.00020310361681329423, 'epoch': 0.7}
+{'loss': 0.9761, 'grad_norm': 1.7784004211425781, 'learning_rate': 0.00020307917888563045, 'epoch': 0.7}
+{'loss': 1.0946, 'grad_norm': 2.9220640659332275, 'learning_rate': 0.00020305474095796673, 'epoch': 0.7}
+{'loss': 0.6745, 'grad_norm': 1.6523720026016235, 'learning_rate': 0.000203030303030303, 'epoch': 0.7}
+{'loss': 0.5833, 'grad_norm': 2.706207275390625, 'learning_rate': 0.00020300586510263926, 'epoch': 0.7}
+{'loss': 0.752, 'grad_norm': 1.879623532295227, 'learning_rate': 0.00020298142717497554, 'epoch': 0.7}
+{'loss': 0.7268, 'grad_norm': 2.1002285480499268, 'learning_rate': 0.00020295698924731182, 'epoch': 0.7}
+{'loss': 0.9332, 'grad_norm': 2.6373350620269775, 'learning_rate': 0.00020293255131964807, 'epoch': 0.7}
+{'loss': 1.4244, 'grad_norm': 3.0177090167999268, 'learning_rate': 0.00020290811339198435, 'epoch': 0.7}
+{'loss': 0.6665, 'grad_norm': 2.546877384185791, 'learning_rate': 0.00020288367546432063, 'epoch': 0.7}
+{'loss': 1.2049, 'grad_norm': 1.9773567914962769, 'learning_rate': 0.00020285923753665685, 'epoch': 0.7}
+{'loss': 1.2953, 'grad_norm': 3.617976188659668, 'learning_rate': 0.00020283479960899313, 'epoch': 0.7}
+{'loss': 0.722, 'grad_norm': 2.691190719604492, 'learning_rate': 0.0002028103616813294, 'epoch': 0.7}
+{'loss': 0.8476, 'grad_norm': 2.3003101348876953, 'learning_rate': 0.00020278592375366566, 'epoch': 0.7}
+{'loss': 0.6616, 'grad_norm': 2.354837656021118, 'learning_rate': 0.00020276148582600194, 'epoch': 0.7}
+{'loss': 1.2246, 'grad_norm': 2.914757013320923, 'learning_rate': 0.00020273704789833822, 'epoch': 0.7}
+{'loss': 1.2729, 'grad_norm': 2.373718500137329, 'learning_rate': 0.00020271260997067447, 'epoch': 0.7}
+{'loss': 1.2303, 'grad_norm': 2.7361257076263428, 'learning_rate': 0.00020268817204301075, 'epoch': 0.7}
+{'loss': 0.6874, 'grad_norm': 1.035384178161621, 'learning_rate': 0.000202663734115347, 'epoch': 0.7}
+{'loss': 1.0254, 'grad_norm': 3.0830559730529785, 'learning_rate': 0.00020263929618768325, 'epoch': 0.7}
+{'loss': 0.7345, 'grad_norm': 5.6683478355407715, 'learning_rate': 0.00020261485826001953, 'epoch': 0.7}
+{'loss': 0.9819, 'grad_norm': 2.4761593341827393, 'learning_rate': 0.0002025904203323558, 'epoch': 0.7}
+{'loss': 0.9002, 'grad_norm': 4.851938724517822, 'learning_rate': 0.00020256598240469206, 'epoch': 0.7}
+{'loss': 0.7226, 'grad_norm': 1.542232871055603, 'learning_rate': 0.00020254154447702833, 'epoch': 0.7}
+{'loss': 1.0137, 'grad_norm': 1.4405412673950195, 'learning_rate': 0.0002025171065493646, 'epoch': 0.7}
+{'loss': 0.725, 'grad_norm': 1.6948432922363281, 'learning_rate': 0.00020249266862170084, 'epoch': 0.7}
+{'loss': 0.2645, 'grad_norm': 0.5752367973327637, 'learning_rate': 0.00020246823069403712, 'epoch': 0.7}
+{'loss': 0.3297, 'grad_norm': 0.5872207880020142, 'learning_rate': 0.0002024437927663734, 'epoch': 0.7}
+{'loss': 0.379, 'grad_norm': 0.5640816688537598, 'learning_rate': 0.00020241935483870965, 'epoch': 0.7}
+{'loss': 0.3643, 'grad_norm': 0.9761353731155396, 'learning_rate': 0.00020239491691104592, 'epoch': 0.71}
+{'loss': 0.3448, 'grad_norm': 0.6274861693382263, 'learning_rate': 0.0002023704789833822, 'epoch': 0.71}
+{'loss': 0.3694, 'grad_norm': 0.8410617709159851, 'learning_rate': 0.00020234604105571845, 'epoch': 0.71}
+{'loss': 0.3699, 'grad_norm': 0.7062875032424927, 'learning_rate': 0.00020232160312805473, 'epoch': 0.71}
+{'loss': 0.478, 'grad_norm': 0.783191442489624, 'learning_rate': 0.000202297165200391, 'epoch': 0.71}
+{'loss': 0.3735, 'grad_norm': 0.8087450265884399, 'learning_rate': 0.00020227272727272723, 'epoch': 0.71}
+{'loss': 0.4296, 'grad_norm': 0.7015910148620605, 'learning_rate': 0.0002022482893450635, 'epoch': 0.71}
+{'loss': 0.4157, 'grad_norm': 1.2119005918502808, 'learning_rate': 0.0002022238514173998, 'epoch': 0.71}
+{'loss': 0.3681, 'grad_norm': 0.7165002226829529, 'learning_rate': 0.00020219941348973604, 'epoch': 0.71}
+{'loss': 0.472, 'grad_norm': 0.9335975050926208, 'learning_rate': 0.00020217497556207232, 'epoch': 0.71}
+{'loss': 0.6244, 'grad_norm': 1.0434401035308838, 'learning_rate': 0.0002021505376344086, 'epoch': 0.71}
+{'loss': 0.5675, 'grad_norm': 1.2325775623321533, 'learning_rate': 0.00020212609970674485, 'epoch': 0.71}
+{'loss': 0.6639, 'grad_norm': 1.6948950290679932, 'learning_rate': 0.0002021016617790811, 'epoch': 0.71}
+{'loss': 0.6657, 'grad_norm': 1.0508677959442139, 'learning_rate': 0.00020207722385141738, 'epoch': 0.71}
+{'loss': 0.5776, 'grad_norm': 3.625051259994507, 'learning_rate': 0.00020205278592375363, 'epoch': 0.71}
+{'loss': 0.7766, 'grad_norm': 3.57965087890625, 'learning_rate': 0.0002020283479960899, 'epoch': 0.71}
+{'loss': 1.1872, 'grad_norm': 3.801496744155884, 'learning_rate': 0.0002020039100684262, 'epoch': 0.71}
+{'loss': 0.9458, 'grad_norm': 1.3635119199752808, 'learning_rate': 0.00020197947214076244, 'epoch': 0.71}
+{'loss': 0.8063, 'grad_norm': 3.277301788330078, 'learning_rate': 0.00020195503421309872, 'epoch': 0.71}
+{'loss': 0.552, 'grad_norm': 2.7568883895874023, 'learning_rate': 0.000201930596285435, 'epoch': 0.71}
+{'loss': 0.6122, 'grad_norm': 1.8860586881637573, 'learning_rate': 0.00020190615835777122, 'epoch': 0.71}
+{'loss': 0.7285, 'grad_norm': 1.4238359928131104, 'learning_rate': 0.0002018817204301075, 'epoch': 0.71}
+{'loss': 0.4278, 'grad_norm': 0.9961578845977783, 'learning_rate': 0.00020185728250244378, 'epoch': 0.71}
+{'loss': 1.2713, 'grad_norm': 1.8736709356307983, 'learning_rate': 0.00020183284457478003, 'epoch': 0.71}
+{'loss': 0.7608, 'grad_norm': 1.6112703084945679, 'learning_rate': 0.0002018084066471163, 'epoch': 0.71}
+{'loss': 0.6969, 'grad_norm': 1.8438599109649658, 'learning_rate': 0.00020178396871945258, 'epoch': 0.71}
+{'loss': 0.7564, 'grad_norm': 2.1331958770751953, 'learning_rate': 0.00020175953079178884, 'epoch': 0.71}
+{'loss': 0.5279, 'grad_norm': 1.4046313762664795, 'learning_rate': 0.00020173509286412511, 'epoch': 0.71}
+{'loss': 1.1941, 'grad_norm': 2.5683090686798096, 'learning_rate': 0.0002017106549364614, 'epoch': 0.71}
+{'loss': 1.0081, 'grad_norm': 2.2568979263305664, 'learning_rate': 0.00020168621700879762, 'epoch': 0.71}
+{'loss': 1.0977, 'grad_norm': 2.451312780380249, 'learning_rate': 0.0002016617790811339, 'epoch': 0.71}
+{'loss': 0.9091, 'grad_norm': 2.566115379333496, 'learning_rate': 0.00020163734115347017, 'epoch': 0.71}
+{'loss': 1.1188, 'grad_norm': 1.651110053062439, 'learning_rate': 0.00020161290322580642, 'epoch': 0.71}
+{'loss': 0.9653, 'grad_norm': 1.3296421766281128, 'learning_rate': 0.0002015884652981427, 'epoch': 0.71}
+{'loss': 1.0826, 'grad_norm': 4.0526862144470215, 'learning_rate': 0.00020156402737047898, 'epoch': 0.71}
+{'loss': 1.4499, 'grad_norm': 2.9230265617370605, 'learning_rate': 0.00020153958944281523, 'epoch': 0.71}
+{'loss': 0.9125, 'grad_norm': 3.018975257873535, 'learning_rate': 0.00020151515151515148, 'epoch': 0.71}
+ 36%|███▌      | 4541/12776 [47:00<31:58,  4.29it/s] 36%|███▌      | 4542/12776 [47:01<30:53,  4.44it/s]                                                     36%|███▌      | 4542/12776 [47:01<30:53,  4.44it/s] 36%|███▌      | 4543/12776 [47:01<32:46,  4.19it/s]                                                     36%|███▌      | 4543/12776 [47:01<32:46,  4.19it/s] 36%|███▌      | 4544/12776 [47:01<31:11,  4.40it/s]                                                     36%|███▌      | 4544/12776 [47:01<31:11,  4.40it/s] 36%|███▌      | 4545/12776 [47:01<29:57,  4.58it/s]                                                     36%|███▌      | 4545/12776 [47:01<29:57,  4.58it/s] 36%|███▌      | 4546/12776 [47:01<28:58,  4.73it/s]                                                     36%|███▌      | 4546/12776 [47:01<28:58,  4.73it/s] 36%|███▌      | 4547/12776 [47:02<28:10,  4.87it/s]                                                     36%|███▌      | 4547/12776 [47:02<28:10,  4.87it/s] 36%|███▌      | 4548/12776 [47:02<27:25,  5.00it/s]                                                     36%|███▌      | 4548/12776 [47:02<27:25,  5.00it/s] 36%|███▌      | 4549/12776 [47:02<30:58,  4.43it/s]                                                     36%|███▌      | 4549/12776 [47:02<30:58,  4.43it/s] 36%|███▌      | 4550/12776 [47:03<48:58,  2.80it/s]                                                     36%|███▌      | 4550/12776 [47:03<48:58,  2.80it/s] 36%|███▌      | 4551/12776 [47:04<1:25:39,  1.60it/s]                                                       36%|███▌      | 4551/12776 [47:04<1:25:39,  1.60it/s] 36%|███▌      | 4552/12776 [47:05<1:37:00,  1.41it/s]                                                       36%|███▌      | 4552/12776 [47:05<1:37:00,  1.41it/s] 36%|███▌      | 4553/12776 [47:06<1:42:34,  1.34it/s]                                                       36%|███▌      | 4553/12776 [47:06<1:42:34,  1.34it/s] 36%|███▌      | 4554/12776 [47:07<1:45:54,  1.29it/s]                                                       36%|███▌      | 4554/12776 [47:07<1:45:54,  1.29it/s] 36%|███▌      | 4555/12776 [47:07<1:43:45,  1.32it/s]                                                       36%|███▌      | 4555/12776 [47:07<1:43:45,  1.32it/s] 36%|███▌      | 4556/12776 [47:08<1:42:22,  1.34it/s]                                                       36%|███▌      | 4556/12776 [47:08<1:42:22,  1.34it/s] 36%|███▌      | 4557/12776 [47:09<1:37:29,  1.41it/s]                                                       36%|███▌      | 4557/12776 [47:09<1:37:29,  1.41it/s] 36%|███▌      | 4558/12776 [47:09<1:33:04,  1.47it/s]                                                       36%|███▌      | 4558/12776 [47:09<1:33:04,  1.47it/s] 36%|███▌      | 4559/12776 [47:10<1:28:20,  1.55it/s]                                                       36%|███▌      | 4559/12776 [47:10<1:28:20,  1.55it/s] 36%|███▌      | 4560/12776 [47:10<1:24:57,  1.61it/s]                                                       36%|███▌      | 4560/12776 [47:10<1:24:57,  1.61it/s] 36%|███▌      | 4561/12776 [47:11<1:20:09,  1.71it/s]                                                       36%|███▌      | 4561/12776 [47:11<1:20:09,  1.71it/s] 36%|███▌      | 4562/12776 [47:11<1:16:13,  1.80it/s]                                                       36%|███▌      | 4562/12776 [47:11<1:16:13,  1.80it/s] 36%|███▌      | 4563/12776 [47:12<1:11:46,  1.91it/s]                                                       36%|███▌      | 4563/12776 [47:12<1:11:46,  1.91it/s] 36%|███▌      | 4564/12776 [47:12<1:12:39,  1.88it/s]                                                       36%|███▌      | 4564/12776 [47:12<1:12:39,  1.88it/s] 36%|███▌      | 4565/12776 [47:13<1:08:02,  2.01it/s]                                                       36%|███▌      | 4565/12776 [47:13<1:08:02,  2.01it/s] 36%|███▌      | 4566/12776 [47:13<1:03:45,  2.15it/s]                                                       36%|███▌      | 4566/12776 [47:13<1:03:45,  2.15it/s] 36%|███▌      | 4567/12776 [47:14<1:04:36,  2.12it/s]                                                       36%|███▌      | 4567/12776 [47:14<1:04:36,  2.12it/s] 36%|███▌      | 4568/12776 [47:14<59:50,  2.29it/s]                                                       36%|███▌      | 4568/12776 [47:14<59:50,  2.29it/s] 36%|███▌      | 4569/12776 [47:14<56:14,  2.43it/s]                                                     36%|███▌      | 4569/12776 [47:14<56:14,  2.43it/s] 36%|███▌      | 4570/12776 [47:15<57:00,  2.40it/s]                                                     36%|███▌      | 4570/12776 [47:15<57:00,  2.40it/s] 36%|███▌      | 4571/12776 [47:15<53:34,  2.55it/s]                                                     36%|███▌      | 4571/12776 [47:15<53:34,  2.55it/s] 36%|███▌      | 4572/12776 [47:15<50:57,  2.68it/s]                                                     36%|███▌      | 4572/12776 [47:15<50:57,  2.68it/s] 36%|███▌      | 4573/12776 [47:16<49:59,  2.73it/s]                                                     36%|███▌      | 4573/12776 [47:16<49:59,  2.73it/s] 36%|███▌      | 4574/12776 [47:16<47:31,  2.88it/s]                                                     36%|███▌      | 4574/12776 [47:16<47:31,  2.88it/s] 36%|███▌      | 4575/12776 [47:16<45:20,  3.01it/s]                                                     36%|███▌      | 4575/12776 [47:16<45:20,  3.01it/s] 36%|███▌      | 4576/12776 [47:17<43:56,  3.11it/s]                                                     36%|███▌      | 4576/12776 [47:17<43:56,  3.11it/s] 36%|███▌      | 4577/12776 [47:17<46:42,  2.93it/s]                                                     36%|███▌      | 4577/12776 [47:17<46:42,  2.93it/s] 36%|███▌      | 4578/12776 [47:17<44:07,  3.10it/s]                                                     36%|███▌      | 4578/12776 [47:17<44:07,  3.10it/s] 36%|███▌      | 4579/12776 [47:18<41:55,  3.26it/s]                                                     36%|███▌      | 4579/12776 [47:18<41:55,  3.26it/s] 36%|███▌      | 4580/12776 [47:18<40:14,  3.39it/s]                                                     36%|███▌      | 4580/12776 [47:18<40:14,  3.39it/s] 36%|███▌      | 4581/12776 [47:18<42:32,  3.21it/s]                                                     36%|███▌      | 4581/12776 [47:18<42:32,  3.21it/s] 36%|███▌      | 4582/12776 [47:19<40:10,  3.40it/s]                                                     36%|███▌      | 4582/12776 [47:19<40:10,  3.40it/s] 36%|███▌      | 4583/12776 [47:19<38:12,  3.57it/s]                                                     36%|███▌      | 4583/12776 [47:19<38:12,  3.57it/s] 36%|███▌      | 4584/12776 [47:19<36:35,  3.73it/s]                                                     36%|███▌      | 4584/12776 [47:19<36:35,  3.73it/s] 36%|███▌      | 4585/12776 [47:19<38:53,  3.51it/s]                                                     36%|███▌      | 4585/12776 [47:19<38:53,  3.51it/s] 36%|███▌      | 4586/12776 [47:20<36:34,  3.73it/s]                                                     36%|███▌      | 4586/12776 [47:20<36:34,  3.73it/s] 36%|███▌      | 4587/12776 [47:20<34:48,  3.92it/s]                                                     36%|███▌      | 4587/12776 [47:20<34:48,  3.92it/s] 36%|███▌      | 4588/12776 [47:20<33:15,  4.10it/s]                                                     36%|███▌      | 4588/12776 [47:20<33:15,  4.10it/s] 36%|███▌      | 4589/12776 [47:20<36:36,  3.73it/s]                                                     36%|███▌      | 4589/12776 [47:20<36:36,  3.73it/s] 36%|███▌      | 4590/12776 [47:21<34:26,  3.96it/s]                                                     36%|███▌      | 4590/12776 [47:21<34:26,  3.96it/s] 36%|███▌      | 4591/12776 [47:21<32:39,  4.18it/s]                                                     36%|███▌      | 4591/12776 [47:21<32:39,  4.18it/s] 36%|███▌      | 4592/12776 [47:21<31:15,  4.36it/s]                                                     36%|███▌      | 4592/12776 [47:21<31:15,  4.36it/s] 36%|███▌      | 4593/12776 [47:21<30:07,  4.53it/s]                                                     36%|███▌      | 4593/12776 [47:21<30:07,  4.53it/s] 36%|███▌      | 4594/12776 [47:21<34:18,  3.97it/s]                                                     36%|███▌      | 4594/12776 [47:21<34:18,  3.97it/s] 36%|███▌      | 4595/12776 [47:22<32:04,  4.25it/s]                                                     36%|███▌      | 4595/12776 [47:22<32:04,  4.25it/s] 36%|███▌      | 4596/12776 [47:22<30:30,  4.47it/s]                                                     36%|███▌      | 4596/12776 [47:22<30:30,  4.47it/s] 36%|███▌      | 4597/12776 [47:22<29:11,  4.67it/s]                                                     36%|███▌      | 4597/12776 [47:22<29:11,  4.67it/s] 36%|███▌      | 4598/12776 [47:22<28:10,  4.84it/s]                                                     36%|███▌      | 4598/12776 [47:22<28:10,  4.84it/s] 36%|███▌      | 4599/12776 [47:22<27:18,  4.99it/s]                                                     36%|███▌      | 4599/12776 [47:22<27:18,  4.99it/s] 36%|███▌      | 4600/12776 [47:23<47:16,  2.88it/s]                                                     36%|███▌      | 4600/12776 [47:23<47:16,  2.88it/s] 36%|███▌      | 4601/12776 [47:25<1:30:23,  1.51it/s]                                                       36%|███▌      | 4601/12776 [47:25<1:30:23,  1.51it/s] 36%|███▌      | 4602/12776 [47:25<1:41:37,  1.34it/s]                                                       36%|███▌      | 4602/12776 [47:25<1:41:37,  1.34it/s] 36%|███▌      | 4603/12776 [47:26<1:51:35,  1.22it/s]                                                       36%|███▌      | 4603/12776 [47:26<1:51:35,  1.22it/s] 36%|███▌      | 4604/12776 [47:27<1:51:57,  1.22it/s]                                                       36%|███▌      | 4604/12776 [47:27<1:51:57,  1.22it/s] 36%|███▌      | 4605/12776 [47:28<1:47:36,  1.27it/s]                                                       36%|███▌      | 4605/12776 [47:28<1:47:36,  1.27it/s] 36%|███▌      | 4606/12776 [47:29<1:44:56,  1.30it/s]                                                       36%|███▌      | 4606/12776 [47:29<1:44:56,  1.30it/s] 36%|███▌      | 4607/12776 [47:29<1:39:14,  1.37it/s]                                                       36%|███▌      | 4607/12776 [47:29<1:39:14,  1.37it/s] 36%|███▌      | 4608/12776 [47:30<1:32:59,  1.46it/s]                                                       36%|███▌      | 4608/12776 [47:30<1:32:59,  1.46it/s] 36%|███▌      | 4609/12776 [47:31<1:28:16,  1.54it/s]                                                       36%|███▌      | 4609/12776 [47:31<1:28:16,  1.54it/s] 36%|███▌      | 4610/12776 [47:31<1:23:59,  1.62it/s]                                                       36%|███▌      | 4610/12776 [47:31<1:23:59,  1.62it/s] 36%|███▌      | 4611/12776 [47:32<1:19:10,  1.72it/s]                                                       36%|███▌      | 4611/12776 [47:32<1:19:10,  1.72it/s] 36%|███▌      | 4612/12776 [47:32<1:16:38,  1.78it/s]                                                       36%|███▌      | 4612/12776 [47:32<1:16:38,  1.78it/s] 36%|███▌      | 4613/12776 [47:33<1:12:00,  1.89it/s]                                                       36%|███▌      | 4613/12776 [47:33<1:12:00,  1.89it/s] 36%|███▌      | 4614/12776 [47:33<1:10:31,  1.93it/s]                                                       36%|███▌      | 4614/12776 [47:33<1:10:31,  1.93it/s] 36%|███▌      | 4615/12776 [47:33<1:06:38,  2.04it/s]                                                       36%|███▌      | 4615/12776 [47:33<1:06:38,  2.04it/s] 36%|███▌      | 4616/12776 [47:34<1:03:05,  2.16it/s]                                                       36%|███▌      | 4616/12776 [47:34<1:03:05,  2.16it/s] 36%|███▌      | 4617/12776 [47:34<1:04:34,  2.11it/s]                                                       36%|███▌      | 4617/12776 [47:34<1:04:34,  2.11it/s] 36%|███▌      | 4618/12776 [47:35<1:00:54,  2.23it/s]                                                      {'loss': 1.3237, 'grad_norm': 1.6214097738265991, 'learning_rate': 0.00020149071358748776, 'epoch': 0.71}
+{'loss': 1.8457, 'grad_norm': 2.642343521118164, 'learning_rate': 0.000201466275659824, 'epoch': 0.71}
+{'loss': 1.279, 'grad_norm': 2.0636792182922363, 'learning_rate': 0.0002014418377321603, 'epoch': 0.71}
+{'loss': 1.4645, 'grad_norm': 3.2463300228118896, 'learning_rate': 0.00020141739980449657, 'epoch': 0.71}
+{'loss': 1.3413, 'grad_norm': 2.8252651691436768, 'learning_rate': 0.00020139296187683282, 'epoch': 0.71}
+{'loss': 0.7135, 'grad_norm': 0.8941448330879211, 'learning_rate': 0.0002013685239491691, 'epoch': 0.71}
+{'loss': 0.9046, 'grad_norm': 2.6736679077148438, 'learning_rate': 0.00020134408602150538, 'epoch': 0.71}
+{'loss': 1.2271, 'grad_norm': 2.5625529289245605, 'learning_rate': 0.0002013196480938416, 'epoch': 0.71}
+{'loss': 0.3858, 'grad_norm': 0.8302935361862183, 'learning_rate': 0.00020129521016617788, 'epoch': 0.71}
+{'loss': 1.4059, 'grad_norm': 3.529597043991089, 'learning_rate': 0.00020127077223851416, 'epoch': 0.71}
+{'loss': 0.7232, 'grad_norm': 1.3789228200912476, 'learning_rate': 0.0002012463343108504, 'epoch': 0.71}
+{'loss': 0.2367, 'grad_norm': 0.34966397285461426, 'learning_rate': 0.0002012218963831867, 'epoch': 0.71}
+{'loss': 0.2883, 'grad_norm': 0.5851748585700989, 'learning_rate': 0.00020119745845552297, 'epoch': 0.71}
+{'loss': 0.3685, 'grad_norm': 0.4715137779712677, 'learning_rate': 0.00020117302052785922, 'epoch': 0.71}
+{'loss': 0.3928, 'grad_norm': 0.7511829733848572, 'learning_rate': 0.0002011485826001955, 'epoch': 0.71}
+{'loss': 0.3393, 'grad_norm': 0.8073253035545349, 'learning_rate': 0.00020112414467253177, 'epoch': 0.71}
+{'loss': 0.3951, 'grad_norm': 1.0048470497131348, 'learning_rate': 0.000201099706744868, 'epoch': 0.71}
+{'loss': 0.3419, 'grad_norm': 0.5629849433898926, 'learning_rate': 0.00020107526881720428, 'epoch': 0.71}
+{'loss': 0.3908, 'grad_norm': 1.2211352586746216, 'learning_rate': 0.00020105083088954056, 'epoch': 0.71}
+{'loss': 0.395, 'grad_norm': 0.7062564492225647, 'learning_rate': 0.0002010263929618768, 'epoch': 0.71}
+{'loss': 0.4042, 'grad_norm': 1.036588430404663, 'learning_rate': 0.00020100195503421309, 'epoch': 0.71}
+{'loss': 0.7025, 'grad_norm': 1.1843818426132202, 'learning_rate': 0.00020097751710654936, 'epoch': 0.71}
+{'loss': 0.6033, 'grad_norm': 1.298744559288025, 'learning_rate': 0.0002009530791788856, 'epoch': 0.71}
+{'loss': 0.4368, 'grad_norm': 1.004136085510254, 'learning_rate': 0.00020092864125122187, 'epoch': 0.71}
+{'loss': 0.2452, 'grad_norm': 0.6956773996353149, 'learning_rate': 0.00020090420332355814, 'epoch': 0.71}
+{'loss': 0.7385, 'grad_norm': 1.565514087677002, 'learning_rate': 0.0002008797653958944, 'epoch': 0.71}
+{'loss': 0.5778, 'grad_norm': 1.181185007095337, 'learning_rate': 0.00020085532746823067, 'epoch': 0.71}
+{'loss': 0.6435, 'grad_norm': 1.010599970817566, 'learning_rate': 0.00020083088954056695, 'epoch': 0.72}
+{'loss': 0.6996, 'grad_norm': 1.560499668121338, 'learning_rate': 0.0002008064516129032, 'epoch': 0.72}
+{'loss': 0.5439, 'grad_norm': 2.0916194915771484, 'learning_rate': 0.00020078201368523948, 'epoch': 0.72}
+{'loss': 0.8417, 'grad_norm': 2.5257811546325684, 'learning_rate': 0.00020075757575757576, 'epoch': 0.72}
+{'loss': 0.7435, 'grad_norm': 1.5999330282211304, 'learning_rate': 0.00020073313782991198, 'epoch': 0.72}
+{'loss': 0.5395, 'grad_norm': 1.6199560165405273, 'learning_rate': 0.00020070869990224826, 'epoch': 0.72}
+{'loss': 1.2339, 'grad_norm': 6.264113426208496, 'learning_rate': 0.00020068426197458454, 'epoch': 0.72}
+{'loss': 0.6285, 'grad_norm': 1.9628108739852905, 'learning_rate': 0.0002006598240469208, 'epoch': 0.72}
+{'loss': 0.5511, 'grad_norm': 2.0946178436279297, 'learning_rate': 0.00020063538611925707, 'epoch': 0.72}
+{'loss': 0.8856, 'grad_norm': 1.8806025981903076, 'learning_rate': 0.00020061094819159335, 'epoch': 0.72}
+{'loss': 0.9052, 'grad_norm': 1.6844736337661743, 'learning_rate': 0.0002005865102639296, 'epoch': 0.72}
+{'loss': 0.7984, 'grad_norm': 1.8848754167556763, 'learning_rate': 0.00020056207233626588, 'epoch': 0.72}
+{'loss': 0.7998, 'grad_norm': 1.9630393981933594, 'learning_rate': 0.00020053763440860216, 'epoch': 0.72}
+{'loss': 0.6739, 'grad_norm': 1.6129907369613647, 'learning_rate': 0.00020051319648093838, 'epoch': 0.72}
+{'loss': 0.8227, 'grad_norm': 1.1391911506652832, 'learning_rate': 0.00020048875855327466, 'epoch': 0.72}
+{'loss': 0.717, 'grad_norm': 2.1303088665008545, 'learning_rate': 0.00020046432062561094, 'epoch': 0.72}
+{'loss': 1.046, 'grad_norm': 2.0801308155059814, 'learning_rate': 0.0002004398826979472, 'epoch': 0.72}
+{'loss': 1.8212, 'grad_norm': 3.0255331993103027, 'learning_rate': 0.00020041544477028347, 'epoch': 0.72}
+{'loss': 0.8383, 'grad_norm': 1.748745322227478, 'learning_rate': 0.00020039100684261975, 'epoch': 0.72}
+{'loss': 0.6246, 'grad_norm': 2.0003836154937744, 'learning_rate': 0.00020036656891495597, 'epoch': 0.72}
+{'loss': 0.9589, 'grad_norm': 2.874161720275879, 'learning_rate': 0.00020034213098729225, 'epoch': 0.72}
+{'loss': 1.201, 'grad_norm': 3.0715625286102295, 'learning_rate': 0.00020031769305962853, 'epoch': 0.72}
+{'loss': 1.5703, 'grad_norm': 2.9870784282684326, 'learning_rate': 0.00020029325513196478, 'epoch': 0.72}
+{'loss': 0.8619, 'grad_norm': 2.393979549407959, 'learning_rate': 0.00020026881720430106, 'epoch': 0.72}
+{'loss': 1.3913, 'grad_norm': 3.002680540084839, 'learning_rate': 0.00020024437927663733, 'epoch': 0.72}
+{'loss': 1.3388, 'grad_norm': 2.3127334117889404, 'learning_rate': 0.00020021994134897359, 'epoch': 0.72}
+{'loss': 0.948, 'grad_norm': 1.3608214855194092, 'learning_rate': 0.00020019550342130986, 'epoch': 0.72}
+{'loss': 1.2869, 'grad_norm': 3.9520657062530518, 'learning_rate': 0.00020017106549364614, 'epoch': 0.72}
+{'loss': 1.098, 'grad_norm': 2.2881007194519043, 'learning_rate': 0.00020014662756598237, 'epoch': 0.72}
+{'loss': 0.5111, 'grad_norm': 1.3503066301345825, 'learning_rate': 0.00020012218963831864, 'epoch': 0.72}
+{'loss': 0.8472, 'grad_norm': 2.778672695159912, 'learning_rate': 0.00020009775171065492, 'epoch': 0.72}
+{'loss': 0.7971, 'grad_norm': 3.3491392135620117, 'learning_rate': 0.00020007331378299117, 'epoch': 0.72}
+{'loss': 0.8447, 'grad_norm': 2.288565158843994, 'learning_rate': 0.00020004887585532745, 'epoch': 0.72}
+{'loss': 0.3159, 'grad_norm': 0.5646255612373352, 'learning_rate': 0.00020002443792766373, 'epoch': 0.72}
+{'loss': 0.3544, 'grad_norm': 0.8426042199134827, 'learning_rate': 0.00019999999999999998, 'epoch': 0.72}
+{'loss': 0.2904, 'grad_norm': 0.6321278214454651, 'learning_rate': 0.00019997556207233626, 'epoch': 0.72}
+{'loss': 0.274, 'grad_norm': 0.8084971904754639, 'learning_rate': 0.0001999511241446725, 'epoch': 0.72}
+{'loss': 0.2366, 'grad_norm': 0.5742786526679993, 'learning_rate': 0.00019992668621700876, 'epoch': 0.72}
+{'loss': 0.324, 'grad_norm': 1.111324667930603, 'learning_rate': 0.00019990224828934504, 'epoch': 0.72}
+{'loss': 0.3936, 'grad_norm': 0.9967480301856995, 'learning_rate': 0.00019987781036168132, 'epoch': 0.72}
+{'loss': 0.3368, 'grad_norm': 0.802562952041626, 'learning_rate': 0.00019985337243401757, 'epoch': 0.72}
+{'loss': 0.342, 'grad_norm': 0.7982587218284607, 'learning_rate': 0.00019982893450635385, 'epoch': 0.72}
+{'loss': 0.3102, 'grad_norm': 0.8451456427574158, 'learning_rate': 0.00019980449657869013, 'epoch': 0.72}
+{'loss': 0.8298, 'grad_norm': 2.590470314025879, 'learning_rate': 0.00019978005865102635, 'epoch': 0.72}
+{'loss': 0.5124, 'grad_norm': 1.0624793767929077, 'learning_rate': 0.00019975562072336263, 'epoch': 0.72}
+{'loss': 0.4405, 'grad_norm': 0.9378711581230164, 'learning_rate': 0.0001997311827956989, 'epoch': 0.72}
+{'loss': 0.4996, 'grad_norm': 1.3992924690246582, 'learning_rate': 0.00019970674486803516, 'epoch': 0.72}
+{'loss': 0.5062, 'grad_norm': 0.9326929450035095, 'learning_rate': 0.00019968230694037144, 'epoch': 0.72}
+{'loss': 0.5575, 'grad_norm': 1.211406946182251, 'learning_rate': 0.00019965786901270772, 'epoch': 0.72}
+{'loss': 0.2156, 'grad_norm': 0.5172481536865234, 'learning_rate': 0.00019963343108504397, 'epoch': 0.72}
+ 36%|███▌      | 4618/12776 [47:35<1:00:54,  2.23it/s] 36%|███▌      | 4619/12776 [47:35<57:41,  2.36it/s]                                                       36%|███▌      | 4619/12776 [47:35<57:41,  2.36it/s] 36%|███▌      | 4620/12776 [47:36<58:10,  2.34it/s]                                                     36%|███▌      | 4620/12776 [47:36<58:10,  2.34it/s] 36%|███▌      | 4621/12776 [47:36<54:52,  2.48it/s]                                                     36%|███▌      | 4621/12776 [47:36<54:52,  2.48it/s] 36%|███▌      | 4622/12776 [47:36<52:16,  2.60it/s]                                                     36%|███▌      | 4622/12776 [47:36<52:16,  2.60it/s] 36%|███▌      | 4623/12776 [47:37<52:32,  2.59it/s]                                                     36%|███▌      | 4623/12776 [47:37<52:32,  2.59it/s] 36%|███▌      | 4624/12776 [47:37<49:51,  2.73it/s]                                                     36%|███▌      | 4624/12776 [47:37<49:51,  2.73it/s] 36%|███▌      | 4625/12776 [47:37<47:23,  2.87it/s]                                                     36%|███▌      | 4625/12776 [47:37<47:23,  2.87it/s] 36%|███▌      | 4626/12776 [47:38<47:06,  2.88it/s]                                                     36%|███▌      | 4626/12776 [47:38<47:06,  2.88it/s] 36%|███▌      | 4627/12776 [47:38<44:44,  3.04it/s]                                                     36%|███▌      | 4627/12776 [47:38<44:44,  3.04it/s] 36%|███▌      | 4628/12776 [47:38<42:53,  3.17it/s]                                                     36%|███▌      | 4628/12776 [47:38<42:53,  3.17it/s] 36%|███▌      | 4629/12776 [47:38<41:14,  3.29it/s]                                                     36%|███▌      | 4629/12776 [47:38<41:14,  3.29it/s] 36%|███▌      | 4630/12776 [47:39<40:29,  3.35it/s]                                                     36%|███▌      | 4630/12776 [47:39<40:29,  3.35it/s] 36%|███▌      | 4631/12776 [47:39<38:47,  3.50it/s]                                                     36%|███▌      | 4631/12776 [47:39<38:47,  3.50it/s] 36%|███▋      | 4632/12776 [47:39<37:24,  3.63it/s]                                                     36%|███▋      | 4632/12776 [47:39<37:24,  3.63it/s] 36%|███▋      | 4633/12776 [47:39<36:14,  3.74it/s]                                                     36%|███▋      | 4633/12776 [47:39<36:14,  3.74it/s] 36%|███▋      | 4634/12776 [47:40<41:26,  3.27it/s]                                                     36%|███▋      | 4634/12776 [47:40<41:26,  3.27it/s] 36%|███▋      | 4635/12776 [47:40<38:50,  3.49it/s]                                                     36%|███▋      | 4635/12776 [47:40<38:50,  3.49it/s] 36%|███▋      | 4636/12776 [47:40<36:41,  3.70it/s]                                                     36%|███▋      | 4636/12776 [47:40<36:41,  3.70it/s] 36%|███▋      | 4637/12776 [47:41<34:57,  3.88it/s]                                                     36%|███▋      | 4637/12776 [47:41<34:57,  3.88it/s] 36%|███▋      | 4638/12776 [47:41<33:29,  4.05it/s]                                                     36%|███▋      | 4638/12776 [47:41<33:29,  4.05it/s] 36%|███▋      | 4639/12776 [47:41<36:00,  3.77it/s]                                                     36%|███▋      | 4639/12776 [47:41<36:00,  3.77it/s] 36%|███▋      | 4640/12776 [47:41<33:40,  4.03it/s]                                                     36%|███▋      | 4640/12776 [47:41<33:40,  4.03it/s] 36%|███▋      | 4641/12776 [47:42<32:13,  4.21it/s]                                                     36%|███▋      | 4641/12776 [47:42<32:13,  4.21it/s] 36%|███▋      | 4642/12776 [47:42<30:53,  4.39it/s]                                                     36%|███▋      | 4642/12776 [47:42<30:53,  4.39it/s] 36%|███▋      | 4643/12776 [47:42<29:48,  4.55it/s]                                                     36%|███▋      | 4643/12776 [47:42<29:48,  4.55it/s] 36%|███▋      | 4644/12776 [47:42<33:06,  4.09it/s]                                                     36%|███▋      | 4644/12776 [47:42<33:06,  4.09it/s] 36%|███▋      | 4645/12776 [47:42<31:10,  4.35it/s]                                                     36%|███▋      | 4645/12776 [47:42<31:10,  4.35it/s] 36%|███▋      | 4646/12776 [47:43<29:47,  4.55it/s]                                                     36%|███▋      | 4646/12776 [47:43<29:47,  4.55it/s] 36%|███▋      | 4647/12776 [47:43<28:39,  4.73it/s]                                                     36%|███▋      | 4647/12776 [47:43<28:39,  4.73it/s] 36%|███▋      | 4648/12776 [47:43<27:34,  4.91it/s]                                                     36%|███▋      | 4648/12776 [47:43<27:34,  4.91it/s] 36%|███▋      | 4649/12776 [47:43<29:20,  4.62it/s]                                                     36%|███▋      | 4649/12776 [47:43<29:20,  4.62it/s] 36%|███▋      | 4650/12776 [47:44<48:29,  2.79it/s]                                                     36%|███▋      | 4650/12776 [47:44<48:29,  2.79it/s] 36%|███▋      | 4651/12776 [47:46<1:41:29,  1.33it/s]                                                       36%|███▋      | 4651/12776 [47:46<1:41:29,  1.33it/s] 36%|███▋      | 4652/12776 [47:47<1:55:19,  1.17it/s]                                                       36%|███▋      | 4652/12776 [47:47<1:55:19,  1.17it/s] 36%|███▋      | 4653/12776 [47:48<1:55:20,  1.17it/s]                                                       36%|███▋      | 4653/12776 [47:48<1:55:20,  1.17it/s] 36%|███▋      | 4654/12776 [47:48<1:51:56,  1.21it/s]                                                       36%|███▋      | 4654/12776 [47:48<1:51:56,  1.21it/s] 36%|███▋      | 4655/12776 [47:49<1:47:33,  1.26it/s]                                                       36%|███▋      | 4655/12776 [47:49<1:47:33,  1.26it/s] 36%|███▋      | 4656/12776 [47:50<1:42:51,  1.32it/s]                                                       36%|███▋      | 4656/12776 [47:50<1:42:51,  1.32it/s] 36%|███▋      | 4657/12776 [47:50<1:40:48,  1.34it/s]                                                       36%|███▋      | 4657/12776 [47:50<1:40:48,  1.34it/s] 36%|███▋      | 4658/12776 [47:51<1:35:49,  1.41it/s]                                                       36%|███▋      | 4658/12776 [47:51<1:35:49,  1.41it/s] 36%|███▋      | 4659/12776 [47:52<1:30:47,  1.49it/s]                                                       36%|███▋      | 4659/12776 [47:52<1:30:47,  1.49it/s] 36%|███▋      | 4660/12776 [47:52<1:25:41,  1.58it/s]                                                       36%|███▋      | 4660/12776 [47:52<1:25:41,  1.58it/s] 36%|███▋      | 4661/12776 [47:53<1:21:56,  1.65it/s]                                                       36%|███▋      | 4661/12776 [47:53<1:21:56,  1.65it/s] 36%|███▋      | 4662/12776 [47:53<1:17:26,  1.75it/s]                                                       36%|███▋      | 4662/12776 [47:53<1:17:26,  1.75it/s] 36%|███▋      | 4663/12776 [47:54<1:14:54,  1.81it/s]                                                       36%|███▋      | 4663/12776 [47:54<1:14:54,  1.81it/s] 37%|███▋      | 4664/12776 [47:54<1:10:44,  1.91it/s]                                                       37%|███▋      | 4664/12776 [47:54<1:10:44,  1.91it/s] 37%|███▋      | 4665/12776 [47:55<1:09:51,  1.94it/s]                                                       37%|███▋      | 4665/12776 [47:55<1:09:51,  1.94it/s] 37%|███▋      | 4666/12776 [47:55<1:06:08,  2.04it/s]                                                       37%|███▋      | 4666/12776 [47:55<1:06:08,  2.04it/s] 37%|███▋      | 4667/12776 [47:56<1:02:40,  2.16it/s]                                                       37%|███▋      | 4667/12776 [47:56<1:02:40,  2.16it/s] 37%|███▋      | 4668/12776 [47:56<1:03:48,  2.12it/s]                                                       37%|███▋      | 4668/12776 [47:56<1:03:48,  2.12it/s] 37%|███▋      | 4669/12776 [47:56<1:00:09,  2.25it/s]                                                       37%|███▋      | 4669/12776 [47:56<1:00:09,  2.25it/s] 37%|███▋      | 4670/12776 [47:57<56:44,  2.38it/s]                                                       37%|███▋      | 4670/12776 [47:57<56:44,  2.38it/s] 37%|███▋      | 4671/12776 [47:57<58:54,  2.29it/s]                                                     37%|█��█▋      | 4671/12776 [47:57<58:54,  2.29it/s] 37%|███▋      | 4672/12776 [47:58<54:51,  2.46it/s]                                                     37%|███▋      | 4672/12776 [47:58<54:51,  2.46it/s] 37%|███▋      | 4673/12776 [47:58<52:02,  2.60it/s]                                                     37%|███▋      | 4673/12776 [47:58<52:02,  2.60it/s] 37%|███▋      | 4674/12776 [47:58<52:29,  2.57it/s]                                                     37%|███▋      | 4674/12776 [47:58<52:29,  2.57it/s] 37%|███▋      | 4675/12776 [47:59<48:47,  2.77it/s]                                                     37%|███▋      | 4675/12776 [47:59<48:47,  2.77it/s] 37%|███▋      | 4676/12776 [47:59<45:56,  2.94it/s]                                                     37%|███▋      | 4676/12776 [47:59<45:56,  2.94it/s] 37%|███▋      | 4677/12776 [47:59<47:29,  2.84it/s]                                                     37%|███▋      | 4677/12776 [47:59<47:29,  2.84it/s] 37%|███▋      | 4678/12776 [48:00<44:29,  3.03it/s]                                                     37%|███▋      | 4678/12776 [48:00<44:29,  3.03it/s] 37%|███▋      | 4679/12776 [48:00<41:59,  3.21it/s]                                                     37%|███▋      | 4679/12776 [48:00<41:59,  3.21it/s] 37%|███▋      | 4680/12776 [48:00<40:10,  3.36it/s]                                                     37%|███▋      | 4680/12776 [48:00<40:10,  3.36it/s] 37%|███▋      | 4681/12776 [48:00<41:22,  3.26it/s]                                                     37%|███▋      | 4681/12776 [48:00<41:22,  3.26it/s] 37%|███▋      | 4682/12776 [48:01<39:15,  3.44it/s]                                                     37%|███▋      | 4682/12776 [48:01<39:15,  3.44it/s] 37%|███▋      | 4683/12776 [48:01<37:21,  3.61it/s]                                                     37%|███▋      | 4683/12776 [48:01<37:21,  3.61it/s] 37%|███▋      | 4684/12776 [48:01<35:52,  3.76it/s]                                                     37%|███▋      | 4684/12776 [48:01<35:52,  3.76it/s] 37%|███▋      | 4685/12776 [48:01<34:28,  3.91it/s]                                                     37%|███▋      | 4685/12776 [48:01<34:28,  3.91it/s] 37%|███▋      | 4686/12776 [48:02<35:08,  3.84it/s]                                                     37%|███▋      | 4686/12776 [48:02<35:08,  3.84it/s] 37%|███▋      | 4687/12776 [48:02<33:18,  4.05it/s]                                                     37%|███▋      | 4687/12776 [48:02<33:18,  4.05it/s] 37%|███▋      | 4688/12776 [48:02<31:51,  4.23it/s]                                                     37%|███▋      | 4688/12776 [48:02<31:51,  4.23it/s] 37%|███▋      | 4689/12776 [48:02<30:41,  4.39it/s]                                                     37%|███▋      | 4689/12776 [48:02<30:41,  4.39it/s] 37%|███▋      | 4690/12776 [48:02<30:02,  4.49it/s]                                                     37%|███▋      | 4690/12776 [48:02<30:02,  4.49it/s] 37%|███▋      | 4691/12776 [48:03<31:25,  4.29it/s]                                                     37%|███▋      | 4691/12776 [48:03<31:25,  4.29it/s] 37%|███▋      | 4692/12776 [48:03<30:05,  4.48it/s]                                                     37%|███▋      | 4692/12776 [48:03<30:05,  4.48it/s] 37%|███▋      | 4693/12776 [48:03<29:08,  4.62it/s]                                                     37%|███▋      | 4693/12776 [48:03<29:08,  4.62it/s] 37%|███▋      | 4694/12776 [48:03<28:15,  4.77it/s]                                                     37%|███▋      | 4694/12776 [48:03<28:15,  4.77it/s] 37%|███▋      | 4695/12776 [48:04<27:36,  4.88it/s]                                                    {'loss': 0.8636, 'grad_norm': 1.6718294620513916, 'learning_rate': 0.00019960899315738025, 'epoch': 0.72}
+{'loss': 0.4851, 'grad_norm': 1.0531017780303955, 'learning_rate': 0.00019958455522971652, 'epoch': 0.72}
+{'loss': 0.4803, 'grad_norm': 1.2837929725646973, 'learning_rate': 0.00019956011730205275, 'epoch': 0.72}
+{'loss': 0.5956, 'grad_norm': 1.3181400299072266, 'learning_rate': 0.00019953567937438903, 'epoch': 0.72}
+{'loss': 0.6616, 'grad_norm': 1.248836874961853, 'learning_rate': 0.0001995112414467253, 'epoch': 0.72}
+{'loss': 0.2288, 'grad_norm': 1.2771812677383423, 'learning_rate': 0.00019948680351906156, 'epoch': 0.72}
+{'loss': 0.6624, 'grad_norm': 1.3296245336532593, 'learning_rate': 0.00019946236559139784, 'epoch': 0.72}
+{'loss': 0.6186, 'grad_norm': 1.7813029289245605, 'learning_rate': 0.0001994379276637341, 'epoch': 0.72}
+{'loss': 0.4384, 'grad_norm': 1.5180063247680664, 'learning_rate': 0.00019941348973607036, 'epoch': 0.72}
+{'loss': 0.6051, 'grad_norm': 1.4584667682647705, 'learning_rate': 0.00019938905180840664, 'epoch': 0.72}
+{'loss': 0.5041, 'grad_norm': 2.218289852142334, 'learning_rate': 0.0001993646138807429, 'epoch': 0.72}
+{'loss': 0.7868, 'grad_norm': 2.1602261066436768, 'learning_rate': 0.00019934017595307915, 'epoch': 0.72}
+{'loss': 0.72, 'grad_norm': 1.7537661790847778, 'learning_rate': 0.00019931573802541542, 'epoch': 0.72}
+{'loss': 1.0954, 'grad_norm': 3.2519748210906982, 'learning_rate': 0.0001992913000977517, 'epoch': 0.72}
+{'loss': 1.229, 'grad_norm': 2.7691283226013184, 'learning_rate': 0.00019926686217008795, 'epoch': 0.73}
+{'loss': 0.9008, 'grad_norm': 2.6838536262512207, 'learning_rate': 0.00019924242424242423, 'epoch': 0.73}
+{'loss': 1.4006, 'grad_norm': 2.501833915710449, 'learning_rate': 0.0001992179863147605, 'epoch': 0.73}
+{'loss': 0.8523, 'grad_norm': 3.2722952365875244, 'learning_rate': 0.00019919354838709673, 'epoch': 0.73}
+{'loss': 0.9759, 'grad_norm': 2.0954771041870117, 'learning_rate': 0.000199169110459433, 'epoch': 0.73}
+{'loss': 0.8356, 'grad_norm': 3.6169707775115967, 'learning_rate': 0.0001991446725317693, 'epoch': 0.73}
+{'loss': 0.3973, 'grad_norm': 1.3619242906570435, 'learning_rate': 0.00019912023460410554, 'epoch': 0.73}
+{'loss': 1.3629, 'grad_norm': 2.543351650238037, 'learning_rate': 0.00019909579667644182, 'epoch': 0.73}
+{'loss': 0.9438, 'grad_norm': 2.047996759414673, 'learning_rate': 0.0001990713587487781, 'epoch': 0.73}
+{'loss': 1.1399, 'grad_norm': 1.984495997428894, 'learning_rate': 0.00019904692082111435, 'epoch': 0.73}
+{'loss': 0.7897, 'grad_norm': 2.7449281215667725, 'learning_rate': 0.00019902248289345063, 'epoch': 0.73}
+{'loss': 0.9229, 'grad_norm': 2.5032405853271484, 'learning_rate': 0.0001989980449657869, 'epoch': 0.73}
+{'loss': 1.2118, 'grad_norm': 2.4411966800689697, 'learning_rate': 0.00019897360703812313, 'epoch': 0.73}
+{'loss': 0.4972, 'grad_norm': 2.0347883701324463, 'learning_rate': 0.0001989491691104594, 'epoch': 0.73}
+{'loss': 1.3068, 'grad_norm': 2.888296604156494, 'learning_rate': 0.0001989247311827957, 'epoch': 0.73}
+{'loss': 0.9385, 'grad_norm': 1.6684255599975586, 'learning_rate': 0.00019890029325513194, 'epoch': 0.73}
+{'loss': 1.1736, 'grad_norm': 3.1219706535339355, 'learning_rate': 0.00019887585532746822, 'epoch': 0.73}
+{'loss': 0.7565, 'grad_norm': 2.7629895210266113, 'learning_rate': 0.0001988514173998045, 'epoch': 0.73}
+{'loss': 1.5535, 'grad_norm': 2.8487377166748047, 'learning_rate': 0.00019882697947214075, 'epoch': 0.73}
+{'loss': 0.3549, 'grad_norm': 0.5729645490646362, 'learning_rate': 0.00019880254154447703, 'epoch': 0.73}
+{'loss': 0.2756, 'grad_norm': 0.7285500764846802, 'learning_rate': 0.00019877810361681328, 'epoch': 0.73}
+{'loss': 0.3588, 'grad_norm': 0.9222913384437561, 'learning_rate': 0.00019875366568914953, 'epoch': 0.73}
+{'loss': 0.3541, 'grad_norm': 0.7123448848724365, 'learning_rate': 0.0001987292277614858, 'epoch': 0.73}
+{'loss': 0.339, 'grad_norm': 0.6087605357170105, 'learning_rate': 0.00019870478983382208, 'epoch': 0.73}
+{'loss': 0.5141, 'grad_norm': 0.6646626591682434, 'learning_rate': 0.00019868035190615834, 'epoch': 0.73}
+{'loss': 0.3261, 'grad_norm': 0.6121235489845276, 'learning_rate': 0.00019865591397849461, 'epoch': 0.73}
+{'loss': 0.3063, 'grad_norm': 0.7372736930847168, 'learning_rate': 0.0001986314760508309, 'epoch': 0.73}
+{'loss': 0.4006, 'grad_norm': 2.8387651443481445, 'learning_rate': 0.00019860703812316712, 'epoch': 0.73}
+{'loss': 0.3821, 'grad_norm': 0.7166696190834045, 'learning_rate': 0.0001985826001955034, 'epoch': 0.73}
+{'loss': 0.3255, 'grad_norm': 0.9727258682250977, 'learning_rate': 0.00019855816226783967, 'epoch': 0.73}
+{'loss': 0.3748, 'grad_norm': 0.5781895518302917, 'learning_rate': 0.00019853372434017592, 'epoch': 0.73}
+{'loss': 0.3613, 'grad_norm': 0.8979054093360901, 'learning_rate': 0.0001985092864125122, 'epoch': 0.73}
+{'loss': 0.5699, 'grad_norm': 1.2119108438491821, 'learning_rate': 0.00019848484848484848, 'epoch': 0.73}
+{'loss': 0.4611, 'grad_norm': 1.14756441116333, 'learning_rate': 0.00019846041055718473, 'epoch': 0.73}
+{'loss': 0.3666, 'grad_norm': 0.9472888708114624, 'learning_rate': 0.000198435972629521, 'epoch': 0.73}
+{'loss': 2.9419, 'grad_norm': 4.8855695724487305, 'learning_rate': 0.0001984115347018573, 'epoch': 0.73}
+{'loss': 0.7695, 'grad_norm': 1.7084994316101074, 'learning_rate': 0.0001983870967741935, 'epoch': 0.73}
+{'loss': 0.5826, 'grad_norm': 1.1830832958221436, 'learning_rate': 0.0001983626588465298, 'epoch': 0.73}
+{'loss': 0.6607, 'grad_norm': 1.2845669984817505, 'learning_rate': 0.00019833822091886607, 'epoch': 0.73}
+{'loss': 0.7845, 'grad_norm': 1.136726975440979, 'learning_rate': 0.00019831378299120232, 'epoch': 0.73}
+{'loss': 0.5098, 'grad_norm': 1.3713886737823486, 'learning_rate': 0.0001982893450635386, 'epoch': 0.73}
+{'loss': 0.6415, 'grad_norm': 1.7736725807189941, 'learning_rate': 0.00019826490713587488, 'epoch': 0.73}
+{'loss': 0.5017, 'grad_norm': 1.0796916484832764, 'learning_rate': 0.00019824046920821113, 'epoch': 0.73}
+{'loss': 0.7534, 'grad_norm': 1.7601935863494873, 'learning_rate': 0.00019821603128054738, 'epoch': 0.73}
+{'loss': 0.7588, 'grad_norm': 1.5599554777145386, 'learning_rate': 0.00019819159335288366, 'epoch': 0.73}
+{'loss': 0.9686, 'grad_norm': 3.366999626159668, 'learning_rate': 0.0001981671554252199, 'epoch': 0.73}
+{'loss': 0.8723, 'grad_norm': 2.5471243858337402, 'learning_rate': 0.0001981427174975562, 'epoch': 0.73}
+{'loss': 0.6776, 'grad_norm': 1.7258960008621216, 'learning_rate': 0.00019811827956989247, 'epoch': 0.73}
+{'loss': 0.821, 'grad_norm': 2.3678998947143555, 'learning_rate': 0.00019809384164222872, 'epoch': 0.73}
+{'loss': 1.0205, 'grad_norm': 2.2252848148345947, 'learning_rate': 0.000198069403714565, 'epoch': 0.73}
+{'loss': 0.8085, 'grad_norm': 1.9169201850891113, 'learning_rate': 0.00019804496578690127, 'epoch': 0.73}
+{'loss': 0.6308, 'grad_norm': 2.3197524547576904, 'learning_rate': 0.0001980205278592375, 'epoch': 0.73}
+{'loss': 0.9189, 'grad_norm': 2.232663631439209, 'learning_rate': 0.00019799608993157378, 'epoch': 0.73}
+{'loss': 0.8978, 'grad_norm': 3.014984130859375, 'learning_rate': 0.00019797165200391006, 'epoch': 0.73}
+{'loss': 0.7204, 'grad_norm': 2.799757480621338, 'learning_rate': 0.0001979472140762463, 'epoch': 0.73}
+{'loss': 0.8103, 'grad_norm': 4.892414569854736, 'learning_rate': 0.00019792277614858259, 'epoch': 0.73}
+{'loss': 1.2881, 'grad_norm': 2.7434020042419434, 'learning_rate': 0.00019789833822091886, 'epoch': 0.73}
+{'loss': 1.6406, 'grad_norm': 2.714355945587158, 'learning_rate': 0.00019787390029325511, 'epoch': 0.73}
+{'loss': 1.2144, 'grad_norm': 2.0118868350982666, 'learning_rate': 0.0001978494623655914, 'epoch': 0.73}
+{'loss': 0.742, 'grad_norm': 1.1020734310150146, 'learning_rate': 0.00019782502443792767, 'epoch': 0.73}
+{'loss': 1.6325, 'grad_norm': 4.144432544708252, 'learning_rate': 0.0001978005865102639, 'epoch': 0.73}
+{'loss': 0.9883, 'grad_norm': 1.8533861637115479, 'learning_rate': 0.00019777614858260017, 'epoch': 0.73}
+{'loss': 1.4588, 'grad_norm': 2.328321695327759, 'learning_rate': 0.00019775171065493645, 'epoch': 0.73}
+ 37%|███▋      | 4695/12776 [48:04<27:36,  4.88it/s] 37%|███▋      | 4696/12776 [48:04<31:50,  4.23it/s]                                                     37%|███▋      | 4696/12776 [48:04<31:50,  4.23it/s] 37%|███▋      | 4697/12776 [48:04<29:58,  4.49it/s]                                                     37%|███▋      | 4697/12776 [48:04<29:58,  4.49it/s] 37%|███▋      | 4698/12776 [48:04<28:30,  4.72it/s]                                                     37%|███▋      | 4698/12776 [48:04<28:30,  4.72it/s] 37%|███▋      | 4699/12776 [48:04<27:18,  4.93it/s]                                                     37%|███▋      | 4699/12776 [48:04<27:18,  4.93it/s] 37%|███▋      | 4700/12776 [48:05<44:50,  3.00it/s]                                                     37%|███▋      | 4700/12776 [48:05<44:50,  3.00it/s] 37%|███▋      | 4701/12776 [48:06<1:26:16,  1.56it/s]                                                       37%|███▋      | 4701/12776 [48:06<1:26:16,  1.56it/s] 37%|███▋      | 4702/12776 [48:07<1:35:06,  1.41it/s]                                                       37%|███▋      | 4702/12776 [48:07<1:35:06,  1.41it/s] 37%|███▋      | 4703/12776 [48:08<1:39:39,  1.35it/s]                                                       37%|███▋      | 4703/12776 [48:08<1:39:39,  1.35it/s] 37%|███▋      | 4704/12776 [48:09<1:40:15,  1.34it/s]                                                       37%|███▋      | 4704/12776 [48:09<1:40:15,  1.34it/s] 37%|███▋      | 4705/12776 [48:10<1:38:49,  1.36it/s]                                                       37%|███▋      | 4705/12776 [48:10<1:38:49,  1.36it/s] 37%|███▋      | 4706/12776 [48:10<1:36:28,  1.39it/s]                                                       37%|███▋      | 4706/12776 [48:10<1:36:28,  1.39it/s] 37%|███▋      | 4707/12776 [48:11<1:34:20,  1.43it/s]                                                       37%|███▋      | 4707/12776 [48:11<1:34:20,  1.43it/s] 37%|███▋      | 4708/12776 [48:11<1:30:24,  1.49it/s]                                                       37%|███▋      | 4708/12776 [48:11<1:30:24,  1.49it/s] 37%|███▋      | 4709/12776 [48:12<1:26:22,  1.56it/s]                                                       37%|███▋      | 4709/12776 [48:12<1:26:22,  1.56it/s] 37%|███▋      | 4710/12776 [48:13<1:22:06,  1.64it/s]                                                       37%|███▋      | 4710/12776 [48:13<1:22:06,  1.64it/s] 37%|███▋      | 4711/12776 [48:13<1:21:14,  1.65it/s]                                                       37%|███▋      | 4711/12776 [48:13<1:21:14,  1.65it/s] 37%|███▋      | 4712/12776 [48:14<1:16:42,  1.75it/s]                                                       37%|███▋      | 4712/12776 [48:14<1:16:42,  1.75it/s] 37%|███▋      | 4713/12776 [48:14<1:14:42,  1.80it/s]                                                       37%|███▋      | 4713/12776 [48:14<1:14:42,  1.80it/s] 37%|███▋      | 4714/12776 [48:15<1:10:07,  1.92it/s]                                                       37%|███▋      | 4714/12776 [48:15<1:10:07,  1.92it/s] 37%|███▋      | 4715/12776 [48:15<1:09:32,  1.93it/s]                                                       37%|███▋      | 4715/12776 [48:15<1:09:32,  1.93it/s] 37%|███▋      | 4716/12776 [48:16<1:04:56,  2.07it/s]                                                       37%|███▋      | 4716/12776 [48:16<1:04:56,  2.07it/s] 37%|███▋      | 4717/12776 [48:16<1:01:05,  2.20it/s]                                                       37%|███▋      | 4717/12776 [48:16<1:01:05,  2.20it/s] 37%|███▋      | 4718/12776 [48:16<1:03:35,  2.11it/s]                                                       37%|███▋      | 4718/12776 [48:16<1:03:35,  2.11it/s] 37%|███▋      | 4719/12776 [48:17<59:08,  2.27it/s]                                                       37%|███▋      | 4719/12776 [48:17<59:08,  2.27it/s] 37%|███▋      | 4720/12776 [48:17<55:42,  2.41it/s]                                                     37%|███▋      | 4720/12776 [48:17<55:42,  2.41it/s] 37%|███▋      | 4721/12776 [48:18<56:30,  2.38it/s]                                                     37%|███▋      | 4721/12776 [48:18<56:30,  2.38it/s] 37%|███▋      | 4722/12776 [48:18<53:06,  2.53it/s]                                                     37%|███▋      | 4722/12776 [48:18<53:06,  2.53it/s] 37%|███▋      | 4723/12776 [48:18<50:19,  2.67it/s]                                                     37%|███▋      | 4723/12776 [48:18<50:19,  2.67it/s] 37%|███▋      | 4724/12776 [48:19<51:51,  2.59it/s]                                                     37%|███▋      | 4724/12776 [48:19<51:51,  2.59it/s] 37%|███▋      | 4725/12776 [48:19<48:09,  2.79it/s]                                                     37%|███▋      | 4725/12776 [48:19<48:09,  2.79it/s] 37%|███▋      | 4726/12776 [48:19<45:26,  2.95it/s]                                                     37%|███▋      | 4726/12776 [48:19<45:26,  2.95it/s] 37%|███▋      | 4727/12776 [48:20<47:26,  2.83it/s]                                                     37%|███▋      | 4727/12776 [48:20<47:26,  2.83it/s] 37%|███▋      | 4728/12776 [48:20<44:10,  3.04it/s]                                                     37%|███▋      | 4728/12776 [48:20<44:10,  3.04it/s] 37%|███▋      | 4729/12776 [48:20<41:42,  3.22it/s]                                                     37%|███▋      | 4729/12776 [48:20<41:42,  3.22it/s] 37%|███▋      | 4730/12776 [48:20<39:28,  3.40it/s]                                                     37%|███▋      | 4730/12776 [48:20<39:28,  3.40it/s] 37%|███▋      | 4731/12776 [48:21<40:22,  3.32it/s]                                                     37%|███▋      | 4731/12776 [48:21<40:22,  3.32it/s] 37%|███▋      | 4732/12776 [48:21<38:11,  3.51it/s]                                                     37%|███▋      | 4732/12776 [48:21<38:11,  3.51it/s] 37%|███▋      | 4733/12776 [48:21<36:28,  3.68it/s]                                                     37%|███▋      | 4733/12776 [48:21<36:28,  3.68it/s] 37%|███▋      | 4734/12776 [48:22<35:05,  3.82it/s]                                                     37%|███▋      | 4734/12776 [48:22<35:05,  3.82it/s] 37%|███▋      | 4735/12776 [48:22<33:45,  3.97it/s]                                                     37%|███▋      | 4735/12776 [48:22<33:45,  3.97it/s] 37%|███▋      | 4736/12776 [48:22<35:13,  3.80it/s]                                                     37%|███▋      | 4736/12776 [48:22<35:13,  3.80it/s] 37%|███▋      | 4737/12776 [48:22<33:25,  4.01it/s]                                                     37%|███▋      | 4737/12776 [48:22<33:25,  4.01it/s] 37%|███▋      | 4738/12776 [48:22<31:55,  4.20it/s]                                                     37%|███▋      | 4738/12776 [48:22<31:55,  4.20it/s] 37%|███▋      | 4739/12776 [48:23<30:43,  4.36it/s]                                                     37%|███▋      | 4739/12776 [48:23<30:43,  4.36it/s] 37%|███▋      | 4740/12776 [48:23<29:58,  4.47it/s]                                                     37%|███▋      | 4740/12776 [48:23<29:58,  4.47it/s] 37%|███▋      | 4741/12776 [48:23<32:56,  4.07it/s]                                                     37%|███▋      | 4741/12776 [48:23<32:56,  4.07it/s] 37%|███▋      | 4742/12776 [48:23<31:14,  4.29it/s]                                                     37%|███▋      | 4742/12776 [48:23<31:14,  4.29it/s] 37%|███▋      | 4743/12776 [48:24<29:53,  4.48it/s]                                                     37%|███▋      | 4743/12776 [48:24<29:53,  4.48it/s] 37%|███▋      | 4744/12776 [48:24<28:55,  4.63it/s]                                                     37%|███▋      | 4744/12776 [48:24<28:55,  4.63it/s] 37%|███▋      | 4745/12776 [48:24<28:05,  4.76it/s]                                                     37%|███▋      | 4745/12776 [48:24<28:05,  4.76it/s] 37%|███▋      | 4746/12776 [48:24<33:09,  4.04it/s]                                                     37%|███▋      | 4746/12776 [48:24<33:09,  4.04it/s] 37%|███▋      | 4747/12776 [48:24<30:52,  4.33it/s]                                                     37%|███▋      | 4747/12776 [48:24<30:52,  4.33it/s] 37%|███▋      | 4748/12776 [48:25<29:02,  4.61it/s]                                                     37%|███▋      | 4748/12776 [48:25<29:02,  4.61it/s] 37%|███▋      | 4749/12776 [48:25<27:45,  4.82it/s]                                                     37%|███▋      | 4749/12776 [48:25<27:45,  4.82it/s] 37%|███▋      | 4750/12776 [48:26<47:24,  2.82it/s]                                                     37%|███▋      | 4750/12776 [48:26<47:24,  2.82it/s] 37%|███▋      | 4751/12776 [48:27<1:32:20,  1.45it/s]                                                       37%|███▋      | 4751/12776 [48:27<1:32:20,  1.45it/s] 37%|███▋      | 4752/12776 [48:28<1:44:29,  1.28it/s]                                                       37%|███▋      | 4752/12776 [48:28<1:44:29,  1.28it/s] 37%|███▋      | 4753/12776 [48:29<1:47:19,  1.25it/s]                                                       37%|███▋      | 4753/12776 [48:29<1:47:19,  1.25it/s] 37%|███▋      | 4754/12776 [48:30<1:47:25,  1.24it/s]                                                       37%|███▋      | 4754/12776 [48:30<1:47:25,  1.24it/s] 37%|███▋      | 4755/12776 [48:30<1:45:44,  1.26it/s]                                                       37%|███▋      | 4755/12776 [48:30<1:45:44,  1.26it/s] 37%|███▋      | 4756/12776 [48:31<1:42:44,  1.30it/s]                                                       37%|███▋      | 4756/12776 [48:31<1:42:44,  1.30it/s] 37%|███▋      | 4757/12776 [48:32<1:38:35,  1.36it/s]                                                       37%|███▋      | 4757/12776 [48:32<1:38:35,  1.36it/s] 37%|███▋      | 4758/12776 [48:33<1:37:45,  1.37it/s]                                                       37%|███▋      | 4758/12776 [48:33<1:37:45,  1.37it/s] 37%|███▋      | 4759/12776 [48:33<1:32:56,  1.44it/s]                                                       37%|███▋      | 4759/12776 [48:33<1:32:56,  1.44it/s] 37%|███▋      | 4760/12776 [48:34<1:28:42,  1.51it/s]                                                       37%|███▋      | 4760/12776 [48:34<1:28:42,  1.51it/s] 37%|███▋      | 4761/12776 [48:34<1:24:15,  1.59it/s]                                                       37%|███▋      | 4761/12776 [48:34<1:24:15,  1.59it/s] 37%|███▋      | 4762/12776 [48:35<1:21:04,  1.65it/s]                                                       37%|███▋      | 4762/12776 [48:35<1:21:04,  1.65it/s] 37%|███▋      | 4763/12776 [48:35<1:16:56,  1.74it/s]                                                       37%|███▋      | 4763/12776 [48:35<1:16:56,  1.74it/s] 37%|███▋      | 4764/12776 [48:36<1:14:56,  1.78it/s]                                                       37%|███▋      | 4764/12776 [48:36<1:14:56,  1.78it/s] 37%|███▋      | 4765/12776 [48:36<1:10:07,  1.90it/s]                                                       37%|███▋      | 4765/12776 [48:36<1:10:07,  1.90it/s] 37%|███▋      | 4766/12776 [48:37<1:10:17,  1.90it/s]                                                       37%|███▋      | 4766/12776 [48:37<1:10:17,  1.90it/s] 37%|███▋      | 4767/12776 [48:37<1:05:24,  2.04it/s]                                                       37%|███▋      | 4767/12776 [48:37<1:05:24,  2.04it/s] 37%|███▋      | 4768/12776 [48:38<1:01:29,  2.17it/s]                                                       37%|███▋      | 4768/12776 [48:38<1:01:29,  2.17it/s] 37%|███▋      | 4769/12776 [48:38<1:03:51,  2.09it/s]                                                       37%|███▋      | 4769/12776 [48:38<1:03:51,  2.09it/s] 37%|███▋      | 4770/12776 [48:39<58:54,  2.26it/s]                                                       37%|███▋      | 4770/12776 [48:39<58:54,  2.26it/s] 37%|███▋      | 4771/12776 [48:39<55:15,  2.41it/s]                                                     37%|███▋      | 4771/12776 [48:39<55:15,  2.41it/s] 37%|███▋      | 4772/12776 [48:39<55:50,  2.39it/s]                                                    {'loss': 0.7339, 'grad_norm': 1.688111424446106, 'learning_rate': 0.0001977272727272727, 'epoch': 0.73}
+{'loss': 0.7413, 'grad_norm': 1.9149725437164307, 'learning_rate': 0.00019770283479960898, 'epoch': 0.74}
+{'loss': 0.5104, 'grad_norm': 1.6234050989151, 'learning_rate': 0.00019767839687194526, 'epoch': 0.74}
+{'loss': 1.3501, 'grad_norm': 2.461130142211914, 'learning_rate': 0.0001976539589442815, 'epoch': 0.74}
+{'loss': 0.9382, 'grad_norm': 1.6503095626831055, 'learning_rate': 0.00019762952101661776, 'epoch': 0.74}
+{'loss': 1.4864, 'grad_norm': 3.062802314758301, 'learning_rate': 0.00019760508308895404, 'epoch': 0.74}
+{'loss': 0.2212, 'grad_norm': 0.5734681487083435, 'learning_rate': 0.0001975806451612903, 'epoch': 0.74}
+{'loss': 0.2945, 'grad_norm': 0.5626726150512695, 'learning_rate': 0.00019755620723362657, 'epoch': 0.74}
+{'loss': 0.2841, 'grad_norm': 0.5201683044433594, 'learning_rate': 0.00019753176930596285, 'epoch': 0.74}
+{'loss': 0.5189, 'grad_norm': 0.7159972190856934, 'learning_rate': 0.0001975073313782991, 'epoch': 0.74}
+{'loss': 0.3791, 'grad_norm': 0.732814610004425, 'learning_rate': 0.00019748289345063538, 'epoch': 0.74}
+{'loss': 0.4011, 'grad_norm': 0.7638669610023499, 'learning_rate': 0.00019745845552297166, 'epoch': 0.74}
+{'loss': 0.3322, 'grad_norm': 0.7000142931938171, 'learning_rate': 0.00019743401759530788, 'epoch': 0.74}
+{'loss': 0.2646, 'grad_norm': 0.7838113307952881, 'learning_rate': 0.00019740957966764416, 'epoch': 0.74}
+{'loss': 0.4024, 'grad_norm': 0.7795481085777283, 'learning_rate': 0.00019738514173998044, 'epoch': 0.74}
+{'loss': 0.381, 'grad_norm': 0.6289814114570618, 'learning_rate': 0.0001973607038123167, 'epoch': 0.74}
+{'loss': 0.4522, 'grad_norm': 1.3623909950256348, 'learning_rate': 0.00019733626588465297, 'epoch': 0.74}
+{'loss': 0.4841, 'grad_norm': 1.1188793182373047, 'learning_rate': 0.00019731182795698925, 'epoch': 0.74}
+{'loss': 0.8086, 'grad_norm': 1.1815077066421509, 'learning_rate': 0.0001972873900293255, 'epoch': 0.74}
+{'loss': 0.6665, 'grad_norm': 1.2694467306137085, 'learning_rate': 0.00019726295210166178, 'epoch': 0.74}
+{'loss': 0.5785, 'grad_norm': 1.3122878074645996, 'learning_rate': 0.00019723851417399805, 'epoch': 0.74}
+{'loss': 0.4062, 'grad_norm': 1.089698314666748, 'learning_rate': 0.00019721407624633428, 'epoch': 0.74}
+{'loss': 0.455, 'grad_norm': 0.9487230181694031, 'learning_rate': 0.00019718963831867056, 'epoch': 0.74}
+{'loss': 0.6051, 'grad_norm': 1.9743801355361938, 'learning_rate': 0.00019716520039100683, 'epoch': 0.74}
+{'loss': 0.5583, 'grad_norm': 1.2036941051483154, 'learning_rate': 0.00019714076246334309, 'epoch': 0.74}
+{'loss': 0.562, 'grad_norm': 1.425595760345459, 'learning_rate': 0.00019711632453567936, 'epoch': 0.74}
+{'loss': 0.4964, 'grad_norm': 0.8784242272377014, 'learning_rate': 0.00019709188660801564, 'epoch': 0.74}
+{'loss': 0.411, 'grad_norm': 3.1630759239196777, 'learning_rate': 0.00019706744868035187, 'epoch': 0.74}
+{'loss': 0.2949, 'grad_norm': 1.366860270500183, 'learning_rate': 0.00019704301075268815, 'epoch': 0.74}
+{'loss': 0.7963, 'grad_norm': 1.7974003553390503, 'learning_rate': 0.00019701857282502442, 'epoch': 0.74}
+{'loss': 0.7553, 'grad_norm': 2.272477388381958, 'learning_rate': 0.00019699413489736067, 'epoch': 0.74}
+{'loss': 0.7692, 'grad_norm': 1.5582921504974365, 'learning_rate': 0.00019696969696969695, 'epoch': 0.74}
+{'loss': 0.6417, 'grad_norm': 2.499539613723755, 'learning_rate': 0.00019694525904203323, 'epoch': 0.74}
+{'loss': 0.8532, 'grad_norm': 2.7339725494384766, 'learning_rate': 0.00019692082111436948, 'epoch': 0.74}
+{'loss': 0.6366, 'grad_norm': 1.615174651145935, 'learning_rate': 0.00019689638318670576, 'epoch': 0.74}
+{'loss': 1.0497, 'grad_norm': 2.133688449859619, 'learning_rate': 0.00019687194525904204, 'epoch': 0.74}
+{'loss': 0.7695, 'grad_norm': 1.7480201721191406, 'learning_rate': 0.00019684750733137826, 'epoch': 0.74}
+{'loss': 1.0978, 'grad_norm': 3.059267282485962, 'learning_rate': 0.00019682306940371454, 'epoch': 0.74}
+{'loss': 1.1363, 'grad_norm': 2.2188549041748047, 'learning_rate': 0.00019679863147605082, 'epoch': 0.74}
+{'loss': 0.6612, 'grad_norm': 1.8795181512832642, 'learning_rate': 0.00019677419354838707, 'epoch': 0.74}
+{'loss': 0.987, 'grad_norm': 1.7549196481704712, 'learning_rate': 0.00019674975562072335, 'epoch': 0.74}
+{'loss': 1.1045, 'grad_norm': 2.692260265350342, 'learning_rate': 0.00019672531769305963, 'epoch': 0.74}
+{'loss': 1.2261, 'grad_norm': 2.230609893798828, 'learning_rate': 0.00019670087976539588, 'epoch': 0.74}
+{'loss': 1.0071, 'grad_norm': 3.0220465660095215, 'learning_rate': 0.00019667644183773216, 'epoch': 0.74}
+{'loss': 1.1151, 'grad_norm': 1.864869236946106, 'learning_rate': 0.00019665200391006844, 'epoch': 0.74}
+{'loss': 1.2847, 'grad_norm': 5.262318134307861, 'learning_rate': 0.00019662756598240466, 'epoch': 0.74}
+{'loss': 1.131, 'grad_norm': 2.782365083694458, 'learning_rate': 0.00019660312805474094, 'epoch': 0.74}
+{'loss': 1.2404, 'grad_norm': 1.8691052198410034, 'learning_rate': 0.00019657869012707722, 'epoch': 0.74}
+{'loss': 0.7896, 'grad_norm': 1.3026301860809326, 'learning_rate': 0.00019655425219941347, 'epoch': 0.74}
+{'loss': 1.0242, 'grad_norm': 2.631027936935425, 'learning_rate': 0.00019652981427174975, 'epoch': 0.74}
+{'loss': 1.4005, 'grad_norm': 2.3272483348846436, 'learning_rate': 0.00019650537634408603, 'epoch': 0.74}
+{'loss': 0.7136, 'grad_norm': 2.3887293338775635, 'learning_rate': 0.00019648093841642225, 'epoch': 0.74}
+{'loss': 0.6281, 'grad_norm': 1.8265565633773804, 'learning_rate': 0.00019645650048875853, 'epoch': 0.74}
+{'loss': 1.017, 'grad_norm': 2.172302007675171, 'learning_rate': 0.0001964320625610948, 'epoch': 0.74}
+{'loss': 1.1234, 'grad_norm': 2.7405848503112793, 'learning_rate': 0.00019640762463343106, 'epoch': 0.74}
+{'loss': 0.8493, 'grad_norm': 1.8919093608856201, 'learning_rate': 0.00019638318670576734, 'epoch': 0.74}
+{'loss': 0.338, 'grad_norm': 0.5198003053665161, 'learning_rate': 0.00019635874877810361, 'epoch': 0.74}
+{'loss': 0.253, 'grad_norm': 0.49822479486465454, 'learning_rate': 0.00019633431085043987, 'epoch': 0.74}
+{'loss': 0.2911, 'grad_norm': 0.4993680417537689, 'learning_rate': 0.00019630987292277614, 'epoch': 0.74}
+{'loss': 0.3687, 'grad_norm': 0.7572706341743469, 'learning_rate': 0.00019628543499511242, 'epoch': 0.74}
+{'loss': 0.2211, 'grad_norm': 0.4129246175289154, 'learning_rate': 0.00019626099706744865, 'epoch': 0.74}
+{'loss': 0.3346, 'grad_norm': 0.4937713146209717, 'learning_rate': 0.00019623655913978492, 'epoch': 0.74}
+{'loss': 0.3343, 'grad_norm': 0.7381402850151062, 'learning_rate': 0.0001962121212121212, 'epoch': 0.74}
+{'loss': 0.3558, 'grad_norm': 0.9552004933357239, 'learning_rate': 0.00019618768328445745, 'epoch': 0.74}
+{'loss': 0.3339, 'grad_norm': 0.7461534142494202, 'learning_rate': 0.00019616324535679373, 'epoch': 0.74}
+{'loss': 0.4365, 'grad_norm': 0.7386569976806641, 'learning_rate': 0.00019613880742913, 'epoch': 0.75}
+{'loss': 0.3719, 'grad_norm': 0.824906587600708, 'learning_rate': 0.00019611436950146626, 'epoch': 0.75}
+{'loss': 0.3857, 'grad_norm': 2.553926467895508, 'learning_rate': 0.00019608993157380254, 'epoch': 0.75}
+{'loss': 0.6648, 'grad_norm': 1.7398136854171753, 'learning_rate': 0.0001960654936461388, 'epoch': 0.75}
+{'loss': 0.5358, 'grad_norm': 1.5790472030639648, 'learning_rate': 0.00019604105571847504, 'epoch': 0.75}
+{'loss': 0.3689, 'grad_norm': 0.9220511317253113, 'learning_rate': 0.00019601661779081132, 'epoch': 0.75}
+{'loss': 1.2123, 'grad_norm': 2.0262274742126465, 'learning_rate': 0.0001959921798631476, 'epoch': 0.75}
+{'loss': 0.5389, 'grad_norm': 0.8574962615966797, 'learning_rate': 0.00019596774193548385, 'epoch': 0.75}
+{'loss': 0.5364, 'grad_norm': 0.822786271572113, 'learning_rate': 0.00019594330400782013, 'epoch': 0.75}
+{'loss': 0.8335, 'grad_norm': 1.0409291982650757, 'learning_rate': 0.0001959188660801564, 'epoch': 0.75}
+{'loss': 0.5052, 'grad_norm': 0.9173699617385864, 'learning_rate': 0.00019589442815249263, 'epoch': 0.75}
+{'loss': 0.6807, 'grad_norm': 1.4677929878234863, 'learning_rate': 0.0001958699902248289, 'epoch': 0.75}
+ 37%|███▋      | 4772/12776 [48:39<55:50,  2.39it/s] 37%|███▋      | 4773/12776 [48:40<52:21,  2.55it/s]                                                     37%|███▋      | 4773/12776 [48:40<52:21,  2.55it/s] 37%|███▋      | 4774/12776 [48:40<49:04,  2.72it/s]                                                     37%|███▋      | 4774/12776 [48:40<49:04,  2.72it/s] 37%|███▋      | 4775/12776 [48:40<47:49,  2.79it/s]                                                     37%|███▋      | 4775/12776 [48:40<47:49,  2.79it/s] 37%|███▋      | 4776/12776 [48:41<45:22,  2.94it/s]                                                     37%|███▋      | 4776/12776 [48:41<45:22,  2.94it/s] 37%|███▋      | 4777/12776 [48:41<43:16,  3.08it/s]                                                     37%|███▋      | 4777/12776 [48:41<43:16,  3.08it/s] 37%|███▋      | 4778/12776 [48:41<41:25,  3.22it/s]                                                     37%|███▋      | 4778/12776 [48:41<41:25,  3.22it/s] 37%|███▋      | 4779/12776 [48:42<45:02,  2.96it/s]                                                     37%|███▋      | 4779/12776 [48:42<45:02,  2.96it/s] 37%|███▋      | 4780/12776 [48:42<42:06,  3.17it/s]                                                     37%|███▋      | 4780/12776 [48:42<42:06,  3.17it/s] 37%|███▋      | 4781/12776 [48:42<39:35,  3.37it/s]                                                     37%|███▋      | 4781/12776 [48:42<39:35,  3.37it/s] 37%|███▋      | 4782/12776 [48:42<37:40,  3.54it/s]                                                     37%|███▋      | 4782/12776 [48:42<37:40,  3.54it/s] 37%|███▋      | 4783/12776 [48:43<39:56,  3.34it/s]                                                     37%|███▋      | 4783/12776 [48:43<39:56,  3.34it/s] 37%|███▋      | 4784/12776 [48:43<37:29,  3.55it/s]                                                     37%|███▋      | 4784/12776 [48:43<37:29,  3.55it/s] 37%|███▋      | 4785/12776 [48:43<35:31,  3.75it/s]                                                     37%|███▋      | 4785/12776 [48:43<35:31,  3.75it/s] 37%|███▋      | 4786/12776 [48:43<33:56,  3.92it/s]                                                     37%|███▋      | 4786/12776 [48:43<33:56,  3.92it/s] 37%|███▋      | 4787/12776 [48:44<36:35,  3.64it/s]                                                     37%|███▋      | 4787/12776 [48:44<36:35,  3.64it/s] 37%|███▋      | 4788/12776 [48:44<34:10,  3.90it/s]                                                     37%|███▋      | 4788/12776 [48:44<34:10,  3.90it/s] 37%|███▋      | 4789/12776 [48:44<32:19,  4.12it/s]                                                     37%|███▋      | 4789/12776 [48:44<32:19,  4.12it/s] 37%|███▋      | 4790/12776 [48:44<31:03,  4.29it/s]                                                     37%|███▋      | 4790/12776 [48:44<31:03,  4.29it/s] 38%|███▊      | 4791/12776 [48:45<29:59,  4.44it/s]                                                     38%|███▊      | 4791/12776 [48:45<29:59,  4.44it/s] 38%|███▊      | 4792/12776 [48:45<33:36,  3.96it/s]                                                     38%|███▊      | 4792/12776 [48:45<33:36,  3.96it/s] 38%|███▊      | 4793/12776 [48:45<31:29,  4.22it/s]                                                     38%|███▊      | 4793/12776 [48:45<31:29,  4.22it/s] 38%|███▊      | 4794/12776 [48:45<29:55,  4.45it/s]                                                     38%|███▊      | 4794/12776 [48:45<29:55,  4.45it/s] 38%|███▊      | 4795/12776 [48:45<28:48,  4.62it/s]                                                     38%|███▊      | 4795/12776 [48:45<28:48,  4.62it/s] 38%|███▊      | 4796/12776 [48:46<27:52,  4.77it/s]                                                     38%|███▊      | 4796/12776 [48:46<27:52,  4.77it/s] 38%|███▊      | 4797/12776 [48:46<27:09,  4.90it/s]                                                     38%|███▊      | 4797/12776 [48:46<27:09,  4.90it/s] 38%|███▊      | 4798/12776 [48:46<30:24,  4.37it/s]                                                     38%|███▊      | 4798/12776 [48:46<30:24,  4.37it/s] 38%|███▊      | 4799/12776 [48:46<28:41,  4.63it/s]                                                     38%|███▊      | 4799/12776 [48:46<28:41,  4.63it/s] 38%|███▊      | 4800/12776 [48:47<53:42,  2.48it/s]                                                     38%|███▊      | 4800/12776 [48:47<53:42,  2.48it/s]Saving model checkpoint to ./checkpoint-4800
+Configuration saved in ./checkpoint-4800/config.json
+Model weights saved in ./checkpoint-4800/model.safetensors
+Feature extractor saved in ./checkpoint-4800/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-4800/tokenizer_config.json
+Special tokens file saved in ./checkpoint-4800/special_tokens_map.json
+added tokens file saved in ./checkpoint-4800/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-3600] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 38%|███▊      | 4801/12776 [48:54<5:14:37,  2.37s/it]                                                       38%|███▊      | 4801/12776 [48:54<5:14:37,  2.37s/it] 38%|███▊      | 4802/12776 [48:55<4:23:34,  1.98s/it]                                                       38%|███▊      | 4802/12776 [48:55<4:23:34,  1.98s/it] 38%|███▊      | 4803/12776 [48:56<3:37:03,  1.63s/it]                                                       38%|███▊      | 4803/12776 [48:56<3:37:03,  1.63s/it] 38%|███▊      | 4804/12776 [48:57<3:01:24,  1.37s/it]                                                       38%|███▊      | 4804/12776 [48:57<3:01:24,  1.37s/it] 38%|███▊      | 4805/12776 [48:57<2:36:25,  1.18s/it]                                                       38%|███▊      | 4805/12776 [48:57<2:36:25,  1.18s/it] 38%|███▊      | 4806/12776 [48:58<2:15:42,  1.02s/it]                                                       38%|███▊      | 4806/12776 [48:58<2:15:42,  1.02s/it] 38%|███▊      | 4807/12776 [48:59<2:04:22,  1.07it/s]                                                       38%|███▊      | 4807/12776 [48:59<2:04:22,  1.07it/s] 38%|███▊      | 4808/12776 [48:59<1:51:05,  1.20it/s]                                                       38%|███▊      | 4808/12776 [48:59<1:51:05,  1.20it/s] 38%|███▊      | 4809/12776 [49:00<1:43:20,  1.28it/s]                                                       38%|███▊      | 4809/12776 [49:00<1:43:20,  1.28it/s] 38%|███▊      | 4810/12776 [49:01<1:34:04,  1.41it/s]                                                       38%|███▊      | 4810/12776 [49:01<1:34:04,  1.41it/s] 38%|███▊      | 4811/12776 [49:01<1:28:38,  1.50it/s]                                                       38%|███▊      | 4811/12776 [49:01<1:28:38,  1.50it/s] 38%|███▊      | 4812/12776 [49:02<1:22:23,  1.61it/s]                                                       38%|███▊      | 4812/12776 [49:02<1:22:23,  1.61it/s] 38%|███▊      | 4813/12776 [49:02<1:19:08,  1.68it/s]                                                       38%|███▊      | 4813/12776 [49:02<1:19:08,  1.68it/s] 38%|███▊      | 4814/12776 [49:03<1:13:45,  1.80it/s]                                                       38%|███▊      | 4814/12776 [49:03<1:13:45,  1.80it/s] 38%|███▊      | 4815/12776 [49:03<1:11:16,  1.86it/s]                                                       38%|███▊      | 4815/12776 [49:03<1:11:16,  1.86it/s] 38%|███▊      | 4816/12776 [49:04<1:06:05,  2.01it/s]                                                       38%|███▊      | 4816/12776 [49:04<1:06:05,  2.01it/s] 38%|███▊      | 4817/12776 [49:04<1:02:01,  2.14it/s]                                                       38%|███▊      | 4817/12776 [49:04<1:02:01,  2.14it/s] 38%|███▊      | 4818/12776 [49:05<1:03:42,  2.08it/s]                                                       38%|███▊      | 4818/12776 [49:05<1:03:42,  2.08it/s] 38%|███▊      | 4819/12776 [49:05<58:54,  2.25it/s]                                                       38%|███▊      | 4819/12776 [49:05<58:54,  2.25it/s] 38%|███▊      | 4820/12776 [49:05<55:05,  2.41it/s]                                                     38%|███▊      | 4820/12776 [49:05<55:05,  2.41it/s] 38%|███▊      | 4821/12776 [49:06<55:08,  2.40it/s]                                                     38%|███▊      | 4821/12776 [49:06<55:08,  2.40it/s] 38%|███▊      | 4822/12776 [49:06<51:19,  2.58it/s]                                                     38%|███▊      | 4822/12776 [49:06<51:19,  2.58it/s] 38%|███▊      | 4823/12776 [49:06<48:14,  2.75it/s]                                                     38%|███▊      | 4823/12776 [49:06<48:14,  2.75it/s] 38%|███▊      | 4824/12776 [49:07<49:23,  2.68it/s]                                                     38%|███▊      | 4824/12776 [49:07<49:23,  2.68it/s] 38%|███▊      | 4825/12776 [49:07<45:52,  2.89it/s]                                                     38%|███▊      | 4825/12776 [49:07<45:52,  2.89it/s] 38%|███▊      | 4826/12776 [49:07<43:12,  3.07it/s]                                                     38%|███▊      | 4826/12776 [49:07<43:12,  3.07it/s] 38%|███▊      | 4827/12776 [49:08<40:54,  3.24it/s]                                                     38%|███▊      | 4827/12776 [49:08<40:54,  3.24it/s] 38%|███▊      | 4828/12776 [49:08<41:41,  3.18it/s]                                                     38%|███▊      | 4828/12776 [49:08<41:41,  3.18it/s] 38%|███▊      | 4829/12776 [49:08<39:34,  3.35it/s]                                                     38%|███▊      | 4829/12776 [49:08<39:34,  3.35it/s] 38%|███▊      | 4830/12776 [49:08<37:42,  3.51it/s]                                                     38%|███▊      | 4830/12776 [49:08<37:42,  3.51it/s] 38%|███▊      | 4831/12776 [49:09<36:03,  3.67it/s]                                                     38%|███▊      | 4831/12776 [49:09<36:03,  3.67it/s] 38%|███▊      | 4832/12776 [49:09<34:31,  3.84it/s]                                                     38%|███▊      | 4832/12776 [49:09<34:31,  3.84it/s] 38%|███▊      | 4833/12776 [49:09<35:07,  3.77it/s]                                                     38%|███▊      | 4833/12776 [49:09<35:07,  3.77it/s] 38%|███▊      | 4834/12776 [49:09<33:22,  3.97it/s]                                                     38%|███▊      | 4834/12776 [49:09<33:22,  3.97it/s] 38%|███▊      | 4835/12776 [49:10<31:57,  4.14it/s]                                                     38%|███▊      | 4835/12776 [49:10<31:57,  4.14it/s] 38%|███▊      | 4836/12776 [49:10<30:40,  4.31it/s]                                                     38%|███▊      | 4836/12776 [49:10<30:40,  4.31it/s] 38%|███▊      | 4837/12776 [49:10<29:24,  4.50it/s]                                                     38%|███▊      | 4837/12776 [49:10<29:24,  4.50it/s] 38%|███▊      | 4838/12776 [49:10<31:32,  4.19it/s]                                                     38%|███▊      | 4838/12776 [49:10<31:32,  4.19it/s] 38%|███▊      | 4839/12776 [49:10<29:42,  4.45it/s]                                                     38%|███▊      | 4839/12776 [49:10<29:42,  4.45it/s] 38%|███▊      | 4840/12776 [49:11<28:24,  4.65it/s]                                                     38%|███▊      | 4840/12776 [49:11<28:24,  4.65it/s] 38%|███▊      | 4841/12776 [49:11<27:22,  4.83it/s]                                                     38%|███▊      | 4841/12776 [49:11<27:22,  4.83it/s] 38%|███▊      | 4842/12776 [49:11<26:29,  4.99it/s]                                                     38%|███▊      | 4842/12776 [49:11<26:29,  4.99it/s] 38%|███▊      | 4843/12776 [49:11<29:58,  4.41it/s]                                                     38%|███▊      | 4843/12776 [49:11<29:58,  4.41it/s] 38%|███▊      | 4844/12776 [49:11<28:02,  4.72it/s]                                                     38%|███▊      | 4844/12776 [49:11<28:02,  4.72it/s] 38%|███▊      | 4845/12776 [49:12<26:32,  4.98it/s]                                                     38%|███▊      | 4845/12776 [49:12<26:32,  4.98it/s] 38%|███▊      | 4846/12776 [49:12<25:25,  5.20it/s]                                                     38%|███▊      | 4846/12776 [49:12<25:25,  5.20it/s] 38%|███▊      | 4847/12776 [49:12<24:38,  5.36it/s]                                                     38%|███▊      | 4847/12776 [49:12<24:38,  5.36it/s] 38%|███▊      | 4848/12776 [49:12<23:50,  5.54it/s]                                                     38%|███▊      | 4848/12776 [49:12<23:50,  5.54it/s] 38%|███▊      | 4849/12776 [49:12<27:20,  4.83it/s]                                                    {'loss': 0.6056, 'grad_norm': 1.5798442363739014, 'learning_rate': 0.0001958455522971652, 'epoch': 0.75}
+{'loss': 0.5881, 'grad_norm': 1.009047508239746, 'learning_rate': 0.00019582111436950144, 'epoch': 0.75}
+{'loss': 0.5304, 'grad_norm': 2.4256644248962402, 'learning_rate': 0.00019579667644183772, 'epoch': 0.75}
+{'loss': 0.6864, 'grad_norm': 2.2748332023620605, 'learning_rate': 0.000195772238514174, 'epoch': 0.75}
+{'loss': 0.7061, 'grad_norm': 2.184377431869507, 'learning_rate': 0.00019574780058651025, 'epoch': 0.75}
+{'loss': 0.8436, 'grad_norm': 2.4698145389556885, 'learning_rate': 0.00019572336265884653, 'epoch': 0.75}
+{'loss': 0.4022, 'grad_norm': 0.8937560319900513, 'learning_rate': 0.0001956989247311828, 'epoch': 0.75}
+{'loss': 0.6108, 'grad_norm': 1.8450487852096558, 'learning_rate': 0.00019567448680351903, 'epoch': 0.75}
+{'loss': 0.7764, 'grad_norm': 1.5226956605911255, 'learning_rate': 0.0001956500488758553, 'epoch': 0.75}
+{'loss': 0.9701, 'grad_norm': 3.9673829078674316, 'learning_rate': 0.00019562561094819159, 'epoch': 0.75}
+{'loss': 0.887, 'grad_norm': 1.7222696542739868, 'learning_rate': 0.00019560117302052784, 'epoch': 0.75}
+{'loss': 0.993, 'grad_norm': 3.377755641937256, 'learning_rate': 0.00019557673509286411, 'epoch': 0.75}
+{'loss': 1.0556, 'grad_norm': 2.456941604614258, 'learning_rate': 0.0001955522971652004, 'epoch': 0.75}
+{'loss': 1.115, 'grad_norm': 3.0926427841186523, 'learning_rate': 0.00019552785923753664, 'epoch': 0.75}
+{'loss': 1.5124, 'grad_norm': 5.375258445739746, 'learning_rate': 0.00019550342130987292, 'epoch': 0.75}
+{'loss': 1.1732, 'grad_norm': 2.7964589595794678, 'learning_rate': 0.00019547898338220917, 'epoch': 0.75}
+{'loss': 1.0657, 'grad_norm': 1.498062014579773, 'learning_rate': 0.00019545454545454543, 'epoch': 0.75}
+{'loss': 1.1077, 'grad_norm': 6.085139751434326, 'learning_rate': 0.0001954301075268817, 'epoch': 0.75}
+{'loss': 1.4877, 'grad_norm': 3.144690990447998, 'learning_rate': 0.00019540566959921798, 'epoch': 0.75}
+{'loss': 1.4845, 'grad_norm': 3.460711717605591, 'learning_rate': 0.00019538123167155423, 'epoch': 0.75}
+{'loss': 1.4173, 'grad_norm': 2.4659714698791504, 'learning_rate': 0.0001953567937438905, 'epoch': 0.75}
+{'loss': 1.5278, 'grad_norm': 2.4354169368743896, 'learning_rate': 0.0001953323558162268, 'epoch': 0.75}
+{'loss': 1.5933, 'grad_norm': 4.406135559082031, 'learning_rate': 0.00019530791788856301, 'epoch': 0.75}
+{'loss': 1.278, 'grad_norm': 2.1102359294891357, 'learning_rate': 0.0001952834799608993, 'epoch': 0.75}
+{'loss': 0.439, 'grad_norm': 1.0757032632827759, 'learning_rate': 0.00019525904203323557, 'epoch': 0.75}
+{'loss': 0.5876, 'grad_norm': 1.6576645374298096, 'learning_rate': 0.00019523460410557182, 'epoch': 0.75}
+{'loss': 1.1348, 'grad_norm': 2.0612480640411377, 'learning_rate': 0.0001952101661779081, 'epoch': 0.75}
+{'loss': 1.1104, 'grad_norm': 4.78619384765625, 'learning_rate': 0.00019518572825024438, 'epoch': 0.75}
+{'loss': 1.3444, 'grad_norm': 2.2526986598968506, 'learning_rate': 0.00019516129032258063, 'epoch': 0.75}
+{'loss': 0.322, 'grad_norm': 0.41874295473098755, 'learning_rate': 0.0001951368523949169, 'epoch': 0.75}
+{'loss': 0.2551, 'grad_norm': 0.5169926285743713, 'learning_rate': 0.0001951124144672532, 'epoch': 0.75}
+{'loss': 0.3832, 'grad_norm': 0.8094984292984009, 'learning_rate': 0.0001950879765395894, 'epoch': 0.75}
+{'loss': 0.2254, 'grad_norm': 0.5564207434654236, 'learning_rate': 0.0001950635386119257, 'epoch': 0.75}
+{'loss': 0.3473, 'grad_norm': 0.42472612857818604, 'learning_rate': 0.00019503910068426197, 'epoch': 0.75}
+{'loss': 0.3752, 'grad_norm': 0.7204293012619019, 'learning_rate': 0.00019501466275659822, 'epoch': 0.75}
+{'loss': 0.4892, 'grad_norm': 1.2864282131195068, 'learning_rate': 0.0001949902248289345, 'epoch': 0.75}
+{'loss': 0.4192, 'grad_norm': 0.8378409743309021, 'learning_rate': 0.00019496578690127078, 'epoch': 0.75}
+{'loss': 0.3945, 'grad_norm': 0.6147698163986206, 'learning_rate': 0.00019494134897360703, 'epoch': 0.75}
+{'loss': 0.2257, 'grad_norm': 0.4717334508895874, 'learning_rate': 0.0001949169110459433, 'epoch': 0.75}
+{'loss': 0.597, 'grad_norm': 1.3184202909469604, 'learning_rate': 0.00019489247311827956, 'epoch': 0.75}
+{'loss': 0.4335, 'grad_norm': 1.2259258031845093, 'learning_rate': 0.0001948680351906158, 'epoch': 0.75}
+{'loss': 0.3843, 'grad_norm': 0.6783806085586548, 'learning_rate': 0.00019484359726295209, 'epoch': 0.75}
+{'loss': 0.3306, 'grad_norm': 1.3408708572387695, 'learning_rate': 0.00019481915933528836, 'epoch': 0.75}
+{'loss': 0.4586, 'grad_norm': 0.9002304673194885, 'learning_rate': 0.00019479472140762462, 'epoch': 0.75}
+{'loss': 0.5046, 'grad_norm': 1.12655770778656, 'learning_rate': 0.0001947702834799609, 'epoch': 0.75}
+{'loss': 0.5049, 'grad_norm': 0.9550591111183167, 'learning_rate': 0.00019474584555229717, 'epoch': 0.75}
+{'loss': 0.5001, 'grad_norm': 1.1619162559509277, 'learning_rate': 0.0001947214076246334, 'epoch': 0.75}
+{'loss': 0.3228, 'grad_norm': 0.7153265476226807, 'learning_rate': 0.00019469696969696967, 'epoch': 0.75}
+{'loss': 0.3079, 'grad_norm': 0.8718234300613403, 'learning_rate': 0.00019467253176930595, 'epoch': 0.75}
+{'loss': 0.5131, 'grad_norm': 1.3940988779067993, 'learning_rate': 0.0001946480938416422, 'epoch': 0.75}
+{'loss': 0.6426, 'grad_norm': 1.4768911600112915, 'learning_rate': 0.00019462365591397848, 'epoch': 0.75}
+{'loss': 0.5, 'grad_norm': 1.1347852945327759, 'learning_rate': 0.00019459921798631476, 'epoch': 0.76}
+{'loss': 0.7089, 'grad_norm': 2.5851755142211914, 'learning_rate': 0.000194574780058651, 'epoch': 0.76}
+{'loss': 0.7053, 'grad_norm': 1.3372689485549927, 'learning_rate': 0.0001945503421309873, 'epoch': 0.76}
+{'loss': 0.6847, 'grad_norm': 2.5522773265838623, 'learning_rate': 0.00019452590420332357, 'epoch': 0.76}
+{'loss': 0.5898, 'grad_norm': 1.1774206161499023, 'learning_rate': 0.0001945014662756598, 'epoch': 0.76}
+{'loss': 0.529, 'grad_norm': 1.0767072439193726, 'learning_rate': 0.00019447702834799607, 'epoch': 0.76}
+{'loss': 0.4727, 'grad_norm': 1.9729124307632446, 'learning_rate': 0.00019445259042033235, 'epoch': 0.76}
+{'loss': 1.1988, 'grad_norm': 5.122827053070068, 'learning_rate': 0.0001944281524926686, 'epoch': 0.76}
+{'loss': 0.8946, 'grad_norm': 1.9116922616958618, 'learning_rate': 0.00019440371456500488, 'epoch': 0.76}
+{'loss': 1.0248, 'grad_norm': 2.4748942852020264, 'learning_rate': 0.00019437927663734116, 'epoch': 0.76}
+{'loss': 1.1779, 'grad_norm': 2.4819397926330566, 'learning_rate': 0.0001943548387096774, 'epoch': 0.76}
+{'loss': 0.7033, 'grad_norm': 2.278400421142578, 'learning_rate': 0.00019433040078201366, 'epoch': 0.76}
+{'loss': 0.8813, 'grad_norm': 2.6924045085906982, 'learning_rate': 0.00019430596285434994, 'epoch': 0.76}
+{'loss': 0.9923, 'grad_norm': 1.8537712097167969, 'learning_rate': 0.0001942815249266862, 'epoch': 0.76}
+{'loss': 0.8471, 'grad_norm': 3.0504987239837646, 'learning_rate': 0.00019425708699902247, 'epoch': 0.76}
+{'loss': 1.0216, 'grad_norm': 3.1439967155456543, 'learning_rate': 0.00019423264907135875, 'epoch': 0.76}
+{'loss': 0.8951, 'grad_norm': 4.802983283996582, 'learning_rate': 0.000194208211143695, 'epoch': 0.76}
+{'loss': 1.1434, 'grad_norm': 2.6184773445129395, 'learning_rate': 0.00019418377321603128, 'epoch': 0.76}
+{'loss': 1.3546, 'grad_norm': 4.851845741271973, 'learning_rate': 0.00019415933528836755, 'epoch': 0.76}
+{'loss': 1.5022, 'grad_norm': 2.406771421432495, 'learning_rate': 0.00019413489736070378, 'epoch': 0.76}
+{'loss': 1.6779, 'grad_norm': 3.594590663909912, 'learning_rate': 0.00019411045943304006, 'epoch': 0.76}
+{'loss': 1.8704, 'grad_norm': 2.279587984085083, 'learning_rate': 0.00019408602150537634, 'epoch': 0.76}
+{'loss': 1.3369, 'grad_norm': 2.240976333618164, 'learning_rate': 0.0001940615835777126, 'epoch': 0.76}
+{'loss': 0.6641, 'grad_norm': 2.4125685691833496, 'learning_rate': 0.00019403714565004886, 'epoch': 0.76}
+{'loss': 0.9713, 'grad_norm': 3.437535285949707, 'learning_rate': 0.00019401270772238514, 'epoch': 0.76}
+{'loss': 0.6326, 'grad_norm': 5.112056732177734, 'learning_rate': 0.0001939882697947214, 'epoch': 0.76}
+ 38%|███▊      | 4849/12776 [49:12<27:20,  4.83it/s] 38%|███▊      | 4850/12776 [49:13<44:43,  2.95it/s]                                                     38%|███▊      | 4850/12776 [49:13<44:43,  2.95it/s] 38%|███▊      | 4851/12776 [49:15<1:31:33,  1.44it/s]                                                       38%|███▊      | 4851/12776 [49:15<1:31:33,  1.44it/s] 38%|███▊      | 4852/12776 [49:15<1:37:51,  1.35it/s]                                                       38%|███▊      | 4852/12776 [49:15<1:37:51,  1.35it/s] 38%|███▊      | 4853/12776 [49:16<1:44:19,  1.27it/s]                                                       38%|███▊      | 4853/12776 [49:16<1:44:19,  1.27it/s] 38%|███▊      | 4854/12776 [49:17<1:44:07,  1.27it/s]                                                       38%|███▊      | 4854/12776 [49:17<1:44:07,  1.27it/s] 38%|███▊      | 4855/12776 [49:18<1:41:19,  1.30it/s]                                                       38%|███▊      | 4855/12776 [49:18<1:41:19,  1.30it/s] 38%|███▊      | 4856/12776 [49:19<1:38:32,  1.34it/s]                                                       38%|███▊      | 4856/12776 [49:19<1:38:32,  1.34it/s] 38%|███▊      | 4857/12776 [49:19<1:33:06,  1.42it/s]                                                       38%|███▊      | 4857/12776 [49:19<1:33:06,  1.42it/s] 38%|███▊      | 4858/12776 [49:20<1:28:22,  1.49it/s]                                                       38%|███▊      | 4858/12776 [49:20<1:28:22,  1.49it/s] 38%|███▊      | 4859/12776 [49:20<1:23:22,  1.58it/s]                                                       38%|███▊      | 4859/12776 [49:20<1:23:22,  1.58it/s] 38%|███▊      | 4860/12776 [49:21<1:19:40,  1.66it/s]                                                       38%|███▊      | 4860/12776 [49:21<1:19:40,  1.66it/s] 38%|███▊      | 4861/12776 [49:21<1:15:01,  1.76it/s]                                                       38%|███▊      | 4861/12776 [49:21<1:15:01,  1.76it/s] 38%|███▊      | 4862/12776 [49:22<1:14:59,  1.76it/s]                                                       38%|███▊      | 4862/12776 [49:22<1:14:59,  1.76it/s] 38%|███▊      | 4863/12776 [49:22<1:09:31,  1.90it/s]                                                       38%|███▊      | 4863/12776 [49:22<1:09:31,  1.90it/s] 38%|███▊      | 4864/12776 [49:23<1:09:43,  1.89it/s]                                                       38%|███▊      | 4864/12776 [49:23<1:09:43,  1.89it/s] 38%|███▊      | 4865/12776 [49:23<1:04:55,  2.03it/s]                                                       38%|███▊      | 4865/12776 [49:23<1:04:55,  2.03it/s] 38%|███▊      | 4866/12776 [49:24<1:01:10,  2.15it/s]                                                       38%|███▊      | 4866/12776 [49:24<1:01:10,  2.15it/s] 38%|███▊      | 4867/12776 [49:24<1:02:53,  2.10it/s]                                                       38%|███▊      | 4867/12776 [49:24<1:02:53,  2.10it/s] 38%|███▊      | 4868/12776 [49:25<59:38,  2.21it/s]                                                       38%|███▊      | 4868/12776 [49:25<59:38,  2.21it/s] 38%|███▊      | 4869/12776 [49:25<56:52,  2.32it/s]                                                     38%|███▊      | 4869/12776 [49:25<56:52,  2.32it/s] 38%|███▊      | 4870/12776 [49:25<55:53,  2.36it/s]                                                     38%|███▊      | 4870/12776 [49:25<55:53,  2.36it/s] 38%|███▊      | 4871/12776 [49:26<53:05,  2.48it/s]                                                     38%|███▊      | 4871/12776 [49:26<53:05,  2.48it/s] 38%|███▊      | 4872/12776 [49:26<50:44,  2.60it/s]                                                     38%|███▊      | 4872/12776 [49:26<50:44,  2.60it/s] 38%|███▊      | 4873/12776 [49:26<51:28,  2.56it/s]                                                     38%|███▊      | 4873/12776 [49:26<51:28,  2.56it/s] 38%|███▊      | 4874/12776 [49:27<49:03,  2.68it/s]                                                     38%|███▊      | 4874/12776 [49:27<49:03,  2.68it/s] 38%|███▊      | 4875/12776 [49:27<46:54,  2.81it/s]                                                     38%|███▊      | 4875/12776 [49:27<46:54,  2.81it/s] 38%|███▊      | 4876/12776 [49:27<48:29,  2.72it/s]                                                     38%|███▊      | 4876/12776 [49:27<48:29,  2.72it/s] 38%|███▊      | 4877/12776 [49:28<45:32,  2.89it/s]                                                     38%|███▊      | 4877/12776 [49:28<45:32,  2.89it/s] 38%|███▊      | 4878/12776 [49:28<43:07,  3.05it/s]                                                     38%|███▊      | 4878/12776 [49:28<43:07,  3.05it/s] 38%|███▊      | 4879/12776 [49:28<45:40,  2.88it/s]                                                     38%|███▊      | 4879/12776 [49:28<45:40,  2.88it/s] 38%|███▊      | 4880/12776 [49:29<42:34,  3.09it/s]                                                     38%|███▊      | 4880/12776 [49:29<42:34,  3.09it/s] 38%|███▊      | 4881/12776 [49:29<39:53,  3.30it/s]                                                     38%|███▊      | 4881/12776 [49:29<39:53,  3.30it/s] 38%|███▊      | 4882/12776 [49:29<37:45,  3.48it/s]                                                     38%|███▊      | 4882/12776 [49:29<37:45,  3.48it/s] 38%|███▊      | 4883/12776 [49:30<40:31,  3.25it/s]                                                     38%|███▊      | 4883/12776 [49:30<40:31,  3.25it/s] 38%|███▊      | 4884/12776 [49:30<37:42,  3.49it/s]                                                     38%|███▊      | 4884/12776 [49:30<37:42,  3.49it/s] 38%|███▊      | 4885/12776 [49:30<35:30,  3.70it/s]                                                     38%|███▊      | 4885/12776 [49:30<35:30,  3.70it/s] 38%|███▊      | 4886/12776 [49:30<34:49,  3.78it/s]                                                     38%|███▊      | 4886/12776 [49:30<34:49,  3.78it/s] 38%|███▊      | 4887/12776 [49:31<32:54,  4.00it/s]                                                     38%|███▊      | 4887/12776 [49:31<32:54,  4.00it/s] 38%|███▊      | 4888/12776 [49:31<35:14,  3.73it/s]                                                     38%|███▊      | 4888/12776 [49:31<35:14,  3.73it/s] 38%|███▊      | 4889/12776 [49:31<32:57,  3.99it/s]                                                     38%|███▊      | 4889/12776 [49:31<32:57,  3.99it/s] 38%|███▊      | 4890/12776 [49:31<31:12,  4.21it/s]                                                     38%|███▊      | 4890/12776 [49:31<31:12,  4.21it/s] 38%|███▊      | 4891/12776 [49:31<29:52,  4.40it/s]                                                     38%|███▊      | 4891/12776 [49:31<29:52,  4.40it/s] 38%|███▊      | 4892/12776 [49:32<28:52,  4.55it/s]                                                     38%|███▊      | 4892/12776 [49:32<28:52,  4.55it/s] 38%|███▊      | 4893/12776 [49:32<32:20,  4.06it/s]                                                     38%|███▊      | 4893/12776 [49:32<32:20,  4.06it/s] 38%|███▊      | 4894/12776 [49:32<30:30,  4.30it/s]                                                     38%|███▊      | 4894/12776 [49:32<30:30,  4.30it/s] 38%|███▊      | 4895/12776 [49:32<29:04,  4.52it/s]                                                     38%|███▊      | 4895/12776 [49:32<29:04,  4.52it/s] 38%|███▊      | 4896/12776 [49:33<27:58,  4.70it/s]                                                     38%|███▊      | 4896/12776 [49:33<27:58,  4.70it/s] 38%|███▊      | 4897/12776 [49:33<25:34,  5.13it/s]                                                     38%|███▊      | 4897/12776 [49:33<25:34,  5.13it/s] 38%|███▊      | 4898/12776 [49:33<28:26,  4.62it/s]                                                     38%|███▊      | 4898/12776 [49:33<28:26,  4.62it/s] 38%|███▊      | 4899/12776 [49:33<27:04,  4.85it/s]                                                     38%|███▊      | 4899/12776 [49:33<27:04,  4.85it/s] 38%|███▊      | 4900/12776 [49:34<47:26,  2.77it/s]                                                     38%|███▊      | 4900/12776 [49:34<47:26,  2.77it/s] 38%|███▊      | 4901/12776 [49:35<1:26:38,  1.51it/s]                                                       38%|███▊      | 4901/12776 [49:35<1:26:38,  1.51it/s] 38%|███▊      | 4902/12776 [49:36<1:38:54,  1.33it/s]                                                       38%|███▊      | 4902/12776 [49:36<1:38:54,  1.33it/s] 38%|███▊      | 4903/12776 [49:37<1:42:05,  1.29it/s]                                                       38%|███▊      | 4903/12776 [49:37<1:42:05,  1.29it/s] 38%|███▊      | 4904/12776 [49:38<1:43:22,  1.27it/s]                                                       38%|███▊      | 4904/12776 [49:38<1:43:22,  1.27it/s] 38%|███▊      | 4905/12776 [49:39<1:45:13,  1.25it/s]                                                       38%|███▊      | 4905/12776 [49:39<1:45:13,  1.25it/s] 38%|███▊      | 4906/12776 [49:39<1:40:19,  1.31it/s]                                                       38%|███▊      | 4906/12776 [49:39<1:40:19,  1.31it/s] 38%|███▊      | 4907/12776 [49:40<1:39:08,  1.32it/s]                                                       38%|███▊      | 4907/12776 [49:40<1:39:08,  1.32it/s] 38%|███▊      | 4908/12776 [49:41<1:33:59,  1.40it/s]                                                       38%|███▊      | 4908/12776 [49:41<1:33:59,  1.40it/s] 38%|███▊      | 4909/12776 [49:41<1:29:29,  1.47it/s]                                                       38%|███▊      | 4909/12776 [49:41<1:29:29,  1.47it/s] 38%|███▊      | 4910/12776 [49:42<1:24:45,  1.55it/s]                                                       38%|███▊      | 4910/12776 [49:42<1:24:45,  1.55it/s] 38%|███▊      | 4911/12776 [49:42<1:20:54,  1.62it/s]                                                       38%|███▊      | 4911/12776 [49:42<1:20:54,  1.62it/s] 38%|███▊      | 4912/12776 [49:43<1:17:17,  1.70it/s]                                                       38%|███▊      | 4912/12776 [49:43<1:17:17,  1.70it/s] 38%|███▊      | 4913/12776 [49:44<1:16:32,  1.71it/s]                                                       38%|███▊      | 4913/12776 [49:44<1:16:32,  1.71it/s] 38%|███▊      | 4914/12776 [49:44<1:11:36,  1.83it/s]                                                       38%|███▊      | 4914/12776 [49:44<1:11:36,  1.83it/s] 38%|███▊      | 4915/12776 [49:45<1:11:47,  1.82it/s]                                                       38%|███▊      | 4915/12776 [49:45<1:11:47,  1.82it/s] 38%|███▊      | 4916/12776 [49:45<1:07:18,  1.95it/s]                                                       38%|███▊      | 4916/12776 [49:45<1:07:18,  1.95it/s] 38%|███▊      | 4917/12776 [49:45<1:06:44,  1.96it/s]                                                       38%|███▊      | 4917/12776 [49:45<1:06:44,  1.96it/s] 38%|███▊      | 4918/12776 [49:46<1:02:32,  2.09it/s]                                                       38%|███▊      | 4918/12776 [49:46<1:02:32,  2.09it/s] 39%|███▊      | 4919/12776 [49:46<59:25,  2.20it/s]                                                       39%|███▊      | 4919/12776 [49:46<59:25,  2.20it/s] 39%|███▊      | 4920/12776 [49:47<1:01:32,  2.13it/s]                                                       39%|███▊      | 4920/12776 [49:47<1:01:32,  2.13it/s] 39%|███▊      | 4921/12776 [49:47<56:55,  2.30it/s]                                                       39%|███▊      | 4921/12776 [49:47<56:55,  2.30it/s] 39%|███▊      | 4922/12776 [49:47<53:17,  2.46it/s]                                                     39%|███▊      | 4922/12776 [49:47<53:17,  2.46it/s] 39%|███▊      | 4923/12776 [49:48<54:28,  2.40it/s]                                                     39%|███▊      | 4923/12776 [49:48<54:28,  2.40it/s] 39%|███▊      | 4924/12776 [49:48<50:35,  2.59it/s]                                                     39%|███▊      | 4924/12776 [49:48<50:35,  2.59it/s] 39%|███▊      | 4925/12776 [49:49<47:13,  2.77it/s]                                                     39%|███▊      | 4925/12776 [49:49<47:13,  2.77it/s] 39%|███▊      | 4926/12776 [49:49<47:11,  2.77it/s]                                                    {'loss': 1.0366, 'grad_norm': 3.5413787364959717, 'learning_rate': 0.00019396383186705767, 'epoch': 0.76}
+{'loss': 0.7756, 'grad_norm': 1.493313193321228, 'learning_rate': 0.00019393939393939395, 'epoch': 0.76}
+{'loss': 0.2853, 'grad_norm': 0.4888933598995209, 'learning_rate': 0.00019391495601173018, 'epoch': 0.76}
+{'loss': 0.3158, 'grad_norm': 0.5343114137649536, 'learning_rate': 0.00019389051808406645, 'epoch': 0.76}
+{'loss': 0.322, 'grad_norm': 0.571465790271759, 'learning_rate': 0.00019386608015640273, 'epoch': 0.76}
+{'loss': 0.2126, 'grad_norm': 0.4364553689956665, 'learning_rate': 0.00019384164222873898, 'epoch': 0.76}
+{'loss': 0.5031, 'grad_norm': 1.6458876132965088, 'learning_rate': 0.00019381720430107526, 'epoch': 0.76}
+{'loss': 0.3538, 'grad_norm': 0.8052377104759216, 'learning_rate': 0.00019379276637341154, 'epoch': 0.76}
+{'loss': 0.2654, 'grad_norm': 0.7143148183822632, 'learning_rate': 0.0001937683284457478, 'epoch': 0.76}
+{'loss': 0.3274, 'grad_norm': 1.215345859527588, 'learning_rate': 0.00019374389051808404, 'epoch': 0.76}
+{'loss': 0.4571, 'grad_norm': 1.0219749212265015, 'learning_rate': 0.0001937194525904203, 'epoch': 0.76}
+{'loss': 0.469, 'grad_norm': 1.1042121648788452, 'learning_rate': 0.00019369501466275657, 'epoch': 0.76}
+{'loss': 0.5121, 'grad_norm': 0.9635404944419861, 'learning_rate': 0.00019367057673509285, 'epoch': 0.76}
+{'loss': 0.4804, 'grad_norm': 1.2368090152740479, 'learning_rate': 0.0001936461388074291, 'epoch': 0.76}
+{'loss': 0.5622, 'grad_norm': 1.2510402202606201, 'learning_rate': 0.00019362170087976538, 'epoch': 0.76}
+{'loss': 0.495, 'grad_norm': 0.9782634973526001, 'learning_rate': 0.00019359726295210166, 'epoch': 0.76}
+{'loss': 0.5429, 'grad_norm': 1.6762943267822266, 'learning_rate': 0.00019357282502443788, 'epoch': 0.76}
+{'loss': 0.3212, 'grad_norm': 0.6857059597969055, 'learning_rate': 0.00019354838709677416, 'epoch': 0.76}
+{'loss': 0.4868, 'grad_norm': 1.0604544878005981, 'learning_rate': 0.00019352394916911044, 'epoch': 0.76}
+{'loss': 0.7897, 'grad_norm': 1.5608930587768555, 'learning_rate': 0.0001934995112414467, 'epoch': 0.76}
+{'loss': 0.6437, 'grad_norm': 1.5711655616760254, 'learning_rate': 0.00019347507331378297, 'epoch': 0.76}
+{'loss': 0.4996, 'grad_norm': 1.5553734302520752, 'learning_rate': 0.00019345063538611925, 'epoch': 0.76}
+{'loss': 0.5887, 'grad_norm': 1.692755937576294, 'learning_rate': 0.0001934261974584555, 'epoch': 0.76}
+{'loss': 0.4608, 'grad_norm': 1.0537468194961548, 'learning_rate': 0.00019340175953079178, 'epoch': 0.76}
+{'loss': 0.4771, 'grad_norm': 1.4170693159103394, 'learning_rate': 0.00019337732160312806, 'epoch': 0.76}
+{'loss': 0.8474, 'grad_norm': 2.4024274349212646, 'learning_rate': 0.00019335288367546428, 'epoch': 0.76}
+{'loss': 0.559, 'grad_norm': 1.5059177875518799, 'learning_rate': 0.00019332844574780056, 'epoch': 0.76}
+{'loss': 0.4221, 'grad_norm': 1.0502533912658691, 'learning_rate': 0.00019330400782013684, 'epoch': 0.76}
+{'loss': 0.8188, 'grad_norm': 2.7585763931274414, 'learning_rate': 0.0001932795698924731, 'epoch': 0.76}
+{'loss': 0.584, 'grad_norm': 1.459269642829895, 'learning_rate': 0.00019325513196480937, 'epoch': 0.76}
+{'loss': 1.2836, 'grad_norm': 3.582939624786377, 'learning_rate': 0.00019323069403714564, 'epoch': 0.76}
+{'loss': 1.0656, 'grad_norm': 2.4551584720611572, 'learning_rate': 0.0001932062561094819, 'epoch': 0.76}
+{'loss': 0.8687, 'grad_norm': 1.6925396919250488, 'learning_rate': 0.00019318181818181815, 'epoch': 0.76}
+{'loss': 1.13, 'grad_norm': 2.231828451156616, 'learning_rate': 0.00019315738025415442, 'epoch': 0.76}
+{'loss': 1.1292, 'grad_norm': 2.1356468200683594, 'learning_rate': 0.00019313294232649068, 'epoch': 0.76}
+{'loss': 0.9482, 'grad_norm': 2.5488100051879883, 'learning_rate': 0.00019310850439882695, 'epoch': 0.76}
+{'loss': 1.275, 'grad_norm': 2.787428617477417, 'learning_rate': 0.00019308406647116323, 'epoch': 0.76}
+{'loss': 1.1905, 'grad_norm': 2.909212112426758, 'learning_rate': 0.00019305962854349948, 'epoch': 0.76}
+{'loss': 0.9273, 'grad_norm': 1.801330804824829, 'learning_rate': 0.00019303519061583576, 'epoch': 0.77}
+{'loss': 1.3595, 'grad_norm': 1.8224399089813232, 'learning_rate': 0.00019301075268817204, 'epoch': 0.77}
+{'loss': 1.1251, 'grad_norm': 1.63428795337677, 'learning_rate': 0.00019298631476050826, 'epoch': 0.77}
+{'loss': 1.0573, 'grad_norm': 2.024421215057373, 'learning_rate': 0.00019296187683284454, 'epoch': 0.77}
+{'loss': 1.2419, 'grad_norm': 2.4601943492889404, 'learning_rate': 0.00019293743890518082, 'epoch': 0.77}
+{'loss': 0.7137, 'grad_norm': 1.7658562660217285, 'learning_rate': 0.00019291300097751707, 'epoch': 0.77}
+{'loss': 1.0605, 'grad_norm': 2.124553680419922, 'learning_rate': 0.00019288856304985335, 'epoch': 0.77}
+{'loss': 1.3344, 'grad_norm': 1.8370262384414673, 'learning_rate': 0.00019286412512218963, 'epoch': 0.77}
+{'loss': 1.9377, 'grad_norm': 1.633594036102295, 'learning_rate': 0.00019283968719452588, 'epoch': 0.77}
+{'loss': 0.7699, 'grad_norm': 1.5041015148162842, 'learning_rate': 0.00019281524926686216, 'epoch': 0.77}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.00019281524926686216, 'epoch': 0.77}
+{'loss': 0.6776, 'grad_norm': 2.5734617710113525, 'learning_rate': 0.00019279081133919844, 'epoch': 0.77}
+{'loss': 0.8875, 'grad_norm': 2.468716859817505, 'learning_rate': 0.00019276637341153466, 'epoch': 0.77}
+{'loss': 0.6044, 'grad_norm': 2.735874891281128, 'learning_rate': 0.00019274193548387094, 'epoch': 0.77}
+{'loss': 0.3667, 'grad_norm': 0.6753793954849243, 'learning_rate': 0.00019271749755620722, 'epoch': 0.77}
+{'loss': 0.3415, 'grad_norm': 0.5195480585098267, 'learning_rate': 0.00019269305962854347, 'epoch': 0.77}
+{'loss': 0.3182, 'grad_norm': 0.5045526623725891, 'learning_rate': 0.00019266862170087975, 'epoch': 0.77}
+{'loss': 0.5985, 'grad_norm': 2.755859136581421, 'learning_rate': 0.00019264418377321603, 'epoch': 0.77}
+{'loss': 0.2805, 'grad_norm': 0.4260087013244629, 'learning_rate': 0.00019261974584555228, 'epoch': 0.77}
+{'loss': 0.4325, 'grad_norm': 0.8395310044288635, 'learning_rate': 0.00019259530791788853, 'epoch': 0.77}
+{'loss': 0.2764, 'grad_norm': 0.5676664710044861, 'learning_rate': 0.0001925708699902248, 'epoch': 0.77}
+{'loss': 0.3626, 'grad_norm': 0.5876914262771606, 'learning_rate': 0.00019254643206256106, 'epoch': 0.77}
+{'loss': 0.3014, 'grad_norm': 0.6870219707489014, 'learning_rate': 0.00019252199413489734, 'epoch': 0.77}
+{'loss': 0.5405, 'grad_norm': 0.8017184138298035, 'learning_rate': 0.00019249755620723362, 'epoch': 0.77}
+{'loss': 0.4282, 'grad_norm': 0.7471238374710083, 'learning_rate': 0.00019247311827956987, 'epoch': 0.77}
+{'loss': 0.466, 'grad_norm': 0.7291784286499023, 'learning_rate': 0.00019244868035190614, 'epoch': 0.77}
+{'loss': 0.4488, 'grad_norm': 0.8841153383255005, 'learning_rate': 0.00019242424242424242, 'epoch': 0.77}
+{'loss': 0.3224, 'grad_norm': 0.8288737535476685, 'learning_rate': 0.00019239980449657865, 'epoch': 0.77}
+{'loss': 0.5345, 'grad_norm': 2.8311057090759277, 'learning_rate': 0.00019237536656891493, 'epoch': 0.77}
+{'loss': 0.3726, 'grad_norm': 0.9499149322509766, 'learning_rate': 0.0001923509286412512, 'epoch': 0.77}
+{'loss': 0.4571, 'grad_norm': 1.6338462829589844, 'learning_rate': 0.00019232649071358746, 'epoch': 0.77}
+{'loss': 0.6984, 'grad_norm': 1.347562313079834, 'learning_rate': 0.00019230205278592373, 'epoch': 0.77}
+{'loss': 0.5362, 'grad_norm': 1.349605679512024, 'learning_rate': 0.00019227761485826, 'epoch': 0.77}
+{'loss': 0.3536, 'grad_norm': 0.9548114538192749, 'learning_rate': 0.00019225317693059626, 'epoch': 0.77}
+{'loss': 0.593, 'grad_norm': 1.3998003005981445, 'learning_rate': 0.00019222873900293254, 'epoch': 0.77}
+{'loss': 0.652, 'grad_norm': 2.3617327213287354, 'learning_rate': 0.00019220430107526882, 'epoch': 0.77}
+{'loss': 0.8062, 'grad_norm': 2.4933969974517822, 'learning_rate': 0.00019217986314760504, 'epoch': 0.77}
+{'loss': 0.545, 'grad_norm': 1.7823599576950073, 'learning_rate': 0.00019215542521994132, 'epoch': 0.77}
+{'loss': 0.4175, 'grad_norm': 1.9176454544067383, 'learning_rate': 0.0001921309872922776, 'epoch': 0.77}
+ 39%|███▊      | 4926/12776 [49:49<47:11,  2.77it/s] 39%|███▊      | 4927/12776 [49:49<44:09,  2.96it/s]                                                     39%|███▊      | 4927/12776 [49:49<44:09,  2.96it/s] 39%|███▊      | 4928/12776 [49:49<41:38,  3.14it/s]                                                     39%|███▊      | 4928/12776 [49:49<41:38,  3.14it/s] 39%|███▊      | 4929/12776 [49:50<39:37,  3.30it/s]                                                     39%|███▊      | 4929/12776 [49:50<39:37,  3.30it/s] 39%|███▊      | 4930/12776 [49:50<40:08,  3.26it/s]                                                     39%|███▊      | 4930/12776 [49:50<40:08,  3.26it/s] 39%|███▊      | 4931/12776 [49:50<38:13,  3.42it/s]                                                     39%|███▊      | 4931/12776 [49:50<38:13,  3.42it/s] 39%|███▊      | 4932/12776 [49:51<36:42,  3.56it/s]                                                     39%|███▊      | 4932/12776 [49:51<36:42,  3.56it/s] 39%|███▊      | 4933/12776 [49:51<35:32,  3.68it/s]                                                     39%|███▊      | 4933/12776 [49:51<35:32,  3.68it/s] 39%|███▊      | 4934/12776 [49:51<39:32,  3.31it/s]                                                     39%|███▊      | 4934/12776 [49:51<39:32,  3.31it/s] 39%|███▊      | 4935/12776 [49:51<37:31,  3.48it/s]                                                     39%|███▊      | 4935/12776 [49:51<37:31,  3.48it/s] 39%|███▊      | 4936/12776 [49:52<35:17,  3.70it/s]                                                     39%|███▊      | 4936/12776 [49:52<35:17,  3.70it/s] 39%|███▊      | 4937/12776 [49:52<33:31,  3.90it/s]                                                     39%|███▊      | 4937/12776 [49:52<33:31,  3.90it/s] 39%|███▊      | 4938/12776 [49:52<32:02,  4.08it/s]                                                     39%|███▊      | 4938/12776 [49:52<32:02,  4.08it/s] 39%|███▊      | 4939/12776 [49:52<35:30,  3.68it/s]                                                     39%|███▊      | 4939/12776 [49:52<35:30,  3.68it/s] 39%|███▊      | 4940/12776 [49:53<33:03,  3.95it/s]                                                     39%|███▊      | 4940/12776 [49:53<33:03,  3.95it/s] 39%|███▊      | 4941/12776 [49:53<31:24,  4.16it/s]                                                     39%|███▊      | 4941/12776 [49:53<31:24,  4.16it/s] 39%|███▊      | 4942/12776 [49:53<30:00,  4.35it/s]                                                     39%|███▊      | 4942/12776 [49:53<30:00,  4.35it/s] 39%|███▊      | 4943/12776 [49:53<28:57,  4.51it/s]                                                     39%|███▊      | 4943/12776 [49:53<28:57,  4.51it/s] 39%|███▊      | 4944/12776 [49:54<32:35,  4.00it/s]                                                     39%|███▊      | 4944/12776 [49:54<32:35,  4.00it/s] 39%|███▊      | 4945/12776 [49:54<30:31,  4.28it/s]                                                     39%|███▊      | 4945/12776 [49:54<30:31,  4.28it/s] 39%|███▊      | 4946/12776 [49:54<29:01,  4.50it/s]                                                     39%|███▊      | 4946/12776 [49:54<29:01,  4.50it/s] 39%|███▊      | 4947/12776 [49:54<27:47,  4.69it/s]                                                     39%|███▊      | 4947/12776 [49:54<27:47,  4.69it/s] 39%|███▊      | 4948/12776 [49:54<26:42,  4.88it/s]                                                     39%|███▊      | 4948/12776 [49:54<26:42,  4.88it/s] 39%|███▊      | 4949/12776 [49:55<28:39,  4.55it/s]                                                     39%|███▊      | 4949/12776 [49:55<28:39,  4.55it/s] 39%|███▊      | 4950/12776 [49:55<46:15,  2.82it/s]                                                     39%|███▊      | 4950/12776 [49:55<46:15,  2.82it/s] 39%|███▉      | 4951/12776 [49:57<1:27:10,  1.50it/s]                                                       39%|███▉      | 4951/12776 [49:57<1:27:10,  1.50it/s] 39%|███▉      | 4952/12776 [49:58<1:36:56,  1.35it/s]                                                       39%|███▉      | 4952/12776 [49:58<1:36:56,  1.35it/s] 39%|███▉      | 4953/12776 [49:58<1:39:54,  1.31it/s]                                                       39%|███▉      | 4953/12776 [49:58<1:39:54,  1.31it/s] 39%|███▉      | 4954/12776 [49:59<1:42:06,  1.28it/s]                                                       39%|███▉      | 4954/12776 [49:59<1:42:06,  1.28it/s] 39%|███▉      | 4955/12776 [50:00<1:38:38,  1.32it/s]                                                       39%|███▉      | 4955/12776 [50:00<1:38:38,  1.32it/s] 39%|███▉      | 4956/12776 [50:01<1:37:55,  1.33it/s]                                                       39%|███▉      | 4956/12776 [50:01<1:37:55,  1.33it/s] 39%|███▉      | 4957/12776 [50:01<1:33:27,  1.39it/s]                                                       39%|███▉      | 4957/12776 [50:01<1:33:27,  1.39it/s] 39%|███▉      | 4958/12776 [50:02<1:29:38,  1.45it/s]                                                       39%|███▉      | 4958/12776 [50:02<1:29:38,  1.45it/s] 39%|███▉      | 4959/12776 [50:03<1:25:32,  1.52it/s]                                                       39%|███▉      | 4959/12776 [50:03<1:25:32,  1.52it/s] 39%|███▉      | 4960/12776 [50:03<1:24:05,  1.55it/s]                                                       39%|███▉      | 4960/12776 [50:03<1:24:05,  1.55it/s] 39%|███▉      | 4961/12776 [50:04<1:20:20,  1.62it/s]                                                       39%|███▉      | 4961/12776 [50:04<1:20:20,  1.62it/s] 39%|███▉      | 4962/12776 [50:04<1:17:27,  1.68it/s]                                                       39%|███▉      | 4962/12776 [50:04<1:17:27,  1.68it/s] 39%|███▉      | 4963/12776 [50:05<1:13:36,  1.77it/s]                                                       39%|███▉      | 4963/12776 [50:05<1:13:36,  1.77it/s] 39%|███▉      | 4964/12776 [50:05<1:12:02,  1.81it/s]                                                       39%|███▉      | 4964/12776 [50:05<1:12:02,  1.81it/s] 39%|███▉      | 4965/12776 [50:06<1:07:27,  1.93it/s]                                                       39%|███▉      | 4965/12776 [50:06<1:07:27,  1.93it/s] 39%|███▉      | 4966/12776 [50:06<1:08:44,  1.89it/s]                                                       39%|███▉      | 4966/12776 [50:06<1:08:44,  1.89it/s] 39%|███▉      | 4967/12776 [50:07<1:04:19,  2.02it/s]                                                       39%|███▉      | 4967/12776 [50:07<1:04:19,  2.02it/s] 39%|███▉      | 4968/12776 [50:07<1:00:24,  2.15it/s]                                                       39%|███▉      | 4968/12776 [50:07<1:00:24,  2.15it/s] 39%|███▉      | 4969/12776 [50:08<1:02:47,  2.07it/s]                                                       39%|███▉      | 4969/12776 [50:08<1:02:47,  2.07it/s] 39%|███▉      | 4970/12776 [50:08<58:30,  2.22it/s]                                                       39%|███▉      | 4970/12776 [50:08<58:30,  2.22it/s] 39%|███▉      | 4971/12776 [50:08<54:54,  2.37it/s]                                                     39%|███▉      | 4971/12776 [50:08<54:54,  2.37it/s] 39%|███▉      | 4972/12776 [50:09<55:15,  2.35it/s]                                                     39%|███▉      | 4972/12776 [50:09<55:15,  2.35it/s] 39%|███▉      | 4973/12776 [50:09<51:57,  2.50it/s]                                                     39%|███▉      | 4973/12776 [50:09<51:57,  2.50it/s] 39%|███▉      | 4974/12776 [50:09<49:28,  2.63it/s]                                                     39%|███▉      | 4974/12776 [50:09<49:28,  2.63it/s] 39%|███▉      | 4975/12776 [50:10<49:31,  2.63it/s]                                                     39%|███▉      | 4975/12776 [50:10<49:31,  2.63it/s] 39%|███▉      | 4976/12776 [50:10<46:04,  2.82it/s]                                                     39%|███▉      | 4976/12776 [50:10<46:04,  2.82it/s] 39%|███▉      | 4977/12776 [50:10<43:29,  2.99it/s]                                                     39%|███▉      | 4977/12776 [50:10<43:29,  2.99it/s] 39%|███▉      | 4978/12776 [50:11<45:05,  2.88it/s]                                                     39%|███▉      | 4978/12776 [50:11<45:05,  2.88it/s] 39%|███▉      | 4979/12776 [50:11<42:36,  3.05it/s]                                                     39%|███▉      | 4979/12776 [50:11<42:36,  3.05it/s] 39%|███▉      | 4980/12776 [50:11<39:56,  3.25it/s]                                                     39%|███▉      | 4980/12776 [50:11<39:56,  3.25it/s] 39%|███▉      | 4981/12776 [50:12<37:44,  3.44it/s]                                                     39%|███▉      | 4981/12776 [50:12<37:44,  3.44it/s] 39%|███▉      | 4982/12776 [50:12<39:50,  3.26it/s]                                                     39%|███▉      | 4982/12776 [50:12<39:50,  3.26it/s] 39%|███▉      | 4983/12776 [50:12<37:21,  3.48it/s]                                                     39%|███▉      | 4983/12776 [50:12<37:21,  3.48it/s] 39%|███▉      | 4984/12776 [50:12<35:17,  3.68it/s]                                                     39%|███▉      | 4984/12776 [50:12<35:17,  3.68it/s] 39%|███▉      | 4985/12776 [50:13<33:31,  3.87it/s]                                                     39%|███▉      | 4985/12776 [50:13<33:31,  3.87it/s] 39%|███▉      | 4986/12776 [50:13<32:07,  4.04it/s]                                                     39%|███▉      | 4986/12776 [50:13<32:07,  4.04it/s] 39%|███▉      | 4987/12776 [50:13<35:13,  3.69it/s]                                                     39%|███▉      | 4987/12776 [50:13<35:13,  3.69it/s] 39%|███▉      | 4988/12776 [50:13<32:49,  3.95it/s]                                                     39%|███▉      | 4988/12776 [50:13<32:49,  3.95it/s] 39%|███▉      | 4989/12776 [50:14<31:02,  4.18it/s]                                                     39%|███▉      | 4989/12776 [50:14<31:02,  4.18it/s] 39%|███▉      | 4990/12776 [50:14<30:07,  4.31it/s]                                                     39%|███▉      | 4990/12776 [50:14<30:07,  4.31it/s] 39%|███▉      | 4991/12776 [50:14<29:07,  4.46it/s]                                                     39%|███▉      | 4991/12776 [50:14<29:07,  4.46it/s] 39%|███▉      | 4992/12776 [50:14<31:55,  4.06it/s]                                                     39%|███▉      | 4992/12776 [50:14<31:55,  4.06it/s] 39%|███▉      | 4993/12776 [50:14<30:01,  4.32it/s]                                                     39%|███▉      | 4993/12776 [50:14<30:01,  4.32it/s] 39%|███▉      | 4994/12776 [50:15<29:02,  4.47it/s]                                                     39%|███▉      | 4994/12776 [50:15<29:02,  4.47it/s] 39%|███▉      | 4995/12776 [50:15<27:49,  4.66it/s]                                                     39%|███▉      | 4995/12776 [50:15<27:49,  4.66it/s] 39%|███▉      | 4996/12776 [50:15<26:50,  4.83it/s]                                                     39%|███▉      | 4996/12776 [50:15<26:50,  4.83it/s] 39%|███▉      | 4997/12776 [50:15<31:19,  4.14it/s]                                                     39%|███▉      | 4997/12776 [50:15<31:19,  4.14it/s] 39%|███▉      | 4998/12776 [50:16<29:05,  4.46it/s]                                                     39%|███▉      | 4998/12776 [50:16<29:05,  4.46it/s] 39%|███▉      | 4999/12776 [50:16<28:06,  4.61it/s]                                                     39%|███▉      | 4999/12776 [50:16<28:06,  4.61it/s] 39%|███▉      | 5000/12776 [50:17<49:29,  2.62it/s]                                                     39%|███▉      | 5000/12776 [50:17<49:29,  2.62it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 0.7757, 'grad_norm': 1.5695958137512207, 'learning_rate': 0.00019210654936461385, 'epoch': 0.77}
+{'loss': 0.9986, 'grad_norm': 1.9228142499923706, 'learning_rate': 0.00019208211143695013, 'epoch': 0.77}
+{'loss': 1.1804, 'grad_norm': 2.260345458984375, 'learning_rate': 0.0001920576735092864, 'epoch': 0.77}
+{'loss': 0.7815, 'grad_norm': 1.4471839666366577, 'learning_rate': 0.00019203323558162266, 'epoch': 0.77}
+{'loss': 0.6585, 'grad_norm': 2.359945774078369, 'learning_rate': 0.0001920087976539589, 'epoch': 0.77}
+{'loss': 0.8588, 'grad_norm': 2.4294509887695312, 'learning_rate': 0.0001919843597262952, 'epoch': 0.77}
+{'loss': 1.2107, 'grad_norm': 2.964313268661499, 'learning_rate': 0.00019195992179863144, 'epoch': 0.77}
+{'loss': 0.5851, 'grad_norm': 2.60548996925354, 'learning_rate': 0.00019193548387096772, 'epoch': 0.77}
+{'loss': 0.9809, 'grad_norm': 2.0704550743103027, 'learning_rate': 0.000191911045943304, 'epoch': 0.77}
+{'loss': 0.8783, 'grad_norm': 3.5938777923583984, 'learning_rate': 0.00019188660801564025, 'epoch': 0.77}
+{'loss': 0.856, 'grad_norm': 1.8894193172454834, 'learning_rate': 0.00019186217008797653, 'epoch': 0.77}
+{'loss': 1.0566, 'grad_norm': 2.43099308013916, 'learning_rate': 0.0001918377321603128, 'epoch': 0.77}
+{'loss': 1.2289, 'grad_norm': 2.183492660522461, 'learning_rate': 0.00019181329423264903, 'epoch': 0.77}
+{'loss': 0.7877, 'grad_norm': 1.2107994556427002, 'learning_rate': 0.0001917888563049853, 'epoch': 0.77}
+{'loss': 0.9841, 'grad_norm': 1.4150274991989136, 'learning_rate': 0.00019176441837732159, 'epoch': 0.77}
+{'loss': 1.1294, 'grad_norm': 1.461196780204773, 'learning_rate': 0.00019173998044965784, 'epoch': 0.77}
+{'loss': 1.2065, 'grad_norm': 2.6352620124816895, 'learning_rate': 0.00019171554252199412, 'epoch': 0.77}
+{'loss': 1.1227, 'grad_norm': 2.3676352500915527, 'learning_rate': 0.0001916911045943304, 'epoch': 0.77}
+{'loss': 1.377, 'grad_norm': 2.5080063343048096, 'learning_rate': 0.00019166666666666665, 'epoch': 0.77}
+{'loss': 1.71, 'grad_norm': 2.701669216156006, 'learning_rate': 0.00019164222873900292, 'epoch': 0.77}
+{'loss': 0.5902, 'grad_norm': 1.4313629865646362, 'learning_rate': 0.0001916177908113392, 'epoch': 0.77}
+{'loss': 1.0311, 'grad_norm': 2.563730478286743, 'learning_rate': 0.00019159335288367543, 'epoch': 0.77}
+{'loss': 0.8272, 'grad_norm': 2.521282434463501, 'learning_rate': 0.0001915689149560117, 'epoch': 0.77}
+{'loss': 0.8662, 'grad_norm': 2.894930601119995, 'learning_rate': 0.00019154447702834798, 'epoch': 0.77}
+{'loss': 0.5643, 'grad_norm': 1.747538685798645, 'learning_rate': 0.00019152003910068423, 'epoch': 0.77}
+{'loss': 0.3981, 'grad_norm': 0.750394880771637, 'learning_rate': 0.0001914956011730205, 'epoch': 0.78}
+{'loss': 0.711, 'grad_norm': 0.8471237421035767, 'learning_rate': 0.0001914711632453568, 'epoch': 0.78}
+{'loss': 0.4321, 'grad_norm': 0.6519607901573181, 'learning_rate': 0.00019144672531769302, 'epoch': 0.78}
+{'loss': 0.2508, 'grad_norm': 0.5569426417350769, 'learning_rate': 0.0001914222873900293, 'epoch': 0.78}
+{'loss': 0.5589, 'grad_norm': 0.8768931031227112, 'learning_rate': 0.00019139784946236557, 'epoch': 0.78}
+{'loss': 0.4621, 'grad_norm': 0.6277607083320618, 'learning_rate': 0.00019137341153470182, 'epoch': 0.78}
+{'loss': 0.4333, 'grad_norm': 0.6822944283485413, 'learning_rate': 0.0001913489736070381, 'epoch': 0.78}
+{'loss': 0.3288, 'grad_norm': 0.45079341530799866, 'learning_rate': 0.00019132453567937438, 'epoch': 0.78}
+{'loss': 0.4235, 'grad_norm': 0.6848064661026001, 'learning_rate': 0.00019130009775171063, 'epoch': 0.78}
+{'loss': 0.2753, 'grad_norm': 0.524940013885498, 'learning_rate': 0.0001912756598240469, 'epoch': 0.78}
+{'loss': 0.421, 'grad_norm': 1.9908454418182373, 'learning_rate': 0.0001912512218963832, 'epoch': 0.78}
+{'loss': 0.383, 'grad_norm': 0.7081311345100403, 'learning_rate': 0.0001912267839687194, 'epoch': 0.78}
+{'loss': 0.5341, 'grad_norm': 1.0572980642318726, 'learning_rate': 0.0001912023460410557, 'epoch': 0.78}
+{'loss': 1.4755, 'grad_norm': 7.10352087020874, 'learning_rate': 0.00019117790811339197, 'epoch': 0.78}
+{'loss': 0.5192, 'grad_norm': 1.0377283096313477, 'learning_rate': 0.00019115347018572822, 'epoch': 0.78}
+{'loss': 0.426, 'grad_norm': 1.1436519622802734, 'learning_rate': 0.0001911290322580645, 'epoch': 0.78}
+{'loss': 0.6584, 'grad_norm': 2.1015119552612305, 'learning_rate': 0.00019110459433040078, 'epoch': 0.78}
+{'loss': 0.3944, 'grad_norm': 0.8442532420158386, 'learning_rate': 0.00019108015640273703, 'epoch': 0.78}
+{'loss': 0.5206, 'grad_norm': 1.5063536167144775, 'learning_rate': 0.0001910557184750733, 'epoch': 0.78}
+{'loss': 0.492, 'grad_norm': 1.2549412250518799, 'learning_rate': 0.00019103128054740958, 'epoch': 0.78}
+{'loss': 0.6607, 'grad_norm': 1.9004099369049072, 'learning_rate': 0.0001910068426197458, 'epoch': 0.78}
+{'loss': 0.4599, 'grad_norm': 0.9570836424827576, 'learning_rate': 0.0001909824046920821, 'epoch': 0.78}
+{'loss': 0.4858, 'grad_norm': 1.3211475610733032, 'learning_rate': 0.00019095796676441837, 'epoch': 0.78}
+{'loss': 0.4007, 'grad_norm': 1.0831687450408936, 'learning_rate': 0.00019093352883675462, 'epoch': 0.78}
+{'loss': 0.5252, 'grad_norm': 1.4682847261428833, 'learning_rate': 0.0001909090909090909, 'epoch': 0.78}
+{'loss': 1.4791, 'grad_norm': 3.620468854904175, 'learning_rate': 0.00019088465298142717, 'epoch': 0.78}
+{'loss': 0.7857, 'grad_norm': 2.0373001098632812, 'learning_rate': 0.0001908602150537634, 'epoch': 0.78}
+{'loss': 0.8352, 'grad_norm': 2.083617925643921, 'learning_rate': 0.00019083577712609968, 'epoch': 0.78}
+{'loss': 1.4002, 'grad_norm': 1.7250630855560303, 'learning_rate': 0.00019081133919843595, 'epoch': 0.78}
+{'loss': 0.588, 'grad_norm': 1.6049273014068604, 'learning_rate': 0.0001907869012707722, 'epoch': 0.78}
+{'loss': 0.9482, 'grad_norm': 1.8046834468841553, 'learning_rate': 0.00019076246334310848, 'epoch': 0.78}
+{'loss': 0.9811, 'grad_norm': 2.0542006492614746, 'learning_rate': 0.00019073802541544476, 'epoch': 0.78}
+{'loss': 0.5584, 'grad_norm': 1.8561007976531982, 'learning_rate': 0.000190713587487781, 'epoch': 0.78}
+{'loss': 1.2283, 'grad_norm': 4.412578582763672, 'learning_rate': 0.0001906891495601173, 'epoch': 0.78}
+{'loss': 0.8703, 'grad_norm': 2.7626380920410156, 'learning_rate': 0.00019066471163245357, 'epoch': 0.78}
+{'loss': 1.096, 'grad_norm': 3.2905735969543457, 'learning_rate': 0.0001906402737047898, 'epoch': 0.78}
+{'loss': 1.0584, 'grad_norm': 1.68156099319458, 'learning_rate': 0.00019061583577712607, 'epoch': 0.78}
+{'loss': 1.3791, 'grad_norm': 3.365527629852295, 'learning_rate': 0.00019059139784946235, 'epoch': 0.78}
+{'loss': 1.0837, 'grad_norm': 3.2215206623077393, 'learning_rate': 0.0001905669599217986, 'epoch': 0.78}
+{'loss': 1.0125, 'grad_norm': 2.5578112602233887, 'learning_rate': 0.00019054252199413488, 'epoch': 0.78}
+{'loss': 1.3229, 'grad_norm': 4.676730632781982, 'learning_rate': 0.00019051808406647116, 'epoch': 0.78}
+{'loss': 1.6003, 'grad_norm': 3.1872496604919434, 'learning_rate': 0.0001904936461388074, 'epoch': 0.78}
+{'loss': 0.9255, 'grad_norm': 2.7677407264709473, 'learning_rate': 0.0001904692082111437, 'epoch': 0.78}
+{'loss': 1.177, 'grad_norm': 1.499570608139038, 'learning_rate': 0.00019044477028347994, 'epoch': 0.78}
+{'loss': 1.2654, 'grad_norm': 2.0705745220184326, 'learning_rate': 0.0001904203323558162, 'epoch': 0.78}
+{'loss': 0.7452, 'grad_norm': 3.193979263305664, 'learning_rate': 0.00019039589442815247, 'epoch': 0.78}
+{'loss': 0.5481, 'grad_norm': 2.0789194107055664, 'learning_rate': 0.00019037145650048875, 'epoch': 0.78}
+{'loss': 1.0294, 'grad_norm': 2.25919771194458, 'learning_rate': 0.000190347018572825, 'epoch': 0.78}
+{'loss': 1.025, 'grad_norm': 1.2901616096496582, 'learning_rate': 0.00019032258064516128, 'epoch': 0.78}
+{'loss': 0.8492, 'grad_norm': 1.560736894607544, 'learning_rate': 0.00019029814271749756, 'epoch': 0.78}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:03,  6.27it/s][A
+  0%|          | 3/774 [00:00<02:45,  4.66it/s][A
+  1%|          | 4/774 [00:00<03:12,  4.00it/s][A
+  1%|          | 5/774 [00:01<03:13,  3.98it/s][A
+  1%|          | 6/774 [00:01<03:28,  3.69it/s][A
+  1%|          | 7/774 [00:01<03:25,  3.73it/s][A
+  1%|          | 8/774 [00:02<03:27,  3.68it/s][A
+  1%|          | 9/774 [00:02<03:16,  3.89it/s][A
+  1%|▏         | 10/774 [00:02<03:16,  3.90it/s][A
+  1%|▏         | 11/774 [00:02<03:31,  3.60it/s][A
+  2%|▏         | 12/774 [00:03<03:20,  3.81it/s][A
+  2%|▏         | 13/774 [00:03<03:13,  3.93it/s][A
+  2%|▏         | 14/774 [00:03<03:25,  3.70it/s][A
+  2%|▏         | 15/774 [00:03<03:44,  3.39it/s][A
+  2%|▏         | 16/774 [00:04<03:40,  3.44it/s][A
+  2%|▏         | 17/774 [00:04<03:16,  3.85it/s][A
+  2%|▏         | 18/774 [00:04<03:08,  4.00it/s][A
+  2%|▏         | 19/774 [00:04<03:18,  3.80it/s][A
+  3%|▎         | 20/774 [00:05<03:14,  3.87it/s][A
+  3%|▎         | 21/774 [00:05<03:18,  3.79it/s][A
+  3%|▎         | 22/774 [00:05<03:23,  3.70it/s][A
+  3%|▎         | 23/774 [00:06<03:34,  3.50it/s][A
+  3%|▎         | 24/774 [00:06<03:33,  3.52it/s][A
+  3%|▎         | 25/774 [00:06<03:39,  3.42it/s][A
+  3%|▎         | 26/774 [00:06<03:37,  3.44it/s][A
+  3%|▎         | 27/774 [00:07<03:37,  3.44it/s][A
+  4%|▎         | 28/774 [00:07<03:45,  3.31it/s][A
+  4%|▎         | 29/774 [00:07<03:53,  3.20it/s][A
+  4%|▍         | 30/774 [00:08<03:41,  3.35it/s][A
+  4%|▍         | 31/774 [00:08<03:40,  3.37it/s][A
+  4%|▍         | 32/774 [00:08<04:27,  2.77it/s][A
+  4%|▍         | 33/774 [00:09<04:21,  2.83it/s][A
+  4%|▍         | 34/774 [00:09<04:01,  3.06it/s][A
+  5%|▍         | 35/774 [00:09<04:07,  2.99it/s][A
+  5%|▍         | 36/774 [00:10<04:03,  3.03it/s][A
+  5%|▍         | 37/774 [00:10<04:03,  3.02it/s][A
+  5%|▍         | 38/774 [00:10<03:55,  3.12it/s][A
+  5%|▌         | 39/774 [00:11<03:36,  3.40it/s][A
+  5%|▌         | 40/774 [00:11<03:41,  3.32it/s][A
+  5%|▌         | 41/774 [00:11<03:35,  3.39it/s][A
+  5%|▌         | 42/774 [00:11<03:23,  3.60it/s][A
+  6%|▌         | 43/774 [00:12<03:33,  3.43it/s][A
+  6%|▌         | 44/774 [00:12<03:34,  3.40it/s][A
+  6%|▌         | 45/774 [00:12<03:21,  3.61it/s][A
+  6%|▌         | 46/774 [00:13<03:05,  3.93it/s][A
+  6%|▌         | 47/774 [00:13<02:53,  4.20it/s][A
+  6%|▌         | 48/774 [00:13<02:53,  4.19it/s][A
+  6%|▋         | 49/774 [00:13<02:54,  4.14it/s][A
+  6%|▋         | 50/774 [00:13<02:57,  4.08it/s][A
+  7%|▋         | 51/774 [00:14<02:58,  4.04it/s][A
+  7%|▋         | 52/774 [00:14<02:57,  4.06it/s][A
+  7%|▋         | 53/774 [00:14<03:06,  3.87it/s][A
+  7%|▋         | 54/774 [00:15<03:10,  3.78it/s][A
+  7%|▋         | 55/774 [00:15<03:19,  3.61it/s][A
+  7%|▋         | 56/774 [00:15<03:19,  3.60it/s][A
+  7%|▋         | 57/774 [00:15<03:23,  3.52it/s][A
+  7%|▋         | 58/774 [00:16<03:23,  3.51it/s][A
+  8%|▊         | 59/774 [00:16<03:07,  3.80it/s][A
+  8%|▊         | 60/774 [00:16<02:54,  4.09it/s][A
+  8%|▊         | 61/774 [00:16<02:32,  4.66it/s][A
+  8%|▊         | 62/774 [00:16<02:30,  4.73it/s][A
+  8%|▊         | 63/774 [00:17<02:55,  4.04it/s][A
+  8%|▊         | 64/774 [00:17<02:47,  4.25it/s][A
+  8%|▊         | 65/774 [00:17<02:48,  4.20it/s][A
+  9%|▊         | 66/774 [00:17<02:46,  4.26it/s][A
+  9%|▊         | 67/774 [00:18<02:37,  4.47it/s][A
+  9%|▉         | 68/774 [00:18<02:34,  4.56it/s][A
+  9%|▉         | 69/774 [00:18<02:26,  4.80it/s][A
+  9%|▉         | 70/774 [00:18<02:35,  4.54it/s][A
+  9%|▉         | 71/774 [00:18<02:30,  4.68it/s][A
+  9%|▉         | 72/774 [00:19<02:40,  4.36it/s][A
+  9%|▉         | 73/774 [00:19<02:51,  4.10it/s][A
+ 10%|▉         | 74/774 [00:19<02:58,  3.93it/s][A
+ 10%|▉         | 75/774 [00:20<03:04,  3.78it/s][A
+ 10%|▉         | 76/774 [00:20<03:00,  3.86it/s][A
+ 10%|▉         | 77/774 [00:20<03:13,  3.61it/s][A
+ 10%|█         | 78/774 [00:20<02:54,  3.99it/s][A
+ 10%|█         | 79/774 [00:21<02:42,  4.28it/s][A
+ 10%|█         | 80/774 [00:21<02:39,  4.36it/s][A
+ 10%|█         | 81/774 [00:21<02:17,  5.05it/s][A
+ 11%|█         | 82/774 [00:21<02:18,  5.01it/s][A
+ 11%|█         | 83/774 [00:21<02:21,  4.88it/s][A
+ 11%|█         | 84/774 [00:22<02:27,  4.68it/s][A
+ 11%|█         | 85/774 [00:22<02:36,  4.41it/s][A
+ 11%|█         | 86/774 [00:22<02:43,  4.21it/s][A
+ 11%|█         | 87/774 [00:22<02:44,  4.18it/s][A
+ 11%|█▏        | 88/774 [00:23<02:32,  4.49it/s][A
+ 11%|█▏        | 89/774 [00:23<02:26,  4.67it/s][A
+ 12%|█▏        | 90/774 [00:23<02:34,  4.42it/s][A
+ 12%|█▏        | 91/774 [00:23<02:49,  4.04it/s][A
+ 12%|█▏        | 92/774 [00:24<03:02,  3.74it/s][A
+ 12%|█▏        | 93/774 [00:24<02:58,  3.81it/s][A
+ 12%|█▏        | 94/774 [00:24<03:02,  3.73it/s][A
+ 12%|█▏        | 95/774 [00:24<03:00,  3.76it/s][A
+ 12%|█▏        | 96/774 [00:25<02:56,  3.85it/s][A
+ 13%|█▎        | 97/774 [00:25<02:39,  4.26it/s][A
+ 13%|█▎        | 98/774 [00:25<02:32,  4.44it/s][A
+ 13%|█▎        | 99/774 [00:25<02:45,  4.08it/s][A
+ 13%|█▎        | 100/774 [00:26<02:57,  3.81it/s][A
+ 13%|█▎        | 101/774 [00:26<03:02,  3.69it/s][A
+ 13%|█▎        | 102/774 [00:26<03:15,  3.44it/s][A
+ 13%|█▎        | 103/774 [00:27<03:17,  3.40it/s][A
+ 13%|█▎        | 104/774 [00:27<03:15,  3.42it/s][A
+ 14%|█▎        | 105/774 [00:27<03:14,  3.43it/s][A
+ 14%|█▎        | 106/774 [00:27<03:34,  3.11it/s][A
+ 14%|█▍        | 107/774 [00:28<03:46,  2.95it/s][A
+ 14%|█▍        | 108/774 [00:28<03:37,  3.06it/s][A
+ 14%|█▍        | 109/774 [00:28<03:34,  3.10it/s][A
+ 14%|█▍        | 110/774 [00:29<03:24,  3.25it/s][A
+ 14%|█▍        | 111/774 [00:29<03:23,  3.26it/s][A
+ 14%|█▍        | 112/774 [00:29<03:13,  3.42it/s][A
+ 15%|█▍        | 113/774 [00:30<03:19,  3.31it/s][A
+ 15%|█▍        | 114/774 [00:30<03:23,  3.25it/s][A
+ 15%|█▍        | 115/774 [00:30<03:16,  3.35it/s][A
+ 15%|█▍        | 116/774 [00:30<03:01,  3.62it/s][A
+ 15%|█▌        | 117/774 [00:31<03:07,  3.51it/s][A
+ 15%|█▌        | 118/774 [00:31<03:05,  3.54it/s][A
+ 15%|█▌        | 119/774 [00:31<02:58,  3.68it/s][A
+ 16%|█▌        | 120/774 [00:32<03:08,  3.47it/s][A
+ 16%|█▌        | 121/774 [00:32<03:02,  3.59it/s][A
+ 16%|█▌        | 122/774 [00:32<03:05,  3.52it/s][A
+ 16%|█▌        | 123/774 [00:32<02:57,  3.67it/s][A
+ 16%|█▌        | 124/774 [00:33<03:00,  3.60it/s][A
+ 16%|█▌        | 125/774 [00:33<03:01,  3.57it/s][A
+ 16%|█▋        | 126/774 [00:33<03:09,  3.42it/s][A
+ 16%|█▋        | 127/774 [00:34<03:21,  3.21it/s][A
+ 17%|█▋        | 128/774 [00:34<03:10,  3.39it/s][A
+ 17%|█▋        | 129/774 [00:34<03:11,  3.37it/s][A
+ 17%|█▋        | 130/774 [00:35<03:18,  3.25it/s][A
+ 17%|█▋        | 131/774 [00:35<03:09,  3.40it/s][A
+ 17%|█▋        | 132/774 [00:35<03:09,  3.38it/s][A
+ 17%|█▋        | 133/774 [00:35<03:05,  3.46it/s][A
+ 17%|█▋        | 134/774 [00:36<03:04,  3.46it/s][A
+ 17%|█▋        | 135/774 [00:36<03:20,  3.18it/s][A
+ 18%|█▊        | 136/774 [00:36<03:28,  3.06it/s][A
+ 18%|█▊        | 137/774 [00:37<03:26,  3.09it/s][A
+ 18%|█▊        | 138/774 [00:37<03:22,  3.14it/s][A
+ 18%|█▊        | 139/774 [00:37<03:22,  3.13it/s][A
+ 18%|█▊        | 140/774 [00:38<03:18,  3.19it/s][A
+ 18%|█▊        | 141/774 [00:38<03:11,  3.31it/s][A
+ 18%|█▊        | 142/774 [00:38<03:21,  3.13it/s][A
+ 18%|█▊        | 143/774 [00:39<03:19,  3.17it/s][A
+ 19%|█▊        | 144/774 [00:39<03:08,  3.34it/s][A
+ 19%|█▊        | 145/774 [00:39<03:01,  3.46it/s][A
+ 19%|█▉        | 146/774 [00:39<02:50,  3.68it/s][A
+ 19%|█▉        | 147/774 [00:40<02:41,  3.87it/s][A
+ 19%|█▉        | 148/774 [00:40<02:51,  3.65it/s][A
+ 19%|█▉        | 149/774 [00:40<03:03,  3.40it/s][A
+ 19%|█▉        | 150/774 [00:41<03:06,  3.35it/s][A
+ 20%|█▉        | 151/774 [00:41<02:56,  3.54it/s][A
+ 20%|█▉        | 152/774 [00:41<02:49,  3.67it/s][A
+ 20%|█▉        | 153/774 [00:41<02:54,  3.56it/s][A
+ 20%|█▉        | 154/774 [00:42<02:50,  3.64it/s][A
+ 20%|██        | 155/774 [00:42<02:47,  3.69it/s][A
+ 20%|██        | 156/774 [00:42<02:42,  3.81it/s][A
+ 20%|██        | 157/774 [00:42<02:35,  3.98it/s][A
+ 20%|██        | 158/774 [00:43<02:38,  3.88it/s][A
+ 21%|██        | 159/774 [00:43<02:41,  3.82it/s][A
+ 21%|██        | 160/774 [00:43<02:33,  4.00it/s][A
+ 21%|██        | 161/774 [00:43<02:43,  3.75it/s][A
+ 21%|██        | 162/774 [00:44<02:47,  3.65it/s][A
+ 21%|██        | 163/774 [00:44<02:47,  3.66it/s][A
+ 21%|██        | 164/774 [00:44<02:40,  3.79it/s][A
+ 21%|██▏       | 165/774 [00:44<02:38,  3.83it/s][A
+ 21%|██▏       | 166/774 [00:45<02:42,  3.73it/s][A
+ 22%|██▏       | 167/774 [00:45<02:45,  3.67it/s][A
+ 22%|██▏       | 168/774 [00:45<02:36,  3.88it/s][A
+ 22%|██▏       | 169/774 [00:45<02:28,  4.07it/s][A
+ 22%|██▏       | 170/774 [00:46<02:37,  3.85it/s][A
+ 22%|██▏       | 171/774 [00:46<02:46,  3.63it/s][A
+ 22%|██▏       | 172/774 [00:46<02:54,  3.45it/s][A
+ 22%|██▏       | 173/774 [00:47<02:50,  3.52it/s][A
+ 22%|██▏       | 174/774 [00:47<02:44,  3.66it/s][A
+ 23%|██▎       | 175/774 [00:47<02:43,  3.66it/s][A
+ 23%|██▎       | 176/774 [00:47<02:37,  3.81it/s][A
+ 23%|██▎       | 177/774 [00:48<02:50,  3.50it/s][A
+ 23%|██▎       | 178/774 [00:48<02:35,  3.84it/s][A
+ 23%|██▎       | 179/774 [00:48<02:22,  4.18it/s][A
+ 23%|██▎       | 180/774 [00:48<02:15,  4.38it/s][A
+ 23%|██▎       | 181/774 [00:49<02:19,  4.24it/s][A
+ 24%|██▎       | 182/774 [00:49<02:23,  4.12it/s][A
+ 24%|██▎       | 183/774 [00:49<02:24,  4.08it/s][A
+ 24%|██▍       | 184/774 [00:49<02:35,  3.80it/s][A
+ 24%|██▍       | 185/774 [00:50<02:44,  3.59it/s][A
+ 24%|██▍       | 186/774 [00:50<02:42,  3.63it/s][A
+ 24%|██▍       | 187/774 [00:50<02:35,  3.78it/s][A
+ 24%|██▍       | 188/774 [00:50<02:34,  3.78it/s][A
+ 24%|██▍       | 189/774 [00:51<02:30,  3.88it/s][A
+ 25%|██▍       | 190/774 [00:51<02:26,  3.98it/s][A
+ 25%|██▍       | 191/774 [00:51<02:32,  3.83it/s][A
+ 25%|██▍       | 192/774 [00:52<02:36,  3.72it/s][A
+ 25%|██▍       | 193/774 [00:52<02:40,  3.63it/s][A
+ 25%|██▌       | 194/774 [00:52<02:49,  3.43it/s][A
+ 25%|██▌       | 195/774 [00:53<02:57,  3.27it/s][A
+ 25%|██▌       | 196/774 [00:53<02:57,  3.26it/s][A
+ 25%|██▌       | 197/774 [00:53<02:53,  3.32it/s][A
+ 26%|██▌       | 198/774 [00:53<02:44,  3.50it/s][A
+ 26%|██▌       | 199/774 [00:54<02:45,  3.48it/s][A
+ 26%|██▌       | 200/774 [00:54<02:40,  3.58it/s][A
+ 26%|██▌       | 201/774 [00:54<02:36,  3.66it/s][A
+ 26%|██▌       | 202/774 [00:54<02:33,  3.73it/s][A
+ 26%|██▌       | 203/774 [00:55<02:26,  3.90it/s][A
+ 26%|██▋       | 204/774 [00:55<02:30,  3.78it/s][A
+ 26%|██▋       | 205/774 [00:55<02:41,  3.53it/s][A
+ 27%|██▋       | 206/774 [00:56<02:37,  3.60it/s][A
+ 27%|██▋       | 207/774 [00:56<02:34,  3.66it/s][A
+ 27%|██▋       | 208/774 [00:56<02:34,  3.66it/s][A
+ 27%|██▋       | 209/774 [00:56<02:33,  3.69it/s][A
+ 27%|██▋       | 210/774 [00:57<02:30,  3.74it/s][A
+ 27%|██▋       | 211/774 [00:57<02:27,  3.82it/s][A
+ 27%|██▋       | 212/774 [00:57<02:16,  4.11it/s][A
+ 28%|██▊       | 213/774 [00:57<02:01,  4.62it/s][A
+ 28%|██▊       | 214/774 [00:57<02:03,  4.53it/s][A
+ 28%|██▊       | 215/774 [00:58<02:02,  4.55it/s][A
+ 28%|██▊       | 216/774 [00:58<02:00,  4.62it/s][A
+ 28%|██▊       | 217/774 [00:58<02:04,  4.47it/s][A
+ 28%|██▊       | 218/774 [00:58<02:10,  4.25it/s][A
+ 28%|██▊       | 219/774 [00:59<02:20,  3.95it/s][A
+ 28%|██▊       | 220/774 [00:59<02:19,  3.98it/s][A
+ 29%|██▊       | 221/774 [00:59<02:24,  3.83it/s][A
+ 29%|██▊       | 222/774 [00:59<02:33,  3.60it/s][A
+ 29%|██▉       | 223/774 [01:00<02:50,  3.23it/s][A
+ 29%|██▉       | 224/774 [01:00<02:59,  3.06it/s][A
+ 29%|██▉       | 225/774 [01:01<03:10,  2.87it/s][A
+ 29%|██▉       | 226/774 [01:01<03:15,  2.81it/s][A
+ 29%|██▉       | 227/774 [01:01<03:11,  2.86it/s][A
+ 29%|██▉       | 228/774 [01:02<03:04,  2.96it/s][A
+ 30%|██▉       | 229/774 [01:02<03:19,  2.74it/s][A
+ 30%|██▉       | 230/774 [01:02<03:04,  2.95it/s][A
+ 30%|██▉       | 231/774 [01:03<03:01,  2.99it/s][A
+ 30%|██▉       | 232/774 [01:03<02:53,  3.13it/s][A
+ 30%|███       | 233/774 [01:03<03:08,  2.87it/s][A
+ 30%|███       | 234/774 [01:04<03:12,  2.81it/s][A
+ 30%|███       | 235/774 [01:04<03:11,  2.82it/s][A
+ 30%|███       | 236/774 [01:04<03:14,  2.76it/s][A
+ 31%|███       | 237/774 [01:05<03:10,  2.82it/s][A
+ 31%|███       | 238/774 [01:05<03:01,  2.96it/s][A
+ 31%|███       | 239/774 [01:05<03:00,  2.97it/s][A
+ 31%|███       | 240/774 [01:06<03:00,  2.96it/s][A
+ 31%|███       | 241/774 [01:06<03:03,  2.91it/s][A
+ 31%|███▏      | 242/774 [01:07<03:13,  2.75it/s][A
+ 31%|███▏      | 243/774 [01:07<03:23,  2.61it/s][A
+ 32%|███▏      | 244/774 [01:07<03:17,  2.69it/s][A
+ 32%|███▏      | 245/774 [01:08<03:09,  2.80it/s][A
+ 32%|███▏      | 246/774 [01:08<03:08,  2.81it/s][A
+ 32%|███▏      | 247/774 [01:09<03:46,  2.33it/s][A
+ 32%|███▏      | 248/774 [01:09<03:51,  2.27it/s][A
+ 32%|███▏      | 249/774 [01:09<03:26,  2.54it/s][A
+ 32%|███▏      | 250/774 [01:10<03:20,  2.61it/s][A
+ 32%|███▏      | 251/774 [01:10<03:18,  2.64it/s][A
+ 33%|███▎      | 252/774 [01:10<03:14,  2.69it/s][A
+ 33%|███▎      | 253/774 [01:11<03:12,  2.71it/s][A
+ 33%|███▎      | 254/774 [01:11<03:07,  2.78it/s][A
+ 33%|███▎      | 255/774 [01:11<03:03,  2.83it/s][A
+ 33%|███▎      | 256/774 [01:12<03:00,  2.87it/s][A
+ 33%|███▎      | 257/774 [01:12<02:58,  2.90it/s][A
+ 33%|███▎      | 258/774 [01:12<02:43,  3.16it/s][A
+ 33%|███▎      | 259/774 [01:13<02:25,  3.54it/s][A
+ 34%|███▎      | 260/774 [01:13<02:23,  3.59it/s][A
+ 34%|███▎      | 261/774 [01:13<02:28,  3.45it/s][A
+ 34%|███▍      | 262/774 [01:13<02:13,  3.82it/s][A
+ 34%|███▍      | 263/774 [01:14<02:06,  4.04it/s][A
+ 34%|███▍      | 264/774 [01:14<02:15,  3.76it/s][A
+ 34%|███▍      | 265/774 [01:14<02:09,  3.92it/s][A
+ 34%|███▍      | 266/774 [01:14<02:03,  4.12it/s][A
+ 34%|███▍      | 267/774 [01:15<02:01,  4.17it/s][A
+ 35%|███▍      | 268/774 [01:15<02:08,  3.92it/s][A
+ 35%|███▍      | 269/774 [01:15<02:14,  3.76it/s][A
+ 35%|███▍      | 270/774 [01:15<02:19,  3.60it/s][A
+ 35%|███▌      | 271/774 [01:16<02:16,  3.69it/s][A
+ 35%|███▌      | 272/774 [01:16<02:05,  3.99it/s][A
+ 35%|███▌      | 273/774 [01:16<02:02,  4.10it/s][A
+ 35%|███▌      | 274/774 [01:16<02:05,  3.98it/s][A
+ 36%|███▌      | 275/774 [01:17<01:59,  4.17it/s][A
+ 36%|███▌      | 276/774 [01:17<01:53,  4.39it/s][A
+ 36%|███▌      | 277/774 [01:17<01:57,  4.24it/s][A
+ 36%|███▌      | 278/774 [01:17<01:59,  4.14it/s][A
+ 36%|███▌      | 279/774 [01:18<01:52,  4.38it/s][A
+ 36%|███▌      | 280/774 [01:18<01:54,  4.31it/s][A
+ 36%|███▋      | 281/774 [01:18<02:05,  3.94it/s][A
+ 36%|███▋      | 282/774 [01:18<02:16,  3.60it/s][A
+ 37%|███▋      | 283/774 [01:19<02:11,  3.73it/s][A
+ 37%|███▋      | 284/774 [01:19<02:12,  3.70it/s][A
+ 37%|███▋      | 285/774 [01:19<02:05,  3.90it/s][A
+ 37%|███▋      | 286/774 [01:19<02:00,  4.05it/s][A
+ 37%|███▋      | 287/774 [01:20<02:12,  3.69it/s][A
+ 37%|███▋      | 288/774 [01:20<02:15,  3.58it/s][A
+ 37%|███▋      | 289/774 [01:20<02:13,  3.64it/s][A
+ 37%|███▋      | 290/774 [01:21<02:09,  3.74it/s][A
+ 38%|███▊      | 291/774 [01:21<02:08,  3.76it/s][A
+ 38%|███▊      | 292/774 [01:21<02:04,  3.87it/s][A
+ 38%|███▊      | 293/774 [01:21<01:54,  4.21it/s][A
+ 38%|███▊      | 294/774 [01:21<01:50,  4.33it/s][A
+ 38%|███▊      | 295/774 [01:22<01:49,  4.38it/s][A
+ 38%|███▊      | 296/774 [01:22<01:44,  4.59it/s][A
+ 38%|███▊      | 297/774 [01:22<01:39,  4.80it/s][A
+ 39%|███▊      | 298/774 [01:22<01:43,  4.60it/s][A
+ 39%|███▊      | 299/774 [01:23<01:47,  4.40it/s][A
+ 39%|███▉      | 300/774 [01:23<01:55,  4.12it/s][A
+ 39%|███▉      | 301/774 [01:23<01:48,  4.36it/s][A
+ 39%|███▉      | 302/774 [01:23<01:42,  4.62it/s][A
+ 39%|███▉      | 303/774 [01:23<01:38,  4.80it/s][A
+ 39%|███▉      | 304/774 [01:24<01:27,  5.35it/s][A
+ 39%|███▉      | 305/774 [01:24<01:27,  5.35it/s][A
+ 40%|███▉      | 306/774 [01:24<01:39,  4.72it/s][A
+ 40%|███▉      | 307/774 [01:24<01:43,  4.51it/s][A
+ 40%|███▉      | 308/774 [01:24<01:38,  4.75it/s][A
+ 40%|███▉      | 309/774 [01:25<01:37,  4.77it/s][A
+ 40%|████      | 310/774 [01:25<01:43,  4.49it/s][A
+ 40%|████      | 311/774 [01:25<01:42,  4.54it/s][A
+ 40%|████      | 312/774 [01:25<01:39,  4.65it/s][A
+ 40%|████      | 313/774 [01:26<01:39,  4.62it/s][A
+ 41%|████      | 314/774 [01:26<01:41,  4.55it/s][A
+ 41%|████      | 315/774 [01:26<01:50,  4.17it/s][A
+ 41%|████      | 316/774 [01:26<01:40,  4.54it/s][A
+ 41%|████      | 317/774 [01:26<01:33,  4.87it/s][A
+ 41%|████      | 318/774 [01:27<01:37,  4.69it/s][A
+ 41%|████      | 319/774 [01:27<01:39,  4.58it/s][A
+ 41%|████▏     | 320/774 [01:27<01:39,  4.54it/s][A
+ 41%|████▏     | 321/774 [01:27<01:31,  4.94it/s][A
+ 42%|████▏     | 322/774 [01:27<01:26,  5.24it/s][A
+ 42%|████▏     | 323/774 [01:28<01:17,  5.81it/s][A
+ 42%|████▏     | 324/774 [01:28<01:25,  5.29it/s][A
+ 42%|████▏     | 325/774 [01:28<01:29,  5.02it/s][A
+ 42%|████▏     | 326/774 [01:28<01:25,  5.24it/s][A
+ 42%|████▏     | 327/774 [01:28<01:28,  5.04it/s][A
+ 42%|████▏     | 328/774 [01:29<01:26,  5.17it/s][A
+ 43%|████▎     | 329/774 [01:29<01:34,  4.69it/s][A
+ 43%|████▎     | 330/774 [01:29<01:30,  4.90it/s][A
+ 43%|████▎     | 331/774 [01:29<01:22,  5.38it/s][A
+ 43%|████▎     | 332/774 [01:29<01:19,  5.58it/s][A
+ 43%|████▎     | 333/774 [01:30<01:22,  5.35it/s][A
+ 43%|████▎     | 334/774 [01:30<01:26,  5.09it/s][A
+ 43%|████▎     | 335/774 [01:30<01:27,  5.02it/s][A
+ 43%|████▎     | 336/774 [01:30<01:26,  5.07it/s][A
+ 44%|████▎     | 337/774 [01:30<01:20,  5.45it/s][A
+ 44%|████▎     | 338/774 [01:30<01:13,  5.90it/s][A
+ 44%|████▍     | 339/774 [01:31<01:09,  6.25it/s][A
+ 44%|████▍     | 340/774 [01:31<01:10,  6.18it/s][A
+ 44%|████▍     | 341/774 [01:31<01:27,  4.94it/s][A
+ 44%|████▍     | 342/774 [01:31<01:36,  4.46it/s][A
+ 44%|████▍     | 343/774 [01:32<01:37,  4.42it/s][A
+ 44%|████▍     | 344/774 [01:32<01:41,  4.23it/s][A
+ 45%|████▍     | 345/774 [01:32<01:45,  4.08it/s][A
+ 45%|████▍     | 346/774 [01:32<01:47,  3.99it/s][A
+ 45%|████▍     | 347/774 [01:33<01:44,  4.10it/s][A
+ 45%|████▍     | 348/774 [01:33<01:38,  4.31it/s][A
+ 45%|████▌     | 349/774 [01:33<01:35,  4.47it/s][A
+ 45%|████▌     | 350/774 [01:33<01:38,  4.32it/s][A
+ 45%|████▌     | 351/774 [01:33<01:38,  4.31it/s][A
+ 45%|████▌     | 352/774 [01:34<01:34,  4.48it/s][A
+ 46%|████▌     | 353/774 [01:34<01:34,  4.45it/s][A
+ 46%|████▌     | 354/774 [01:34<01:33,  4.47it/s][A
+ 46%|████▌     | 355/774 [01:34<01:39,  4.23it/s][A
+ 46%|████▌     | 356/774 [01:35<01:49,  3.83it/s][A
+ 46%|████▌     | 357/774 [01:35<02:05,  3.33it/s][A
+ 46%|████▋     | 358/774 [01:35<02:09,  3.22it/s][A
+ 46%|████▋     | 359/774 [01:36<02:08,  3.22it/s][A
+ 47%|████▋     | 360/774 [01:36<02:08,  3.21it/s][A
+ 47%|████▋     | 361/774 [01:36<02:02,  3.38it/s][A
+ 47%|████▋     | 362/774 [01:37<02:08,  3.21it/s][A
+ 47%|████▋     | 363/774 [01:37<02:07,  3.23it/s][A
+ 47%|████▋     | 364/774 [01:37<02:08,  3.19it/s][A
+ 47%|████▋     | 365/774 [01:38<02:03,  3.30it/s][A
+ 47%|████▋     | 366/774 [01:38<01:55,  3.54it/s][A
+ 47%|████▋     | 367/774 [01:38<01:49,  3.70it/s][A
+ 48%|████▊     | 368/774 [01:38<01:47,  3.78it/s][A
+ 48%|████▊     | 369/774 [01:39<01:54,  3.54it/s][A
+ 48%|████▊     | 370/774 [01:39<02:08,  3.14it/s][A
+ 48%|████▊     | 371/774 [01:39<02:00,  3.36it/s][A
+ 48%|████▊     | 372/774 [01:40<02:01,  3.32it/s][A
+ 48%|████▊     | 373/774 [01:40<01:58,  3.38it/s][A
+ 48%|████▊     | 374/774 [01:40<01:55,  3.46it/s][A
+ 48%|████▊     | 375/774 [01:40<01:55,  3.45it/s][A
+ 49%|████▊     | 376/774 [01:41<02:00,  3.30it/s][A
+ 49%|████▊     | 377/774 [01:41<02:11,  3.01it/s][A
+ 49%|████▉     | 378/774 [01:41<02:12,  2.98it/s][A
+ 49%|████▉     | 379/774 [01:42<02:03,  3.19it/s][A
+ 49%|████▉     | 380/774 [01:42<01:53,  3.48it/s][A
+ 49%|████▉     | 381/774 [01:42<01:45,  3.72it/s][A
+ 49%|████▉     | 382/774 [01:42<01:42,  3.84it/s][A
+ 49%|████▉     | 383/774 [01:43<01:40,  3.87it/s][A
+ 50%|████▉     | 384/774 [01:43<01:48,  3.58it/s][A
+ 50%|████▉     | 385/774 [01:43<01:56,  3.33it/s][A
+ 50%|████▉     | 386/774 [01:44<01:50,  3.52it/s][A
+ 50%|█████     | 387/774 [01:44<01:42,  3.77it/s][A
+ 50%|█████     | 388/774 [01:44<01:47,  3.59it/s][A
+ 50%|█████     | 389/774 [01:44<01:43,  3.73it/s][A
+ 50%|█████     | 390/774 [01:45<01:56,  3.30it/s][A
+ 51%|█████     | 391/774 [01:45<01:57,  3.26it/s][A
+ 51%|█████     | 392/774 [01:45<01:47,  3.54it/s][A
+ 51%|█████     | 393/774 [01:46<01:39,  3.84it/s][A
+ 51%|█████     | 394/774 [01:46<01:39,  3.82it/s][A
+ 51%|█████     | 395/774 [01:46<01:46,  3.54it/s][A
+ 51%|█████     | 396/774 [01:46<01:44,  3.62it/s][A
+ 51%|█████▏    | 397/774 [01:47<01:48,  3.48it/s][A
+ 51%|█████▏    | 398/774 [01:47<01:42,  3.68it/s][A
+ 52%|█████▏    | 399/774 [01:47<01:40,  3.72it/s][A
+ 52%|█████▏    | 400/774 [01:47<01:33,  4.00it/s][A
+ 52%|█████▏    | 401/774 [01:48<01:30,  4.14it/s][A
+ 52%|█████▏    | 402/774 [01:48<01:29,  4.15it/s][A
+ 52%|█████▏    | 403/774 [01:48<01:33,  3.99it/s][A
+ 52%|█████▏    | 404/774 [01:48<01:39,  3.73it/s][A
+ 52%|█████▏    | 405/774 [01:49<01:35,  3.87it/s][A
+ 52%|█████▏    | 406/774 [01:49<01:38,  3.75it/s][A
+ 53%|█████▎    | 407/774 [01:49<01:43,  3.55it/s][A
+ 53%|█████▎    | 408/774 [01:50<01:39,  3.67it/s][A
+ 53%|█████▎    | 409/774 [01:50<01:36,  3.78it/s][A
+ 53%|█████▎    | 410/774 [01:50<01:38,  3.70it/s][A
+ 53%|█████▎    | 411/774 [01:50<01:38,  3.69it/s][A
+ 53%|█████▎    | 412/774 [01:51<01:39,  3.64it/s][A
+ 53%|█████▎    | 413/774 [01:51<01:37,  3.72it/s][A
+ 53%|█████▎    | 414/774 [01:51<01:34,  3.81it/s][A
+ 54%|█████▎    | 415/774 [01:51<01:23,  4.30it/s][A
+ 54%|█████▎    | 416/774 [01:51<01:23,  4.31it/s][A
+ 54%|█████▍    | 417/774 [01:52<01:21,  4.36it/s][A
+ 54%|█████▍    | 418/774 [01:52<01:15,  4.69it/s][A
+ 54%|█████▍    | 419/774 [01:52<01:30,  3.93it/s][A
+ 54%|█████▍    | 420/774 [01:53<01:34,  3.74it/s][A
+ 54%|█████▍    | 421/774 [01:53<01:34,  3.74it/s][A
+ 55%|█████▍    | 422/774 [01:53<01:34,  3.74it/s][A
+ 55%|█████▍    | 423/774 [01:53<01:35,  3.68it/s][A
+ 55%|█████▍    | 424/774 [01:54<01:33,  3.75it/s][A
+ 55%|█████▍    | 425/774 [01:54<01:22,  4.22it/s][A
+ 55%|█████▌    | 426/774 [01:54<01:16,  4.53it/s][A
+ 55%|█████▌    | 427/774 [01:54<01:13,  4.75it/s][A
+ 55%|█████▌    | 428/774 [01:54<01:14,  4.63it/s][A
+ 55%|█████▌    | 429/774 [01:55<01:17,  4.48it/s][A
+ 56%|█████▌    | 430/774 [01:55<01:21,  4.22it/s][A
+ 56%|█████▌    | 431/774 [01:55<01:34,  3.64it/s][A
+ 56%|█████▌    | 432/774 [01:56<01:32,  3.68it/s][A
+ 56%|█████▌    | 433/774 [01:56<01:26,  3.95it/s][A
+ 56%|█████▌    | 434/774 [01:56<01:21,  4.18it/s][A
+ 56%|█████▌    | 435/774 [01:56<01:19,  4.24it/s][A
+ 56%|█████▋    | 436/774 [01:56<01:21,  4.14it/s][A
+ 56%|█████▋    | 437/774 [01:57<01:18,  4.29it/s][A
+ 57%|█████▋    | 438/774 [01:57<01:14,  4.51it/s][A
+ 57%|█████▋    | 439/774 [01:57<01:17,  4.32it/s][A
+ 57%|█████▋    | 440/774 [01:57<01:20,  4.13it/s][A
+ 57%|█████▋    | 441/774 [01:58<01:25,  3.90it/s][A
+ 57%|█████▋    | 442/774 [01:58<01:27,  3.81it/s][A
+ 57%|█████▋    | 443/774 [01:58<01:25,  3.89it/s][A
+ 57%|█████▋    | 444/774 [01:58<01:23,  3.93it/s][A
+ 57%|█████▋    | 445/774 [01:59<01:24,  3.91it/s][A
+ 58%|█████▊    | 446/774 [01:59<01:21,  4.01it/s][A
+ 58%|█████▊    | 447/774 [01:59<01:20,  4.07it/s][A
+ 58%|█████▊    | 448/774 [01:59<01:13,  4.45it/s][A
+ 58%|█████▊    | 449/774 [02:00<01:14,  4.38it/s][A
+ 58%|█████▊    | 450/774 [02:00<01:17,  4.20it/s][A
+ 58%|█████▊    | 451/774 [02:00<01:15,  4.30it/s][A
+ 58%|█████▊    | 452/774 [02:00<01:11,  4.52it/s][A
+ 59%|█████▊    | 453/774 [02:00<01:09,  4.60it/s][A
+ 59%|█████▊    | 454/774 [02:01<01:15,  4.26it/s][A
+ 59%|█████▉    | 455/774 [02:01<01:19,  4.00it/s][A
+ 59%|█████▉    | 456/774 [02:01<01:23,  3.80it/s][A
+ 59%|█████▉    | 457/774 [02:01<01:17,  4.08it/s][A
+ 59%|█████▉    | 458/774 [02:02<01:17,  4.09it/s][A
+ 59%|█████▉    | 459/774 [02:02<01:14,  4.20it/s][A
+ 59%|█████▉    | 460/774 [02:02<01:20,  3.90it/s][A
+ 60%|█████▉    | 461/774 [02:03<01:27,  3.58it/s][A
+ 60%|█████▉    | 462/774 [02:03<01:24,  3.68it/s][A
+ 60%|█████▉    | 463/774 [02:03<01:21,  3.80it/s][A
+ 60%|█████▉    | 464/774 [02:03<01:22,  3.77it/s][A
+ 60%|██████    | 465/774 [02:04<01:13,  4.18it/s][A
+ 60%|██████    | 466/774 [02:04<01:11,  4.29it/s][A
+ 60%|██████    | 467/774 [02:04<01:07,  4.55it/s][A
+ 60%|██████    | 468/774 [02:04<01:07,  4.51it/s][A
+ 61%|██████    | 469/774 [02:04<01:01,  4.97it/s][A
+ 61%|██████    | 470/774 [02:05<00:58,  5.16it/s][A
+ 61%|██████    | 471/774 [02:05<01:01,  4.95it/s][A
+ 61%|██████    | 472/774 [02:05<01:06,  4.57it/s][A
+ 61%|██████    | 473/774 [02:05<01:08,  4.37it/s][A
+ 61%|██████    | 474/774 [02:05<01:07,  4.47it/s][A
+ 61%|██████▏   | 475/774 [02:06<01:08,  4.36it/s][A
+ 61%|██████▏   | 476/774 [02:06<01:19,  3.75it/s][A
+ 62%|██████▏   | 477/774 [02:06<01:33,  3.16it/s][A
+ 62%|██████▏   | 478/774 [02:07<01:34,  3.14it/s][A
+ 62%|██████▏   | 479/774 [02:07<01:32,  3.20it/s][A
+ 62%|██████▏   | 480/774 [02:07<01:28,  3.33it/s][A
+ 62%|██████▏   | 481/774 [02:08<01:29,  3.28it/s][A
+ 62%|██████▏   | 482/774 [02:08<01:26,  3.37it/s][A
+ 62%|██████▏   | 483/774 [02:08<01:24,  3.44it/s][A
+ 63%|██████▎   | 484/774 [02:09<01:25,  3.38it/s][A
+ 63%|██████▎   | 485/774 [02:09<01:27,  3.29it/s][A
+ 63%|██████▎   | 486/774 [02:09<01:23,  3.45it/s][A
+ 63%|██████▎   | 487/774 [02:09<01:25,  3.37it/s][A
+ 63%|██████▎   | 488/774 [02:10<01:22,  3.45it/s][A
+ 63%|██████▎   | 489/774 [02:10<01:18,  3.65it/s][A
+ 63%|██████▎   | 490/774 [02:10<01:18,  3.64it/s][A
+ 63%|██████▎   | 491/774 [02:10<01:16,  3.68it/s][A
+ 64%|██████▎   | 492/774 [02:11<01:18,  3.59it/s][A
+ 64%|██████▎   | 493/774 [02:11<01:19,  3.55it/s][A
+ 64%|██████▍   | 494/774 [02:11<01:17,  3.59it/s][A
+ 64%|██████▍   | 495/774 [02:12<01:17,  3.58it/s][A
+ 64%|██████▍   | 496/774 [02:12<01:23,  3.34it/s][A
+ 64%|██████▍   | 497/774 [02:12<01:23,  3.30it/s][A
+ 64%|██████▍   | 498/774 [02:13<01:22,  3.35it/s][A
+ 64%|██████▍   | 499/774 [02:13<01:20,  3.42it/s][A
+ 65%|██████▍   | 500/774 [02:13<01:18,  3.51it/s][A
+ 65%|██████▍   | 501/774 [02:13<01:15,  3.64it/s][A
+ 65%|██████▍   | 502/774 [02:14<01:14,  3.65it/s][A
+ 65%|██████▍   | 503/774 [02:14<01:19,  3.39it/s][A
+ 65%|██████▌   | 504/774 [02:14<01:22,  3.29it/s][A
+ 65%|██████▌   | 505/774 [02:15<01:19,  3.40it/s][A
+ 65%|██████▌   | 506/774 [02:15<01:18,  3.40it/s][A
+ 66%|██████▌   | 507/774 [02:15<01:23,  3.19it/s][A
+ 66%|██████▌   | 508/774 [02:16<01:21,  3.26it/s][A
+ 66%|██████▌   | 509/774 [02:16<01:19,  3.32it/s][A
+ 66%|██████▌   | 510/774 [02:16<01:17,  3.41it/s][A
+ 66%|██████▌   | 511/774 [02:16<01:12,  3.60it/s][A
+ 66%|██████▌   | 512/774 [02:17<01:11,  3.68it/s][A
+ 66%|██████▋   | 513/774 [02:17<01:14,  3.52it/s][A
+ 66%|██████▋   | 514/774 [02:17<01:16,  3.41it/s][A
+ 67%|██████▋   | 515/774 [02:18<01:22,  3.13it/s][A
+ 67%|██████▋   | 516/774 [02:18<01:17,  3.35it/s][A
+ 67%|██████▋   | 517/774 [02:18<01:10,  3.64it/s][A
+ 67%|██████▋   | 518/774 [02:18<01:08,  3.76it/s][A
+ 67%|██████▋   | 519/774 [02:19<01:10,  3.61it/s][A
+ 67%|██████▋   | 520/774 [02:19<01:09,  3.64it/s][A
+ 67%|██████▋   | 521/774 [02:19<01:07,  3.75it/s][A
+ 67%|██████▋   | 522/774 [02:19<01:03,  3.96it/s][A
+ 68%|██████▊   | 523/774 [02:20<01:02,  4.04it/s][A
+ 68%|██████▊   | 524/774 [02:20<01:05,  3.79it/s][A
+ 68%|██████▊   | 525/774 [02:20<01:07,  3.72it/s][A
+ 68%|██████▊   | 526/774 [02:20<01:09,  3.56it/s][A
+ 68%|██████▊   | 527/774 [02:21<01:11,  3.47it/s][A
+ 68%|██████▊   | 528/774 [02:21<01:10,  3.51it/s][A
+ 68%|██████▊   | 529/774 [02:21<01:06,  3.69it/s][A
+ 68%|██████▊   | 530/774 [02:22<01:05,  3.73it/s][A
+ 69%|██████▊   | 531/774 [02:22<01:05,  3.74it/s][A
+ 69%|██████▊   | 532/774 [02:22<01:03,  3.84it/s][A
+ 69%|██████▉   | 533/774 [02:22<00:59,  4.03it/s][A
+ 69%|██████▉   | 534/774 [02:22<00:56,  4.23it/s][A
+ 69%|██████▉   | 535/774 [02:23<00:59,  4.03it/s][A
+ 69%|██████▉   | 536/774 [02:23<01:01,  3.86it/s][A
+ 69%|██████▉   | 537/774 [02:23<01:01,  3.83it/s][A
+ 70%|██████▉   | 538/774 [02:24<01:05,  3.58it/s][A
+ 70%|██████▉   | 539/774 [02:24<01:05,  3.60it/s][A
+ 70%|██████▉   | 540/774 [02:24<01:04,  3.62it/s][A
+ 70%|██████▉   | 541/774 [02:24<01:02,  3.76it/s][A
+ 70%|███████   | 542/774 [02:25<01:02,  3.74it/s][A
+ 70%|███████   | 543/774 [02:25<01:03,  3.64it/s][A
+ 70%|███████   | 544/774 [02:25<01:03,  3.63it/s][A
+ 70%|███████   | 545/774 [02:26<01:01,  3.75it/s][A
+ 71%|███████   | 546/774 [02:26<00:57,  4.00it/s][A
+ 71%|███████   | 547/774 [02:26<00:55,  4.12it/s][A
+ 71%|███████   | 548/774 [02:26<00:54,  4.16it/s][A
+ 71%|███████   | 549/774 [02:26<00:55,  4.08it/s][A
+ 71%|███████   | 550/774 [02:27<00:58,  3.84it/s][A
+ 71%|███████   | 551/774 [02:27<01:00,  3.66it/s][A
+ 71%|███████▏  | 552/774 [02:27<01:04,  3.46it/s][A
+ 71%|███████▏  | 553/774 [02:28<01:08,  3.25it/s][A
+ 72%|███████▏  | 554/774 [02:28<01:07,  3.28it/s][A
+ 72%|███████▏  | 555/774 [02:28<01:06,  3.28it/s][A
+ 72%|███████▏  | 556/774 [02:29<01:03,  3.45it/s][A
+ 72%|███████▏  | 557/774 [02:29<01:06,  3.25it/s][A
+ 72%|███████▏  | 558/774 [02:29<01:00,  3.56it/s][A
+ 72%|███████▏  | 559/774 [02:29<00:55,  3.84it/s][A
+ 72%|███████▏  | 560/774 [02:30<01:00,  3.52it/s][A
+ 72%|███████▏  | 561/774 [02:30<00:57,  3.71it/s][A
+ 73%|███████▎  | 562/774 [02:30<00:52,  4.01it/s][A
+ 73%|███████▎  | 563/774 [02:30<00:50,  4.20it/s][A
+ 73%|███████▎  | 564/774 [02:31<00:52,  4.01it/s][A
+ 73%|███████▎  | 565/774 [02:31<00:54,  3.84it/s][A
+ 73%|███████▎  | 566/774 [02:31<00:50,  4.13it/s][A
+ 73%|███████▎  | 567/774 [02:31<00:46,  4.47it/s][A
+ 73%|███████▎  | 568/774 [02:32<00:47,  4.35it/s][A
+ 74%|███████▎  | 569/774 [02:32<00:48,  4.26it/s][A
+ 74%|███████▎  | 570/774 [02:32<00:48,  4.24it/s][A
+ 74%|███████▍  | 571/774 [02:32<00:52,  3.90it/s][A
+ 74%|███████▍  | 572/774 [02:33<00:54,  3.73it/s][A
+ 74%|███████▍  | 573/774 [02:33<00:53,  3.73it/s][A
+ 74%|███████▍  | 574/774 [02:33<00:52,  3.81it/s][A
+ 74%|███████▍  | 575/774 [02:33<00:52,  3.77it/s][A
+ 74%|███████▍  | 576/774 [02:34<00:57,  3.44it/s][A
+ 75%|███████▍  | 577/774 [02:34<00:55,  3.55it/s][A
+ 75%|███████▍  | 578/774 [02:34<00:54,  3.61it/s][A
+ 75%|███████▍  | 579/774 [02:35<00:56,  3.46it/s][A
+ 75%|███████▍  | 580/774 [02:35<00:55,  3.48it/s][A
+ 75%|███████▌  | 581/774 [02:35<00:54,  3.51it/s][A
+ 75%|███████▌  | 582/774 [02:35<00:53,  3.62it/s][A
+ 75%|███████▌  | 583/774 [02:36<00:50,  3.76it/s][A
+ 75%|███████▌  | 584/774 [02:36<00:50,  3.79it/s][A
+ 76%|███████▌  | 585/774 [02:36<00:51,  3.65it/s][A
+ 76%|███████▌  | 586/774 [02:37<00:52,  3.58it/s][A
+ 76%|███████▌  | 587/774 [02:37<00:51,  3.64it/s][A
+ 76%|███████▌  | 588/774 [02:37<00:50,  3.72it/s][A
+ 76%|███████▌  | 589/774 [02:37<00:48,  3.79it/s][A
+ 76%|███████▌  | 590/774 [02:37<00:45,  4.05it/s][A
+ 76%|███████▋  | 591/774 [02:38<00:46,  3.90it/s][A
+ 76%|███████▋  | 592/774 [02:38<00:49,  3.65it/s][A
+ 77%|███████▋  | 593/774 [02:38<00:50,  3.59it/s][A
+ 77%|███████▋  | 594/774 [02:39<00:49,  3.61it/s][A
+ 77%|███████▋  | 595/774 [02:39<00:53,  3.33it/s][A
+ 77%|███████▋  | 596/774 [02:39<00:56,  3.15it/s][A
+ 77%|███████▋  | 597/774 [02:40<00:56,  3.13it/s][A
+ 77%|███████▋  | 598/774 [02:40<00:57,  3.04it/s][A
+ 77%|███████▋  | 599/774 [02:40<00:58,  3.00it/s][A
+ 78%|███████▊  | 600/774 [02:41<00:58,  2.99it/s][A
+ 78%|███████▊  | 601/774 [02:41<00:58,  2.95it/s][A
+ 78%|███████▊  | 602/774 [02:41<00:58,  2.94it/s][A
+ 78%|███████▊  | 603/774 [02:42<00:57,  2.97it/s][A
+ 78%|███████▊  | 604/774 [02:42<00:57,  2.95it/s][A
+ 78%|███████▊  | 605/774 [02:42<00:56,  3.00it/s][A
+ 78%|███████▊  | 606/774 [02:43<00:57,  2.91it/s][A
+ 78%|███████▊  | 607/774 [02:43<00:56,  2.93it/s][A
+ 79%|███████▊  | 608/774 [02:43<00:56,  2.93it/s][A
+ 79%|███████▊  | 609/774 [02:44<00:54,  3.05it/s][A
+ 79%|███████▉  | 610/774 [02:44<00:55,  2.98it/s][A
+ 79%|███████▉  | 611/774 [02:45<00:59,  2.74it/s][A
+ 79%|███████▉  | 612/774 [02:45<01:01,  2.62it/s][A
+ 79%|███████▉  | 613/774 [02:45<00:57,  2.82it/s][A
+ 79%|███████▉  | 614/774 [02:46<00:55,  2.89it/s][A
+ 79%|███████▉  | 615/774 [02:46<00:52,  3.04it/s][A
+ 80%|███████▉  | 616/774 [02:46<00:51,  3.09it/s][A
+ 80%|███████▉  | 617/774 [02:46<00:50,  3.10it/s][A
+ 80%|███████▉  | 618/774 [02:47<00:47,  3.28it/s][A
+ 80%|███████▉  | 619/774 [02:47<00:45,  3.43it/s][A
+ 80%|████████  | 620/774 [02:47<00:44,  3.46it/s][A
+ 80%|████████  | 621/774 [02:48<00:41,  3.73it/s][A
+ 80%|████████  | 622/774 [02:48<00:38,  3.98it/s][A
+ 80%|████████  | 623/774 [02:48<00:37,  3.98it/s][A
+ 81%|████████  | 624/774 [02:48<00:41,  3.60it/s][A
+ 81%|████████  | 625/774 [02:49<00:41,  3.55it/s][A
+ 81%|████████  | 626/774 [02:49<00:44,  3.30it/s][A
+ 81%|████████  | 627/774 [02:49<00:45,  3.23it/s][A
+ 81%|████████  | 628/774 [02:50<00:45,  3.22it/s][A
+ 81%|████████▏ | 629/774 [02:50<00:43,  3.32it/s][A
+ 81%|████████▏ | 630/774 [02:50<00:40,  3.54it/s][A
+ 82%|████████▏ | 631/774 [02:50<00:38,  3.75it/s][A
+ 82%|████████▏ | 632/774 [02:51<00:37,  3.74it/s][A
+ 82%|████████▏ | 633/774 [02:51<00:39,  3.56it/s][A
+ 82%|████████▏ | 634/774 [02:51<00:40,  3.46it/s][A
+ 82%|████████▏ | 635/774 [02:51<00:39,  3.55it/s][A
+ 82%|████████▏ | 636/774 [02:52<00:39,  3.47it/s][A
+ 82%|████████▏ | 637/774 [02:52<00:39,  3.51it/s][A
+ 82%|████████▏ | 638/774 [02:52<00:39,  3.48it/s][A
+ 83%|████████▎ | 639/774 [02:53<00:43,  3.11it/s][A
+ 83%|████████▎ | 640/774 [02:53<00:49,  2.69it/s][A
+ 83%|████████▎ | 641/774 [02:54<00:49,  2.70it/s][A
+ 83%|████████▎ | 642/774 [02:54<00:45,  2.90it/s][A
+ 83%|████████▎ | 643/774 [02:54<00:45,  2.91it/s][A
+ 83%|████████▎ | 644/774 [02:55<00:41,  3.12it/s][A
+ 83%|████████▎ | 645/774 [02:55<00:37,  3.41it/s][A
+ 83%|████████▎ | 646/774 [02:55<00:35,  3.65it/s][A
+ 84%|████████▎ | 647/774 [02:55<00:32,  3.92it/s][A
+ 84%|████████▎ | 648/774 [02:55<00:30,  4.07it/s][A
+ 84%|████████▍ | 649/774 [02:56<00:30,  4.09it/s][A
+ 84%|████████▍ | 650/774 [02:56<00:28,  4.31it/s][A
+ 84%|████████▍ | 651/774 [02:56<00:28,  4.26it/s][A
+ 84%|████████▍ | 652/774 [02:56<00:29,  4.17it/s][A
+ 84%|████████▍ | 653/774 [02:57<00:31,  3.89it/s][A
+ 84%|████████▍ | 654/774 [02:57<00:29,  4.12it/s][A
+ 85%|████████▍ | 655/774 [02:57<00:26,  4.42it/s][A
+ 85%|████████▍ | 656/774 [02:57<00:27,  4.23it/s][A
+ 85%|████████▍ | 657/774 [02:57<00:26,  4.47it/s][A
+ 85%|████████▌ | 658/774 [02:58<00:27,  4.24it/s][A
+ 85%|████████▌ | 659/774 [02:58<00:29,  3.88it/s][A
+ 85%|████████▌ | 660/774 [02:58<00:30,  3.74it/s][A
+ 85%|████████▌ | 661/774 [02:59<00:30,  3.70it/s][A
+ 86%|████████▌ | 662/774 [02:59<00:28,  3.87it/s][A
+ 86%|████████▌ | 663/774 [02:59<00:30,  3.65it/s][A
+ 86%|████████▌ | 664/774 [02:59<00:30,  3.62it/s][A
+ 86%|████████▌ | 665/774 [03:00<00:28,  3.88it/s][A
+ 86%|████████▌ | 666/774 [03:00<00:24,  4.35it/s][A
+ 86%|████████▌ | 667/774 [03:00<00:23,  4.62it/s][A
+ 86%|████████▋ | 668/774 [03:00<00:23,  4.46it/s][A
+ 86%|████████▋ | 669/774 [03:01<00:25,  4.20it/s][A
+ 87%|████████▋ | 670/774 [03:01<00:24,  4.33it/s][A
+ 87%|████████▋ | 671/774 [03:01<00:25,  3.98it/s][A
+ 87%|████████▋ | 672/774 [03:01<00:25,  4.04it/s][A
+ 87%|████████▋ | 673/774 [03:02<00:24,  4.13it/s][A
+ 87%|████████▋ | 674/774 [03:02<00:24,  4.06it/s][A
+ 87%|████████▋ | 675/774 [03:02<00:23,  4.27it/s][A
+ 87%|████████▋ | 676/774 [03:02<00:21,  4.48it/s][A
+ 87%|████████▋ | 677/774 [03:02<00:21,  4.42it/s][A
+ 88%|████████▊ | 678/774 [03:03<00:21,  4.48it/s][A
+ 88%|████████▊ | 679/774 [03:03<00:22,  4.27it/s][A
+ 88%|████████▊ | 680/774 [03:03<00:22,  4.19it/s][A
+ 88%|████████▊ | 681/774 [03:03<00:20,  4.46it/s][A
+ 88%|████████▊ | 682/774 [03:04<00:20,  4.50it/s][A
+ 88%|████████▊ | 683/774 [03:04<00:22,  4.13it/s][A
+ 88%|████████▊ | 684/774 [03:04<00:23,  3.88it/s][A
+ 89%|████████▊ | 685/774 [03:04<00:24,  3.70it/s][A
+ 89%|████████▊ | 686/774 [03:05<00:22,  3.85it/s][A
+ 89%|████████▉ | 687/774 [03:05<00:21,  4.05it/s][A
+ 89%|████████▉ | 688/774 [03:05<00:21,  4.05it/s][A
+ 89%|████████▉ | 689/774 [03:05<00:20,  4.21it/s][A
+ 89%|████████▉ | 690/774 [03:06<00:19,  4.31it/s][A
+ 89%|████████▉ | 691/774 [03:06<00:18,  4.44it/s][A
+ 89%|████████▉ | 692/774 [03:06<00:18,  4.50it/s][A
+ 90%|████████▉ | 693/774 [03:06<00:18,  4.50it/s][A
+ 90%|████████▉ | 694/774 [03:06<00:19,  4.21it/s][A
+ 90%|████████▉ | 695/774 [03:07<00:20,  3.86it/s][A
+ 90%|████████▉ | 696/774 [03:07<00:19,  3.98it/s][A
+ 90%|█████████ | 697/774 [03:07<00:19,  3.97it/s][A
+ 90%|█████████ | 698/774 [03:07<00:17,  4.37it/s][A
+ 90%|█████████ | 699/774 [03:08<00:15,  4.72it/s][A
+ 90%|█████████ | 700/774 [03:08<00:17,  4.34it/s][A
+ 91%|█████████ | 701/774 [03:08<00:16,  4.43it/s][A
+ 91%|█████████ | 702/774 [03:08<00:16,  4.42it/s][A
+ 91%|█████████ | 703/774 [03:09<00:16,  4.41it/s][A
+ 91%|█████████ | 704/774 [03:09<00:16,  4.25it/s][A
+ 91%|█████████ | 705/774 [03:09<00:14,  4.61it/s][A
+ 91%|█████████ | 706/774 [03:09<00:14,  4.78it/s][A
+ 91%|█████████▏| 707/774 [03:09<00:14,  4.73it/s][A
+ 91%|█████████▏| 708/774 [03:10<00:13,  4.96it/s][A
+ 92%|█████████▏| 709/774 [03:10<00:13,  4.78it/s][A
+ 92%|█████████▏| 710/774 [03:10<00:13,  4.68it/s][A
+ 92%|█████████▏| 711/774 [03:10<00:12,  4.85it/s][A
+ 92%|█████████▏| 712/774 [03:10<00:12,  5.09it/s][A
+ 92%|█████████▏| 713/774 [03:11<00:12,  4.98it/s][A
+ 92%|█████████▏| 714/774 [03:11<00:12,  4.66it/s][A
+ 92%|█████████▏| 715/774 [03:11<00:12,  4.76it/s][A
+ 93%|█████████▎| 716/774 [03:11<00:11,  5.25it/s][A
+ 93%|█████████▎| 717/774 [03:11<00:10,  5.32it/s][A
+ 93%|█████████▎| 718/774 [03:12<00:11,  4.74it/s][A
+ 93%|█████████▎| 719/774 [03:12<00:11,  4.62it/s][A
+ 93%|█████████▎| 720/774 [03:12<00:10,  4.92it/s][A
+ 93%|█████████▎| 721/774 [03:12<00:10,  5.19it/s][A
+ 93%|█████████▎| 722/774 [03:12<00:09,  5.59it/s][A
+ 93%|█████████▎| 723/774 [03:13<00:09,  5.38it/s][A
+ 94%|█████████▎| 724/774 [03:13<00:09,  5.33it/s][A
+ 94%|█████████▎| 725/774 [03:13<00:08,  5.59it/s][A
+ 94%|█████████▍| 726/774 [03:13<00:08,  5.54it/s][A
+ 94%|█████████▍| 727/774 [03:13<00:08,  5.32it/s][A
+ 94%|█████████▍| 728/774 [03:14<00:09,  4.78it/s][A
+ 94%|█████████▍| 729/774 [03:14<00:08,  5.08it/s][A
+ 94%|█████████▍| 730/774 [03:14<00:08,  5.37it/s][A
+ 94%|█████████▍| 731/774 [03:14<00:07,  5.48it/s][A
+ 95%|█████████▍| 732/774 [03:14<00:07,  5.60it/s][A
+ 95%|█████████���| 733/774 [03:14<00:07,  5.58it/s][A
+ 95%|█████████▍| 734/774 [03:15<00:07,  5.60it/s][A
+ 95%|█████████▍| 735/774 [03:15<00:06,  5.72it/s][A
+ 95%|█████████▌| 736/774 [03:15<00:06,  5.80it/s][A
+ 95%|█████████▌| 737/774 [03:15<00:06,  5.70it/s][A
+ 95%|█████████▌| 738/774 [03:15<00:06,  5.61it/s][A
+ 95%|█████████▌| 739/774 [03:15<00:06,  5.53it/s][A
+ 96%|█████████▌| 740/774 [03:16<00:06,  5.43it/s][A
+ 96%|█████████▌| 741/774 [03:16<00:06,  5.11it/s][A
+ 96%|█████████▌| 742/774 [03:16<00:06,  5.30it/s][A
+ 96%|█████████▌| 743/774 [03:16<00:05,  5.62it/s][A
+ 96%|█████████▌| 744/774 [03:16<00:05,  5.44it/s][A
+ 96%|█████████▋| 745/774 [03:17<00:06,  4.50it/s][A
+ 96%|█████████▋| 746/774 [03:17<00:07,  3.89it/s][A
+ 97%|█████████▋| 747/774 [03:17<00:06,  4.08it/s][A
+ 97%|█████████▋| 748/774 [03:17<00:06,  4.29it/s][A
+ 97%|█████████▋| 749/774 [03:18<00:05,  4.64it/s][A
+ 97%|█████████▋| 750/774 [03:18<00:05,  4.29it/s][A
+ 97%|█████████▋| 751/774 [03:18<00:05,  4.49it/s][A
+ 97%|█████████▋| 752/774 [03:18<00:04,  4.43it/s][A
+ 97%|█████████▋| 753/774 [03:19<00:04,  4.71it/s][A
+ 97%|█████████▋| 754/774 [03:19<00:03,  5.34it/s][A
+ 98%|█████████▊| 755/774 [03:19<00:03,  5.70it/s][A
+ 98%|█████████▊| 756/774 [03:19<00:03,  5.51it/s][A
+ 98%|█████████▊| 757/774 [03:19<00:03,  5.30it/s][A
+ 98%|█████████▊| 758/774 [03:19<00:03,  5.20it/s][A
+ 98%|█████████▊| 759/774 [03:20<00:02,  5.44it/s][A
+ 98%|█████████▊| 760/774 [03:20<00:02,  5.40it/s][A
+ 98%|█████████▊| 761/774 [03:20<00:02,  5.90it/s][A
+ 98%|█████████▊| 762/774 [03:20<00:02,  5.99it/s][A
+ 99%|█████████▊| 763/774 [03:20<00:01,  6.17it/s][A
+ 99%|█████████▊| 764/774 [03:20<00:01,  6.27it/s][A
+ 99%|█████████▉| 765/774 [03:21<00:01,  6.18it/s][A
+ 99%|█████████▉| 766/774 [03:21<00:01,  5.30it/s][A
+ 99%|█████████▉| 767/774 [03:21<00:01,  5.47it/s][A
+ 99%|█████████▉| 768/774 [03:21<00:01,  5.52it/s][A
+ 99%|█████████▉| 769/774 [03:21<00:00,  5.21it/s][A
+ 99%|█████████▉| 770/774 [03:22<00:00,  5.06it/s][A
+100%|█████████▉| 771/774 [03:22<00:00,  5.30it/s][A
+100%|█████████▉| 772/774 [03:22<00:00,  4.93it/s][A
+100%|█████████▉| 773/774 [03:22<00:00,  4.82it/s][A                                                    
+                                                 [A 39%|███▉      | 5000/12776 [53:42<49:29,  2.62it/s]
+100%|██████████| 774/774 [03:25<00:00,  4.82it/s][A
+                                                 [A 39%|███▉      | 5001/12776 [53:44<134:49:10, 62.42s/it]                                                         39%|███▉      | 5001/12776 [53:44<134:49:10, 62.42s/it] 39%|███▉      | 5002/12776 [53:45<95:01:00, 44.00s/it]                                                         39%|███▉      | 5002/12776 [53:45<95:01:00, 44.00s/it] 39%|███▉      | 5003/12776 [53:46<67:03:10, 31.06s/it]                                                        39%|███▉      | 5003/12776 [53:46<67:03:10, 31.06s/it] 39%|███▉      | 5004/12776 [53:46<47:26:06, 21.97s/it]                                                        39%|███▉      | 5004/12776 [53:46<47:26:06, 21.97s/it] 39%|███▉      | 5005/12776 [53:47<33:40:52, 15.60s/it]                                                        39%|███▉      | 5005/12776 [53:47<33:40:52, 15.60s/it] 39%|███▉      | 5006/12776 [53:48<24:00:12, 11.12s/it]                                                        39%|███▉      | 5006/12776 [53:48<24:00:12, 11.12s/it] 39%|███▉      | 5007/12776 [53:48<17:12:39,  7.98s/it]                                                        39%|███▉      | 5007/12776 [53:48<17:12:39,  7.98s/it] 39%|███▉      | 5008/12776 [53:49<12:31:43,  5.81s/it]                                                        39%|███▉      | 5008/12776 [53:49<12:31:43,  5.81s/it] 39%|███▉      | 5009/12776 [53:50<9:08:35,  4.24s/it]                                                        39%|███▉      | 5009/12776 [53:50<9:08:35,  4.24s/it] 39%|███▉      | 5010/12776 [53:50<6:48:25,  3.16s/it]                                                       39%|███▉      | 5010/12776 [53:50<6:48:25,  3.16s/it] 39%|███▉      | 5011/12776 [53:51<5:06:35,  2.37s/it]                                                       39%|███▉      | 5011/12776 [53:51<5:06:35,  2.37s/it] 39%|███▉      | 5012/12776 [53:51<3:56:24,  1.83s/it]                                                       39%|███▉      | 5012/12776 [53:51<3:56:24,  1.83s/it] 39%|███▉      | 5013/12776 [53:52<3:04:29,  1.43s/it]                                                       39%|███▉      | 5013/12776 [53:52<3:04:29,  1.43s/it] 39%|███▉      | 5014/12776 [53:52<2:30:48,  1.17s/it]                                                       39%|███▉      | 5014/12776 [53:52<2:30:48,  1.17s/it] 39%|███▉      | 5015/12776 [53:53<2:02:44,  1.05it/s]                                                       39%|███▉      | 5015/12776 [53:53<2:02:44,  1.05it/s] 39%|███▉      | 5016/12776 [53:53<1:45:16,  1.23it/s]                                                       39%|███▉      | 5016/12776 [53:53<1:45:16,  1.23it/s] 39%|███▉      | 5017/12776 [53:54<1:29:47,  1.44it/s]                                                       39%|███▉      | 5017/12776 [53:54<1:29:47,  1.44it/s] 39%|███▉      | 5018/12776 [53:54<1:18:24,  1.65it/s]                                                       39%|███▉      | 5018/12776 [53:54<1:18:24,  1.65it/s] 39%|███▉      | 5019/12776 [53:55<1:16:15,  1.70it/s]                                                       39%|███▉      | 5019/12776 [53:55<1:16:15,  1.70it/s] 39%|███▉      | 5020/12776 [53:55<1:07:39,  1.91it/s]                                                       39%|███▉      | 5020/12776 [53:55<1:07:39,  1.91it/s] 39%|███▉      | 5021/12776 [53:56<1:02:31,  2.07it/s]                                                       39%|███▉      | 5021/12776 [53:56<1:02:31,  2.07it/s] 39%|███▉      | 5022/12776 [53:56<58:47,  2.20it/s]                                                       39%|███▉      | 5022/12776 [53:56<58:47,  2.20it/s] 39%|███▉      | 5023/12776 [53:56<55:03,  2.35it/s]                                                     39%|███▉      | 5023/12776 [53:56<55:03,  2.35it/s] 39%|███▉      | 5024/12776 [53:57<51:27,  2.51it/s]                                                     39%|███▉      | 5024/12776 [53:57<51:27,  2.51it/s] 39%|███▉      | 5025/12776 [53:57<49:31,  2.61it/s]                                                     39%|███▉      | 5025/12776 [53:57<49:31,  2.61it/s] 39%|███▉      | 5026/12776 [53:57<46:15,  2.79it/s]                                                     39%|███▉      | 5026/12776 [53:57<46:15,  2.79it/s] 39%|███▉      | 5027/12776 [53:58<44:05,  2.93it/s]                                                     39%|███▉      | 5027/12776 [53:58<44:05,  2.93it/s] 39%|███▉      | 5028/12776 [53:58<45:32,  2.84it/s]                                                     39%|███▉      | 5028/12776 [53:58<45:32,  2.84it/s] 39%|███▉      | 5029/12776 [53:58<42:22,  3.05it/s]                                                     39%|███▉      | 5029/12776 [53:58<42:22,  3.05it/s] 39%|███▉      | 5030/12776 [53:59<39:55,  3.23it/s]                                                     39%|███▉      | 5030/12776 [53:59<39:55,  3.23it/s] 39%|███▉      | 5031/12776 [53:59<37:51,  3.41it/s]                                                     39%|███▉      | 5031/12776 [53:59<37:51,  3.41it/s] 39%|███▉      | 5032/12776 [53:59<38:50,  3.32it/s]                                                     39%|███▉      | 5032/12776 [53:59<38:50,  3.32it/s] 39%|███▉      | 5033/12776 [53:59<36:45,  3.51it/s]                                                     39%|███▉      | 5033/12776 [53:59<36:45,  3.51it/s] 39%|███▉      | 5034/12776 [54:00<35:14,  3.66it/s]                                                     39%|███▉      | 5034/12776 [54:00<35:14,  3.66it/s] 39%|███▉      | 5035/12776 [54:00<33:51,  3.81it/s]                                                     39%|███▉      | 5035/12776 [54:00<33:51,  3.81it/s] 39%|███▉      | 5036/12776 [54:00<32:41,  3.95it/s]                                                     39%|███▉      | 5036/12776 [54:00<32:41,  3.95it/s] 39%|███▉      | 5037/12776 [54:00<35:12,  3.66it/s]                                                     39%|███▉      | 5037/12776 [54:00<35:12,  3.66it/s] 39%|███▉      | 5038/12776 [54:01<33:09,  3.89it/s]                                                     39%|███▉      | 5038/12776 [54:01<33:09,  3.89it/s] 39%|███▉      | 5039/12776 [54:01<31:33,  4.09it/s]                                                     39%|███▉      | 5039/12776 [54:01<31:33,  4.09it/s] 39%|███▉      | 5040/12776 [54:01<30:20,  4.25it/s]                                                     39%|███▉      | 5040/12776 [54:01<30:20,  4.25it/s] 39%|███▉      | 5041/12776 [54:01<29:23,  4.39it/s]                                                     39%|███▉      | 5041/12776 [54:01<29:23,  4.39it/s] 39%|███▉      | 5042/12776 [54:01<30:28,  4.23it/s]                                                     39%|███▉      | 5042/12776 [54:01<30:28,  4.23it/s] 39%|███▉      | 5043/12776 [54:02<29:23,  4.38it/s]                                                     39%|███▉      | 5043/12776 [54:02<29:23,  4.38it/s] 39%|███▉      | 5044/12776 [54:02<28:17,  4.56it/s]                                                     39%|███▉      | 5044/12776 [54:02<28:17,  4.56it/s] 39%|███▉      | 5045/12776 [54:02<27:25,  4.70it/s]                                                     39%|███▉      | 5045/12776 [54:02<27:25,  4.70it/s] 39%|███▉      | 5046/12776 [54:02<26:44,  4.82it/s]                                                     39%|███▉      | 5046/12776 [54:02<26:44,  4.82it/s] 40%|███▉      | 5047/12776 [54:03<30:08,  4.27it/s]                                                     40%|███▉      | 5047/12776 [54:03<30:08,  4.27it/s] 40%|███▉      | 5048/12776 [54:03<28:13,  4.56it/s]                                                     40%|███▉      | 5048/12776 [54:03<28:13,  4.56it/s] 40%|███▉      | 5049/12776 [54:03<27:05,  4.75it/s]                                                     40%|███▉      | 5049/12776 [54:03<27:05,  4.75it/s] 40%|███▉      | 5050/12776 [54:04<49:01,  2.63it/s]                                                     40%|███▉      | 5050/12776 [54:04<49:01,  2.63it/s] 40%|███▉      | 5051/12776 [54:05<1:33:20,  1.38it/s]                                                       40%|███▉      | 5051/12776 [54:05<1:33:20,  1.38it/s] 40%|███▉      | 5052/12776 [54:06<1:48:21,  1.19it/s]                                                       40%|███▉      | 5052/12776 [54:06<1:48:21,  1.19it/s] 40%|███▉      | 5053/12776 [54:07<1:47:41,  1.20it/s]                                                       40%|███▉      | 5053/12776 [54:07<1:47:41,  1.20it/s] 40%|███▉      | 5054/12776 [54:08<1:44:20,  1.23it/s]                                                       40%|███▉      | 5054/12776 [54:08<1:44:20,  1.23it/s] 40%|███▉      | 5055/12776 [54:09<1:42:48,  1.25it/s]                                                       40%|███▉      | 5055/12776 [54:09<1:42:48,  1.25it/s] 40%|███▉      | 5056/12776 [54:09<1:40:28,  1.28it/s]                                                       40%|███▉      | 5056/12776 [54:09<1:40:28,  1.28it/s] 40%|███▉      | 5057/12776 [54:10<1:34:38,  1.36it/s]                                                       40%|███▉      | 5057/12776 [54:10<1:34:38,  1.36it/s] 40%|███▉      | 5058/12776 [54:11<1:28:12,  1.46it/s]                                                       40%|███▉      | 5058/12776 [54:11<1:28:12,  1.46it/s] 40%|███▉      | 5059/12776 [54:11<1:23:51,  1.53it/s]                                                       40%|███▉      | 5059/12776 [54:11<1:23:51,  1.53it/s] 40%|███▉      | 5060/12776 [54:12<1:22:06,  1.57it/s]                                                       40%|███▉      | 5060/12776 [54:12<1:22:06,  1.57it/s] 40%|███▉      | 5061/12776 [54:12<1:17:55,  1.65it/s]                                                       40%|███▉      | 5061/12776 [54:12<1:17:55,  1.65it/s] 40%|███▉      | 5062/12776 [54:13<1:18:36,  1.64it/s]                                                       40%|███▉      | 5062/12776 [54:13<1:18:36,  1.64it/s] 40%|███▉      | 5063/12776 [54:13<1:13:43,  1.74it/s]                                                       40%|███▉      | 5063/12776 [54:13<1:13:43,  1.74it/s] 40%|███▉      | 5064/12776 [54:14<1:08:56,  1.86it/s]                                                       40%|███▉      | 5064/12776 [54:14<1:08:56,  1.86it/s] 40%|███▉      | 5065/12776 [54:14<1:08:04,  1.89it/s]                                                       40%|███▉      | 5065/12776 [54:14<1:08:04,  1.89it/s] 40%|███▉      | 5066/12776 [54:15<1:03:34,  2.02it/s]                                                       40%|███▉      | 5066/12776 [54:15<1:03:34,  2.02it/s] 40%|███▉      | 5067/12776 [54:15<1:02:28,  2.06it/s]                                                       40%|███▉      | 5067/12776 [54:15<1:02:28,  2.06it/s] 40%|███▉      | 5068/12776 [54:16<58:43,  2.19it/s]                                                       40%|███▉      | 5068/12776 [54:16<58:43,  2.19it/s] 40%|███▉      | 5069/12776 [54:16<55:48,  2.30it/s]                                                     40%|███▉      | 5069/12776 [54:16<55:48,  2.30it/s] 40%|███▉      | 5070/12776 [54:16<54:08,  2.37it/s]                                                     40%|███▉      | 5070/12776 [54:16<54:08,  2.37it/s] 40%|███▉      | 5071/12776 [54:17<51:33,  2.49it/s]                                                     40%|███▉      | 5071/12776 [54:17<51:33,  2.49it/s] 40%|███▉      | 5072/12776 [54:17<49:27,  2.60it/s]                                                     40%|███▉      | 5072/12776 [54:17<49:27,  2.60it/s] 40%|███▉      | 5073/12776 [54:18<50:13,  2.56it/s]                                                     40%|███▉      | 5073/12776 [54:18<50:13,  2.56it/s] 40%|███▉      | 5074/12776 [54:18<47:56,  2.68it/s]                                                     40%|███▉      | 5074/12776 [54:18<47:56,  2.68it/s] 40%|███▉      | 5075/12776 [54:18<45:53,  2.80it/s]                                                     40%|███▉      | 5075/12776 [54:18<45:53,  2.80it/s] 40%|███▉      | 5076/12776 [54:19<43:54,  2.92it/s]                                                     40%|███▉      | 5076/12776 [54:19<43:54,  2.92it/s] 40%|███▉      | 5077/12776 [54:19<43:36,  2.94it/s]                                                    {'eval_loss': 0.5757540464401245, 'eval_wer': 0.36954119472129543, 'eval_runtime': 205.5697, 'eval_samples_per_second': 60.237, 'eval_steps_per_second': 3.765, 'epoch': 0.78}
+{'loss': 0.5492, 'grad_norm': 1.0069044828414917, 'learning_rate': 0.00019027370478983378, 'epoch': 0.78}
+{'loss': 0.2823, 'grad_norm': 0.5260990262031555, 'learning_rate': 0.00019024926686217006, 'epoch': 0.78}
+{'loss': 0.2471, 'grad_norm': 0.48100632429122925, 'learning_rate': 0.00019022482893450634, 'epoch': 0.78}
+{'loss': 0.2494, 'grad_norm': 0.6281126141548157, 'learning_rate': 0.0001902003910068426, 'epoch': 0.78}
+{'loss': 0.3811, 'grad_norm': 0.869832456111908, 'learning_rate': 0.00019017595307917887, 'epoch': 0.78}
+{'loss': 0.2062, 'grad_norm': 0.6189180612564087, 'learning_rate': 0.00019015151515151514, 'epoch': 0.78}
+{'loss': 0.3194, 'grad_norm': 1.0378501415252686, 'learning_rate': 0.0001901270772238514, 'epoch': 0.78}
+{'loss': 0.3416, 'grad_norm': 0.7968530654907227, 'learning_rate': 0.00019010263929618767, 'epoch': 0.78}
+{'loss': 0.4465, 'grad_norm': 1.4084452390670776, 'learning_rate': 0.00019007820136852395, 'epoch': 0.78}
+{'loss': 0.3285, 'grad_norm': 0.9723837375640869, 'learning_rate': 0.00019005376344086018, 'epoch': 0.78}
+{'loss': 0.3684, 'grad_norm': 1.0934810638427734, 'learning_rate': 0.00019002932551319645, 'epoch': 0.78}
+{'loss': 0.7612, 'grad_norm': 1.6922805309295654, 'learning_rate': 0.00019000488758553273, 'epoch': 0.78}
+{'loss': 0.7402, 'grad_norm': 1.0072715282440186, 'learning_rate': 0.00018998044965786898, 'epoch': 0.78}
+{'loss': 0.4451, 'grad_norm': 1.367684245109558, 'learning_rate': 0.00018995601173020526, 'epoch': 0.78}
+{'loss': 0.4766, 'grad_norm': 1.189140796661377, 'learning_rate': 0.00018993157380254154, 'epoch': 0.79}
+{'loss': 0.6411, 'grad_norm': 1.3974438905715942, 'learning_rate': 0.0001899071358748778, 'epoch': 0.79}
+{'loss': 0.3786, 'grad_norm': 1.6327006816864014, 'learning_rate': 0.00018988269794721407, 'epoch': 0.79}
+{'loss': 0.4677, 'grad_norm': 0.9266005754470825, 'learning_rate': 0.00018985826001955032, 'epoch': 0.79}
+{'loss': 0.5805, 'grad_norm': 0.8187036514282227, 'learning_rate': 0.00018983382209188657, 'epoch': 0.79}
+{'loss': 0.6537, 'grad_norm': 1.5598969459533691, 'learning_rate': 0.00018980938416422285, 'epoch': 0.79}
+{'loss': 0.6621, 'grad_norm': 1.5645040273666382, 'learning_rate': 0.00018978494623655913, 'epoch': 0.79}
+{'loss': 0.4445, 'grad_norm': 1.342915654182434, 'learning_rate': 0.00018976050830889538, 'epoch': 0.79}
+{'loss': 0.7038, 'grad_norm': 1.3394932746887207, 'learning_rate': 0.00018973607038123166, 'epoch': 0.79}
+{'loss': 0.5493, 'grad_norm': 1.4908233880996704, 'learning_rate': 0.00018971163245356794, 'epoch': 0.79}
+{'loss': 0.6794, 'grad_norm': 1.5220710039138794, 'learning_rate': 0.00018968719452590416, 'epoch': 0.79}
+{'loss': 1.0358, 'grad_norm': 6.862061023712158, 'learning_rate': 0.00018966275659824044, 'epoch': 0.79}
+{'loss': 0.6504, 'grad_norm': 1.3390004634857178, 'learning_rate': 0.00018963831867057672, 'epoch': 0.79}
+{'loss': 0.9927, 'grad_norm': 1.7340364456176758, 'learning_rate': 0.00018961388074291297, 'epoch': 0.79}
+{'loss': 0.7016, 'grad_norm': 2.199751377105713, 'learning_rate': 0.00018958944281524925, 'epoch': 0.79}
+{'loss': 0.3266, 'grad_norm': 1.6952979564666748, 'learning_rate': 0.00018956500488758553, 'epoch': 0.79}
+{'loss': 0.6416, 'grad_norm': 2.159717082977295, 'learning_rate': 0.00018954056695992178, 'epoch': 0.79}
+{'loss': 0.8744, 'grad_norm': 1.6389358043670654, 'learning_rate': 0.00018951612903225806, 'epoch': 0.79}
+{'loss': 1.0329, 'grad_norm': 2.425839424133301, 'learning_rate': 0.00018949169110459433, 'epoch': 0.79}
+{'loss': 0.8958, 'grad_norm': 3.5907609462738037, 'learning_rate': 0.00018946725317693056, 'epoch': 0.79}
+{'loss': 0.7585, 'grad_norm': 1.7229177951812744, 'learning_rate': 0.00018944281524926684, 'epoch': 0.79}
+{'loss': 1.1827, 'grad_norm': 5.789181232452393, 'learning_rate': 0.00018941837732160312, 'epoch': 0.79}
+{'loss': 1.1366, 'grad_norm': 6.507972717285156, 'learning_rate': 0.00018939393939393937, 'epoch': 0.79}
+{'loss': 0.904, 'grad_norm': 2.509685754776001, 'learning_rate': 0.00018936950146627565, 'epoch': 0.79}
+{'loss': 0.7319, 'grad_norm': 3.2579421997070312, 'learning_rate': 0.00018934506353861192, 'epoch': 0.79}
+{'loss': 1.0652, 'grad_norm': 2.354316234588623, 'learning_rate': 0.00018932062561094817, 'epoch': 0.79}
+{'loss': 1.0612, 'grad_norm': 1.7672300338745117, 'learning_rate': 0.00018929618768328443, 'epoch': 0.79}
+{'loss': 1.1822, 'grad_norm': 2.5716187953948975, 'learning_rate': 0.0001892717497556207, 'epoch': 0.79}
+{'loss': 1.7569, 'grad_norm': 1.6928919553756714, 'learning_rate': 0.00018924731182795696, 'epoch': 0.79}
+{'loss': 1.1707, 'grad_norm': 2.918978452682495, 'learning_rate': 0.00018922287390029323, 'epoch': 0.79}
+{'loss': 1.4186, 'grad_norm': 2.933793783187866, 'learning_rate': 0.0001891984359726295, 'epoch': 0.79}
+{'loss': 1.0802, 'grad_norm': 3.004098653793335, 'learning_rate': 0.00018917399804496576, 'epoch': 0.79}
+{'loss': 1.2112, 'grad_norm': 4.242359638214111, 'learning_rate': 0.00018914956011730204, 'epoch': 0.79}
+{'loss': 0.6445, 'grad_norm': 1.2052658796310425, 'learning_rate': 0.00018912512218963832, 'epoch': 0.79}
+{'loss': 1.21, 'grad_norm': 2.339385986328125, 'learning_rate': 0.00018910068426197454, 'epoch': 0.79}
+{'loss': 0.8448, 'grad_norm': 1.4080181121826172, 'learning_rate': 0.00018907624633431082, 'epoch': 0.79}
+{'loss': 0.2988, 'grad_norm': 0.7067837119102478, 'learning_rate': 0.0001890518084066471, 'epoch': 0.79}
+{'loss': 0.2903, 'grad_norm': 0.7016892433166504, 'learning_rate': 0.00018902737047898335, 'epoch': 0.79}
+{'loss': 0.3338, 'grad_norm': 0.7686540484428406, 'learning_rate': 0.00018900293255131963, 'epoch': 0.79}
+{'loss': 0.4192, 'grad_norm': 0.7467711567878723, 'learning_rate': 0.0001889784946236559, 'epoch': 0.79}
+{'loss': 0.3853, 'grad_norm': 1.1654859781265259, 'learning_rate': 0.00018895405669599216, 'epoch': 0.79}
+{'loss': 0.3844, 'grad_norm': 0.7265309691429138, 'learning_rate': 0.00018892961876832844, 'epoch': 0.79}
+{'loss': 0.3022, 'grad_norm': 0.5267289280891418, 'learning_rate': 0.00018890518084066472, 'epoch': 0.79}
+{'loss': 0.4393, 'grad_norm': 0.6257277727127075, 'learning_rate': 0.00018888074291300094, 'epoch': 0.79}
+{'loss': 0.8329, 'grad_norm': 2.329874277114868, 'learning_rate': 0.00018885630498533722, 'epoch': 0.79}
+{'loss': 0.3971, 'grad_norm': 0.7630550861358643, 'learning_rate': 0.0001888318670576735, 'epoch': 0.79}
+{'loss': 0.3248, 'grad_norm': 0.726796567440033, 'learning_rate': 0.00018880742913000975, 'epoch': 0.79}
+{'loss': 0.3606, 'grad_norm': 0.9759939312934875, 'learning_rate': 0.00018878299120234603, 'epoch': 0.79}
+{'loss': 0.5576, 'grad_norm': 1.23695969581604, 'learning_rate': 0.0001887585532746823, 'epoch': 0.79}
+{'loss': 0.7945, 'grad_norm': 1.5264012813568115, 'learning_rate': 0.00018873411534701856, 'epoch': 0.79}
+{'loss': 0.5012, 'grad_norm': 0.8853788375854492, 'learning_rate': 0.0001887096774193548, 'epoch': 0.79}
+{'loss': 0.4011, 'grad_norm': 0.9085557460784912, 'learning_rate': 0.0001886852394916911, 'epoch': 0.79}
+{'loss': 0.3467, 'grad_norm': 0.9665614366531372, 'learning_rate': 0.00018866080156402734, 'epoch': 0.79}
+{'loss': 0.6318, 'grad_norm': 1.1219550371170044, 'learning_rate': 0.00018863636363636362, 'epoch': 0.79}
+{'loss': 0.5886, 'grad_norm': 1.8163543939590454, 'learning_rate': 0.0001886119257086999, 'epoch': 0.79}
+{'loss': 0.764, 'grad_norm': 1.5584337711334229, 'learning_rate': 0.00018858748778103615, 'epoch': 0.79}
+{'loss': 1.0698, 'grad_norm': 2.26932692527771, 'learning_rate': 0.00018856304985337242, 'epoch': 0.79}
+{'loss': 0.4875, 'grad_norm': 1.0103601217269897, 'learning_rate': 0.0001885386119257087, 'epoch': 0.79}
+{'loss': 0.7125, 'grad_norm': 2.160799980163574, 'learning_rate': 0.00018851417399804493, 'epoch': 0.79}
+{'loss': 0.7405, 'grad_norm': 1.7695913314819336, 'learning_rate': 0.0001884897360703812, 'epoch': 0.79}
+{'loss': 0.6548, 'grad_norm': 1.6188089847564697, 'learning_rate': 0.00018846529814271748, 'epoch': 0.79}
+{'loss': 1.2027, 'grad_norm': 2.61811900138855, 'learning_rate': 0.00018844086021505373, 'epoch': 0.79}
+ 40%|███▉      | 5077/12776 [54:19<43:36,  2.94it/s] 40%|███▉      | 5078/12776 [54:19<41:12,  3.11it/s]                                                     40%|███▉      | 5078/12776 [54:19<41:12,  3.11it/s] 40%|███▉      | 5079/12776 [54:19<39:08,  3.28it/s]                                                     40%|███▉      | 5079/12776 [54:19<39:08,  3.28it/s] 40%|███▉      | 5080/12776 [54:20<37:26,  3.43it/s]                                                     40%|███▉      | 5080/12776 [54:20<37:26,  3.43it/s] 40%|███▉      | 5081/12776 [54:20<39:25,  3.25it/s]                                                     40%|███▉      | 5081/12776 [54:20<39:25,  3.25it/s] 40%|███▉      | 5082/12776 [54:20<37:10,  3.45it/s]                                                     40%|███▉      | 5082/12776 [54:20<37:10,  3.45it/s] 40%|███▉      | 5083/12776 [54:21<35:22,  3.62it/s]                                                     40%|███▉      | 5083/12776 [54:21<35:22,  3.62it/s] 40%|███▉      | 5084/12776 [54:21<34:05,  3.76it/s]                                                     40%|███▉      | 5084/12776 [54:21<34:05,  3.76it/s] 40%|███▉      | 5085/12776 [54:21<37:18,  3.44it/s]                                                     40%|███▉      | 5085/12776 [54:21<37:18,  3.44it/s] 40%|███▉      | 5086/12776 [54:21<34:55,  3.67it/s]                                                     40%|███▉      | 5086/12776 [54:21<34:55,  3.67it/s] 40%|███▉      | 5087/12776 [54:22<33:05,  3.87it/s]                                                     40%|███▉      | 5087/12776 [54:22<33:05,  3.87it/s] 40%|███▉      | 5088/12776 [54:22<31:29,  4.07it/s]                                                     40%|███▉      | 5088/12776 [54:22<31:29,  4.07it/s] 40%|███▉      | 5089/12776 [54:22<35:01,  3.66it/s]                                                     40%|███▉      | 5089/12776 [54:22<35:01,  3.66it/s] 40%|███▉      | 5090/12776 [54:22<32:28,  3.95it/s]                                                     40%|███▉      | 5090/12776 [54:22<32:28,  3.95it/s] 40%|███▉      | 5091/12776 [54:23<30:43,  4.17it/s]                                                     40%|███▉      | 5091/12776 [54:23<30:43,  4.17it/s] 40%|███▉      | 5092/12776 [54:23<29:21,  4.36it/s]                                                     40%|███▉      | 5092/12776 [54:23<29:21,  4.36it/s] 40%|███▉      | 5093/12776 [54:23<28:17,  4.53it/s]                                                     40%|███▉      | 5093/12776 [54:23<28:17,  4.53it/s] 40%|███▉      | 5094/12776 [54:23<31:59,  4.00it/s]                                                     40%|███▉      | 5094/12776 [54:23<31:59,  4.00it/s] 40%|███▉      | 5095/12776 [54:23<29:53,  4.28it/s]                                                     40%|███▉      | 5095/12776 [54:23<29:53,  4.28it/s] 40%|███▉      | 5096/12776 [54:24<28:20,  4.52it/s]                                                     40%|███▉      | 5096/12776 [54:24<28:20,  4.52it/s] 40%|███▉      | 5097/12776 [54:24<27:08,  4.72it/s]                                                     40%|███▉      | 5097/12776 [54:24<27:08,  4.72it/s] 40%|███▉      | 5098/12776 [54:24<26:06,  4.90it/s]                                                     40%|███▉      | 5098/12776 [54:24<26:06,  4.90it/s] 40%|███▉      | 5099/12776 [54:24<25:21,  5.05it/s]                                                     40%|███▉      | 5099/12776 [54:24<25:21,  5.05it/s] 40%|███▉      | 5100/12776 [54:25<45:15,  2.83it/s]                                                     40%|███▉      | 5100/12776 [54:25<45:15,  2.83it/s] 40%|███▉      | 5101/12776 [54:26<1:25:42,  1.49it/s]                                                       40%|███▉      | 5101/12776 [54:26<1:25:42,  1.49it/s] 40%|███▉      | 5102/12776 [54:27<1:36:17,  1.33it/s]                                                       40%|███▉      | 5102/12776 [54:27<1:36:17,  1.33it/s] 40%|███▉      | 5103/12776 [54:28<1:43:30,  1.24it/s]                                                       40%|███▉      | 5103/12776 [54:28<1:43:30,  1.24it/s] 40%|███▉      | 5104/12776 [54:29<1:43:08,  1.24it/s]                                                       40%|███▉      | 5104/12776 [54:29<1:43:08,  1.24it/s] 40%|███▉      | 5105/12776 [54:30<1:40:02,  1.28it/s]                                                       40%|███▉      | 5105/12776 [54:30<1:40:02,  1.28it/s] 40%|███▉      | 5106/12776 [54:30<1:37:12,  1.31it/s]                                                       40%|███▉      | 5106/12776 [54:30<1:37:12,  1.31it/s] 40%|███▉      | 5107/12776 [54:31<1:32:39,  1.38it/s]                                                       40%|███▉      | 5107/12776 [54:31<1:32:39,  1.38it/s] 40%|███▉      | 5108/12776 [54:32<1:28:07,  1.45it/s]                                                       40%|███▉      | 5108/12776 [54:32<1:28:07,  1.45it/s] 40%|███▉      | 5109/12776 [54:32<1:23:42,  1.53it/s]                                                       40%|███▉      | 5109/12776 [54:32<1:23:42,  1.53it/s] 40%|███▉      | 5110/12776 [54:33<1:19:30,  1.61it/s]                                                       40%|███▉      | 5110/12776 [54:33<1:19:30,  1.61it/s] 40%|████      | 5111/12776 [54:33<1:15:48,  1.69it/s]                                                       40%|████      | 5111/12776 [54:33<1:15:48,  1.69it/s] 40%|████      | 5112/12776 [54:34<1:14:45,  1.71it/s]                                                       40%|████      | 5112/12776 [54:34<1:14:45,  1.71it/s] 40%|████      | 5113/12776 [54:34<1:10:41,  1.81it/s]                                                       40%|████      | 5113/12776 [54:34<1:10:41,  1.81it/s] 40%|████      | 5114/12776 [54:35<1:10:32,  1.81it/s]                                                       40%|████      | 5114/12776 [54:35<1:10:32,  1.81it/s] 40%|████      | 5115/12776 [54:35<1:06:17,  1.93it/s]                                                       40%|████      | 5115/12776 [54:35<1:06:17,  1.93it/s] 40%|████      | 5116/12776 [54:36<1:06:04,  1.93it/s]                                                       40%|████      | 5116/12776 [54:36<1:06:04,  1.93it/s] 40%|████      | 5117/12776 [54:36<1:01:47,  2.07it/s]                                                       40%|████      | 5117/12776 [54:36<1:01:47,  2.07it/s] 40%|████      | 5118/12776 [54:37<58:04,  2.20it/s]                                                       40%|████      | 5118/12776 [54:37<58:04,  2.20it/s] 40%|████      | 5119/12776 [54:37<59:11,  2.16it/s]                                                     40%|████      | 5119/12776 [54:37<59:11,  2.16it/s] 40%|████      | 5120/12776 [54:38<55:17,  2.31it/s]                                                     40%|████      | 5120/12776 [54:38<55:17,  2.31it/s] 40%|████      | 5121/12776 [54:38<52:04,  2.45it/s]                                                     40%|████      | 5121/12776 [54:38<52:04,  2.45it/s] 40%|████      | 5122/12776 [54:38<52:04,  2.45it/s]                                                     40%|████      | 5122/12776 [54:38<52:04,  2.45it/s] 40%|████      | 5123/12776 [54:39<49:01,  2.60it/s]                                                     40%|████      | 5123/12776 [54:39<49:01,  2.60it/s] 40%|████      | 5124/12776 [54:39<46:00,  2.77it/s]                                                     40%|████      | 5124/12776 [54:39<46:00,  2.77it/s] 40%|████      | 5125/12776 [54:39<46:15,  2.76it/s]                                                     40%|████      | 5125/12776 [54:39<46:15,  2.76it/s] 40%|████      | 5126/12776 [54:40<45:17,  2.82it/s]                                                     40%|████      | 5126/12776 [54:40<45:17,  2.82it/s] 40%|████      | 5127/12776 [54:40<42:48,  2.98it/s]                                                     40%|████      | 5127/12776 [54:40<42:48,  2.98it/s] 40%|████      | 5128/12776 [54:40<40:31,  3.15it/s]                                                     40%|████      | 5128/12776 [54:40<40:31,  3.15it/s] 40%|████      | 5129/12776 [54:41<41:44,  3.05it/s]                                                     40%|████      | 5129/12776 [54:41<41:44,  3.05it/s] 40%|████      | 5130/12776 [54:41<38:57,  3.27it/s]                                                     40%|████      | 5130/12776 [54:41<38:57,  3.27it/s] 40%|████      | 5131/12776 [54:41<36:46,  3.46it/s]                                                     40%|████      | 5131/12776 [54:41<36:46,  3.46it/s] 40%|████      | 5132/12776 [54:41<35:03,  3.63it/s]                                                     40%|████      | 5132/12776 [54:41<35:03,  3.63it/s] 40%|████      | 5133/12776 [54:42<38:04,  3.34it/s]                                                     40%|████      | 5133/12776 [54:42<38:04,  3.34it/s] 40%|████      | 5134/12776 [54:42<35:32,  3.58it/s]                                                     40%|████      | 5134/12776 [54:42<35:32,  3.58it/s] 40%|████      | 5135/12776 [54:42<33:33,  3.79it/s]                                                     40%|████      | 5135/12776 [54:42<33:33,  3.79it/s] 40%|████      | 5136/12776 [54:42<32:02,  3.97it/s]                                                     40%|████      | 5136/12776 [54:42<32:02,  3.97it/s] 40%|████      | 5137/12776 [54:43<34:50,  3.65it/s]                                                     40%|████      | 5137/12776 [54:43<34:50,  3.65it/s] 40%|████      | 5138/12776 [54:43<32:26,  3.92it/s]                                                     40%|████      | 5138/12776 [54:43<32:26,  3.92it/s] 40%|████      | 5139/12776 [54:43<30:37,  4.16it/s]                                                     40%|████      | 5139/12776 [54:43<30:37,  4.16it/s] 40%|████      | 5140/12776 [54:43<29:14,  4.35it/s]                                                     40%|████      | 5140/12776 [54:43<29:14,  4.35it/s] 40%|████      | 5141/12776 [54:43<28:12,  4.51it/s]                                                     40%|████      | 5141/12776 [54:43<28:12,  4.51it/s] 40%|████      | 5142/12776 [54:44<31:25,  4.05it/s]                                                     40%|████      | 5142/12776 [54:44<31:25,  4.05it/s] 40%|████      | 5143/12776 [54:44<29:37,  4.29it/s]                                                     40%|████      | 5143/12776 [54:44<29:37,  4.29it/s] 40%|████      | 5144/12776 [54:44<28:18,  4.49it/s]                                                     40%|████      | 5144/12776 [54:44<28:18,  4.49it/s] 40%|████      | 5145/12776 [54:44<27:17,  4.66it/s]                                                     40%|████      | 5145/12776 [54:44<27:17,  4.66it/s] 40%|████      | 5146/12776 [54:45<26:31,  4.80it/s]                                                     40%|████      | 5146/12776 [54:45<26:31,  4.80it/s] 40%|████      | 5147/12776 [54:45<26:00,  4.89it/s]                                                     40%|████      | 5147/12776 [54:45<26:00,  4.89it/s] 40%|████      | 5148/12776 [54:45<28:59,  4.39it/s]                                                     40%|████      | 5148/12776 [54:45<28:59,  4.39it/s] 40%|████      | 5149/12776 [54:45<27:18,  4.65it/s]                                                     40%|████      | 5149/12776 [54:45<27:18,  4.65it/s] 40%|████      | 5150/12776 [54:46<49:38,  2.56it/s]                                                     40%|████      | 5150/12776 [54:46<49:38,  2.56it/s] 40%|████      | 5151/12776 [54:48<1:31:31,  1.39it/s]                                                       40%|████      | 5151/12776 [54:48<1:31:31,  1.39it/s] 40%|████      | 5152/12776 [54:49<1:41:26,  1.25it/s]                                                       40%|████      | 5152/12776 [54:49<1:41:26,  1.25it/s] 40%|████      | 5153/12776 [54:49<1:45:11,  1.21it/s]                                                       40%|████      | 5153/12776 [54:49<1:45:11,  1.21it/s] 40%|████      | 5154/12776 [54:50<1:43:54,  1.22it/s]                                                       40%|████      | 5154/12776 [54:50<1:43:54,  1.22it/s] 40%|████      | 5155/12776 [54:51<1:44:18,  1.22it/s]                                                      {'loss': 0.9647, 'grad_norm': 2.0806033611297607, 'learning_rate': 0.00018841642228739, 'epoch': 0.79}
+{'loss': 0.73, 'grad_norm': 1.6380743980407715, 'learning_rate': 0.0001883919843597263, 'epoch': 0.79}
+{'loss': 0.5505, 'grad_norm': 1.2052991390228271, 'learning_rate': 0.00018836754643206254, 'epoch': 0.8}
+{'loss': 0.6914, 'grad_norm': 1.2171603441238403, 'learning_rate': 0.00018834310850439882, 'epoch': 0.8}
+{'loss': 0.6407, 'grad_norm': 1.6889578104019165, 'learning_rate': 0.0001883186705767351, 'epoch': 0.8}
+{'loss': 0.9845, 'grad_norm': 2.487717866897583, 'learning_rate': 0.00018829423264907132, 'epoch': 0.8}
+{'loss': 0.7382, 'grad_norm': 2.156898021697998, 'learning_rate': 0.0001882697947214076, 'epoch': 0.8}
+{'loss': 0.6193, 'grad_norm': 2.150857448577881, 'learning_rate': 0.00018824535679374388, 'epoch': 0.8}
+{'loss': 0.6914, 'grad_norm': 2.0828707218170166, 'learning_rate': 0.00018822091886608013, 'epoch': 0.8}
+{'loss': 0.9627, 'grad_norm': 2.405583381652832, 'learning_rate': 0.0001881964809384164, 'epoch': 0.8}
+{'loss': 1.0168, 'grad_norm': 2.8909096717834473, 'learning_rate': 0.0001881720430107527, 'epoch': 0.8}
+{'loss': 0.8169, 'grad_norm': 2.1869802474975586, 'learning_rate': 0.00018814760508308894, 'epoch': 0.8}
+{'loss': 1.6036, 'grad_norm': 2.3138229846954346, 'learning_rate': 0.0001881231671554252, 'epoch': 0.8}
+{'loss': 1.9535, 'grad_norm': 5.03582763671875, 'learning_rate': 0.00018809872922776147, 'epoch': 0.8}
+{'loss': 1.4739, 'grad_norm': 2.7218527793884277, 'learning_rate': 0.00018807429130009772, 'epoch': 0.8}
+{'loss': 0.9925, 'grad_norm': 2.0012664794921875, 'learning_rate': 0.000188049853372434, 'epoch': 0.8}
+{'loss': 1.289, 'grad_norm': 3.092926502227783, 'learning_rate': 0.00018802541544477028, 'epoch': 0.8}
+{'loss': 1.1165, 'grad_norm': 1.7731401920318604, 'learning_rate': 0.00018800097751710653, 'epoch': 0.8}
+{'loss': 1.2458, 'grad_norm': 3.9922702312469482, 'learning_rate': 0.0001879765395894428, 'epoch': 0.8}
+{'loss': 0.6469, 'grad_norm': 1.2292232513427734, 'learning_rate': 0.00018795210166177908, 'epoch': 0.8}
+{'loss': 0.4321, 'grad_norm': 1.4839969873428345, 'learning_rate': 0.0001879276637341153, 'epoch': 0.8}
+{'loss': 0.7269, 'grad_norm': 1.8727073669433594, 'learning_rate': 0.0001879032258064516, 'epoch': 0.8}
+{'loss': 0.4632, 'grad_norm': 1.444756269454956, 'learning_rate': 0.00018787878787878787, 'epoch': 0.8}
+{'loss': 0.7119, 'grad_norm': 1.8207427263259888, 'learning_rate': 0.00018785434995112412, 'epoch': 0.8}
+{'loss': 0.7385, 'grad_norm': 2.3908092975616455, 'learning_rate': 0.0001878299120234604, 'epoch': 0.8}
+{'loss': 0.4004, 'grad_norm': 0.8618811964988708, 'learning_rate': 0.00018780547409579667, 'epoch': 0.8}
+{'loss': 0.3438, 'grad_norm': 0.7508620023727417, 'learning_rate': 0.00018778103616813292, 'epoch': 0.8}
+{'loss': 0.3142, 'grad_norm': 0.8002655506134033, 'learning_rate': 0.0001877565982404692, 'epoch': 0.8}
+{'loss': 0.2769, 'grad_norm': 0.8202148675918579, 'learning_rate': 0.00018773216031280548, 'epoch': 0.8}
+{'loss': 0.2992, 'grad_norm': 0.5834996700286865, 'learning_rate': 0.0001877077223851417, 'epoch': 0.8}
+{'loss': 0.2415, 'grad_norm': 0.6148617267608643, 'learning_rate': 0.00018768328445747798, 'epoch': 0.8}
+{'loss': 0.3005, 'grad_norm': 1.0197445154190063, 'learning_rate': 0.00018765884652981426, 'epoch': 0.8}
+{'loss': 0.4335, 'grad_norm': 0.8099470138549805, 'learning_rate': 0.00018763440860215051, 'epoch': 0.8}
+{'loss': 0.4103, 'grad_norm': 0.9061375260353088, 'learning_rate': 0.0001876099706744868, 'epoch': 0.8}
+{'loss': 0.4025, 'grad_norm': 0.8547300696372986, 'learning_rate': 0.00018758553274682307, 'epoch': 0.8}
+{'loss': 0.8325, 'grad_norm': 5.366735935211182, 'learning_rate': 0.0001875610948191593, 'epoch': 0.8}
+{'loss': 0.3426, 'grad_norm': 0.9143383502960205, 'learning_rate': 0.00018753665689149557, 'epoch': 0.8}
+{'loss': 0.3594, 'grad_norm': 0.7643350958824158, 'learning_rate': 0.00018751221896383185, 'epoch': 0.8}
+{'loss': 0.3918, 'grad_norm': 1.4406952857971191, 'learning_rate': 0.0001874877810361681, 'epoch': 0.8}
+{'loss': 0.441, 'grad_norm': 1.629372477531433, 'learning_rate': 0.00018746334310850438, 'epoch': 0.8}
+{'loss': 0.5043, 'grad_norm': 2.073367118835449, 'learning_rate': 0.00018743890518084066, 'epoch': 0.8}
+{'loss': 0.5772, 'grad_norm': 1.0843485593795776, 'learning_rate': 0.0001874144672531769, 'epoch': 0.8}
+{'loss': 0.4383, 'grad_norm': 1.2174686193466187, 'learning_rate': 0.0001873900293255132, 'epoch': 0.8}
+{'loss': 0.9853, 'grad_norm': 1.9818603992462158, 'learning_rate': 0.00018736559139784947, 'epoch': 0.8}
+{'loss': 0.7447, 'grad_norm': 1.621567726135254, 'learning_rate': 0.0001873411534701857, 'epoch': 0.8}
+{'loss': 0.745, 'grad_norm': 1.1117850542068481, 'learning_rate': 0.00018731671554252197, 'epoch': 0.8}
+{'loss': 0.673, 'grad_norm': 1.9802541732788086, 'learning_rate': 0.00018729227761485825, 'epoch': 0.8}
+{'loss': 0.7147, 'grad_norm': 2.39848256111145, 'learning_rate': 0.0001872678396871945, 'epoch': 0.8}
+{'loss': 0.8749, 'grad_norm': 2.0519936084747314, 'learning_rate': 0.00018724340175953078, 'epoch': 0.8}
+{'loss': 0.6292, 'grad_norm': 3.485672950744629, 'learning_rate': 0.00018721896383186706, 'epoch': 0.8}
+{'loss': 0.6966, 'grad_norm': 1.4344536066055298, 'learning_rate': 0.0001871945259042033, 'epoch': 0.8}
+{'loss': 0.7884, 'grad_norm': 1.8065955638885498, 'learning_rate': 0.00018717008797653959, 'epoch': 0.8}
+{'loss': 1.0263, 'grad_norm': 3.6765334606170654, 'learning_rate': 0.00018714565004887586, 'epoch': 0.8}
+{'loss': 0.5238, 'grad_norm': 2.0539987087249756, 'learning_rate': 0.0001871212121212121, 'epoch': 0.8}
+{'loss': 1.0184, 'grad_norm': 2.6825201511383057, 'learning_rate': 0.00018709677419354837, 'epoch': 0.8}
+{'loss': 0.8083, 'grad_norm': 3.3649113178253174, 'learning_rate': 0.00018707233626588464, 'epoch': 0.8}
+{'loss': 0.7746, 'grad_norm': 2.283419370651245, 'learning_rate': 0.0001870478983382209, 'epoch': 0.8}
+{'loss': 0.9586, 'grad_norm': 2.9016916751861572, 'learning_rate': 0.00018702346041055717, 'epoch': 0.8}
+{'loss': 1.083, 'grad_norm': 3.5919530391693115, 'learning_rate': 0.00018699902248289345, 'epoch': 0.8}
+{'loss': 1.187, 'grad_norm': 5.343000888824463, 'learning_rate': 0.00018697458455522968, 'epoch': 0.8}
+{'loss': 1.5941, 'grad_norm': 2.6350221633911133, 'learning_rate': 0.00018695014662756596, 'epoch': 0.8}
+{'loss': 0.6295, 'grad_norm': 1.6021567583084106, 'learning_rate': 0.00018692570869990223, 'epoch': 0.8}
+{'loss': 1.3454, 'grad_norm': 4.3423075675964355, 'learning_rate': 0.00018690127077223848, 'epoch': 0.8}
+{'loss': 1.0692, 'grad_norm': 3.026441812515259, 'learning_rate': 0.00018687683284457476, 'epoch': 0.8}
+{'loss': 0.7581, 'grad_norm': 2.199367046356201, 'learning_rate': 0.00018685239491691104, 'epoch': 0.8}
+{'loss': 0.8282, 'grad_norm': 1.787598967552185, 'learning_rate': 0.0001868279569892473, 'epoch': 0.8}
+{'loss': 1.3213, 'grad_norm': 3.308527708053589, 'learning_rate': 0.00018680351906158357, 'epoch': 0.81}
+{'loss': 1.6773, 'grad_norm': 2.4407522678375244, 'learning_rate': 0.00018677908113391985, 'epoch': 0.81}
+{'loss': 0.6245, 'grad_norm': 0.9789965748786926, 'learning_rate': 0.00018675464320625607, 'epoch': 0.81}
+{'loss': 0.976, 'grad_norm': 1.3967782258987427, 'learning_rate': 0.00018673020527859235, 'epoch': 0.81}
+{'loss': 0.8646, 'grad_norm': 1.6715110540390015, 'learning_rate': 0.00018670576735092863, 'epoch': 0.81}
+{'loss': 0.6652, 'grad_norm': 2.1324195861816406, 'learning_rate': 0.00018668132942326488, 'epoch': 0.81}
+{'loss': 0.8102, 'grad_norm': 1.6442357301712036, 'learning_rate': 0.00018665689149560116, 'epoch': 0.81}
+{'loss': 0.9856, 'grad_norm': 2.52543044090271, 'learning_rate': 0.00018663245356793744, 'epoch': 0.81}
+{'loss': 0.284, 'grad_norm': 0.6359297633171082, 'learning_rate': 0.0001866080156402737, 'epoch': 0.81}
+{'loss': 0.4191, 'grad_norm': 0.6601283550262451, 'learning_rate': 0.00018658357771260997, 'epoch': 0.81}
+{'loss': 0.3558, 'grad_norm': 0.5493868589401245, 'learning_rate': 0.00018655913978494622, 'epoch': 0.81}
+{'loss': 0.355, 'grad_norm': 0.6157817244529724, 'learning_rate': 0.00018653470185728247, 'epoch': 0.81}
+ 40%|████      | 5155/12776 [54:51<1:44:18,  1.22it/s] 40%|████      | 5156/12776 [54:52<1:38:41,  1.29it/s]                                                       40%|████      | 5156/12776 [54:52<1:38:41,  1.29it/s] 40%|████      | 5157/12776 [54:52<1:33:33,  1.36it/s]                                                       40%|████      | 5157/12776 [54:52<1:33:33,  1.36it/s] 40%|████      | 5158/12776 [54:53<1:33:18,  1.36it/s]                                                       40%|████      | 5158/12776 [54:53<1:33:18,  1.36it/s] 40%|████      | 5159/12776 [54:54<1:27:08,  1.46it/s]                                                       40%|████      | 5159/12776 [54:54<1:27:08,  1.46it/s] 40%|████      | 5160/12776 [54:54<1:23:45,  1.52it/s]                                                       40%|████      | 5160/12776 [54:54<1:23:45,  1.52it/s] 40%|████      | 5161/12776 [54:55<1:18:56,  1.61it/s]                                                       40%|████      | 5161/12776 [54:55<1:18:56,  1.61it/s] 40%|████      | 5162/12776 [54:55<1:19:20,  1.60it/s]                                                       40%|████      | 5162/12776 [54:55<1:19:20,  1.60it/s] 40%|████      | 5163/12776 [54:56<1:13:24,  1.73it/s]                                                       40%|████      | 5163/12776 [54:56<1:13:24,  1.73it/s] 40%|████      | 5164/12776 [54:56<1:08:18,  1.86it/s]                                                       40%|████      | 5164/12776 [54:56<1:08:18,  1.86it/s] 40%|████      | 5165/12776 [54:57<1:06:21,  1.91it/s]                                                       40%|████      | 5165/12776 [54:57<1:06:21,  1.91it/s] 40%|████      | 5166/12776 [54:57<1:01:51,  2.05it/s]                                                       40%|████      | 5166/12776 [54:57<1:01:51,  2.05it/s] 40%|████      | 5167/12776 [54:58<1:00:37,  2.09it/s]                                                       40%|████      | 5167/12776 [54:58<1:00:37,  2.09it/s] 40%|████      | 5168/12776 [54:58<56:45,  2.23it/s]                                                       40%|████      | 5168/12776 [54:58<56:45,  2.23it/s] 40%|████      | 5169/12776 [54:58<53:23,  2.37it/s]                                                     40%|████      | 5169/12776 [54:58<53:23,  2.37it/s] 40%|████      | 5170/12776 [54:59<53:07,  2.39it/s]                                                     40%|████      | 5170/12776 [54:59<53:07,  2.39it/s] 40%|████      | 5171/12776 [54:59<50:05,  2.53it/s]                                                     40%|████      | 5171/12776 [54:59<50:05,  2.53it/s] 40%|████      | 5172/12776 [55:00<47:26,  2.67it/s]                                                     40%|████      | 5172/12776 [55:00<47:26,  2.67it/s] 40%|████      | 5173/12776 [55:00<44:54,  2.82it/s]                                                     40%|████      | 5173/12776 [55:00<44:54,  2.82it/s] 40%|████      | 5174/12776 [55:00<43:54,  2.89it/s]                                                     40%|████      | 5174/12776 [55:00<43:54,  2.89it/s] 41%|████      | 5175/12776 [55:00<41:45,  3.03it/s]                                                     41%|████      | 5175/12776 [55:00<41:45,  3.03it/s] 41%|████      | 5176/12776 [55:01<39:55,  3.17it/s]                                                     41%|████      | 5176/12776 [55:01<39:55,  3.17it/s] 41%|████      | 5177/12776 [55:01<38:22,  3.30it/s]                                                     41%|████      | 5177/12776 [55:01<38:22,  3.30it/s] 41%|████      | 5178/12776 [55:01<37:46,  3.35it/s]                                                     41%|████      | 5178/12776 [55:01<37:46,  3.35it/s] 41%|████      | 5179/12776 [55:02<36:22,  3.48it/s]                                                     41%|████      | 5179/12776 [55:02<36:22,  3.48it/s] 41%|████      | 5180/12776 [55:02<35:07,  3.60it/s]                                                     41%|████      | 5180/12776 [55:02<35:07,  3.60it/s] 41%|████      | 5181/12776 [55:02<34:02,  3.72it/s]                                                     41%|████      | 5181/12776 [55:02<34:02,  3.72it/s] 41%|████      | 5182/12776 [55:02<37:03,  3.41it/s]                                                     41%|████      | 5182/12776 [55:02<37:03,  3.41it/s] 41%|████      | 5183/12776 [55:03<35:04,  3.61it/s]                                                     41%|████      | 5183/12776 [55:03<35:04,  3.61it/s] 41%|████      | 5184/12776 [55:03<33:21,  3.79it/s]                                                     41%|████      | 5184/12776 [55:03<33:21,  3.79it/s] 41%|████      | 5185/12776 [55:03<32:01,  3.95it/s]                                                     41%|████      | 5185/12776 [55:03<32:01,  3.95it/s] 41%|████      | 5186/12776 [55:03<33:21,  3.79it/s]                                                     41%|████      | 5186/12776 [55:03<33:21,  3.79it/s] 41%|████      | 5187/12776 [55:04<31:25,  4.02it/s]                                                     41%|████      | 5187/12776 [55:04<31:25,  4.02it/s] 41%|████      | 5188/12776 [55:04<29:59,  4.22it/s]                                                     41%|████      | 5188/12776 [55:04<29:59,  4.22it/s] 41%|████      | 5189/12776 [55:04<29:01,  4.36it/s]                                                     41%|████      | 5189/12776 [55:04<29:01,  4.36it/s] 41%|████      | 5190/12776 [55:04<28:25,  4.45it/s]                                                     41%|████      | 5190/12776 [55:04<28:25,  4.45it/s] 41%|████      | 5191/12776 [55:05<31:05,  4.07it/s]                                                     41%|████      | 5191/12776 [55:05<31:05,  4.07it/s] 41%|████      | 5192/12776 [55:05<29:35,  4.27it/s]                                                     41%|████      | 5192/12776 [55:05<29:35,  4.27it/s] 41%|████      | 5193/12776 [55:05<28:44,  4.40it/s]                                                     41%|████      | 5193/12776 [55:05<28:44,  4.40it/s] 41%|████      | 5194/12776 [55:05<27:44,  4.56it/s]                                                     41%|████      | 5194/12776 [55:05<27:44,  4.56it/s] 41%|████      | 5195/12776 [55:05<26:55,  4.69it/s]                                                     41%|████      | 5195/12776 [55:05<26:55,  4.69it/s] 41%|████      | 5196/12776 [55:06<29:49,  4.24it/s]                                                     41%|████      | 5196/12776 [55:06<29:49,  4.24it/s] 41%|████      | 5197/12776 [55:06<28:14,  4.47it/s]                                                     41%|████      | 5197/12776 [55:06<28:14,  4.47it/s] 41%|████      | 5198/12776 [55:06<26:57,  4.68it/s]                                                     41%|████      | 5198/12776 [55:06<26:57,  4.68it/s] 41%|████      | 5199/12776 [55:06<25:55,  4.87it/s]                                                     41%|████      | 5199/12776 [55:06<25:55,  4.87it/s] 41%|████      | 5200/12776 [55:07<45:01,  2.80it/s]                                                     41%|████      | 5200/12776 [55:07<45:01,  2.80it/s]Saving model checkpoint to ./checkpoint-5200
+Configuration saved in ./checkpoint-5200/config.json
+Model weights saved in ./checkpoint-5200/model.safetensors
+Feature extractor saved in ./checkpoint-5200/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-5200/tokenizer_config.json
+Special tokens file saved in ./checkpoint-5200/special_tokens_map.json
+added tokens file saved in ./checkpoint-5200/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-4000] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 41%|████      | 5201/12776 [55:14<4:49:32,  2.29s/it]                                                       41%|████      | 5201/12776 [55:14<4:49:32,  2.29s/it] 41%|████      | 5202/12776 [55:15<4:00:19,  1.90s/it]                                                       41%|████      | 5202/12776 [55:15<4:00:19,  1.90s/it] 41%|████      | 5203/12776 [55:16<3:19:32,  1.58s/it]                                                       41%|████      | 5203/12776 [55:16<3:19:32,  1.58s/it] 41%|████      | 5204/12776 [55:16<2:48:06,  1.33s/it]                                                       41%|████      | 5204/12776 [55:16<2:48:06,  1.33s/it] 41%|████      | 5205/12776 [55:17<2:24:22,  1.14s/it]                                                       41%|████      | 5205/12776 [55:17<2:24:22,  1.14s/it] 41%|████      | 5206/12776 [55:18<2:05:46,  1.00it/s]                                                       41%|████      | 5206/12776 [55:18<2:05:46,  1.00it/s] 41%|████      | 5207/12776 [55:18<1:56:24,  1.08it/s]                                                       41%|████      | 5207/12776 [55:18<1:56:24,  1.08it/s] 41%|████      | 5208/12776 [55:19<1:43:51,  1.21it/s]                                                       41%|████      | 5208/12776 [55:19<1:43:51,  1.21it/s] 41%|████      | 5209/12776 [55:20<1:35:00,  1.33it/s]                                                       41%|████      | 5209/12776 [55:20<1:35:00,  1.33it/s] 41%|████      | 5210/12776 [55:20<1:26:11,  1.46it/s]                                                       41%|████      | 5210/12776 [55:20<1:26:11,  1.46it/s] 41%|████      | 5211/12776 [55:21<1:22:12,  1.53it/s]                                                       41%|████      | 5211/12776 [55:21<1:22:12,  1.53it/s] 41%|████      | 5212/12776 [55:21<1:15:20,  1.67it/s]                                                       41%|████      | 5212/12776 [55:21<1:15:20,  1.67it/s] 41%|████      | 5213/12776 [55:22<1:14:29,  1.69it/s]                                                       41%|████      | 5213/12776 [55:22<1:14:29,  1.69it/s] 41%|████      | 5214/12776 [55:22<1:08:28,  1.84it/s]                                                       41%|████      | 5214/12776 [55:22<1:08:28,  1.84it/s] 41%|████      | 5215/12776 [55:23<1:07:06,  1.88it/s]                                                       41%|████      | 5215/12776 [55:23<1:07:06,  1.88it/s] 41%|████      | 5216/12776 [55:23<1:01:52,  2.04it/s]                                                       41%|████      | 5216/12776 [55:23<1:01:52,  2.04it/s] 41%|████      | 5217/12776 [55:23<57:38,  2.19it/s]                                                       41%|████      | 5217/12776 [55:23<57:38,  2.19it/s] 41%|████      | 5218/12776 [55:24<55:34,  2.27it/s]                                                     41%|████      | 5218/12776 [55:24<55:34,  2.27it/s] 41%|████      | 5219/12776 [55:24<52:21,  2.41it/s]                                                     41%|████      | 5219/12776 [55:24<52:21,  2.41it/s] 41%|████      | 5220/12776 [55:25<49:16,  2.56it/s]                                                     41%|████      | 5220/12776 [55:25<49:16,  2.56it/s] 41%|████      | 5221/12776 [55:25<51:00,  2.47it/s]                                                     41%|████      | 5221/12776 [55:25<51:00,  2.47it/s] 41%|████      | 5222/12776 [55:25<47:47,  2.63it/s]                                                     41%|████      | 5222/12776 [55:25<47:47,  2.63it/s] 41%|████      | 5223/12776 [55:26<45:10,  2.79it/s]                                                     41%|████      | 5223/12776 [55:26<45:10,  2.79it/s] 41%|████      | 5224/12776 [55:26<43:05,  2.92it/s]                                                     41%|████      | 5224/12776 [55:26<43:05,  2.92it/s] 41%|████      | 5225/12776 [55:26<43:06,  2.92it/s]                                                     41%|████      | 5225/12776 [55:26<43:06,  2.92it/s] 41%|████      | 5226/12776 [55:27<40:32,  3.10it/s]                                                     41%|████      | 5226/12776 [55:27<40:32,  3.10it/s] 41%|████      | 5227/12776 [55:27<38:39,  3.26it/s]                                                     41%|████      | 5227/12776 [55:27<38:39,  3.26it/s] 41%|████      | 5228/12776 [55:27<36:54,  3.41it/s]                                                     41%|████      | 5228/12776 [55:27<36:54,  3.41it/s] 41%|████      | 5229/12776 [55:27<37:06,  3.39it/s]                                                     41%|████      | 5229/12776 [55:27<37:06,  3.39it/s] 41%|████      | 5230/12776 [55:28<35:21,  3.56it/s]                                                     41%|████      | 5230/12776 [55:28<35:21,  3.56it/s] 41%|████      | 5231/12776 [55:28<33:50,  3.72it/s]                                                     41%|████      | 5231/12776 [55:28<33:50,  3.72it/s] 41%|████      | 5232/12776 [55:28<32:28,  3.87it/s]                                                    {'loss': 0.3663, 'grad_norm': 0.5704903602600098, 'learning_rate': 0.00018651026392961875, 'epoch': 0.81}
+{'loss': 0.5071, 'grad_norm': 1.178697943687439, 'learning_rate': 0.00018648582600195503, 'epoch': 0.81}
+{'loss': 0.4386, 'grad_norm': 0.8343028426170349, 'learning_rate': 0.00018646138807429128, 'epoch': 0.81}
+{'loss': 0.2441, 'grad_norm': 0.592948317527771, 'learning_rate': 0.00018643695014662756, 'epoch': 0.81}
+{'loss': 0.3678, 'grad_norm': 0.7243725061416626, 'learning_rate': 0.00018641251221896383, 'epoch': 0.81}
+{'loss': 0.3722, 'grad_norm': 0.8186553120613098, 'learning_rate': 0.00018638807429130006, 'epoch': 0.81}
+{'loss': 0.2819, 'grad_norm': 1.4765008687973022, 'learning_rate': 0.00018636363636363634, 'epoch': 0.81}
+{'loss': 0.3065, 'grad_norm': 1.4610421657562256, 'learning_rate': 0.00018633919843597262, 'epoch': 0.81}
+{'loss': 0.3565, 'grad_norm': 1.0413181781768799, 'learning_rate': 0.00018631476050830887, 'epoch': 0.81}
+{'loss': 0.9926, 'grad_norm': 9.378120422363281, 'learning_rate': 0.00018629032258064515, 'epoch': 0.81}
+{'loss': 0.5953, 'grad_norm': 1.3265831470489502, 'learning_rate': 0.00018626588465298142, 'epoch': 0.81}
+{'loss': 0.4746, 'grad_norm': 2.425917863845825, 'learning_rate': 0.00018624144672531768, 'epoch': 0.81}
+{'loss': 0.4585, 'grad_norm': 1.6301259994506836, 'learning_rate': 0.00018621700879765395, 'epoch': 0.81}
+{'loss': 1.0197, 'grad_norm': 3.910301446914673, 'learning_rate': 0.00018619257086999023, 'epoch': 0.81}
+{'loss': 0.5156, 'grad_norm': 1.4134447574615479, 'learning_rate': 0.00018616813294232646, 'epoch': 0.81}
+{'loss': 1.0762, 'grad_norm': 1.712280035018921, 'learning_rate': 0.00018614369501466273, 'epoch': 0.81}
+{'loss': 0.4157, 'grad_norm': 1.478226900100708, 'learning_rate': 0.000186119257086999, 'epoch': 0.81}
+{'loss': 0.4726, 'grad_norm': 1.3933120965957642, 'learning_rate': 0.00018609481915933526, 'epoch': 0.81}
+{'loss': 0.4534, 'grad_norm': 1.6714564561843872, 'learning_rate': 0.00018607038123167154, 'epoch': 0.81}
+{'loss': 0.689, 'grad_norm': 1.5796337127685547, 'learning_rate': 0.00018604594330400782, 'epoch': 0.81}
+{'loss': 0.8875, 'grad_norm': 1.966103196144104, 'learning_rate': 0.00018602150537634407, 'epoch': 0.81}
+{'loss': 0.7517, 'grad_norm': 1.9266483783721924, 'learning_rate': 0.00018599706744868035, 'epoch': 0.81}
+{'loss': 0.6377, 'grad_norm': 2.138370990753174, 'learning_rate': 0.0001859726295210166, 'epoch': 0.81}
+{'loss': 0.7177, 'grad_norm': 2.1694908142089844, 'learning_rate': 0.00018594819159335285, 'epoch': 0.81}
+{'loss': 0.7425, 'grad_norm': 2.8311069011688232, 'learning_rate': 0.00018592375366568913, 'epoch': 0.81}
+{'loss': 1.0659, 'grad_norm': 3.3369836807250977, 'learning_rate': 0.0001858993157380254, 'epoch': 0.81}
+{'loss': 0.6171, 'grad_norm': 1.3343548774719238, 'learning_rate': 0.00018587487781036166, 'epoch': 0.81}
+{'loss': 1.0402, 'grad_norm': 3.1208863258361816, 'learning_rate': 0.00018585043988269794, 'epoch': 0.81}
+{'loss': 0.8502, 'grad_norm': 2.674014091491699, 'learning_rate': 0.00018582600195503422, 'epoch': 0.81}
+{'loss': 0.7894, 'grad_norm': 2.035822868347168, 'learning_rate': 0.00018580156402737044, 'epoch': 0.81}
+{'loss': 1.1385, 'grad_norm': 3.007331371307373, 'learning_rate': 0.00018577712609970672, 'epoch': 0.81}
+{'loss': 0.7571, 'grad_norm': 1.874494194984436, 'learning_rate': 0.000185752688172043, 'epoch': 0.81}
+{'loss': 1.2572, 'grad_norm': 2.905369758605957, 'learning_rate': 0.00018572825024437925, 'epoch': 0.81}
+{'loss': 1.2583, 'grad_norm': 3.7571399211883545, 'learning_rate': 0.00018570381231671553, 'epoch': 0.81}
+{'loss': 1.8115, 'grad_norm': 2.9104902744293213, 'learning_rate': 0.0001856793743890518, 'epoch': 0.81}
+{'loss': 1.3185, 'grad_norm': 2.058563709259033, 'learning_rate': 0.00018565493646138806, 'epoch': 0.81}
+{'loss': 1.1396, 'grad_norm': 3.4244906902313232, 'learning_rate': 0.00018563049853372434, 'epoch': 0.81}
+{'loss': 1.322, 'grad_norm': 1.9940974712371826, 'learning_rate': 0.00018560606060606061, 'epoch': 0.81}
+{'loss': 1.2332, 'grad_norm': 2.131615161895752, 'learning_rate': 0.00018558162267839684, 'epoch': 0.81}
+{'loss': 0.9671, 'grad_norm': 2.615813970565796, 'learning_rate': 0.00018555718475073312, 'epoch': 0.81}
+{'loss': 1.0884, 'grad_norm': 1.4775357246398926, 'learning_rate': 0.0001855327468230694, 'epoch': 0.81}
+{'loss': 0.7112, 'grad_norm': 5.091252326965332, 'learning_rate': 0.00018550830889540565, 'epoch': 0.81}
+{'loss': 0.4947, 'grad_norm': 1.498961329460144, 'learning_rate': 0.00018548387096774192, 'epoch': 0.81}
+{'loss': 0.8125, 'grad_norm': 2.671917676925659, 'learning_rate': 0.0001854594330400782, 'epoch': 0.81}
+{'loss': 0.6217, 'grad_norm': 1.2073392868041992, 'learning_rate': 0.00018543499511241445, 'epoch': 0.81}
+{'loss': 1.5128, 'grad_norm': 2.6718733310699463, 'learning_rate': 0.0001854105571847507, 'epoch': 0.81}
+{'loss': 0.2958, 'grad_norm': 0.4015403687953949, 'learning_rate': 0.00018538611925708698, 'epoch': 0.81}
+{'loss': 0.211, 'grad_norm': 0.7443885803222656, 'learning_rate': 0.00018536168132942323, 'epoch': 0.81}
+{'loss': 0.3133, 'grad_norm': 0.7921191453933716, 'learning_rate': 0.0001853372434017595, 'epoch': 0.81}
+{'loss': 0.2662, 'grad_norm': 0.7802867293357849, 'learning_rate': 0.0001853128054740958, 'epoch': 0.81}
+{'loss': 0.3204, 'grad_norm': 0.7072981595993042, 'learning_rate': 0.00018528836754643204, 'epoch': 0.81}
+{'loss': 0.2573, 'grad_norm': 0.6935706734657288, 'learning_rate': 0.00018526392961876832, 'epoch': 0.81}
+{'loss': 0.305, 'grad_norm': 0.6448320746421814, 'learning_rate': 0.0001852394916911046, 'epoch': 0.82}
+{'loss': 0.2683, 'grad_norm': 0.5979820489883423, 'learning_rate': 0.00018521505376344082, 'epoch': 0.82}
+{'loss': 0.3316, 'grad_norm': 0.9508888125419617, 'learning_rate': 0.0001851906158357771, 'epoch': 0.82}
+{'loss': 0.2239, 'grad_norm': 0.6390425562858582, 'learning_rate': 0.00018516617790811338, 'epoch': 0.82}
+{'loss': 0.4099, 'grad_norm': 0.9599819779396057, 'learning_rate': 0.00018514173998044963, 'epoch': 0.82}
+{'loss': 0.3733, 'grad_norm': 1.198017954826355, 'learning_rate': 0.0001851173020527859, 'epoch': 0.82}
+{'loss': 0.3879, 'grad_norm': 1.6387499570846558, 'learning_rate': 0.0001850928641251222, 'epoch': 0.82}
+{'loss': 0.6261, 'grad_norm': 1.1071934700012207, 'learning_rate': 0.00018506842619745844, 'epoch': 0.82}
+{'loss': 0.2503, 'grad_norm': 0.7644991278648376, 'learning_rate': 0.00018504398826979472, 'epoch': 0.82}
+{'loss': 0.5133, 'grad_norm': 0.9139405488967896, 'learning_rate': 0.000185019550342131, 'epoch': 0.82}
+{'loss': 0.4415, 'grad_norm': 0.9907679557800293, 'learning_rate': 0.00018499511241446722, 'epoch': 0.82}
+{'loss': 0.491, 'grad_norm': 0.9434964060783386, 'learning_rate': 0.0001849706744868035, 'epoch': 0.82}
+{'loss': 0.673, 'grad_norm': 1.1994105577468872, 'learning_rate': 0.00018494623655913978, 'epoch': 0.82}
+{'loss': 0.5795, 'grad_norm': 1.1028125286102295, 'learning_rate': 0.00018492179863147603, 'epoch': 0.82}
+{'loss': 0.5664, 'grad_norm': 0.9780545234680176, 'learning_rate': 0.0001848973607038123, 'epoch': 0.82}
+{'loss': 0.6478, 'grad_norm': 2.6875715255737305, 'learning_rate': 0.00018487292277614859, 'epoch': 0.82}
+{'loss': 0.6378, 'grad_norm': 1.439847469329834, 'learning_rate': 0.00018484848484848484, 'epoch': 0.82}
+{'loss': 0.8491, 'grad_norm': 1.5552678108215332, 'learning_rate': 0.0001848240469208211, 'epoch': 0.82}
+{'loss': 0.8801, 'grad_norm': 2.3292577266693115, 'learning_rate': 0.00018479960899315737, 'epoch': 0.82}
+{'loss': 0.7684, 'grad_norm': 1.8641091585159302, 'learning_rate': 0.00018477517106549362, 'epoch': 0.82}
+{'loss': 0.5417, 'grad_norm': 1.5867455005645752, 'learning_rate': 0.0001847507331378299, 'epoch': 0.82}
+{'loss': 1.1114, 'grad_norm': 3.227484941482544, 'learning_rate': 0.00018472629521016617, 'epoch': 0.82}
+{'loss': 0.6013, 'grad_norm': 1.3463890552520752, 'learning_rate': 0.00018470185728250243, 'epoch': 0.82}
+{'loss': 0.82, 'grad_norm': 2.245208501815796, 'learning_rate': 0.0001846774193548387, 'epoch': 0.82}
+{'loss': 0.6866, 'grad_norm': 1.4095137119293213, 'learning_rate': 0.00018465298142717498, 'epoch': 0.82}
+ 41%|████      | 5232/12776 [55:28<32:28,  3.87it/s] 41%|████      | 5233/12776 [55:28<33:48,  3.72it/s]                                                     41%|████      | 5233/12776 [55:28<33:48,  3.72it/s] 41%|████      | 5234/12776 [55:29<31:54,  3.94it/s]                                                     41%|████      | 5234/12776 [55:29<31:54,  3.94it/s] 41%|████      | 5235/12776 [55:29<30:23,  4.14it/s]                                                     41%|████      | 5235/12776 [55:29<30:23,  4.14it/s] 41%|████      | 5236/12776 [55:29<29:06,  4.32it/s]                                                     41%|████      | 5236/12776 [55:29<29:06,  4.32it/s] 41%|████      | 5237/12776 [55:29<28:03,  4.48it/s]                                                     41%|████      | 5237/12776 [55:29<28:03,  4.48it/s] 41%|████      | 5238/12776 [55:29<29:45,  4.22it/s]                                                     41%|████      | 5238/12776 [55:29<29:45,  4.22it/s] 41%|████      | 5239/12776 [55:30<28:04,  4.47it/s]                                                     41%|████      | 5239/12776 [55:30<28:04,  4.47it/s] 41%|████      | 5240/12776 [55:30<26:44,  4.70it/s]                                                     41%|████      | 5240/12776 [55:30<26:44,  4.70it/s] 41%|████      | 5241/12776 [55:30<25:46,  4.87it/s]                                                     41%|████      | 5241/12776 [55:30<25:46,  4.87it/s] 41%|████      | 5242/12776 [55:30<24:57,  5.03it/s]                                                     41%|████      | 5242/12776 [55:30<24:57,  5.03it/s] 41%|████      | 5243/12776 [55:30<24:15,  5.18it/s]                                                     41%|████      | 5243/12776 [55:30<24:15,  5.18it/s] 41%|████      | 5244/12776 [55:31<28:30,  4.40it/s]                                                     41%|████      | 5244/12776 [55:31<28:30,  4.40it/s] 41%|████      | 5245/12776 [55:31<26:30,  4.74it/s]                                                     41%|████      | 5245/12776 [55:31<26:30,  4.74it/s] 41%|████      | 5246/12776 [55:31<24:59,  5.02it/s]                                                     41%|████      | 5246/12776 [55:31<24:59,  5.02it/s] 41%|████      | 5247/12776 [55:31<23:53,  5.25it/s]                                                     41%|████      | 5247/12776 [55:31<23:53,  5.25it/s] 41%|████      | 5248/12776 [55:31<22:50,  5.49it/s]                                                     41%|████      | 5248/12776 [55:31<22:50,  5.49it/s] 41%|████      | 5249/12776 [55:32<22:02,  5.69it/s]                                                     41%|████      | 5249/12776 [55:32<22:02,  5.69it/s] 41%|████      | 5250/12776 [55:32<39:22,  3.19it/s]                                                     41%|████      | 5250/12776 [55:32<39:22,  3.19it/s] 41%|████      | 5251/12776 [55:33<1:15:17,  1.67it/s]                                                       41%|████      | 5251/12776 [55:33<1:15:17,  1.67it/s] 41%|████      | 5252/12776 [55:34<1:29:05,  1.41it/s]                                                       41%|████      | 5252/12776 [55:34<1:29:05,  1.41it/s] 41%|████      | 5253/12776 [55:35<1:33:50,  1.34it/s]                                                       41%|████      | 5253/12776 [55:35<1:33:50,  1.34it/s] 41%|████      | 5254/12776 [55:36<1:33:37,  1.34it/s]                                                       41%|████      | 5254/12776 [55:36<1:33:37,  1.34it/s] 41%|████      | 5255/12776 [55:37<1:35:00,  1.32it/s]                                                       41%|████      | 5255/12776 [55:37<1:35:00,  1.32it/s] 41%|████      | 5256/12776 [55:38<1:34:30,  1.33it/s]                                                       41%|████      | 5256/12776 [55:38<1:34:30,  1.33it/s] 41%|████      | 5257/12776 [55:38<1:31:02,  1.38it/s]                                                       41%|████      | 5257/12776 [55:38<1:31:02,  1.38it/s] 41%|████      | 5258/12776 [55:39<1:32:20,  1.36it/s]                                                       41%|████      | 5258/12776 [55:39<1:32:20,  1.36it/s] 41%|████      | 5259/12776 [55:40<1:26:46,  1.44it/s]                                                       41%|████      | 5259/12776 [55:40<1:26:46,  1.44it/s] 41%|████      | 5260/12776 [55:40<1:24:09,  1.49it/s]                                                       41%|████      | 5260/12776 [55:40<1:24:09,  1.49it/s] 41%|████      | 5261/12776 [55:41<1:19:23,  1.58it/s]                                                       41%|████      | 5261/12776 [55:41<1:19:23,  1.58it/s] 41%|████      | 5262/12776 [55:41<1:15:17,  1.66it/s]                                                       41%|████      | 5262/12776 [55:41<1:15:17,  1.66it/s] 41%|████      | 5263/12776 [55:42<1:10:06,  1.79it/s]                                                       41%|████      | 5263/12776 [55:42<1:10:06,  1.79it/s] 41%|████      | 5264/12776 [55:42<1:10:16,  1.78it/s]                                                       41%|████      | 5264/12776 [55:42<1:10:16,  1.78it/s] 41%|████      | 5265/12776 [55:43<1:04:19,  1.95it/s]                                                       41%|████      | 5265/12776 [55:43<1:04:19,  1.95it/s] 41%|████      | 5266/12776 [55:43<59:21,  2.11it/s]                                                       41%|████      | 5266/12776 [55:43<59:21,  2.11it/s] 41%|████      | 5267/12776 [55:44<58:02,  2.16it/s]                                                     41%|██���█      | 5267/12776 [55:44<58:02,  2.16it/s] 41%|████      | 5268/12776 [55:44<53:44,  2.33it/s]                                                     41%|████      | 5268/12776 [55:44<53:44,  2.33it/s] 41%|████      | 5269/12776 [55:44<50:07,  2.50it/s]                                                     41%|████      | 5269/12776 [55:44<50:07,  2.50it/s] 41%|████      | 5270/12776 [55:45<49:25,  2.53it/s]                                                     41%|████      | 5270/12776 [55:45<49:25,  2.53it/s] 41%|████▏     | 5271/12776 [55:45<47:21,  2.64it/s]                                                     41%|████▏     | 5271/12776 [55:45<47:21,  2.64it/s] 41%|████▏     | 5272/12776 [55:45<44:28,  2.81it/s]                                                     41%|████▏     | 5272/12776 [55:45<44:28,  2.81it/s] 41%|████▏     | 5273/12776 [55:46<44:36,  2.80it/s]                                                     41%|████▏     | 5273/12776 [55:46<44:36,  2.80it/s] 41%|████▏     | 5274/12776 [55:46<41:28,  3.01it/s]                                                     41%|████▏     | 5274/12776 [55:46<41:28,  3.01it/s] 41%|████▏     | 5275/12776 [55:46<39:14,  3.19it/s]                                                     41%|████▏     | 5275/12776 [55:46<39:14,  3.19it/s] 41%|████▏     | 5276/12776 [55:46<37:20,  3.35it/s]                                                     41%|████▏     | 5276/12776 [55:46<37:20,  3.35it/s] 41%|████▏     | 5277/12776 [55:47<39:11,  3.19it/s]                                                     41%|████▏     | 5277/12776 [55:47<39:11,  3.19it/s] 41%|████▏     | 5278/12776 [55:47<36:45,  3.40it/s]                                                     41%|████▏     | 5278/12776 [55:47<36:45,  3.40it/s] 41%|████▏     | 5279/12776 [55:47<34:51,  3.58it/s]                                                     41%|████▏     | 5279/12776 [55:47<34:51,  3.58it/s] 41%|████▏     | 5280/12776 [55:47<33:06,  3.77it/s]                                                     41%|████▏     | 5280/12776 [55:47<33:06,  3.77it/s] 41%|████▏     | 5281/12776 [55:48<31:40,  3.94it/s]                                                     41%|████▏     | 5281/12776 [55:48<31:40,  3.94it/s] 41%|████▏     | 5282/12776 [55:48<33:31,  3.72it/s]                                                     41%|████▏     | 5282/12776 [55:48<33:31,  3.72it/s] 41%|████▏     | 5283/12776 [55:48<31:41,  3.94it/s]                                                     41%|████▏     | 5283/12776 [55:48<31:41,  3.94it/s] 41%|████▏     | 5284/12776 [55:48<30:56,  4.04it/s]                                                     41%|████▏     | 5284/12776 [55:48<30:56,  4.04it/s] 41%|████▏     | 5285/12776 [55:49<30:21,  4.11it/s]                                                     41%|████▏     | 5285/12776 [55:49<30:21,  4.11it/s] 41%|████▏     | 5286/12776 [55:49<32:54,  3.79it/s]                                                     41%|████▏     | 5286/12776 [55:49<32:54,  3.79it/s] 41%|████▏     | 5287/12776 [55:49<31:35,  3.95it/s]                                                     41%|████▏     | 5287/12776 [55:49<31:35,  3.95it/s] 41%|████▏     | 5288/12776 [55:49<30:36,  4.08it/s]                                                     41%|████▏     | 5288/12776 [55:49<30:36,  4.08it/s] 41%|████▏     | 5289/12776 [55:50<30:27,  4.10it/s]                                                     41%|████▏     | 5289/12776 [55:50<30:27,  4.10it/s] 41%|████▏     | 5290/12776 [55:50<29:41,  4.20it/s]                                                     41%|████▏     | 5290/12776 [55:50<29:41,  4.20it/s] 41%|████▏     | 5291/12776 [55:50<32:08,  3.88it/s]                                                     41%|████▏     | 5291/12776 [55:50<32:08,  3.88it/s] 41%|████▏     | 5292/12776 [55:50<30:34,  4.08it/s]                                                     41%|████▏     | 5292/12776 [55:50<30:34,  4.08it/s] 41%|████▏     | 5293/12776 [55:51<28:56,  4.31it/s]                                                     41%|████▏     | 5293/12776 [55:51<28:56,  4.31it/s] 41%|████▏     | 5294/12776 [55:51<27:39,  4.51it/s]                                                     41%|████▏     | 5294/12776 [55:51<27:39,  4.51it/s] 41%|████▏     | 5295/12776 [55:51<27:10,  4.59it/s]                                                     41%|████▏     | 5295/12776 [55:51<27:10,  4.59it/s] 41%|████▏     | 5296/12776 [55:51<30:54,  4.03it/s]                                                     41%|████▏     | 5296/12776 [55:51<30:54,  4.03it/s] 41%|████▏     | 5297/12776 [55:52<28:50,  4.32it/s]                                                     41%|████▏     | 5297/12776 [55:52<28:50,  4.32it/s] 41%|████▏     | 5298/12776 [55:52<27:11,  4.58it/s]                                                     41%|████▏     | 5298/12776 [55:52<27:11,  4.58it/s] 41%|████▏     | 5299/12776 [55:52<25:55,  4.81it/s]                                                     41%|████▏     | 5299/12776 [55:52<25:55,  4.81it/s] 41%|████▏     | 5300/12776 [55:53<45:33,  2.74it/s]                                                     41%|████▏     | 5300/12776 [55:53<45:33,  2.74it/s] 41%|████▏     | 5301/12776 [55:54<1:23:08,  1.50it/s]                                                       41%|████▏     | 5301/12776 [55:54<1:23:08,  1.50it/s] 41%|████▏     | 5302/12776 [55:55<1:37:20,  1.28it/s]                                                       41%|████▏     | 5302/12776 [55:55<1:37:20,  1.28it/s] 42%|████▏     | 5303/12776 [55:56<1:40:36,  1.24it/s]                                                       42%|████▏     | 5303/12776 [55:56<1:40:36,  1.24it/s] 42%|████▏     | 5304/12776 [55:57<1:40:07,  1.24it/s]                                                       42%|████▏     | 5304/12776 [55:57<1:40:07,  1.24it/s] 42%|████▏     | 5305/12776 [55:57<1:38:03,  1.27it/s]                                                       42%|████▏     | 5305/12776 [55:57<1:38:03,  1.27it/s] 42%|████▏     | 5306/12776 [55:58<1:33:54,  1.33it/s]                                                       42%|████▏     | 5306/12776 [55:58<1:33:54,  1.33it/s] 42%|████▏     | 5307/12776 [55:59<1:32:13,  1.35it/s]                                                       42%|████▏     | 5307/12776 [55:59<1:32:13,  1.35it/s] 42%|████▏     | 5308/12776 [56:00<1:29:53,  1.38it/s]                                                       42%|████▏     | 5308/12776 [56:00<1:29:53,  1.38it/s] 42%|████▏     | 5309/12776 [56:00<1:25:16,  1.46it/s]                                                      {'loss': 0.7222, 'grad_norm': 2.2786121368408203, 'learning_rate': 0.0001846285434995112, 'epoch': 0.82}
+{'loss': 0.7086, 'grad_norm': 1.5390690565109253, 'learning_rate': 0.00018460410557184748, 'epoch': 0.82}
+{'loss': 0.9285, 'grad_norm': 1.4006104469299316, 'learning_rate': 0.00018457966764418376, 'epoch': 0.82}
+{'loss': 0.7367, 'grad_norm': 1.5036606788635254, 'learning_rate': 0.00018455522971652001, 'epoch': 0.82}
+{'loss': 0.7435, 'grad_norm': 1.538991928100586, 'learning_rate': 0.0001845307917888563, 'epoch': 0.82}
+{'loss': 0.7751, 'grad_norm': 1.5713046789169312, 'learning_rate': 0.00018450635386119257, 'epoch': 0.82}
+{'loss': 1.1718, 'grad_norm': 2.501213788986206, 'learning_rate': 0.00018448191593352882, 'epoch': 0.82}
+{'loss': 1.362, 'grad_norm': 2.6545050144195557, 'learning_rate': 0.0001844574780058651, 'epoch': 0.82}
+{'loss': 1.4286, 'grad_norm': 2.392634153366089, 'learning_rate': 0.00018443304007820138, 'epoch': 0.82}
+{'loss': 1.2579, 'grad_norm': 1.6317548751831055, 'learning_rate': 0.0001844086021505376, 'epoch': 0.82}
+{'loss': 1.1427, 'grad_norm': 8.814602851867676, 'learning_rate': 0.00018438416422287388, 'epoch': 0.82}
+{'loss': 1.6293, 'grad_norm': 3.1028459072113037, 'learning_rate': 0.00018435972629521016, 'epoch': 0.82}
+{'loss': 1.5323, 'grad_norm': 2.7763304710388184, 'learning_rate': 0.0001843352883675464, 'epoch': 0.82}
+{'loss': 2.2438, 'grad_norm': 3.5093159675598145, 'learning_rate': 0.0001843108504398827, 'epoch': 0.82}
+{'loss': 1.2866, 'grad_norm': 2.848104953765869, 'learning_rate': 0.00018428641251221897, 'epoch': 0.82}
+{'loss': 0.6522, 'grad_norm': 1.3957146406173706, 'learning_rate': 0.00018426197458455522, 'epoch': 0.82}
+{'loss': 0.663, 'grad_norm': 4.863888740539551, 'learning_rate': 0.00018423753665689147, 'epoch': 0.82}
+{'loss': 1.2547, 'grad_norm': 2.6501758098602295, 'learning_rate': 0.00018421309872922775, 'epoch': 0.82}
+{'loss': 1.0807, 'grad_norm': 1.6998775005340576, 'learning_rate': 0.000184188660801564, 'epoch': 0.82}
+{'loss': 0.3135, 'grad_norm': 0.7901161909103394, 'learning_rate': 0.00018416422287390028, 'epoch': 0.82}
+{'loss': 0.4211, 'grad_norm': 0.7049028277397156, 'learning_rate': 0.00018413978494623656, 'epoch': 0.82}
+{'loss': 0.2857, 'grad_norm': 0.6811177134513855, 'learning_rate': 0.0001841153470185728, 'epoch': 0.82}
+{'loss': 0.35, 'grad_norm': 0.7868577241897583, 'learning_rate': 0.00018409090909090909, 'epoch': 0.82}
+{'loss': 0.4374, 'grad_norm': 0.7949536442756653, 'learning_rate': 0.00018406647116324536, 'epoch': 0.82}
+{'loss': 0.427, 'grad_norm': 0.6903027296066284, 'learning_rate': 0.0001840420332355816, 'epoch': 0.82}
+{'loss': 0.5036, 'grad_norm': 0.6659723520278931, 'learning_rate': 0.00018401759530791787, 'epoch': 0.82}
+{'loss': 0.3561, 'grad_norm': 0.5568656325340271, 'learning_rate': 0.00018399315738025415, 'epoch': 0.82}
+{'loss': 0.3908, 'grad_norm': 0.8612244129180908, 'learning_rate': 0.0001839687194525904, 'epoch': 0.82}
+{'loss': 0.5201, 'grad_norm': 0.7384498119354248, 'learning_rate': 0.00018394428152492667, 'epoch': 0.82}
+{'loss': 0.413, 'grad_norm': 0.9855079054832458, 'learning_rate': 0.00018391984359726295, 'epoch': 0.82}
+{'loss': 0.4253, 'grad_norm': 0.768484890460968, 'learning_rate': 0.0001838954056695992, 'epoch': 0.82}
+{'loss': 0.2572, 'grad_norm': 0.6024149060249329, 'learning_rate': 0.00018387096774193548, 'epoch': 0.82}
+{'loss': 0.2692, 'grad_norm': 0.5036283135414124, 'learning_rate': 0.00018384652981427176, 'epoch': 0.82}
+{'loss': 0.4799, 'grad_norm': 1.0418510437011719, 'learning_rate': 0.00018382209188660799, 'epoch': 0.82}
+{'loss': 0.3185, 'grad_norm': 0.663507342338562, 'learning_rate': 0.00018379765395894426, 'epoch': 0.82}
+{'loss': 0.337, 'grad_norm': 0.8213881254196167, 'learning_rate': 0.00018377321603128054, 'epoch': 0.82}
+{'loss': 0.4151, 'grad_norm': 1.1728672981262207, 'learning_rate': 0.0001837487781036168, 'epoch': 0.82}
+{'loss': 0.6478, 'grad_norm': 1.2628549337387085, 'learning_rate': 0.00018372434017595307, 'epoch': 0.82}
+{'loss': 0.4379, 'grad_norm': 1.121236801147461, 'learning_rate': 0.00018369990224828935, 'epoch': 0.82}
+{'loss': 0.461, 'grad_norm': 1.1426113843917847, 'learning_rate': 0.00018367546432062557, 'epoch': 0.83}
+{'loss': 0.493, 'grad_norm': 1.555210828781128, 'learning_rate': 0.00018365102639296185, 'epoch': 0.83}
+{'loss': 0.5785, 'grad_norm': 1.8274154663085938, 'learning_rate': 0.00018362658846529813, 'epoch': 0.83}
+{'loss': 0.71, 'grad_norm': 1.771406888961792, 'learning_rate': 0.00018360215053763438, 'epoch': 0.83}
+{'loss': 0.8284, 'grad_norm': 3.0321593284606934, 'learning_rate': 0.00018357771260997066, 'epoch': 0.83}
+{'loss': 1.0345, 'grad_norm': 4.053110599517822, 'learning_rate': 0.00018355327468230694, 'epoch': 0.83}
+{'loss': 0.8215, 'grad_norm': 1.7022113800048828, 'learning_rate': 0.0001835288367546432, 'epoch': 0.83}
+{'loss': 1.186, 'grad_norm': 2.7974987030029297, 'learning_rate': 0.00018350439882697947, 'epoch': 0.83}
+{'loss': 0.7569, 'grad_norm': 2.123547077178955, 'learning_rate': 0.00018347996089931575, 'epoch': 0.83}
+{'loss': 1.1509, 'grad_norm': 2.535295009613037, 'learning_rate': 0.00018345552297165197, 'epoch': 0.83}
+{'loss': 0.6985, 'grad_norm': 2.005593776702881, 'learning_rate': 0.00018343108504398825, 'epoch': 0.83}
+{'loss': 1.401, 'grad_norm': 2.139389753341675, 'learning_rate': 0.00018340664711632453, 'epoch': 0.83}
+{'loss': 0.829, 'grad_norm': 1.5043542385101318, 'learning_rate': 0.00018338220918866078, 'epoch': 0.83}
+{'loss': 0.8851, 'grad_norm': 1.6007672548294067, 'learning_rate': 0.00018335777126099706, 'epoch': 0.83}
+{'loss': 1.3312, 'grad_norm': 2.1892812252044678, 'learning_rate': 0.00018333333333333334, 'epoch': 0.83}
+{'loss': 0.6038, 'grad_norm': 1.2320849895477295, 'learning_rate': 0.0001833088954056696, 'epoch': 0.83}
+{'loss': 0.669, 'grad_norm': 4.166670322418213, 'learning_rate': 0.00018328445747800586, 'epoch': 0.83}
+{'loss': 1.1425, 'grad_norm': 2.1780524253845215, 'learning_rate': 0.00018326001955034214, 'epoch': 0.83}
+{'loss': 1.1492, 'grad_norm': 2.1687676906585693, 'learning_rate': 0.00018323558162267837, 'epoch': 0.83}
+{'loss': 1.0186, 'grad_norm': 1.6996123790740967, 'learning_rate': 0.00018321114369501465, 'epoch': 0.83}
+{'loss': 1.1482, 'grad_norm': 3.4165749549865723, 'learning_rate': 0.00018318670576735092, 'epoch': 0.83}
+{'loss': 1.3908, 'grad_norm': 2.1775505542755127, 'learning_rate': 0.00018316226783968718, 'epoch': 0.83}
+{'loss': 1.1503, 'grad_norm': 2.812617063522339, 'learning_rate': 0.00018313782991202345, 'epoch': 0.83}
+{'loss': 1.3862, 'grad_norm': 1.5233439207077026, 'learning_rate': 0.00018311339198435973, 'epoch': 0.83}
+{'loss': 1.5704, 'grad_norm': 1.8722327947616577, 'learning_rate': 0.00018308895405669596, 'epoch': 0.83}
+{'loss': 1.0679, 'grad_norm': 2.6162047386169434, 'learning_rate': 0.00018306451612903223, 'epoch': 0.83}
+{'loss': 0.5608, 'grad_norm': 1.2465320825576782, 'learning_rate': 0.0001830400782013685, 'epoch': 0.83}
+{'loss': 0.8224, 'grad_norm': 1.294814109802246, 'learning_rate': 0.00018301564027370476, 'epoch': 0.83}
+{'loss': 1.3591, 'grad_norm': 2.261251926422119, 'learning_rate': 0.00018299120234604104, 'epoch': 0.83}
+{'loss': 1.4874, 'grad_norm': 1.4905081987380981, 'learning_rate': 0.00018296676441837732, 'epoch': 0.83}
+{'loss': 0.2785, 'grad_norm': 0.5138626098632812, 'learning_rate': 0.00018294232649071357, 'epoch': 0.83}
+{'loss': 0.3083, 'grad_norm': 0.5530992150306702, 'learning_rate': 0.00018291788856304985, 'epoch': 0.83}
+{'loss': 0.3063, 'grad_norm': 0.40813249349594116, 'learning_rate': 0.00018289345063538613, 'epoch': 0.83}
+{'loss': 0.3811, 'grad_norm': 0.5321076512336731, 'learning_rate': 0.00018286901270772235, 'epoch': 0.83}
+{'loss': 0.3492, 'grad_norm': 0.5852525234222412, 'learning_rate': 0.00018284457478005863, 'epoch': 0.83}
+{'loss': 0.4506, 'grad_norm': 0.7372064590454102, 'learning_rate': 0.0001828201368523949, 'epoch': 0.83}
+{'loss': 0.3694, 'grad_norm': 0.6820610761642456, 'learning_rate': 0.00018279569892473116, 'epoch': 0.83}
+{'loss': 0.4984, 'grad_norm': 1.2214698791503906, 'learning_rate': 0.00018277126099706744, 'epoch': 0.83}
+ 42%|████▏     | 5309/12776 [56:00<1:25:16,  1.46it/s] 42%|████▏     | 5310/12776 [56:01<1:23:31,  1.49it/s]                                                       42%|████▏     | 5310/12776 [56:01<1:23:31,  1.49it/s] 42%|████▏     | 5311/12776 [56:01<1:18:55,  1.58it/s]                                                       42%|████▏     | 5311/12776 [56:01<1:18:55,  1.58it/s] 42%|████▏     | 5312/12776 [56:02<1:17:18,  1.61it/s]                                                       42%|████▏     | 5312/12776 [56:02<1:17:18,  1.61it/s] 42%|████▏     | 5313/12776 [56:02<1:12:24,  1.72it/s]                                                       42%|████▏     | 5313/12776 [56:02<1:12:24,  1.72it/s] 42%|████▏     | 5314/12776 [56:03<1:10:21,  1.77it/s]                                                       42%|████▏     | 5314/12776 [56:03<1:10:21,  1.77it/s] 42%|████▏     | 5315/12776 [56:03<1:05:46,  1.89it/s]                                                       42%|████▏     | 5315/12776 [56:03<1:05:46,  1.89it/s] 42%|████▏     | 5316/12776 [56:04<1:04:28,  1.93it/s]                                                       42%|████▏     | 5316/12776 [56:04<1:04:28,  1.93it/s] 42%|████▏     | 5317/12776 [56:04<1:00:16,  2.06it/s]                                                       42%|████▏     | 5317/12776 [56:04<1:00:16,  2.06it/s] 42%|████▏     | 5318/12776 [56:05<56:46,  2.19it/s]                                                       42%|████▏     | 5318/12776 [56:05<56:46,  2.19it/s] 42%|████▏     | 5319/12776 [56:05<59:20,  2.09it/s]                                                     42%|████▏     | 5319/12776 [56:05<59:20,  2.09it/s] 42%|████▏     | 5320/12776 [56:06<55:25,  2.24it/s]                                                     42%|████▏     | 5320/12776 [56:06<55:25,  2.24it/s] 42%|████▏     | 5321/12776 [56:06<51:51,  2.40it/s]                                                     42%|████▏     | 5321/12776 [56:06<51:51,  2.40it/s] 42%|████▏     | 5322/12776 [56:06<52:23,  2.37it/s]                                                     42%|████▏     | 5322/12776 [56:06<52:23,  2.37it/s] 42%|████▏     | 5323/12776 [56:07<49:03,  2.53it/s]                                                     42%|████▏     | 5323/12776 [56:07<49:03,  2.53it/s] 42%|████▏     | 5324/12776 [56:07<46:48,  2.65it/s]                                                     42%|████▏     | 5324/12776 [56:07<46:48,  2.65it/s] 42%|████▏     | 5325/12776 [56:07<46:43,  2.66it/s]                                                     42%|████▏     | 5325/12776 [56:07<46:43,  2.66it/s] 42%|████▏     | 5326/12776 [56:08<43:46,  2.84it/s]                                                     42%|████▏     | 5326/12776 [56:08<43:46,  2.84it/s] 42%|████▏     | 5327/12776 [56:08<41:24,  3.00it/s]                                                     42%|████▏     | 5327/12776 [56:08<41:24,  3.00it/s] 42%|████▏     | 5328/12776 [56:08<43:22,  2.86it/s]                                                     42%|████▏     | 5328/12776 [56:08<43:22,  2.86it/s] 42%|████▏     | 5329/12776 [56:09<40:28,  3.07it/s]                                                     42%|████▏     | 5329/12776 [56:09<40:28,  3.07it/s] 42%|████▏     | 5330/12776 [56:09<38:17,  3.24it/s]                                                     42%|████▏     | 5330/12776 [56:09<38:17,  3.24it/s] 42%|████▏     | 5331/12776 [56:09<36:17,  3.42it/s]                                                     42%|████▏     | 5331/12776 [56:09<36:17,  3.42it/s] 42%|████▏     | 5332/12776 [56:09<37:48,  3.28it/s]                                                     42%|████▏     | 5332/12776 [56:09<37:48,  3.28it/s] 42%|████▏     | 5333/12776 [56:10<35:33,  3.49it/s]                                                     42%|████▏     | 5333/12776 [56:10<35:33,  3.49it/s] 42%|████▏     | 5334/12776 [56:10<33:52,  3.66it/s]                                                     42%|████▏     | 5334/12776 [56:10<33:52,  3.66it/s] 42%|████▏     | 5335/12776 [56:10<32:23,  3.83it/s]                                                     42%|████▏     | 5335/12776 [56:10<32:23,  3.83it/s] 42%|████▏     | 5336/12776 [56:10<31:39,  3.92it/s]                                                     42%|████▏     | 5336/12776 [56:10<31:39,  3.92it/s] 42%|████▏     | 5337/12776 [56:11<32:18,  3.84it/s]                                                     42%|████▏     | 5337/12776 [56:11<32:18,  3.84it/s] 42%|████▏     | 5338/12776 [56:11<30:34,  4.06it/s]                                                     42%|████▏     | 5338/12776 [56:11<30:34,  4.06it/s] 42%|████▏     | 5339/12776 [56:11<29:07,  4.25it/s]                                                     42%|████▏     | 5339/12776 [56:11<29:07,  4.25it/s] 42%|████▏     | 5340/12776 [56:11<28:14,  4.39it/s]                                                     42%|████▏     | 5340/12776 [56:11<28:14,  4.39it/s] 42%|████▏     | 5341/12776 [56:12<27:25,  4.52it/s]                                                     42%|████▏     | 5341/12776 [56:12<27:25,  4.52it/s] 42%|████▏     | 5342/12776 [56:12<30:21,  4.08it/s]                                                     42%|████▏     | 5342/12776 [56:12<30:21,  4.08it/s] 42%|████▏     | 5343/12776 [56:12<28:41,  4.32it/s]                                                     42%|████▏     | 5343/12776 [56:12<28:41,  4.32it/s] 42%|████▏     | 5344/12776 [56:12<27:24,  4.52it/s]                                                     42%|████▏     | 5344/12776 [56:12<27:24,  4.52it/s] 42%|████▏     | 5345/12776 [56:12<26:32,  4.67it/s]                                                     42%|████▏     | 5345/12776 [56:12<26:32,  4.67it/s] 42%|████▏     | 5346/12776 [56:13<25:45,  4.81it/s]                                                     42%|████▏     | 5346/12776 [56:13<25:45,  4.81it/s] 42%|████▏     | 5347/12776 [56:13<27:12,  4.55it/s]                                                     42%|████▏     | 5347/12776 [56:13<27:12,  4.55it/s] 42%|████▏     | 5348/12776 [56:13<25:54,  4.78it/s]                                                     42%|████▏     | 5348/12776 [56:13<25:54,  4.78it/s] 42%|████▏     | 5349/12776 [56:13<24:57,  4.96it/s]                                                     42%|████▏     | 5349/12776 [56:13<24:57,  4.96it/s] 42%|████▏     | 5350/12776 [56:14<50:26,  2.45it/s]                                                     42%|████▏     | 5350/12776 [56:14<50:26,  2.45it/s] 42%|████▏     | 5351/12776 [56:16<1:33:25,  1.32it/s]                                                       42%|████▏     | 5351/12776 [56:16<1:33:25,  1.32it/s] 42%|████▏     | 5352/12776 [56:17<1:45:27,  1.17it/s]                                                       42%|████▏     | 5352/12776 [56:17<1:45:27,  1.17it/s] 42%|████▏     | 5353/12776 [56:18<1:45:57,  1.17it/s]                                                       42%|████▏     | 5353/12776 [56:18<1:45:57,  1.17it/s] 42%|████▏     | 5354/12776 [56:18<1:44:05,  1.19it/s]                                                       42%|████▏     | 5354/12776 [56:18<1:44:05,  1.19it/s] 42%|████▏     | 5355/12776 [56:19<1:41:05,  1.22it/s]                                                       42%|████▏     | 5355/12776 [56:19<1:41:05,  1.22it/s] 42%|████▏     | 5356/12776 [56:20<1:38:24,  1.26it/s]                                                       42%|████▏     | 5356/12776 [56:20<1:38:24,  1.26it/s] 42%|████▏     | 5357/12776 [56:21<1:33:58,  1.32it/s]                                                       42%|████▏     | 5357/12776 [56:21<1:33:58,  1.32it/s] 42%|████▏     | 5358/12776 [56:21<1:32:33,  1.34it/s]                                                       42%|████▏     | 5358/12776 [56:21<1:32:33,  1.34it/s] 42%|████▏     | 5359/12776 [56:22<1:27:56,  1.41it/s]                                                       42%|████▏     | 5359/12776 [56:22<1:27:56,  1.41it/s] 42%|████▏     | 5360/12776 [56:23<1:23:51,  1.47it/s]                                                       42%|████▏     | 5360/12776 [56:23<1:23:51,  1.47it/s] 42%|████▏     | 5361/12776 [56:23<1:18:42,  1.57it/s]                                                       42%|████▏     | 5361/12776 [56:23<1:18:42,  1.57it/s] 42%|████▏     | 5362/12776 [56:24<1:15:07,  1.64it/s]                                                       42%|████▏     | 5362/12776 [56:24<1:15:07,  1.64it/s] 42%|████▏     | 5363/12776 [56:24<1:10:57,  1.74it/s]                                                       42%|████▏     | 5363/12776 [56:24<1:10:57,  1.74it/s] 42%|████▏     | 5364/12776 [56:25<1:08:42,  1.80it/s]                                                       42%|████▏     | 5364/12776 [56:25<1:08:42,  1.80it/s] 42%|████▏     | 5365/12776 [56:25<1:05:02,  1.90it/s]                                                       42%|████▏     | 5365/12776 [56:25<1:05:02,  1.90it/s] 42%|████▏     | 5366/12776 [56:26<1:04:18,  1.92it/s]                                                       42%|████▏     | 5366/12776 [56:26<1:04:18,  1.92it/s] 42%|████▏     | 5367/12776 [56:26<1:00:33,  2.04it/s]                                                       42%|████▏     | 5367/12776 [56:26<1:00:33,  2.04it/s] 42%|████▏     | 5368/12776 [56:27<57:41,  2.14it/s]                                                       42%|████▏     | 5368/12776 [56:27<57:41,  2.14it/s] 42%|████▏     | 5369/12776 [56:27<57:23,  2.15it/s]                                                     42%|████▏     | 5369/12776 [56:27<57:23,  2.15it/s] 42%|████▏     | 5370/12776 [56:27<53:58,  2.29it/s]                                                     42%|████▏     | 5370/12776 [56:27<53:58,  2.29it/s] 42%|████▏     | 5371/12776 [56:28<50:52,  2.43it/s]                                                     42%|████▏     | 5371/12776 [56:28<50:52,  2.43it/s] 42%|████▏     | 5372/12776 [56:28<51:20,  2.40it/s]                                                     42%|████▏     | 5372/12776 [56:28<51:20,  2.40it/s] 42%|████▏     | 5373/12776 [56:28<48:51,  2.53it/s]                                                     42%|████▏     | 5373/12776 [56:28<48:51,  2.53it/s] 42%|████▏     | 5374/12776 [56:29<46:29,  2.65it/s]                                                     42%|████▏     | 5374/12776 [56:29<46:29,  2.65it/s] 42%|████▏     | 5375/12776 [56:29<47:45,  2.58it/s]                                                     42%|████▏     | 5375/12776 [56:29<47:45,  2.58it/s] 42%|████▏     | 5376/12776 [56:30<44:43,  2.76it/s]                                                     42%|████▏     | 5376/12776 [56:30<44:43,  2.76it/s] 42%|████▏     | 5377/12776 [56:30<42:10,  2.92it/s]                                                     42%|████▏     | 5377/12776 [56:30<42:10,  2.92it/s] 42%|████▏     | 5378/12776 [56:30<42:41,  2.89it/s]                                                     42%|████▏     | 5378/12776 [56:30<42:41,  2.89it/s] 42%|████▏     | 5379/12776 [56:30<39:55,  3.09it/s]                                                     42%|████▏     | 5379/12776 [56:30<39:55,  3.09it/s] 42%|████▏     | 5380/12776 [56:31<37:35,  3.28it/s]                                                     42%|████▏     | 5380/12776 [56:31<37:35,  3.28it/s] 42%|████▏     | 5381/12776 [56:31<35:33,  3.47it/s]                                                     42%|████▏     | 5381/12776 [56:31<35:33,  3.47it/s] 42%|████▏     | 5382/12776 [56:31<37:24,  3.29it/s]                                                     42%|████▏     | 5382/12776 [56:31<37:24,  3.29it/s] 42%|████▏     | 5383/12776 [56:32<34:56,  3.53it/s]                                                     42%|████▏     | 5383/12776 [56:32<34:56,  3.53it/s] 42%|████▏     | 5384/12776 [56:32<32:55,  3.74it/s]                                                     42%|████▏     | 5384/12776 [56:32<32:55,  3.74it/s] 42%|████▏     | 5385/12776 [56:32<31:17,  3.94it/s]                                                     42%|████▏     | 5385/12776 [56:32<31:17,  3.94it/s] 42%|████▏     | 5386/12776 [56:32<29:55,  4.12it/s]                                                    {'loss': 0.6551, 'grad_norm': 1.288681983947754, 'learning_rate': 0.00018274682306940372, 'epoch': 0.83}
+{'loss': 0.3613, 'grad_norm': 1.0131791830062866, 'learning_rate': 0.00018272238514173997, 'epoch': 0.83}
+{'loss': 0.5802, 'grad_norm': 2.0954971313476562, 'learning_rate': 0.00018269794721407625, 'epoch': 0.83}
+{'loss': 0.4414, 'grad_norm': 0.8063324093818665, 'learning_rate': 0.0001826735092864125, 'epoch': 0.83}
+{'loss': 0.485, 'grad_norm': 1.2960774898529053, 'learning_rate': 0.00018264907135874875, 'epoch': 0.83}
+{'loss': 0.5702, 'grad_norm': 0.9982731342315674, 'learning_rate': 0.00018262463343108503, 'epoch': 0.83}
+{'loss': 0.5258, 'grad_norm': 2.1627659797668457, 'learning_rate': 0.0001826001955034213, 'epoch': 0.83}
+{'loss': 0.6247, 'grad_norm': 1.2846755981445312, 'learning_rate': 0.00018257575757575756, 'epoch': 0.83}
+{'loss': 0.5156, 'grad_norm': 1.6041959524154663, 'learning_rate': 0.00018255131964809384, 'epoch': 0.83}
+{'loss': 0.3831, 'grad_norm': 0.9407545328140259, 'learning_rate': 0.00018252688172043011, 'epoch': 0.83}
+{'loss': 0.9489, 'grad_norm': 1.5402854681015015, 'learning_rate': 0.00018250244379276634, 'epoch': 0.83}
+{'loss': 0.5643, 'grad_norm': 0.9646233916282654, 'learning_rate': 0.00018247800586510262, 'epoch': 0.83}
+{'loss': 0.5555, 'grad_norm': 1.2722687721252441, 'learning_rate': 0.0001824535679374389, 'epoch': 0.83}
+{'loss': 0.904, 'grad_norm': 2.5088601112365723, 'learning_rate': 0.00018242913000977515, 'epoch': 0.83}
+{'loss': 0.7769, 'grad_norm': 1.9677549600601196, 'learning_rate': 0.00018240469208211142, 'epoch': 0.83}
+{'loss': 0.8852, 'grad_norm': 2.1917471885681152, 'learning_rate': 0.0001823802541544477, 'epoch': 0.83}
+{'loss': 0.8834, 'grad_norm': 2.5052521228790283, 'learning_rate': 0.00018235581622678395, 'epoch': 0.83}
+{'loss': 0.8438, 'grad_norm': 1.5879862308502197, 'learning_rate': 0.00018233137829912023, 'epoch': 0.83}
+{'loss': 0.5548, 'grad_norm': 2.1139187812805176, 'learning_rate': 0.0001823069403714565, 'epoch': 0.83}
+{'loss': 0.9604, 'grad_norm': 1.9790080785751343, 'learning_rate': 0.00018228250244379274, 'epoch': 0.83}
+{'loss': 0.7413, 'grad_norm': 2.8384530544281006, 'learning_rate': 0.00018225806451612901, 'epoch': 0.83}
+{'loss': 0.5664, 'grad_norm': 2.2223446369171143, 'learning_rate': 0.0001822336265884653, 'epoch': 0.83}
+{'loss': 0.6546, 'grad_norm': 2.0191867351531982, 'learning_rate': 0.00018220918866080154, 'epoch': 0.83}
+{'loss': 0.7337, 'grad_norm': 2.5596179962158203, 'learning_rate': 0.00018218475073313782, 'epoch': 0.83}
+{'loss': 0.6803, 'grad_norm': 1.8105648756027222, 'learning_rate': 0.0001821603128054741, 'epoch': 0.83}
+{'loss': 0.8126, 'grad_norm': 1.6906538009643555, 'learning_rate': 0.00018213587487781035, 'epoch': 0.84}
+{'loss': 0.7731, 'grad_norm': 1.9950919151306152, 'learning_rate': 0.00018211143695014663, 'epoch': 0.84}
+{'loss': 0.7505, 'grad_norm': 1.5140913724899292, 'learning_rate': 0.00018208699902248288, 'epoch': 0.84}
+{'loss': 1.1235, 'grad_norm': 4.230485439300537, 'learning_rate': 0.00018206256109481913, 'epoch': 0.84}
+{'loss': 1.1191, 'grad_norm': 2.215449094772339, 'learning_rate': 0.0001820381231671554, 'epoch': 0.84}
+{'loss': 0.9407, 'grad_norm': 2.6537933349609375, 'learning_rate': 0.0001820136852394917, 'epoch': 0.84}
+{'loss': 0.6755, 'grad_norm': 1.676081895828247, 'learning_rate': 0.00018198924731182794, 'epoch': 0.84}
+{'loss': 0.9189, 'grad_norm': 3.5831100940704346, 'learning_rate': 0.00018196480938416422, 'epoch': 0.84}
+{'loss': 0.8016, 'grad_norm': 2.116856575012207, 'learning_rate': 0.0001819403714565005, 'epoch': 0.84}
+{'loss': 1.1484, 'grad_norm': 4.248983383178711, 'learning_rate': 0.00018191593352883672, 'epoch': 0.84}
+{'loss': 1.1032, 'grad_norm': 2.514638900756836, 'learning_rate': 0.000181891495601173, 'epoch': 0.84}
+{'loss': 0.8824, 'grad_norm': 2.494845390319824, 'learning_rate': 0.00018186705767350928, 'epoch': 0.84}
+{'loss': 0.596, 'grad_norm': 1.30543053150177, 'learning_rate': 0.00018184261974584553, 'epoch': 0.84}
+{'loss': 1.4802, 'grad_norm': 2.425416946411133, 'learning_rate': 0.0001818181818181818, 'epoch': 0.84}
+{'loss': 0.4148, 'grad_norm': 1.7893593311309814, 'learning_rate': 0.00018179374389051809, 'epoch': 0.84}
+{'loss': 0.7683, 'grad_norm': 3.8236687183380127, 'learning_rate': 0.00018176930596285434, 'epoch': 0.84}
+{'loss': 0.886, 'grad_norm': 1.503875494003296, 'learning_rate': 0.00018174486803519062, 'epoch': 0.84}
+{'loss': 0.2861, 'grad_norm': 0.4786723256111145, 'learning_rate': 0.0001817204301075269, 'epoch': 0.84}
+{'loss': 0.4324, 'grad_norm': 0.8675291538238525, 'learning_rate': 0.00018169599217986312, 'epoch': 0.84}
+{'loss': 0.3441, 'grad_norm': 0.8914163708686829, 'learning_rate': 0.0001816715542521994, 'epoch': 0.84}
+{'loss': 0.3934, 'grad_norm': 0.8245097994804382, 'learning_rate': 0.00018164711632453567, 'epoch': 0.84}
+{'loss': 0.2701, 'grad_norm': 0.8394196033477783, 'learning_rate': 0.00018162267839687193, 'epoch': 0.84}
+{'loss': 0.4637, 'grad_norm': 1.792810320854187, 'learning_rate': 0.0001815982404692082, 'epoch': 0.84}
+{'loss': 0.2882, 'grad_norm': 0.979446291923523, 'learning_rate': 0.00018157380254154448, 'epoch': 0.84}
+{'loss': 0.3115, 'grad_norm': 0.9786679148674011, 'learning_rate': 0.00018154936461388073, 'epoch': 0.84}
+{'loss': 0.3731, 'grad_norm': 0.618453860282898, 'learning_rate': 0.00018152492668621698, 'epoch': 0.84}
+{'loss': 0.3479, 'grad_norm': 0.9989930391311646, 'learning_rate': 0.00018150048875855326, 'epoch': 0.84}
+{'loss': 0.3715, 'grad_norm': 0.7567367553710938, 'learning_rate': 0.00018147605083088951, 'epoch': 0.84}
+{'loss': 0.4816, 'grad_norm': 1.233569622039795, 'learning_rate': 0.0001814516129032258, 'epoch': 0.84}
+{'loss': 0.6499, 'grad_norm': 1.9397759437561035, 'learning_rate': 0.00018142717497556207, 'epoch': 0.84}
+{'loss': 0.3907, 'grad_norm': 0.8263518810272217, 'learning_rate': 0.00018140273704789832, 'epoch': 0.84}
+{'loss': 0.3535, 'grad_norm': 0.7921581864356995, 'learning_rate': 0.0001813782991202346, 'epoch': 0.84}
+{'loss': 0.5176, 'grad_norm': 1.2115155458450317, 'learning_rate': 0.00018135386119257088, 'epoch': 0.84}
+{'loss': 0.6016, 'grad_norm': 1.769970417022705, 'learning_rate': 0.0001813294232649071, 'epoch': 0.84}
+{'loss': 0.4667, 'grad_norm': 0.8580262660980225, 'learning_rate': 0.00018130498533724338, 'epoch': 0.84}
+{'loss': 0.4062, 'grad_norm': 0.9332250356674194, 'learning_rate': 0.00018128054740957966, 'epoch': 0.84}
+{'loss': 0.5315, 'grad_norm': 2.4739444255828857, 'learning_rate': 0.0001812561094819159, 'epoch': 0.84}
+{'loss': 0.4582, 'grad_norm': 1.3356553316116333, 'learning_rate': 0.0001812316715542522, 'epoch': 0.84}
+{'loss': 0.7177, 'grad_norm': 1.6975020170211792, 'learning_rate': 0.00018120723362658844, 'epoch': 0.84}
+{'loss': 0.6598, 'grad_norm': 1.566121220588684, 'learning_rate': 0.00018118279569892472, 'epoch': 0.84}
+{'loss': 1.0747, 'grad_norm': 1.5858402252197266, 'learning_rate': 0.000181158357771261, 'epoch': 0.84}
+{'loss': 0.4249, 'grad_norm': 0.9164276719093323, 'learning_rate': 0.00018113391984359722, 'epoch': 0.84}
+{'loss': 0.4706, 'grad_norm': 1.1517704725265503, 'learning_rate': 0.0001811094819159335, 'epoch': 0.84}
+{'loss': 0.6945, 'grad_norm': 1.679603099822998, 'learning_rate': 0.00018108504398826978, 'epoch': 0.84}
+{'loss': 0.8951, 'grad_norm': 1.76631760597229, 'learning_rate': 0.00018106060606060603, 'epoch': 0.84}
+{'loss': 0.4552, 'grad_norm': 1.7927947044372559, 'learning_rate': 0.0001810361681329423, 'epoch': 0.84}
+{'loss': 0.6823, 'grad_norm': 2.7323668003082275, 'learning_rate': 0.00018101173020527859, 'epoch': 0.84}
+{'loss': 0.9579, 'grad_norm': 1.3663641214370728, 'learning_rate': 0.00018098729227761484, 'epoch': 0.84}
+{'loss': 0.8778, 'grad_norm': 1.6697648763656616, 'learning_rate': 0.00018096285434995112, 'epoch': 0.84}
+{'loss': 0.7617, 'grad_norm': 1.3440797328948975, 'learning_rate': 0.00018093841642228737, 'epoch': 0.84}
+{'loss': 0.6647, 'grad_norm': 2.0381476879119873, 'learning_rate': 0.00018091397849462362, 'epoch': 0.84}
+{'loss': 0.8858, 'grad_norm': 2.4340994358062744, 'learning_rate': 0.0001808895405669599, 'epoch': 0.84}
+ 42%|████▏     | 5386/12776 [56:32<29:55,  4.12it/s] 42%|████▏     | 5387/12776 [56:33<33:33,  3.67it/s]                                                     42%|████▏     | 5387/12776 [56:33<33:33,  3.67it/s] 42%|████▏     | 5388/12776 [56:33<31:11,  3.95it/s]                                                     42%|████▏     | 5388/12776 [56:33<31:11,  3.95it/s] 42%|████▏     | 5389/12776 [56:33<29:38,  4.15it/s]                                                     42%|████▏     | 5389/12776 [56:33<29:38,  4.15it/s] 42%|████▏     | 5390/12776 [56:33<28:18,  4.35it/s]                                                     42%|████▏     | 5390/12776 [56:33<28:18,  4.35it/s] 42%|████▏     | 5391/12776 [56:33<27:17,  4.51it/s]                                                     42%|████▏     | 5391/12776 [56:33<27:17,  4.51it/s] 42%|████▏     | 5392/12776 [56:34<30:48,  3.99it/s]                                                     42%|████▏     | 5392/12776 [56:34<30:48,  3.99it/s] 42%|████▏     | 5393/12776 [56:34<28:52,  4.26it/s]                                                     42%|████▏     | 5393/12776 [56:34<28:52,  4.26it/s] 42%|████▏     | 5394/12776 [56:34<27:26,  4.48it/s]                                                     42%|████▏     | 5394/12776 [56:34<27:26,  4.48it/s] 42%|████▏     | 5395/12776 [56:34<26:23,  4.66it/s]                                                     42%|████▏     | 5395/12776 [56:34<26:23,  4.66it/s] 42%|████▏     | 5396/12776 [56:34<25:31,  4.82it/s]                                                     42%|████▏     | 5396/12776 [56:34<25:31,  4.82it/s] 42%|████▏     | 5397/12776 [56:35<27:23,  4.49it/s]                                                     42%|████▏     | 5397/12776 [56:35<27:23,  4.49it/s] 42%|████▏     | 5398/12776 [56:35<25:59,  4.73it/s]                                                     42%|████▏     | 5398/12776 [56:35<25:59,  4.73it/s] 42%|████▏     | 5399/12776 [56:35<24:57,  4.93it/s]                                                     42%|████▏     | 5399/12776 [56:35<24:57,  4.93it/s] 42%|████▏     | 5400/12776 [56:36<50:21,  2.44it/s]                                                     42%|████▏     | 5400/12776 [56:36<50:21,  2.44it/s] 42%|████▏     | 5401/12776 [56:38<1:41:06,  1.22it/s]                                                       42%|████▏     | 5401/12776 [56:38<1:41:06,  1.22it/s] 42%|████▏     | 5402/12776 [56:39<1:48:23,  1.13it/s]                                                       42%|████▏     | 5402/12776 [56:39<1:48:23,  1.13it/s] 42%|████▏     | 5403/12776 [56:40<1:50:14,  1.11it/s]                                                       42%|████▏     | 5403/12776 [56:40<1:50:14,  1.11it/s] 42%|████▏     | 5404/12776 [56:40<1:46:10,  1.16it/s]                                                       42%|████▏     | 5404/12776 [56:41<1:46:10,  1.16it/s] 42%|████▏     | 5405/12776 [56:41<1:41:53,  1.21it/s]                                                       42%|████▏     | 5405/12776 [56:41<1:41:53,  1.21it/s] 42%|████▏     | 5406/12776 [56:42<1:39:39,  1.23it/s]                                                       42%|████▏     | 5406/12776 [56:42<1:39:39,  1.23it/s] 42%|████▏     | 5407/12776 [56:43<1:37:27,  1.26it/s]                                                       42%|████▏     | 5407/12776 [56:43<1:37:27,  1.26it/s] 42%|████▏     | 5408/12776 [56:43<1:31:57,  1.34it/s]                                                       42%|████▏     | 5408/12776 [56:43<1:31:57,  1.34it/s] 42%|████▏     | 5409/12776 [56:44<1:33:45,  1.31it/s]                                                       42%|████▏     | 5409/12776 [56:44<1:33:45,  1.31it/s] 42%|████▏     | 5410/12776 [56:45<1:27:09,  1.41it/s]                                                       42%|████▏     | 5410/12776 [56:45<1:27:09,  1.41it/s] 42%|████▏     | 5411/12776 [56:45<1:24:43,  1.45it/s]                                                       42%|████▏     | 5411/12776 [56:45<1:24:43,  1.45it/s] 42%|████▏     | 5412/12776 [56:46<1:19:20,  1.55it/s]                                                       42%|████▏     | 5412/12776 [56:46<1:19:20,  1.55it/s] 42%|████▏     | 5413/12776 [56:47<1:18:03,  1.57it/s]                                                       42%|████▏     | 5413/12776 [56:47<1:18:03,  1.57it/s] 42%|████▏     | 5414/12776 [56:47<1:12:54,  1.68it/s]                                                       42%|████▏     | 5414/12776 [56:47<1:12:54,  1.68it/s] 42%|████▏     | 5415/12776 [56:48<1:09:34,  1.76it/s]                                                       42%|████▏     | 5415/12776 [56:48<1:09:34,  1.76it/s] 42%|████▏     | 5416/12776 [56:48<1:04:55,  1.89it/s]                                                       42%|████▏     | 5416/12776 [56:48<1:04:55,  1.89it/s] 42%|████▏     | 5417/12776 [56:49<1:03:43,  1.92it/s]                                                       42%|████▏     | 5417/12776 [56:49<1:03:43,  1.92it/s] 42%|████▏     | 5418/12776 [56:49<59:42,  2.05it/s]                                                       42%|████▏     | 5418/12776 [56:49<59:42,  2.05it/s] 42%|████▏     | 5419/12776 [56:49<56:35,  2.17it/s]                                                     42%|████▏     | 5419/12776 [56:49<56:35,  2.17it/s] 42%|████▏     | 5420/12776 [56:50<57:15,  2.14it/s]                                                     42%|████▏     | 5420/12776 [56:50<57:15,  2.14it/s] 42%|████▏     | 5421/12776 [56:50<53:51,  2.28it/s]                                                     42%|████▏     | 5421/12776 [56:50<53:51,  2.28it/s] 42%|████▏     | 5422/12776 [56:51<50:40,  2.42it/s]                                                     42%|████▏     | 5422/12776 [56:51<50:40,  2.42it/s] 42%|████▏     | 5423/12776 [56:51<51:03,  2.40it/s]                                                     42%|████▏     | 5423/12776 [56:51<51:03,  2.40it/s] 42%|████▏     | 5424/12776 [56:51<47:58,  2.55it/s]                                                     42%|████▏     | 5424/12776 [56:51<47:58,  2.55it/s] 42%|████▏     | 5425/12776 [56:52<45:37,  2.69it/s]                                                     42%|████▏     | 5425/12776 [56:52<45:37,  2.69it/s] 42%|████▏     | 5426/12776 [56:52<44:53,  2.73it/s]                                                     42%|████▏     | 5426/12776 [56:52<44:53,  2.73it/s] 42%|████▏     | 5427/12776 [56:52<42:40,  2.87it/s]                                                     42%|████▏     | 5427/12776 [56:52<42:40,  2.87it/s] 42%|████▏     | 5428/12776 [56:53<40:31,  3.02it/s]                                                     42%|████▏     | 5428/12776 [56:53<40:31,  3.02it/s] 42%|████▏     | 5429/12776 [56:53<38:33,  3.18it/s]                                                     42%|████▏     | 5429/12776 [56:53<38:33,  3.18it/s] 43%|████▎     | 5430/12776 [56:53<40:58,  2.99it/s]                                                     43%|████▎     | 5430/12776 [56:53<40:58,  2.99it/s] 43%|███���▎     | 5431/12776 [56:54<38:14,  3.20it/s]                                                     43%|████▎     | 5431/12776 [56:54<38:14,  3.20it/s] 43%|████▎     | 5432/12776 [56:54<36:02,  3.40it/s]                                                     43%|████▎     | 5432/12776 [56:54<36:02,  3.40it/s] 43%|████▎     | 5433/12776 [56:54<34:19,  3.57it/s]                                                     43%|████▎     | 5433/12776 [56:54<34:19,  3.57it/s] 43%|████▎     | 5434/12776 [56:54<37:55,  3.23it/s]                                                     43%|████▎     | 5434/12776 [56:54<37:55,  3.23it/s] 43%|████▎     | 5435/12776 [56:55<35:11,  3.48it/s]                                                     43%|████▎     | 5435/12776 [56:55<35:11,  3.48it/s] 43%|████▎     | 5436/12776 [56:55<33:04,  3.70it/s]                                                     43%|████▎     | 5436/12776 [56:55<33:04,  3.70it/s] 43%|████▎     | 5437/12776 [56:55<31:21,  3.90it/s]                                                     43%|████▎     | 5437/12776 [56:55<31:21,  3.90it/s] 43%|████▎     | 5438/12776 [56:55<33:55,  3.61it/s]                                                     43%|████▎     | 5438/12776 [56:55<33:55,  3.61it/s] 43%|████▎     | 5439/12776 [56:56<31:36,  3.87it/s]                                                     43%|████▎     | 5439/12776 [56:56<31:36,  3.87it/s] 43%|████▎     | 5440/12776 [56:56<29:53,  4.09it/s]                                                     43%|████▎     | 5440/12776 [56:56<29:53,  4.09it/s] 43%|████▎     | 5441/12776 [56:56<28:41,  4.26it/s]                                                     43%|████▎     | 5441/12776 [56:56<28:41,  4.26it/s] 43%|████▎     | 5442/12776 [56:56<27:39,  4.42it/s]                                                     43%|████▎     | 5442/12776 [56:56<27:39,  4.42it/s] 43%|████▎     | 5443/12776 [56:57<30:31,  4.00it/s]                                                     43%|████▎     | 5443/12776 [56:57<30:31,  4.00it/s] 43%|████▎     | 5444/12776 [56:57<28:43,  4.25it/s]                                                     43%|████▎     | 5444/12776 [56:57<28:43,  4.25it/s] 43%|████▎     | 5445/12776 [56:57<27:24,  4.46it/s]                                                     43%|████▎     | 5445/12776 [56:57<27:24,  4.46it/s] 43%|████▎     | 5446/12776 [56:57<26:20,  4.64it/s]                                                     43%|████▎     | 5446/12776 [56:57<26:20,  4.64it/s] 43%|████▎     | 5447/12776 [56:57<25:31,  4.79it/s]                                                     43%|████▎     | 5447/12776 [56:57<25:31,  4.79it/s] 43%|████▎     | 5448/12776 [56:58<24:39,  4.95it/s]                                                     43%|████▎     | 5448/12776 [56:58<24:39,  4.95it/s] 43%|████▎     | 5449/12776 [56:58<26:58,  4.53it/s]                                                     43%|████▎     | 5449/12776 [56:58<26:58,  4.53it/s] 43%|████▎     | 5450/12776 [56:59<47:02,  2.60it/s]                                                     43%|████▎     | 5450/12776 [56:59<47:02,  2.60it/s] 43%|████▎     | 5451/12776 [57:00<1:30:39,  1.35it/s]                                                       43%|████▎     | 5451/12776 [57:00<1:30:39,  1.35it/s] 43%|████▎     | 5452/12776 [57:01<1:40:38,  1.21it/s]                                                       43%|████▎     | 5452/12776 [57:01<1:40:38,  1.21it/s] 43%|████▎     | 5453/12776 [57:02<1:42:35,  1.19it/s]                                                       43%|████▎     | 5453/12776 [57:02<1:42:35,  1.19it/s] 43%|████▎     | 5454/12776 [57:03<1:42:15,  1.19it/s]                                                       43%|████▎     | 5454/12776 [57:03<1:42:15,  1.19it/s] 43%|████▎     | 5455/12776 [57:04<1:45:33,  1.16it/s]                                                       43%|████▎     | 5455/12776 [57:04<1:45:33,  1.16it/s] 43%|████▎     | 5456/12776 [57:05<1:44:15,  1.17it/s]                                                       43%|████▎     | 5456/12776 [57:05<1:44:15,  1.17it/s] 43%|████▎     | 5457/12776 [57:05<1:37:24,  1.25it/s]                                                       43%|████▎     | 5457/12776 [57:05<1:37:24,  1.25it/s] 43%|████▎     | 5458/12776 [57:06<1:35:41,  1.27it/s]                                                       43%|████▎     | 5458/12776 [57:06<1:35:41,  1.27it/s] 43%|████▎     | 5459/12776 [57:07<1:29:21,  1.36it/s]                                                       43%|████▎     | 5459/12776 [57:07<1:29:21,  1.36it/s] 43%|████▎     | 5460/12776 [57:07<1:24:57,  1.44it/s]                                                       43%|████▎     | 5460/12776 [57:07<1:24:57,  1.44it/s] 43%|████▎     | 5461/12776 [57:08<1:19:34,  1.53it/s]                                                       43%|████▎     | 5461/12776 [57:08<1:19:34,  1.53it/s] 43%|████▎     | 5462/12776 [57:08<1:18:09,  1.56it/s]                                                       43%|████▎     | 5462/12776 [57:08<1:18:09,  1.56it/s] 43%|████▎     | 5463/12776 [57:09<1:12:40,  1.68it/s]                                                      {'loss': 1.9136, 'grad_norm': 3.9241204261779785, 'learning_rate': 0.00018086510263929618, 'epoch': 0.84}
+{'loss': 1.0138, 'grad_norm': 1.6787124872207642, 'learning_rate': 0.00018084066471163243, 'epoch': 0.84}
+{'loss': 1.1926, 'grad_norm': 2.1270811557769775, 'learning_rate': 0.0001808162267839687, 'epoch': 0.84}
+{'loss': 0.8631, 'grad_norm': 2.4386167526245117, 'learning_rate': 0.00018079178885630498, 'epoch': 0.84}
+{'loss': 1.6441, 'grad_norm': 2.620981216430664, 'learning_rate': 0.0001807673509286412, 'epoch': 0.84}
+{'loss': 1.1166, 'grad_norm': 2.2242493629455566, 'learning_rate': 0.00018074291300097749, 'epoch': 0.84}
+{'loss': 1.421, 'grad_norm': 2.703012466430664, 'learning_rate': 0.00018071847507331376, 'epoch': 0.84}
+{'loss': 1.2213, 'grad_norm': 2.006281852722168, 'learning_rate': 0.00018069403714565002, 'epoch': 0.84}
+{'loss': 0.8721, 'grad_norm': 2.2452259063720703, 'learning_rate': 0.0001806695992179863, 'epoch': 0.84}
+{'loss': 0.7445, 'grad_norm': 2.2399497032165527, 'learning_rate': 0.00018064516129032257, 'epoch': 0.84}
+{'loss': 0.4617, 'grad_norm': 1.1072652339935303, 'learning_rate': 0.00018062072336265882, 'epoch': 0.84}
+{'loss': 1.5535, 'grad_norm': 4.509149074554443, 'learning_rate': 0.0001805962854349951, 'epoch': 0.84}
+{'loss': 0.9141, 'grad_norm': 4.23785924911499, 'learning_rate': 0.00018057184750733138, 'epoch': 0.85}
+{'loss': 1.5931, 'grad_norm': 2.642498254776001, 'learning_rate': 0.0001805474095796676, 'epoch': 0.85}
+{'loss': 0.9882, 'grad_norm': 2.1803855895996094, 'learning_rate': 0.00018052297165200388, 'epoch': 0.85}
+{'loss': 0.2871, 'grad_norm': 0.598574161529541, 'learning_rate': 0.00018049853372434016, 'epoch': 0.85}
+{'loss': 0.2676, 'grad_norm': 0.48999595642089844, 'learning_rate': 0.0001804740957966764, 'epoch': 0.85}
+{'loss': 0.2758, 'grad_norm': 0.6198676228523254, 'learning_rate': 0.0001804496578690127, 'epoch': 0.85}
+{'loss': 0.3911, 'grad_norm': 1.454820990562439, 'learning_rate': 0.00018042521994134897, 'epoch': 0.85}
+{'loss': 0.2041, 'grad_norm': 0.7595102787017822, 'learning_rate': 0.00018040078201368522, 'epoch': 0.85}
+{'loss': 0.3308, 'grad_norm': 0.6085957288742065, 'learning_rate': 0.0001803763440860215, 'epoch': 0.85}
+{'loss': 0.2946, 'grad_norm': 0.7562193274497986, 'learning_rate': 0.00018035190615835775, 'epoch': 0.85}
+{'loss': 0.2354, 'grad_norm': 0.44967907667160034, 'learning_rate': 0.000180327468230694, 'epoch': 0.85}
+{'loss': 0.5657, 'grad_norm': 0.9210385680198669, 'learning_rate': 0.00018030303030303028, 'epoch': 0.85}
+{'loss': 0.3728, 'grad_norm': 0.6817724704742432, 'learning_rate': 0.00018027859237536656, 'epoch': 0.85}
+{'loss': 0.322, 'grad_norm': 0.7915209531784058, 'learning_rate': 0.0001802541544477028, 'epoch': 0.85}
+{'loss': 0.3134, 'grad_norm': 1.467758297920227, 'learning_rate': 0.0001802297165200391, 'epoch': 0.85}
+{'loss': 0.4713, 'grad_norm': 1.2615087032318115, 'learning_rate': 0.00018020527859237537, 'epoch': 0.85}
+{'loss': 0.3469, 'grad_norm': 0.9758779406547546, 'learning_rate': 0.0001801808406647116, 'epoch': 0.85}
+{'loss': 0.3852, 'grad_norm': 1.8404748439788818, 'learning_rate': 0.00018015640273704787, 'epoch': 0.85}
+{'loss': 0.9143, 'grad_norm': 1.1955705881118774, 'learning_rate': 0.00018013196480938415, 'epoch': 0.85}
+{'loss': 0.6122, 'grad_norm': 1.832085132598877, 'learning_rate': 0.0001801075268817204, 'epoch': 0.85}
+{'loss': 0.6208, 'grad_norm': 1.7228481769561768, 'learning_rate': 0.00018008308895405668, 'epoch': 0.85}
+{'loss': 0.8098, 'grad_norm': 1.7654926776885986, 'learning_rate': 0.00018005865102639295, 'epoch': 0.85}
+{'loss': 0.4749, 'grad_norm': 0.9912778735160828, 'learning_rate': 0.0001800342130987292, 'epoch': 0.85}
+{'loss': 0.456, 'grad_norm': 0.9740948677062988, 'learning_rate': 0.00018000977517106548, 'epoch': 0.85}
+{'loss': 0.6931, 'grad_norm': 1.4612542390823364, 'learning_rate': 0.00017998533724340176, 'epoch': 0.85}
+{'loss': 0.4385, 'grad_norm': 1.1655532121658325, 'learning_rate': 0.00017996089931573799, 'epoch': 0.85}
+{'loss': 0.7765, 'grad_norm': 1.642591953277588, 'learning_rate': 0.00017993646138807426, 'epoch': 0.85}
+{'loss': 0.8041, 'grad_norm': 4.441413879394531, 'learning_rate': 0.00017991202346041054, 'epoch': 0.85}
+{'loss': 1.3471, 'grad_norm': 4.761545181274414, 'learning_rate': 0.0001798875855327468, 'epoch': 0.85}
+{'loss': 0.9073, 'grad_norm': 1.85892915725708, 'learning_rate': 0.00017986314760508307, 'epoch': 0.85}
+{'loss': 0.5756, 'grad_norm': 1.2642240524291992, 'learning_rate': 0.00017983870967741935, 'epoch': 0.85}
+{'loss': 0.622, 'grad_norm': 1.6355515718460083, 'learning_rate': 0.0001798142717497556, 'epoch': 0.85}
+{'loss': 0.9791, 'grad_norm': 2.2661969661712646, 'learning_rate': 0.00017978983382209185, 'epoch': 0.85}
+{'loss': 0.9954, 'grad_norm': 2.119992256164551, 'learning_rate': 0.00017976539589442813, 'epoch': 0.85}
+{'loss': 0.7139, 'grad_norm': 1.942277431488037, 'learning_rate': 0.00017974095796676438, 'epoch': 0.85}
+{'loss': 1.0877, 'grad_norm': 2.0111358165740967, 'learning_rate': 0.00017971652003910066, 'epoch': 0.85}
+{'loss': 1.0568, 'grad_norm': 20.57202911376953, 'learning_rate': 0.00017969208211143694, 'epoch': 0.85}
+{'loss': 1.0449, 'grad_norm': 6.058287143707275, 'learning_rate': 0.0001796676441837732, 'epoch': 0.85}
+{'loss': 0.7661, 'grad_norm': 2.6762993335723877, 'learning_rate': 0.00017964320625610947, 'epoch': 0.85}
+{'loss': 0.7657, 'grad_norm': 3.0614171028137207, 'learning_rate': 0.00017961876832844575, 'epoch': 0.85}
+{'loss': 1.1165, 'grad_norm': 2.681861162185669, 'learning_rate': 0.00017959433040078197, 'epoch': 0.85}
+{'loss': 1.3606, 'grad_norm': 3.7926366329193115, 'learning_rate': 0.00017956989247311825, 'epoch': 0.85}
+{'loss': 1.2335, 'grad_norm': 2.6977744102478027, 'learning_rate': 0.00017954545454545453, 'epoch': 0.85}
+{'loss': 0.9316, 'grad_norm': 1.4805278778076172, 'learning_rate': 0.00017952101661779078, 'epoch': 0.85}
+{'loss': 1.1716, 'grad_norm': 2.5783205032348633, 'learning_rate': 0.00017949657869012706, 'epoch': 0.85}
+{'loss': 1.5495, 'grad_norm': 2.976900100708008, 'learning_rate': 0.00017947214076246334, 'epoch': 0.85}
+{'loss': 1.321, 'grad_norm': 3.093869924545288, 'learning_rate': 0.0001794477028347996, 'epoch': 0.85}
+{'loss': 0.7588, 'grad_norm': 1.297680139541626, 'learning_rate': 0.00017942326490713587, 'epoch': 0.85}
+{'loss': 1.2392, 'grad_norm': 3.9038898944854736, 'learning_rate': 0.00017939882697947214, 'epoch': 0.85}
+{'loss': 0.3811, 'grad_norm': 1.2751731872558594, 'learning_rate': 0.00017937438905180837, 'epoch': 0.85}
+{'loss': 0.382, 'grad_norm': 1.0307965278625488, 'learning_rate': 0.00017934995112414465, 'epoch': 0.85}
+{'loss': 0.8235, 'grad_norm': 2.1279537677764893, 'learning_rate': 0.00017932551319648093, 'epoch': 0.85}
+{'loss': 1.0698, 'grad_norm': 2.3851094245910645, 'learning_rate': 0.00017930107526881718, 'epoch': 0.85}
+{'loss': 0.2337, 'grad_norm': 0.3719303011894226, 'learning_rate': 0.00017927663734115345, 'epoch': 0.85}
+{'loss': 0.4041, 'grad_norm': 0.48134690523147583, 'learning_rate': 0.00017925219941348973, 'epoch': 0.85}
+{'loss': 0.2524, 'grad_norm': 0.40173661708831787, 'learning_rate': 0.00017922776148582598, 'epoch': 0.85}
+{'loss': 0.2702, 'grad_norm': 0.5400435328483582, 'learning_rate': 0.00017920332355816224, 'epoch': 0.85}
+{'loss': 0.4001, 'grad_norm': 0.8261874318122864, 'learning_rate': 0.00017917888563049851, 'epoch': 0.85}
+{'loss': 0.5371, 'grad_norm': 0.846582293510437, 'learning_rate': 0.00017915444770283477, 'epoch': 0.85}
+{'loss': 0.5499, 'grad_norm': 0.961149275302887, 'learning_rate': 0.00017913000977517104, 'epoch': 0.85}
+{'loss': 0.4273, 'grad_norm': 0.5829175710678101, 'learning_rate': 0.00017910557184750732, 'epoch': 0.85}
+{'loss': 0.2947, 'grad_norm': 0.8650907874107361, 'learning_rate': 0.00017908113391984357, 'epoch': 0.85}
+{'loss': 0.4431, 'grad_norm': 0.7858892679214478, 'learning_rate': 0.00017905669599217985, 'epoch': 0.85}
+{'loss': 0.4506, 'grad_norm': 0.6087132692337036, 'learning_rate': 0.00017903225806451613, 'epoch': 0.85}
+{'loss': 0.4651, 'grad_norm': 0.840796709060669, 'learning_rate': 0.00017900782013685235, 'epoch': 0.86}
+ 43%|████▎     | 5463/12776 [57:09<1:12:40,  1.68it/s] 43%|████▎     | 5464/12776 [57:09<1:09:34,  1.75it/s]                                                       43%|████▎     | 5464/12776 [57:09<1:09:34,  1.75it/s] 43%|████▎     | 5465/12776 [57:10<1:04:48,  1.88it/s]                                                       43%|████▎     | 5465/12776 [57:10<1:04:48,  1.88it/s] 43%|████▎     | 5466/12776 [57:10<1:05:42,  1.85it/s]                                                       43%|████▎     | 5466/12776 [57:10<1:05:42,  1.85it/s] 43%|████▎     | 5467/12776 [57:11<1:01:05,  1.99it/s]                                                       43%|████▎     | 5467/12776 [57:11<1:01:05,  1.99it/s] 43%|████▎     | 5468/12776 [57:11<57:06,  2.13it/s]                                                       43%|████▎     | 5468/12776 [57:11<57:06,  2.13it/s] 43%|████▎     | 5469/12776 [57:12<58:06,  2.10it/s]                                                     43%|████▎     | 5469/12776 [57:12<58:06,  2.10it/s] 43%|████▎     | 5470/12776 [57:12<53:40,  2.27it/s]                                                     43%|████▎     | 5470/12776 [57:12<53:40,  2.27it/s] 43%|████▎     | 5471/12776 [57:12<50:10,  2.43it/s]                                                     43%|████▎     | 5471/12776 [57:12<50:10,  2.43it/s] 43%|████▎     | 5472/12776 [57:13<50:43,  2.40it/s]                                                     43%|████▎     | 5472/12776 [57:13<50:43,  2.40it/s] 43%|████▎     | 5473/12776 [57:13<47:31,  2.56it/s]                                                     43%|████▎     | 5473/12776 [57:13<47:31,  2.56it/s] 43%|████▎     | 5474/12776 [57:14<44:52,  2.71it/s]                                                     43%|████▎     | 5474/12776 [57:14<44:52,  2.71it/s] 43%|████▎     | 5475/12776 [57:14<43:36,  2.79it/s]                                                     43%|████▎     | 5475/12776 [57:14<43:36,  2.79it/s] 43%|████▎     | 5476/12776 [57:14<41:16,  2.95it/s]                                                     43%|████▎     | 5476/12776 [57:14<41:16,  2.95it/s] 43%|████▎     | 5477/12776 [57:14<39:08,  3.11it/s]                                                     43%|████▎     | 5477/12776 [57:14<39:08,  3.11it/s] 43%|████▎     | 5478/12776 [57:15<37:24,  3.25it/s]                                                     43%|████▎     | 5478/12776 [57:15<37:24,  3.25it/s] 43%|████▎     | 5479/12776 [57:15<36:52,  3.30it/s]                                                     43%|████▎     | 5479/12776 [57:15<36:52,  3.30it/s] 43%|████▎     | 5480/12776 [57:15<34:57,  3.48it/s]                                                     43%|████▎     | 5480/12776 [57:15<34:57,  3.48it/s] 43%|████▎     | 5481/12776 [57:15<33:36,  3.62it/s]                                                     43%|████▎     | 5481/12776 [57:15<33:36,  3.62it/s] 43%|████▎     | 5482/12776 [57:16<32:31,  3.74it/s]                                                     43%|████▎     | 5482/12776 [57:16<32:31,  3.74it/s] 43%|████▎     | 5483/12776 [57:16<31:36,  3.85it/s]                                                     43%|████▎     | 5483/12776 [57:16<31:36,  3.85it/s] 43%|████▎     | 5484/12776 [57:16<32:38,  3.72it/s]                                                     43%|████▎     | 5484/12776 [57:16<32:38,  3.72it/s] 43%|████▎     | 5485/12776 [57:16<31:15,  3.89it/s]                                                     43%|████▎     | 5485/12776 [57:16<31:15,  3.89it/s] 43%|████▎     | 5486/12776 [57:17<30:04,  4.04it/s]                                                     43%|████▎     | 5486/12776 [57:17<30:04,  4.04it/s] 43%|████▎     | 5487/12776 [57:17<29:05,  4.18it/s]                                                     43%|████▎     | 5487/12776 [57:17<29:05,  4.18it/s] 43%|████▎     | 5488/12776 [57:17<32:24,  3.75it/s]                                                     43%|████▎     | 5488/12776 [57:17<32:24,  3.75it/s] 43%|████▎     | 5489/12776 [57:17<30:36,  3.97it/s]                                                     43%|████▎     | 5489/12776 [57:17<30:36,  3.97it/s] 43%|████▎     | 5490/12776 [57:18<29:08,  4.17it/s]                                                     43%|████▎     | 5490/12776 [57:18<29:08,  4.17it/s] 43%|████▎     | 5491/12776 [57:18<28:02,  4.33it/s]                                                     43%|████▎     | 5491/12776 [57:18<28:02,  4.33it/s] 43%|████▎     | 5492/12776 [57:18<27:02,  4.49it/s]                                                     43%|████▎     | 5492/12776 [57:18<27:02,  4.49it/s] 43%|████▎     | 5493/12776 [57:18<28:47,  4.22it/s]                                                     43%|████▎     | 5493/12776 [57:18<28:47,  4.22it/s] 43%|████▎     | 5494/12776 [57:19<27:24,  4.43it/s]                                                     43%|████▎     | 5494/12776 [57:19<27:24,  4.43it/s] 43%|████▎     | 5495/12776 [57:19<26:24,  4.59it/s]                                                     43%|████▎     | 5495/12776 [57:19<26:24,  4.59it/s] 43%|████▎     | 5496/12776 [57:19<25:37,  4.74it/s]                                                     43%|████▎     | 5496/12776 [57:19<25:37,  4.74it/s] 43%|████▎     | 5497/12776 [57:19<24:59,  4.85it/s]                                                     43%|████▎     | 5497/12776 [57:19<24:59,  4.85it/s] 43%|████▎     | 5498/12776 [57:19<24:17,  5.00it/s]                                                     43%|████▎     | 5498/12776 [57:19<24:17,  5.00it/s] 43%|████▎     | 5499/12776 [57:20<26:22,  4.60it/s]                                                     43%|████▎     | 5499/12776 [57:20<26:22,  4.60it/s] 43%|████▎     | 5500/12776 [57:20<45:07,  2.69it/s]                                                     43%|████▎     | 5500/12776 [57:20<45:07,  2.69it/s] 43%|████▎     | 5501/12776 [57:22<1:29:08,  1.36it/s]                                                       43%|████▎     | 5501/12776 [57:22<1:29:08,  1.36it/s] 43%|████▎     | 5502/12776 [57:23<1:39:07,  1.22it/s]                                                       43%|████▎     | 5502/12776 [57:23<1:39:07,  1.22it/s] 43%|████▎     | 5503/12776 [57:24<1:41:08,  1.20it/s]                                                       43%|████▎     | 5503/12776 [57:24<1:41:08,  1.20it/s] 43%|████▎     | 5504/12776 [57:25<1:40:40,  1.20it/s]                                                       43%|████▎     | 5504/12776 [57:25<1:40:40,  1.20it/s] 43%|████▎     | 5505/12776 [57:26<1:43:51,  1.17it/s]                                                       43%|████▎     | 5505/12776 [57:26<1:43:51,  1.17it/s] 43%|████▎     | 5506/12776 [57:26<1:37:39,  1.24it/s]                                                       43%|████▎     | 5506/12776 [57:26<1:37:39,  1.24it/s] 43%|████▎     | 5507/12776 [57:27<1:32:37,  1.31it/s]                                                       43%|████▎     | 5507/12776 [57:27<1:32:37,  1.31it/s] 43%|████▎     | 5508/12776 [57:28<1:31:58,  1.32it/s]                                                       43%|████▎     | 5508/12776 [57:28<1:31:58,  1.32it/s] 43%|████▎     | 5509/12776 [57:28<1:25:56,  1.41it/s]                                                       43%|████▎     | 5509/12776 [57:28<1:25:56,  1.41it/s] 43%|████▎     | 5510/12776 [57:29<1:21:30,  1.49it/s]                                                       43%|████▎     | 5510/12776 [57:29<1:21:30,  1.49it/s] 43%|████▎     | 5511/12776 [57:29<1:16:54,  1.57it/s]                                                       43%|████▎     | 5511/12776 [57:29<1:16:54,  1.57it/s] 43%|████▎     | 5512/12776 [57:30<1:15:34,  1.60it/s]                                                       43%|████▎     | 5512/12776 [57:30<1:15:34,  1.60it/s] 43%|████▎     | 5513/12776 [57:30<1:11:12,  1.70it/s]                                                       43%|████▎     | 5513/12776 [57:30<1:11:12,  1.70it/s] 43%|████▎     | 5514/12776 [57:31<1:07:02,  1.81it/s]                                                       43%|████▎     | 5514/12776 [57:31<1:07:02,  1.81it/s] 43%|████▎     | 5515/12776 [57:31<1:03:15,  1.91it/s]                                                       43%|████▎     | 5515/12776 [57:31<1:03:15,  1.91it/s] 43%|████▎     | 5516/12776 [57:32<1:00:06,  2.01it/s]                                                       43%|████▎     | 5516/12776 [57:32<1:00:06,  2.01it/s] 43%|████▎     | 5517/12776 [57:32<58:47,  2.06it/s]                                                       43%|████▎     | 5517/12776 [57:32<58:47,  2.06it/s] 43%|████▎     | 5518/12776 [57:33<55:53,  2.16it/s]                                                     43%|████▎     | 5518/12776 [57:33<55:53,  2.16it/s] 43%|████▎     | 5519/12776 [57:33<53:22,  2.27it/s]                                                     43%|████▎     | 5519/12776 [57:33<53:22,  2.27it/s] 43%|████▎     | 5520/12776 [57:34<55:23,  2.18it/s]                                                     43%|████▎     | 5520/12776 [57:34<55:23,  2.18it/s] 43%|████▎     | 5521/12776 [57:34<51:43,  2.34it/s]                                                     43%|████▎     | 5521/12776 [57:34<51:43,  2.34it/s] 43%|████▎     | 5522/12776 [57:34<48:48,  2.48it/s]                                                     43%|████▎     | 5522/12776 [57:34<48:48,  2.48it/s] 43%|████▎     | 5523/12776 [57:35<49:41,  2.43it/s]                                                     43%|████▎     | 5523/12776 [57:35<49:41,  2.43it/s] 43%|████▎     | 5524/12776 [57:35<46:50,  2.58it/s]                                                     43%|████▎     | 5524/12776 [57:35<46:50,  2.58it/s] 43%|████▎     | 5525/12776 [57:35<44:30,  2.72it/s]                                                     43%|████▎     | 5525/12776 [57:35<44:30,  2.72it/s] 43%|████▎     | 5526/12776 [57:36<43:41,  2.77it/s]                                                     43%|████▎     | 5526/12776 [57:36<43:41,  2.77it/s] 43%|████▎     | 5527/12776 [57:36<41:23,  2.92it/s]                                                     43%|████▎     | 5527/12776 [57:36<41:23,  2.92it/s] 43%|████▎     | 5528/12776 [57:36<39:28,  3.06it/s]                                                     43%|████▎     | 5528/12776 [57:36<39:28,  3.06it/s] 43%|████▎     | 5529/12776 [57:37<37:45,  3.20it/s]                                                     43%|████▎     | 5529/12776 [57:37<37:45,  3.20it/s] 43%|████▎     | 5530/12776 [57:37<40:54,  2.95it/s]                                                     43%|████▎     | 5530/12776 [57:37<40:54,  2.95it/s] 43%|████▎     | 5531/12776 [57:37<38:17,  3.15it/s]                                                     43%|████▎     | 5531/12776 [57:37<38:17,  3.15it/s] 43%|████▎     | 5532/12776 [57:38<35:56,  3.36it/s]                                                     43%|████▎     | 5532/12776 [57:38<35:56,  3.36it/s] 43%|████▎     | 5533/12776 [57:38<34:14,  3.53it/s]                                                     43%|████▎     | 5533/12776 [57:38<34:14,  3.53it/s] 43%|████▎     | 5534/12776 [57:38<36:00,  3.35it/s]                                                     43%|████▎     | 5534/12776 [57:38<36:00,  3.35it/s] 43%|████▎     | 5535/12776 [57:38<33:46,  3.57it/s]                                                     43%|████▎     | 5535/12776 [57:38<33:46,  3.57it/s] 43%|████▎     | 5536/12776 [57:39<32:01,  3.77it/s]                                                     43%|████▎     | 5536/12776 [57:39<32:01,  3.77it/s] 43%|████▎     | 5537/12776 [57:39<30:34,  3.95it/s]                                                     43%|████▎     | 5537/12776 [57:39<30:34,  3.95it/s] 43%|████▎     | 5538/12776 [57:39<32:34,  3.70it/s]                                                     43%|████▎     | 5538/12776 [57:39<32:34,  3.70it/s] 43%|████▎     | 5539/12776 [57:39<30:32,  3.95it/s]                                                     43%|████▎     | 5539/12776 [57:39<30:32,  3.95it/s] 43%|████▎     | 5540/12776 [57:40<28:59,  4.16it/s]                                                    {'loss': 0.5114, 'grad_norm': 2.8522396087646484, 'learning_rate': 0.00017898338220918863, 'epoch': 0.86}
+{'loss': 0.7611, 'grad_norm': 1.8388530015945435, 'learning_rate': 0.0001789589442815249, 'epoch': 0.86}
+{'loss': 0.4343, 'grad_norm': 1.0927659273147583, 'learning_rate': 0.00017893450635386116, 'epoch': 0.86}
+{'loss': 0.591, 'grad_norm': 1.3488267660140991, 'learning_rate': 0.00017891006842619744, 'epoch': 0.86}
+{'loss': 0.5916, 'grad_norm': 2.569788932800293, 'learning_rate': 0.00017888563049853372, 'epoch': 0.86}
+{'loss': 0.7243, 'grad_norm': 1.4221092462539673, 'learning_rate': 0.00017886119257086997, 'epoch': 0.86}
+{'loss': 0.3696, 'grad_norm': 0.8458523750305176, 'learning_rate': 0.00017883675464320625, 'epoch': 0.86}
+{'loss': 0.6373, 'grad_norm': 1.2293727397918701, 'learning_rate': 0.00017881231671554253, 'epoch': 0.86}
+{'loss': 0.6213, 'grad_norm': 1.3807203769683838, 'learning_rate': 0.00017878787878787875, 'epoch': 0.86}
+{'loss': 1.1287, 'grad_norm': 3.1636455059051514, 'learning_rate': 0.00017876344086021503, 'epoch': 0.86}
+{'loss': 0.6642, 'grad_norm': 1.5414478778839111, 'learning_rate': 0.0001787390029325513, 'epoch': 0.86}
+{'loss': 0.5733, 'grad_norm': 1.7993489503860474, 'learning_rate': 0.00017871456500488756, 'epoch': 0.86}
+{'loss': 0.976, 'grad_norm': 2.5660245418548584, 'learning_rate': 0.00017869012707722384, 'epoch': 0.86}
+{'loss': 1.0276, 'grad_norm': 1.549904465675354, 'learning_rate': 0.00017866568914956012, 'epoch': 0.86}
+{'loss': 0.6649, 'grad_norm': 1.6813697814941406, 'learning_rate': 0.00017864125122189634, 'epoch': 0.86}
+{'loss': 0.7519, 'grad_norm': 1.3475943803787231, 'learning_rate': 0.00017861681329423262, 'epoch': 0.86}
+{'loss': 0.4629, 'grad_norm': 2.2717173099517822, 'learning_rate': 0.0001785923753665689, 'epoch': 0.86}
+{'loss': 0.8551, 'grad_norm': 1.826530933380127, 'learning_rate': 0.00017856793743890515, 'epoch': 0.86}
+{'loss': 0.8756, 'grad_norm': 2.976663112640381, 'learning_rate': 0.00017854349951124143, 'epoch': 0.86}
+{'loss': 0.8143, 'grad_norm': 1.6091177463531494, 'learning_rate': 0.0001785190615835777, 'epoch': 0.86}
+{'loss': 1.8195, 'grad_norm': 6.574666500091553, 'learning_rate': 0.00017849462365591396, 'epoch': 0.86}
+{'loss': 1.5207, 'grad_norm': 4.467514514923096, 'learning_rate': 0.00017847018572825023, 'epoch': 0.86}
+{'loss': 0.6295, 'grad_norm': 2.5957164764404297, 'learning_rate': 0.0001784457478005865, 'epoch': 0.86}
+{'loss': 1.1869, 'grad_norm': 1.4115712642669678, 'learning_rate': 0.00017842130987292274, 'epoch': 0.86}
+{'loss': 1.3938, 'grad_norm': 3.7182257175445557, 'learning_rate': 0.00017839687194525901, 'epoch': 0.86}
+{'loss': 1.2754, 'grad_norm': 2.587322235107422, 'learning_rate': 0.0001783724340175953, 'epoch': 0.86}
+{'loss': 1.1982, 'grad_norm': 2.736600160598755, 'learning_rate': 0.00017834799608993154, 'epoch': 0.86}
+{'loss': 1.8069, 'grad_norm': 5.915489673614502, 'learning_rate': 0.00017832355816226782, 'epoch': 0.86}
+{'loss': 1.2607, 'grad_norm': 4.513766765594482, 'learning_rate': 0.0001782991202346041, 'epoch': 0.86}
+{'loss': 0.6809, 'grad_norm': 2.3064467906951904, 'learning_rate': 0.00017827468230694035, 'epoch': 0.86}
+{'loss': 0.4819, 'grad_norm': 1.9814224243164062, 'learning_rate': 0.00017825024437927663, 'epoch': 0.86}
+{'loss': 1.5882, 'grad_norm': 2.532914400100708, 'learning_rate': 0.0001782258064516129, 'epoch': 0.86}
+{'loss': 0.8876, 'grad_norm': 1.8483891487121582, 'learning_rate': 0.00017820136852394913, 'epoch': 0.86}
+{'loss': 1.2741, 'grad_norm': 3.369330644607544, 'learning_rate': 0.0001781769305962854, 'epoch': 0.86}
+{'loss': 0.5309, 'grad_norm': 2.431166648864746, 'learning_rate': 0.0001781524926686217, 'epoch': 0.86}
+{'loss': 0.8135, 'grad_norm': 2.442338466644287, 'learning_rate': 0.00017812805474095794, 'epoch': 0.86}
+{'loss': 0.6185, 'grad_norm': 2.9823551177978516, 'learning_rate': 0.00017810361681329422, 'epoch': 0.86}
+{'loss': 1.6136, 'grad_norm': 3.414311170578003, 'learning_rate': 0.0001780791788856305, 'epoch': 0.86}
+{'loss': 0.3127, 'grad_norm': 0.6202065348625183, 'learning_rate': 0.00017805474095796672, 'epoch': 0.86}
+{'loss': 0.2606, 'grad_norm': 0.6588072776794434, 'learning_rate': 0.000178030303030303, 'epoch': 0.86}
+{'loss': 0.267, 'grad_norm': 0.9525992274284363, 'learning_rate': 0.00017800586510263928, 'epoch': 0.86}
+{'loss': 0.501, 'grad_norm': 0.8494582176208496, 'learning_rate': 0.00017798142717497553, 'epoch': 0.86}
+{'loss': 0.4369, 'grad_norm': 0.632400393486023, 'learning_rate': 0.0001779569892473118, 'epoch': 0.86}
+{'loss': 0.3133, 'grad_norm': 0.5401912331581116, 'learning_rate': 0.0001779325513196481, 'epoch': 0.86}
+{'loss': 0.3823, 'grad_norm': 0.6302811503410339, 'learning_rate': 0.00017790811339198434, 'epoch': 0.86}
+{'loss': 0.2837, 'grad_norm': 0.8073775768280029, 'learning_rate': 0.00017788367546432062, 'epoch': 0.86}
+{'loss': 0.2525, 'grad_norm': 0.7132818698883057, 'learning_rate': 0.0001778592375366569, 'epoch': 0.86}
+{'loss': 0.3663, 'grad_norm': 1.0062077045440674, 'learning_rate': 0.00017783479960899312, 'epoch': 0.86}
+{'loss': 0.351, 'grad_norm': 0.8359341621398926, 'learning_rate': 0.0001778103616813294, 'epoch': 0.86}
+{'loss': 0.3803, 'grad_norm': 0.8639108538627625, 'learning_rate': 0.00017778592375366568, 'epoch': 0.86}
+{'loss': 0.3996, 'grad_norm': 0.7320135831832886, 'learning_rate': 0.00017776148582600193, 'epoch': 0.86}
+{'loss': 0.3478, 'grad_norm': 0.9680222868919373, 'learning_rate': 0.0001777370478983382, 'epoch': 0.86}
+{'loss': 0.4855, 'grad_norm': 0.8415281176567078, 'learning_rate': 0.00017771260997067448, 'epoch': 0.86}
+{'loss': 0.6531, 'grad_norm': 1.6651570796966553, 'learning_rate': 0.00017768817204301073, 'epoch': 0.86}
+{'loss': 0.7548, 'grad_norm': 1.7974225282669067, 'learning_rate': 0.000177663734115347, 'epoch': 0.86}
+{'loss': 0.6231, 'grad_norm': 1.254573106765747, 'learning_rate': 0.00017763929618768326, 'epoch': 0.86}
+{'loss': 0.4347, 'grad_norm': 0.7575573921203613, 'learning_rate': 0.00017761485826001952, 'epoch': 0.86}
+{'loss': 0.6743, 'grad_norm': 1.291746973991394, 'learning_rate': 0.0001775904203323558, 'epoch': 0.86}
+{'loss': 0.5223, 'grad_norm': 1.3523181676864624, 'learning_rate': 0.00017756598240469207, 'epoch': 0.86}
+{'loss': 0.6335, 'grad_norm': 1.1703637838363647, 'learning_rate': 0.00017754154447702832, 'epoch': 0.86}
+{'loss': 0.5819, 'grad_norm': 2.5771422386169434, 'learning_rate': 0.0001775171065493646, 'epoch': 0.86}
+{'loss': 0.7858, 'grad_norm': 1.401025414466858, 'learning_rate': 0.00017749266862170088, 'epoch': 0.86}
+{'loss': 0.55, 'grad_norm': 1.5256321430206299, 'learning_rate': 0.0001774682306940371, 'epoch': 0.86}
+{'loss': 0.4361, 'grad_norm': 1.255598545074463, 'learning_rate': 0.00017744379276637338, 'epoch': 0.87}
+{'loss': 1.0666, 'grad_norm': 2.342947483062744, 'learning_rate': 0.00017741935483870966, 'epoch': 0.87}
+{'loss': 1.0796, 'grad_norm': 2.2198994159698486, 'learning_rate': 0.0001773949169110459, 'epoch': 0.87}
+{'loss': 1.0962, 'grad_norm': 2.544025182723999, 'learning_rate': 0.0001773704789833822, 'epoch': 0.87}
+{'loss': 0.7408, 'grad_norm': 3.0264079570770264, 'learning_rate': 0.00017734604105571847, 'epoch': 0.87}
+{'loss': 0.618, 'grad_norm': 1.2119104862213135, 'learning_rate': 0.00017732160312805472, 'epoch': 0.87}
+{'loss': 0.631, 'grad_norm': 1.5257172584533691, 'learning_rate': 0.000177297165200391, 'epoch': 0.87}
+{'loss': 0.7618, 'grad_norm': 2.0830912590026855, 'learning_rate': 0.00017727272727272728, 'epoch': 0.87}
+{'loss': 0.8881, 'grad_norm': 2.5756418704986572, 'learning_rate': 0.0001772482893450635, 'epoch': 0.87}
+{'loss': 0.7217, 'grad_norm': 2.78877329826355, 'learning_rate': 0.00017722385141739978, 'epoch': 0.87}
+{'loss': 0.9808, 'grad_norm': 2.5432016849517822, 'learning_rate': 0.00017719941348973606, 'epoch': 0.87}
+{'loss': 0.8685, 'grad_norm': 1.7844754457473755, 'learning_rate': 0.0001771749755620723, 'epoch': 0.87}
+{'loss': 1.4534, 'grad_norm': 2.5685477256774902, 'learning_rate': 0.0001771505376344086, 'epoch': 0.87}
+{'loss': 1.3919, 'grad_norm': 2.9484968185424805, 'learning_rate': 0.00017712609970674487, 'epoch': 0.87}
+ 43%|████▎     | 5540/12776 [57:40<28:59,  4.16it/s] 43%|████▎     | 5541/12776 [57:40<27:50,  4.33it/s]                                                     43%|████▎     | 5541/12776 [57:40<27:50,  4.33it/s] 43%|████▎     | 5542/12776 [57:40<26:56,  4.47it/s]                                                     43%|████▎     | 5542/12776 [57:40<26:56,  4.47it/s] 43%|████▎     | 5543/12776 [57:40<28:45,  4.19it/s]                                                     43%|████▎     | 5543/12776 [57:40<28:45,  4.19it/s] 43%|████▎     | 5544/12776 [57:40<27:23,  4.40it/s]                                                     43%|████▎     | 5544/12776 [57:40<27:23,  4.40it/s] 43%|████▎     | 5545/12776 [57:41<26:25,  4.56it/s]                                                     43%|████▎     | 5545/12776 [57:41<26:25,  4.56it/s] 43%|████▎     | 5546/12776 [57:41<25:34,  4.71it/s]                                                     43%|████▎     | 5546/12776 [57:41<25:34,  4.71it/s] 43%|████▎     | 5547/12776 [57:41<24:43,  4.87it/s]                                                     43%|████▎     | 5547/12776 [57:41<24:43,  4.87it/s] 43%|████▎     | 5548/12776 [57:41<24:00,  5.02it/s]                                                     43%|████▎     | 5548/12776 [57:41<24:00,  5.02it/s] 43%|████▎     | 5549/12776 [57:41<25:45,  4.68it/s]                                                     43%|████▎     | 5549/12776 [57:41<25:45,  4.68it/s] 43%|████▎     | 5550/12776 [57:42<43:08,  2.79it/s]                                                     43%|████▎     | 5550/12776 [57:42<43:08,  2.79it/s] 43%|████▎     | 5551/12776 [57:44<1:22:10,  1.47it/s]                                                       43%|████▎     | 5551/12776 [57:44<1:22:10,  1.47it/s] 43%|████▎     | 5552/12776 [57:45<1:31:25,  1.32it/s]                                                       43%|████▎     | 5552/12776 [57:45<1:31:25,  1.32it/s] 43%|████▎     | 5553/12776 [57:45<1:33:52,  1.28it/s]                                                       43%|████▎     | 5553/12776 [57:45<1:33:52,  1.28it/s] 43%|████▎     | 5554/12776 [57:46<1:36:15,  1.25it/s]                                                       43%|████▎     | 5554/12776 [57:46<1:36:15,  1.25it/s] 43%|████▎     | 5555/12776 [57:47<1:38:04,  1.23it/s]                                                       43%|████▎     | 5555/12776 [57:47<1:38:04,  1.23it/s] 43%|████▎     | 5556/12776 [57:48<1:32:51,  1.30it/s]                                                       43%|████▎     | 5556/12776 [57:48<1:32:51,  1.30it/s] 43%|████▎     | 5557/12776 [57:48<1:32:00,  1.31it/s]                                                       43%|████▎     | 5557/12776 [57:48<1:32:00,  1.31it/s] 44%|████▎     | 5558/12776 [57:49<1:26:28,  1.39it/s]                                                       44%|████▎     | 5558/12776 [57:49<1:26:28,  1.39it/s] 44%|████▎     | 5559/12776 [57:50<1:22:21,  1.46it/s]                                                       44%|████▎     | 5559/12776 [57:50<1:22:21,  1.46it/s] 44%|████▎     | 5560/12776 [57:50<1:17:52,  1.54it/s]                                                       44%|████▎     | 5560/12776 [57:50<1:17:52,  1.54it/s] 44%|████▎     | 5561/12776 [57:51<1:15:36,  1.59it/s]                                                       44%|████▎     | 5561/12776 [57:51<1:15:36,  1.59it/s] 44%|████▎     | 5562/12776 [57:51<1:11:26,  1.68it/s]                                                       44%|████▎     | 5562/12776 [57:51<1:11:26,  1.68it/s] 44%|████▎     | 5563/12776 [57:52<1:11:02,  1.69it/s]                                                       44%|████▎     | 5563/12776 [57:52<1:11:02,  1.69it/s] 44%|████▎     | 5564/12776 [57:52<1:06:13,  1.82it/s]                                                       44%|████▎     | 5564/12776 [57:52<1:06:13,  1.82it/s] 44%|████▎     | 5565/12776 [57:53<1:06:37,  1.80it/s]                                                       44%|████▎     | 5565/12776 [57:53<1:06:37,  1.80it/s] 44%|███��▎     | 5566/12776 [57:53<1:03:09,  1.90it/s]                                                       44%|████▎     | 5566/12776 [57:53<1:03:09,  1.90it/s] 44%|████▎     | 5567/12776 [57:54<1:02:53,  1.91it/s]                                                       44%|████▎     | 5567/12776 [57:54<1:02:53,  1.91it/s] 44%|████▎     | 5568/12776 [57:54<59:55,  2.00it/s]                                                       44%|████▎     | 5568/12776 [57:54<59:55,  2.00it/s] 44%|████▎     | 5569/12776 [57:55<56:28,  2.13it/s]                                                     44%|████▎     | 5569/12776 [57:55<56:28,  2.13it/s] 44%|████▎     | 5570/12776 [57:55<58:28,  2.05it/s]                                                     44%|████▎     | 5570/12776 [57:55<58:28,  2.05it/s] 44%|████▎     | 5571/12776 [57:56<54:44,  2.19it/s]                                                     44%|████▎     | 5571/12776 [57:56<54:44,  2.19it/s] 44%|████▎     | 5572/12776 [57:56<55:39,  2.16it/s]                                                     44%|████▎     | 5572/12776 [57:56<55:39,  2.16it/s] 44%|████▎     | 5573/12776 [57:57<52:35,  2.28it/s]                                                     44%|████▎     | 5573/12776 [57:57<52:35,  2.28it/s] 44%|████▎     | 5574/12776 [57:57<50:28,  2.38it/s]                                                     44%|████▎     | 5574/12776 [57:57<50:28,  2.38it/s] 44%|████▎     | 5575/12776 [57:57<48:45,  2.46it/s]                                                     44%|████▎     | 5575/12776 [57:57<48:45,  2.46it/s] 44%|████▎     | 5576/12776 [57:58<44:51,  2.68it/s]                                                     44%|████▎     | 5576/12776 [57:58<44:51,  2.68it/s] 44%|████▎     | 5577/12776 [57:58<41:56,  2.86it/s]                                                     44%|████▎     | 5577/12776 [57:58<41:56,  2.86it/s] 44%|████▎     | 5578/12776 [57:58<39:55,  3.01it/s]                                                     44%|████▎     | 5578/12776 [57:58<39:55,  3.01it/s] 44%|████▎     | 5579/12776 [57:59<41:48,  2.87it/s]                                                     44%|████▎     | 5579/12776 [57:59<41:48,  2.87it/s] 44%|████▎     | 5580/12776 [57:59<39:24,  3.04it/s]                                                     44%|████▎     | 5580/12776 [57:59<39:24,  3.04it/s] 44%|████▎     | 5581/12776 [57:59<36:35,  3.28it/s]                                                     44%|████▎     | 5581/12776 [57:59<36:35,  3.28it/s] 44%|████▎     | 5582/12776 [57:59<34:47,  3.45it/s]                                                     44%|████▎     | 5582/12776 [57:59<34:47,  3.45it/s] 44%|████▎     | 5583/12776 [58:00<35:51,  3.34it/s]                                                     44%|████▎     | 5583/12776 [58:00<35:51,  3.34it/s] 44%|████▎     | 5584/12776 [58:00<33:51,  3.54it/s]                                                     44%|████▎     | 5584/12776 [58:00<33:51,  3.54it/s] 44%|████▎     | 5585/12776 [58:00<32:15,  3.71it/s]                                                     44%|████▎     | 5585/12776 [58:00<32:15,  3.71it/s] 44%|████▎     | 5586/12776 [58:00<31:38,  3.79it/s]                                                     44%|████▎     | 5586/12776 [58:00<31:38,  3.79it/s] 44%|████▎     | 5587/12776 [58:01<31:39,  3.79it/s]                                                     44%|████▎     | 5587/12776 [58:01<31:39,  3.79it/s] 44%|████▎     | 5588/12776 [58:01<30:00,  3.99it/s]                                                     44%|████▎     | 5588/12776 [58:01<30:00,  3.99it/s] 44%|████▎     | 5589/12776 [58:01<29:14,  4.10it/s]                                                     44%|████▎     | 5589/12776 [58:01<29:14,  4.10it/s] 44%|████▍     | 5590/12776 [58:01<27:58,  4.28it/s]                                                     44%|████▍     | 5590/12776 [58:01<27:58,  4.28it/s] 44%|████▍     | 5591/12776 [58:02<27:08,  4.41it/s]                                                     44%|████▍     | 5591/12776 [58:02<27:08,  4.41it/s] 44%|████▍     | 5592/12776 [58:02<28:02,  4.27it/s]                                                     44%|████▍     | 5592/12776 [58:02<28:02,  4.27it/s] 44%|████▍     | 5593/12776 [58:02<26:52,  4.45it/s]                                                     44%|████▍     | 5593/12776 [58:02<26:52,  4.45it/s] 44%|████▍     | 5594/12776 [58:02<26:01,  4.60it/s]                                                     44%|████▍     | 5594/12776 [58:02<26:01,  4.60it/s] 44%|████▍     | 5595/12776 [58:02<25:21,  4.72it/s]                                                     44%|████▍     | 5595/12776 [58:02<25:21,  4.72it/s] 44%|████▍     | 5596/12776 [58:03<24:45,  4.83it/s]                                                     44%|████▍     | 5596/12776 [58:03<24:45,  4.83it/s] 44%|████▍     | 5597/12776 [58:03<24:13,  4.94it/s]                                                     44%|████▍     | 5597/12776 [58:03<24:13,  4.94it/s] 44%|████▍     | 5598/12776 [58:03<25:50,  4.63it/s]                                                     44%|████▍     | 5598/12776 [58:03<25:50,  4.63it/s] 44%|████▍     | 5599/12776 [58:03<24:43,  4.84it/s]                                                     44%|████▍     | 5599/12776 [58:03<24:43,  4.84it/s] 44%|████▍     | 5600/12776 [58:04<40:58,  2.92it/s]                                                     44%|████▍     | 5600/12776 [58:04<40:58,  2.92it/s]Saving model checkpoint to ./checkpoint-5600
+Configuration saved in ./checkpoint-5600/config.json
+Model weights saved in ./checkpoint-5600/model.safetensors
+Feature extractor saved in ./checkpoint-5600/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-5600/tokenizer_config.json
+Special tokens file saved in ./checkpoint-5600/special_tokens_map.json
+added tokens file saved in ./checkpoint-5600/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-4400] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 44%|████▍     | 5601/12776 [58:10<4:23:37,  2.20s/it]                                                       44%|████▍     | 5601/12776 [58:10<4:23:37,  2.20s/it] 44%|████▍     | 5602/12776 [58:11<3:38:22,  1.83s/it]                                                       44%|████▍     | 5602/12776 [58:11<3:38:22,  1.83s/it] 44%|████▍     | 5603/12776 [58:12<3:06:35,  1.56s/it]                                                       44%|████▍     | 5603/12776 [58:12<3:06:35,  1.56s/it] 44%|████▍     | 5604/12776 [58:13<2:39:49,  1.34s/it]                                                       44%|████▍     | 5604/12776 [58:13<2:39:49,  1.34s/it] 44%|████▍     | 5605/12776 [58:14<2:18:23,  1.16s/it]                                                       44%|████▍     | 5605/12776 [58:14<2:18:23,  1.16s/it] 44%|████▍     | 5606/12776 [58:15<2:00:52,  1.01s/it]                                                       44%|████▍     | 5606/12776 [58:15<2:00:52,  1.01s/it] 44%|████▍     | 5607/12776 [58:15<1:47:28,  1.11it/s]                                                       44%|████▍     | 5607/12776 [58:15<1:47:28,  1.11it/s] 44%|████▍     | 5608/12776 [58:16<1:35:19,  1.25it/s]                                                       44%|████▍     | 5608/12776 [58:16<1:35:19,  1.25it/s] 44%|████▍     | 5609/12776 [58:16<1:25:54,  1.39it/s]                                                       44%|████▍     | 5609/12776 [58:16<1:25:54,  1.39it/s] 44%|████▍     | 5610/12776 [58:17<1:21:00,  1.47it/s]                                                       44%|████▍     | 5610/12776 [58:17<1:21:00,  1.47it/s] 44%|████▍     | 5611/12776 [58:17<1:14:14,  1.61it/s]                                                       44%|████▍     | 5611/12776 [58:17<1:14:14,  1.61it/s] 44%|████▍     | 5612/12776 [58:18<1:11:13,  1.68it/s]                                                       44%|████▍     | 5612/12776 [58:18<1:11:13,  1.68it/s] 44%|████▍     | 5613/12776 [58:18<1:05:12,  1.83it/s]                                                       44%|████▍     | 5613/12776 [58:18<1:05:12,  1.83it/s] 44%|████▍     | 5614/12776 [58:19<1:03:33,  1.88it/s]                                                       44%|████▍     | 5614/12776 [58:19<1:03:33,  1.88it/s] 44%|████▍     | 5615/12776 [58:19<58:50,  2.03it/s]                                                       44%|████▍     | 5615/12776 [58:19<58:50,  2.03it/s] 44%|████▍     | 5616/12776 [58:20<55:00,  2.17it/s]                                                     44%|████▍     | 5616/12776 [58:20<55:00,  2.17it/s] 44%|████▍     | 5617/12776 [58:20<57:54,  2.06it/s]                                                    {'loss': 0.8989, 'grad_norm': 3.510503053665161, 'learning_rate': 0.00017710166177908112, 'epoch': 0.87}
+{'loss': 1.7287, 'grad_norm': 3.30690860748291, 'learning_rate': 0.0001770772238514174, 'epoch': 0.87}
+{'loss': 1.2677, 'grad_norm': 4.295806407928467, 'learning_rate': 0.00017705278592375365, 'epoch': 0.87}
+{'loss': 1.0414, 'grad_norm': 1.898603081703186, 'learning_rate': 0.0001770283479960899, 'epoch': 0.87}
+{'loss': 1.0474, 'grad_norm': 1.397929310798645, 'learning_rate': 0.00017700391006842618, 'epoch': 0.87}
+{'loss': 1.4236, 'grad_norm': 3.208662986755371, 'learning_rate': 0.00017697947214076245, 'epoch': 0.87}
+{'loss': 1.4281, 'grad_norm': 4.296112537384033, 'learning_rate': 0.0001769550342130987, 'epoch': 0.87}
+{'loss': 0.9672, 'grad_norm': 5.432612895965576, 'learning_rate': 0.00017693059628543498, 'epoch': 0.87}
+{'loss': 0.863, 'grad_norm': 2.7205517292022705, 'learning_rate': 0.00017690615835777126, 'epoch': 0.87}
+{'loss': 0.9332, 'grad_norm': 2.6513023376464844, 'learning_rate': 0.0001768817204301075, 'epoch': 0.87}
+{'loss': 0.7272, 'grad_norm': 1.974501132965088, 'learning_rate': 0.00017685728250244376, 'epoch': 0.87}
+{'loss': 0.3761, 'grad_norm': 0.6594939827919006, 'learning_rate': 0.00017683284457478004, 'epoch': 0.87}
+{'loss': 0.4447, 'grad_norm': 1.849006175994873, 'learning_rate': 0.0001768084066471163, 'epoch': 0.87}
+{'loss': 0.2471, 'grad_norm': 0.555202841758728, 'learning_rate': 0.00017678396871945257, 'epoch': 0.87}
+{'loss': 0.3585, 'grad_norm': 0.5816833972930908, 'learning_rate': 0.00017675953079178885, 'epoch': 0.87}
+{'loss': 0.363, 'grad_norm': 0.6719066500663757, 'learning_rate': 0.0001767350928641251, 'epoch': 0.87}
+{'loss': 0.3488, 'grad_norm': 0.6604121327400208, 'learning_rate': 0.00017671065493646138, 'epoch': 0.87}
+{'loss': 0.2469, 'grad_norm': 0.6314492225646973, 'learning_rate': 0.00017668621700879766, 'epoch': 0.87}
+{'loss': 0.3692, 'grad_norm': 0.6827051639556885, 'learning_rate': 0.00017666177908113388, 'epoch': 0.87}
+{'loss': 0.2522, 'grad_norm': 0.6794304251670837, 'learning_rate': 0.00017663734115347016, 'epoch': 0.87}
+{'loss': 0.4427, 'grad_norm': 0.6768459677696228, 'learning_rate': 0.00017661290322580644, 'epoch': 0.87}
+{'loss': 0.2897, 'grad_norm': 0.9314690232276917, 'learning_rate': 0.0001765884652981427, 'epoch': 0.87}
+{'loss': 0.2953, 'grad_norm': 0.586393415927887, 'learning_rate': 0.00017656402737047897, 'epoch': 0.87}
+{'loss': 0.4265, 'grad_norm': 1.349017858505249, 'learning_rate': 0.00017653958944281525, 'epoch': 0.87}
+{'loss': 0.4198, 'grad_norm': 0.8151659965515137, 'learning_rate': 0.0001765151515151515, 'epoch': 0.87}
+{'loss': 0.6098, 'grad_norm': 0.9918164014816284, 'learning_rate': 0.00017649071358748778, 'epoch': 0.87}
+{'loss': 0.4211, 'grad_norm': 0.7317390441894531, 'learning_rate': 0.00017646627565982403, 'epoch': 0.87}
+{'loss': 0.5503, 'grad_norm': 1.2176004648208618, 'learning_rate': 0.00017644183773216028, 'epoch': 0.87}
+{'loss': 0.8489, 'grad_norm': 1.647218108177185, 'learning_rate': 0.00017641739980449656, 'epoch': 0.87}
+{'loss': 0.4466, 'grad_norm': 0.9232958555221558, 'learning_rate': 0.00017639296187683284, 'epoch': 0.87}
+{'loss': 0.511, 'grad_norm': 0.800988495349884, 'learning_rate': 0.0001763685239491691, 'epoch': 0.87}
+{'loss': 0.6131, 'grad_norm': 1.9028470516204834, 'learning_rate': 0.00017634408602150537, 'epoch': 0.87}
+{'loss': 0.5967, 'grad_norm': 1.8957113027572632, 'learning_rate': 0.00017631964809384164, 'epoch': 0.87}
+{'loss': 0.7269, 'grad_norm': 2.135235071182251, 'learning_rate': 0.00017629521016617787, 'epoch': 0.87}
+{'loss': 0.482, 'grad_norm': 3.0502326488494873, 'learning_rate': 0.00017627077223851415, 'epoch': 0.87}
+{'loss': 0.7517, 'grad_norm': 1.7193087339401245, 'learning_rate': 0.00017624633431085043, 'epoch': 0.87}
+{'loss': 0.5461, 'grad_norm': 2.7776575088500977, 'learning_rate': 0.00017622189638318668, 'epoch': 0.87}
+{'loss': 0.4677, 'grad_norm': 1.1250964403152466, 'learning_rate': 0.00017619745845552296, 'epoch': 0.87}
+{'loss': 0.7262, 'grad_norm': 2.022542953491211, 'learning_rate': 0.00017617302052785923, 'epoch': 0.87}
+{'loss': 1.2791, 'grad_norm': 2.4576961994171143, 'learning_rate': 0.00017614858260019548, 'epoch': 0.87}
+{'loss': 0.6484, 'grad_norm': 2.275444984436035, 'learning_rate': 0.00017612414467253176, 'epoch': 0.87}
+{'loss': 0.6692, 'grad_norm': 2.2250964641571045, 'learning_rate': 0.00017609970674486804, 'epoch': 0.87}
+{'loss': 1.1964, 'grad_norm': 3.6571662425994873, 'learning_rate': 0.00017607526881720427, 'epoch': 0.87}
+{'loss': 0.9287, 'grad_norm': 2.21502947807312, 'learning_rate': 0.00017605083088954054, 'epoch': 0.87}
+{'loss': 1.0618, 'grad_norm': 1.8925524950027466, 'learning_rate': 0.00017602639296187682, 'epoch': 0.87}
+{'loss': 0.9194, 'grad_norm': 1.7226258516311646, 'learning_rate': 0.00017600195503421307, 'epoch': 0.87}
+{'loss': 0.6395, 'grad_norm': 2.176206111907959, 'learning_rate': 0.00017597751710654935, 'epoch': 0.87}
+{'loss': 1.4345, 'grad_norm': 2.134561538696289, 'learning_rate': 0.00017595307917888563, 'epoch': 0.87}
+{'loss': 0.8548, 'grad_norm': 3.149474859237671, 'learning_rate': 0.00017592864125122188, 'epoch': 0.87}
+{'loss': 0.9873, 'grad_norm': 1.696314811706543, 'learning_rate': 0.00017590420332355813, 'epoch': 0.87}
+{'loss': 1.0674, 'grad_norm': 2.3394696712493896, 'learning_rate': 0.0001758797653958944, 'epoch': 0.88}
+{'loss': 1.3712, 'grad_norm': 1.807666301727295, 'learning_rate': 0.00017585532746823066, 'epoch': 0.88}
+{'loss': 1.8955, 'grad_norm': 4.91036319732666, 'learning_rate': 0.00017583088954056694, 'epoch': 0.88}
+{'loss': 1.4988, 'grad_norm': 3.2543954849243164, 'learning_rate': 0.00017580645161290322, 'epoch': 0.88}
+{'loss': 0.6692, 'grad_norm': 1.3064311742782593, 'learning_rate': 0.00017578201368523947, 'epoch': 0.88}
+{'loss': 1.0202, 'grad_norm': 3.3625075817108154, 'learning_rate': 0.00017575757575757575, 'epoch': 0.88}
+{'loss': 1.3143, 'grad_norm': 1.9790335893630981, 'learning_rate': 0.00017573313782991203, 'epoch': 0.88}
+{'loss': 0.8278, 'grad_norm': 3.206354856491089, 'learning_rate': 0.00017570869990224825, 'epoch': 0.88}
+{'loss': 0.5511, 'grad_norm': 3.371476650238037, 'learning_rate': 0.00017568426197458453, 'epoch': 0.88}
+{'loss': 1.5164, 'grad_norm': 3.493743419647217, 'learning_rate': 0.0001756598240469208, 'epoch': 0.88}
+{'loss': 1.186, 'grad_norm': 2.634359359741211, 'learning_rate': 0.00017563538611925706, 'epoch': 0.88}
+{'loss': 0.3571, 'grad_norm': 0.7865440845489502, 'learning_rate': 0.00017561094819159334, 'epoch': 0.88}
+{'loss': 0.3469, 'grad_norm': 0.7957772612571716, 'learning_rate': 0.00017558651026392962, 'epoch': 0.88}
+{'loss': 0.3274, 'grad_norm': 0.6495577692985535, 'learning_rate': 0.00017556207233626587, 'epoch': 0.88}
+{'loss': 0.2298, 'grad_norm': 0.5228533744812012, 'learning_rate': 0.00017553763440860215, 'epoch': 0.88}
+{'loss': 0.5251, 'grad_norm': 1.175572156906128, 'learning_rate': 0.00017551319648093842, 'epoch': 0.88}
+{'loss': 0.3617, 'grad_norm': 0.658734142780304, 'learning_rate': 0.00017548875855327465, 'epoch': 0.88}
+{'loss': 0.4297, 'grad_norm': 0.7342488765716553, 'learning_rate': 0.00017546432062561093, 'epoch': 0.88}
+{'loss': 0.4839, 'grad_norm': 1.1647714376449585, 'learning_rate': 0.0001754398826979472, 'epoch': 0.88}
+{'loss': 0.4779, 'grad_norm': 1.1591604948043823, 'learning_rate': 0.00017541544477028346, 'epoch': 0.88}
+{'loss': 0.2418, 'grad_norm': 0.5184983015060425, 'learning_rate': 0.00017539100684261973, 'epoch': 0.88}
+{'loss': 0.4329, 'grad_norm': 0.9003580808639526, 'learning_rate': 0.000175366568914956, 'epoch': 0.88}
+{'loss': 0.337, 'grad_norm': 0.6690883040428162, 'learning_rate': 0.00017534213098729226, 'epoch': 0.88}
+{'loss': 0.3636, 'grad_norm': 1.1468678712844849, 'learning_rate': 0.00017531769305962852, 'epoch': 0.88}
+{'loss': 0.3264, 'grad_norm': 0.7906423807144165, 'learning_rate': 0.0001752932551319648, 'epoch': 0.88}
+{'loss': 0.4039, 'grad_norm': 0.82657390832901, 'learning_rate': 0.00017526881720430104, 'epoch': 0.88}
+{'loss': 0.411, 'grad_norm': 3.476700782775879, 'learning_rate': 0.00017524437927663732, 'epoch': 0.88}
+ 44%|████▍     | 5617/12776 [58:20<57:54,  2.06it/s] 44%|████▍     | 5618/12776 [58:20<53:24,  2.23it/s]                                                     44%|████▍     | 5618/12776 [58:20<53:24,  2.23it/s] 44%|████▍     | 5619/12776 [58:21<49:37,  2.40it/s]                                                     44%|████▍     | 5619/12776 [58:21<49:37,  2.40it/s] 44%|████▍     | 5620/12776 [58:21<49:00,  2.43it/s]                                                     44%|████▍     | 5620/12776 [58:21<49:00,  2.43it/s] 44%|████▍     | 5621/12776 [58:22<45:41,  2.61it/s]                                                     44%|████▍     | 5621/12776 [58:22<45:41,  2.61it/s] 44%|████▍     | 5622/12776 [58:22<43:10,  2.76it/s]                                                     44%|████▍     | 5622/12776 [58:22<43:10,  2.76it/s] 44%|████▍     | 5623/12776 [58:22<45:35,  2.61it/s]                                                     44%|████▍     | 5623/12776 [58:22<45:35,  2.61it/s] 44%|████▍     | 5624/12776 [58:23<42:19,  2.82it/s]                                                     44%|████▍     | 5624/12776 [58:23<42:19,  2.82it/s] 44%|████▍     | 5625/12776 [58:23<39:31,  3.02it/s]                                                     44%|████▍     | 5625/12776 [58:23<39:31,  3.02it/s] 44%|████▍     | 5626/12776 [58:23<37:21,  3.19it/s]                                                     44%|████▍     | 5626/12776 [58:23<37:21,  3.19it/s] 44%|████▍     | 5627/12776 [58:23<36:58,  3.22it/s]                                                     44%|████▍     | 5627/12776 [58:23<36:58,  3.22it/s] 44%|████▍     | 5628/12776 [58:24<35:02,  3.40it/s]                                                     44%|████▍     | 5628/12776 [58:24<35:02,  3.40it/s] 44%|████▍     | 5629/12776 [58:24<33:30,  3.56it/s]                                                     44%|████▍     | 5629/12776 [58:24<33:30,  3.56it/s] 44%|████▍     | 5630/12776 [58:24<32:07,  3.71it/s]                                                     44%|████▍     | 5630/12776 [58:24<32:07,  3.71it/s] 44%|████▍     | 5631/12776 [58:25<36:56,  3.22it/s]                                                     44%|████▍     | 5631/12776 [58:25<36:56,  3.22it/s] 44%|████▍     | 5632/12776 [58:25<33:55,  3.51it/s]                                                     44%|████▍     | 5632/12776 [58:25<33:55,  3.51it/s] 44%|████▍     | 5633/12776 [58:25<31:39,  3.76it/s]                                                     44%|████▍     | 5633/12776 [58:25<31:39,  3.76it/s] 44%|████▍     | 5634/12776 [58:25<29:50,  3.99it/s]                                                     44%|████▍     | 5634/12776 [58:25<29:50,  3.99it/s] 44%|████▍     | 5635/12776 [58:25<28:22,  4.20it/s]                                                     44%|████▍     | 5635/12776 [58:25<28:22,  4.20it/s] 44%|████▍     | 5636/12776 [58:26<29:34,  4.02it/s]                                                     44%|████▍     | 5636/12776 [58:26<29:34,  4.02it/s] 44%|████▍     | 5637/12776 [58:26<27:51,  4.27it/s]                                                     44%|████▍     | 5637/12776 [58:26<27:51,  4.27it/s] 44%|████▍     | 5638/12776 [58:26<26:33,  4.48it/s]                                                     44%|████▍     | 5638/12776 [58:26<26:33,  4.48it/s] 44%|████▍     | 5639/12776 [58:26<25:27,  4.67it/s]                                                     44%|████▍     | 5639/12776 [58:26<25:27,  4.67it/s] 44%|████▍     | 5640/12776 [58:27<24:30,  4.85it/s]                                                     44%|████▍     | 5640/12776 [58:27<24:30,  4.85it/s] 44%|████▍     | 5641/12776 [58:27<26:20,  4.52it/s]                                                     44%|████▍     | 5641/12776 [58:27<26:20,  4.52it/s] 44%|████▍     | 5642/12776 [58:27<24:57,  4.77it/s]                                                     44%|████▍     | 5642/12776 [58:27<24:57,  4.77it/s] 44%|████▍     | 5643/12776 [58:27<24:00,  4.95it/s]                                                     44%|████▍     | 5643/12776 [58:27<24:00,  4.95it/s] 44%|████▍     | 5644/12776 [58:27<23:07,  5.14it/s]                                                     44%|████▍     | 5644/12776 [58:27<23:07,  5.14it/s] 44%|████▍     | 5645/12776 [58:27<22:25,  5.30it/s]                                                     44%|████▍     | 5645/12776 [58:27<22:25,  5.30it/s] 44%|████▍     | 5646/12776 [58:28<21:53,  5.43it/s]                                                     44%|████▍     | 5646/12776 [58:28<21:53,  5.43it/s] 44%|████▍     | 5647/12776 [58:28<23:41,  5.01it/s]                                                     44%|████▍     | 5647/12776 [58:28<23:41,  5.01it/s] 44%|████▍     | 5648/12776 [58:28<22:31,  5.27it/s]                                                     44%|████▍     | 5648/12776 [58:28<22:31,  5.27it/s] 44%|████▍     | 5649/12776 [58:28<21:40,  5.48it/s]                                                     44%|████▍     | 5649/12776 [58:28<21:40,  5.48it/s] 44%|████▍     | 5650/12776 [58:29<46:48,  2.54it/s]                                                     44%|████▍     | 5650/12776 [58:29<46:48,  2.54it/s] 44%|████▍     | 5651/12776 [58:31<1:28:35,  1.34it/s]                                                       44%|████▍     | 5651/12776 [58:31<1:28:35,  1.34it/s] 44%|████▍     | 5652/12776 [58:32<1:35:20,  1.25it/s]                                                       44%|████▍     | 5652/12776 [58:32<1:35:20,  1.25it/s] 44%|████▍     | 5653/12776 [58:32<1:36:07,  1.24it/s]                                                       44%|████▍     | 5653/12776 [58:32<1:36:07,  1.24it/s] 44%|████▍     | 5654/12776 [58:33<1:33:36,  1.27it/s]                                                       44%|████▍     | 5654/12776 [58:33<1:33:36,  1.27it/s] 44%|████▍     | 5655/12776 [58:34<1:35:54,  1.24it/s]                                                       44%|████▍     | 5655/12776 [58:34<1:35:54,  1.24it/s] 44%|████▍     | 5656/12776 [58:35<1:30:30,  1.31it/s]                                                       44%|████▍     | 5656/12776 [58:35<1:30:30,  1.31it/s] 44%|████▍     | 5657/12776 [58:35<1:25:45,  1.38it/s]                                                       44%|████▍     | 5657/12776 [58:35<1:25:45,  1.38it/s] 44%|████▍     | 5658/12776 [58:36<1:25:56,  1.38it/s]                                                       44%|████▍     | 5658/12776 [58:36<1:25:56,  1.38it/s] 44%|████▍     | 5659/12776 [58:37<1:21:50,  1.45it/s]                                                       44%|████▍     | 5659/12776 [58:37<1:21:50,  1.45it/s] 44%|████▍     | 5660/12776 [58:37<1:18:11,  1.52it/s]                                                       44%|████▍     | 5660/12776 [58:37<1:18:11,  1.52it/s] 44%|████▍     | 5661/12776 [58:38<1:13:24,  1.62it/s]                                                       44%|████▍     | 5661/12776 [58:38<1:13:24,  1.62it/s] 44%|████▍     | 5662/12776 [58:38<1:14:19,  1.60it/s]                                                       44%|████▍     | 5662/12776 [58:38<1:14:19,  1.60it/s] 44%|████▍     | 5663/12776 [58:39<1:09:16,  1.71it/s]                                                       44%|████▍     | 5663/12776 [58:39<1:09:16,  1.71it/s] 44%|████▍     | 5664/12776 [58:39<1:03:36,  1.86it/s]                                                       44%|████▍     | 5664/12776 [58:39<1:03:36,  1.86it/s] 44%|████▍     | 5665/12776 [58:40<1:00:30,  1.96it/s]                                                       44%|████▍     | 5665/12776 [58:40<1:00:30,  1.96it/s] 44%|████▍     | 5666/12776 [58:40<56:16,  2.11it/s]                                                       44%|████▍     | 5666/12776 [58:40<56:16,  2.11it/s] 44%|████▍     | 5667/12776 [58:41<56:46,  2.09it/s]                                                     44%|████▍     | 5667/12776 [58:41<56:46,  2.09it/s] 44%|████▍     | 5668/12776 [58:41<52:29,  2.26it/s]                                                     44%|████▍     | 5668/12776 [58:41<52:29,  2.26it/s] 44%|████▍     | 5669/12776 [58:41<49:10,  2.41it/s]                                                     44%|████▍     | 5669/12776 [58:41<49:10,  2.41it/s] 44%|████▍     | 5670/12776 [58:42<50:37,  2.34it/s]                                                     44%|████▍     | 5670/12776 [58:42<50:37,  2.34it/s] 44%|████▍     | 5671/12776 [58:42<46:43,  2.53it/s]                                                     44%|████▍     | 5671/12776 [58:42<46:43,  2.53it/s] 44%|████▍     | 5672/12776 [58:42<43:36,  2.72it/s]                                                     44%|████▍     | 5672/12776 [58:42<43:36,  2.72it/s] 44%|████▍     | 5673/12776 [58:43<41:12,  2.87it/s]                                                     44%|████▍     | 5673/12776 [58:43<41:12,  2.87it/s] 44%|████▍     | 5674/12776 [58:43<41:58,  2.82it/s]                                                     44%|████▍     | 5674/12776 [58:43<41:58,  2.82it/s] 44%|████▍     | 5675/12776 [58:43<38:59,  3.03it/s]                                                     44%|████▍     | 5675/12776 [58:43<38:59,  3.03it/s] 44%|████▍     | 5676/12776 [58:44<36:49,  3.21it/s]                                                     44%|████▍     | 5676/12776 [58:44<36:49,  3.21it/s] 44%|████▍     | 5677/12776 [58:44<34:52,  3.39it/s]                                                     44%|████▍     | 5677/12776 [58:44<34:52,  3.39it/s] 44%|████▍     | 5678/12776 [58:44<36:37,  3.23it/s]                                                     44%|████▍     | 5678/12776 [58:44<36:37,  3.23it/s] 44%|████▍     | 5679/12776 [58:45<34:17,  3.45it/s]                                                     44%|████▍     | 5679/12776 [58:45<34:17,  3.45it/s] 44%|████▍     | 5680/12776 [58:45<32:28,  3.64it/s]                                                     44%|████▍     | 5680/12776 [58:45<32:28,  3.64it/s] 44%|████▍     | 5681/12776 [58:45<30:45,  3.84it/s]                                                     44%|████▍     | 5681/12776 [58:45<30:45,  3.84it/s] 44%|████▍     | 5682/12776 [58:45<32:22,  3.65it/s]                                                     44%|████▍     | 5682/12776 [58:45<32:22,  3.65it/s] 44%|████▍     | 5683/12776 [58:46<31:14,  3.78it/s]                                                     44%|████▍     | 5683/12776 [58:46<31:14,  3.78it/s] 44%|████▍     | 5684/12776 [58:46<30:04,  3.93it/s]                                                     44%|████▍     | 5684/12776 [58:46<30:04,  3.93it/s] 44%|████▍     | 5685/12776 [58:46<29:00,  4.07it/s]                                                     44%|████▍     | 5685/12776 [58:46<29:00,  4.07it/s] 45%|████▍     | 5686/12776 [58:46<28:00,  4.22it/s]                                                     45%|████▍     | 5686/12776 [58:46<28:00,  4.22it/s] 45%|████▍     | 5687/12776 [58:47<31:29,  3.75it/s]                                                     45%|████▍     | 5687/12776 [58:47<31:29,  3.75it/s] 45%|████▍     | 5688/12776 [58:47<29:30,  4.00it/s]                                                     45%|████▍     | 5688/12776 [58:47<29:30,  4.00it/s] 45%|████▍     | 5689/12776 [58:47<28:01,  4.21it/s]                                                     45%|████▍     | 5689/12776 [58:47<28:01,  4.21it/s] 45%|████▍     | 5690/12776 [58:47<26:56,  4.38it/s]                                                     45%|████▍     | 5690/12776 [58:47<26:56,  4.38it/s] 45%|████▍     | 5691/12776 [58:47<26:08,  4.52it/s]                                                     45%|████▍     | 5691/12776 [58:47<26:08,  4.52it/s] 45%|████▍     | 5692/12776 [58:48<29:23,  4.02it/s]                                                     45%|████▍     | 5692/12776 [58:48<29:23,  4.02it/s] 45%|████▍     | 5693/12776 [58:48<27:48,  4.24it/s]                                                     45%|████▍     | 5693/12776 [58:48<27:48,  4.24it/s] 45%|████▍     | 5694/12776 [58:48<26:25,  4.47it/s]                                                     45%|████▍     | 5694/12776 [58:48<26:25,  4.47it/s] 45%|████▍     | 5695/12776 [58:48<25:22,  4.65it/s]                                                    {'loss': 0.6646, 'grad_norm': 1.0398757457733154, 'learning_rate': 0.0001752199413489736, 'epoch': 0.88}
+{'loss': 0.7714, 'grad_norm': 1.3956527709960938, 'learning_rate': 0.00017519550342130985, 'epoch': 0.88}
+{'loss': 0.5916, 'grad_norm': 1.2822397947311401, 'learning_rate': 0.00017517106549364613, 'epoch': 0.88}
+{'loss': 0.4558, 'grad_norm': 1.0409700870513916, 'learning_rate': 0.0001751466275659824, 'epoch': 0.88}
+{'loss': 0.4444, 'grad_norm': 1.218854308128357, 'learning_rate': 0.00017512218963831863, 'epoch': 0.88}
+{'loss': 0.336, 'grad_norm': 2.0905308723449707, 'learning_rate': 0.0001750977517106549, 'epoch': 0.88}
+{'loss': 0.5637, 'grad_norm': 1.4038474559783936, 'learning_rate': 0.0001750733137829912, 'epoch': 0.88}
+{'loss': 0.6929, 'grad_norm': 1.5574206113815308, 'learning_rate': 0.00017504887585532744, 'epoch': 0.88}
+{'loss': 0.572, 'grad_norm': 2.9178993701934814, 'learning_rate': 0.00017502443792766372, 'epoch': 0.88}
+{'loss': 0.538, 'grad_norm': 1.9534040689468384, 'learning_rate': 0.000175, 'epoch': 0.88}
+{'loss': 0.7121, 'grad_norm': 2.420355796813965, 'learning_rate': 0.00017497556207233625, 'epoch': 0.88}
+{'loss': 0.8366, 'grad_norm': 2.449112892150879, 'learning_rate': 0.00017495112414467253, 'epoch': 0.88}
+{'loss': 0.3996, 'grad_norm': 1.790299415588379, 'learning_rate': 0.0001749266862170088, 'epoch': 0.88}
+{'loss': 0.7785, 'grad_norm': 2.5560543537139893, 'learning_rate': 0.00017490224828934503, 'epoch': 0.88}
+{'loss': 0.6992, 'grad_norm': 1.6959267854690552, 'learning_rate': 0.0001748778103616813, 'epoch': 0.88}
+{'loss': 1.0714, 'grad_norm': 2.7933099269866943, 'learning_rate': 0.0001748533724340176, 'epoch': 0.88}
+{'loss': 0.8542, 'grad_norm': 3.0920727252960205, 'learning_rate': 0.00017482893450635384, 'epoch': 0.88}
+{'loss': 0.8824, 'grad_norm': 1.7544466257095337, 'learning_rate': 0.00017480449657869012, 'epoch': 0.88}
+{'loss': 0.7097, 'grad_norm': 3.3472537994384766, 'learning_rate': 0.0001747800586510264, 'epoch': 0.88}
+{'loss': 1.0795, 'grad_norm': 2.1331112384796143, 'learning_rate': 0.00017475562072336262, 'epoch': 0.88}
+{'loss': 0.8523, 'grad_norm': 3.316182851791382, 'learning_rate': 0.0001747311827956989, 'epoch': 0.88}
+{'loss': 0.9487, 'grad_norm': 3.732465982437134, 'learning_rate': 0.00017470674486803518, 'epoch': 0.88}
+{'loss': 0.7443, 'grad_norm': 2.216749668121338, 'learning_rate': 0.00017468230694037143, 'epoch': 0.88}
+{'loss': 1.0188, 'grad_norm': 2.096297025680542, 'learning_rate': 0.0001746578690127077, 'epoch': 0.88}
+{'loss': 1.7533, 'grad_norm': 1.9539072513580322, 'learning_rate': 0.00017463343108504398, 'epoch': 0.88}
+{'loss': 1.3721, 'grad_norm': 1.9574966430664062, 'learning_rate': 0.00017460899315738024, 'epoch': 0.88}
+{'loss': 1.3333, 'grad_norm': 2.3340044021606445, 'learning_rate': 0.0001745845552297165, 'epoch': 0.88}
+{'loss': 1.2622, 'grad_norm': 2.5659821033477783, 'learning_rate': 0.0001745601173020528, 'epoch': 0.88}
+{'loss': 1.5452, 'grad_norm': 2.5627105236053467, 'learning_rate': 0.00017453567937438902, 'epoch': 0.88}
+{'loss': 0.7612, 'grad_norm': 2.1331045627593994, 'learning_rate': 0.0001745112414467253, 'epoch': 0.88}
+{'loss': 0.5008, 'grad_norm': 1.764351725578308, 'learning_rate': 0.00017448680351906157, 'epoch': 0.88}
+{'loss': 0.4137, 'grad_norm': 1.369174838066101, 'learning_rate': 0.00017446236559139782, 'epoch': 0.88}
+{'loss': 0.7086, 'grad_norm': 2.9028494358062744, 'learning_rate': 0.0001744379276637341, 'epoch': 0.88}
+{'loss': 0.9732, 'grad_norm': 3.7896909713745117, 'learning_rate': 0.00017441348973607038, 'epoch': 0.88}
+{'loss': 0.2748, 'grad_norm': 1.121634840965271, 'learning_rate': 0.00017438905180840663, 'epoch': 0.88}
+{'loss': 0.2495, 'grad_norm': 0.823630154132843, 'learning_rate': 0.0001743646138807429, 'epoch': 0.88}
+{'loss': 0.2669, 'grad_norm': 0.7519258260726929, 'learning_rate': 0.0001743401759530792, 'epoch': 0.88}
+{'loss': 0.3282, 'grad_norm': 0.6291525363922119, 'learning_rate': 0.0001743157380254154, 'epoch': 0.89}
+{'loss': 0.5075, 'grad_norm': 0.8792591691017151, 'learning_rate': 0.0001742913000977517, 'epoch': 0.89}
+{'loss': 0.4798, 'grad_norm': 0.6888756155967712, 'learning_rate': 0.00017426686217008797, 'epoch': 0.89}
+{'loss': 0.2821, 'grad_norm': 0.6959285140037537, 'learning_rate': 0.00017424242424242422, 'epoch': 0.89}
+{'loss': 0.2513, 'grad_norm': 0.9070468544960022, 'learning_rate': 0.0001742179863147605, 'epoch': 0.89}
+{'loss': 0.4034, 'grad_norm': 1.2020875215530396, 'learning_rate': 0.00017419354838709678, 'epoch': 0.89}
+{'loss': 0.2479, 'grad_norm': 0.8414081931114197, 'learning_rate': 0.000174169110459433, 'epoch': 0.89}
+{'loss': 0.3122, 'grad_norm': 0.5406526327133179, 'learning_rate': 0.00017414467253176928, 'epoch': 0.89}
+{'loss': 0.3185, 'grad_norm': 0.6893888711929321, 'learning_rate': 0.00017412023460410556, 'epoch': 0.89}
+{'loss': 0.4614, 'grad_norm': 0.9592669606208801, 'learning_rate': 0.0001740957966764418, 'epoch': 0.89}
+{'loss': 0.5796, 'grad_norm': 1.71309232711792, 'learning_rate': 0.0001740713587487781, 'epoch': 0.89}
+{'loss': 0.5306, 'grad_norm': 1.557362675666809, 'learning_rate': 0.00017404692082111437, 'epoch': 0.89}
+{'loss': 0.532, 'grad_norm': 1.2218095064163208, 'learning_rate': 0.00017402248289345062, 'epoch': 0.89}
+{'loss': 0.5144, 'grad_norm': 1.2000401020050049, 'learning_rate': 0.0001739980449657869, 'epoch': 0.89}
+{'loss': 0.5698, 'grad_norm': 1.0709697008132935, 'learning_rate': 0.00017397360703812317, 'epoch': 0.89}
+{'loss': 0.4509, 'grad_norm': 0.7280417680740356, 'learning_rate': 0.0001739491691104594, 'epoch': 0.89}
+{'loss': 0.6166, 'grad_norm': 1.5254422426223755, 'learning_rate': 0.00017392473118279568, 'epoch': 0.89}
+{'loss': 0.4885, 'grad_norm': 1.4855180978775024, 'learning_rate': 0.00017390029325513195, 'epoch': 0.89}
+{'loss': 0.7829, 'grad_norm': 1.855657935142517, 'learning_rate': 0.0001738758553274682, 'epoch': 0.89}
+{'loss': 1.2026, 'grad_norm': 1.7714046239852905, 'learning_rate': 0.00017385141739980448, 'epoch': 0.89}
+{'loss': 0.8146, 'grad_norm': 2.0689687728881836, 'learning_rate': 0.00017382697947214076, 'epoch': 0.89}
+{'loss': 0.8094, 'grad_norm': 2.669292688369751, 'learning_rate': 0.00017380254154447701, 'epoch': 0.89}
+{'loss': 0.486, 'grad_norm': 1.2255752086639404, 'learning_rate': 0.0001737781036168133, 'epoch': 0.89}
+{'loss': 0.5691, 'grad_norm': 0.9265427589416504, 'learning_rate': 0.00017375366568914954, 'epoch': 0.89}
+{'loss': 0.6208, 'grad_norm': 1.7718168497085571, 'learning_rate': 0.0001737292277614858, 'epoch': 0.89}
+{'loss': 1.0041, 'grad_norm': 3.1587021350860596, 'learning_rate': 0.00017370478983382207, 'epoch': 0.89}
+{'loss': 0.4903, 'grad_norm': 1.6209098100662231, 'learning_rate': 0.00017368035190615835, 'epoch': 0.89}
+{'loss': 0.6506, 'grad_norm': 1.9299261569976807, 'learning_rate': 0.0001736559139784946, 'epoch': 0.89}
+{'loss': 1.0693, 'grad_norm': 1.872786521911621, 'learning_rate': 0.00017363147605083088, 'epoch': 0.89}
+{'loss': 0.9645, 'grad_norm': 2.118117094039917, 'learning_rate': 0.00017360703812316716, 'epoch': 0.89}
+{'loss': 0.895, 'grad_norm': 2.271125078201294, 'learning_rate': 0.00017358260019550338, 'epoch': 0.89}
+{'loss': 1.2066, 'grad_norm': 2.594882011413574, 'learning_rate': 0.00017355816226783966, 'epoch': 0.89}
+{'loss': 0.6995, 'grad_norm': 1.8685270547866821, 'learning_rate': 0.00017353372434017594, 'epoch': 0.89}
+{'loss': 1.3999, 'grad_norm': 3.1582865715026855, 'learning_rate': 0.0001735092864125122, 'epoch': 0.89}
+{'loss': 1.6178, 'grad_norm': 8.269997596740723, 'learning_rate': 0.00017348484848484847, 'epoch': 0.89}
+{'loss': 1.1089, 'grad_norm': 4.720138072967529, 'learning_rate': 0.00017346041055718475, 'epoch': 0.89}
+{'loss': 1.2031, 'grad_norm': 1.907230257987976, 'learning_rate': 0.000173435972629521, 'epoch': 0.89}
+{'loss': 1.6695, 'grad_norm': 2.7072951793670654, 'learning_rate': 0.00017341153470185728, 'epoch': 0.89}
+{'loss': 1.0969, 'grad_norm': 4.086122989654541, 'learning_rate': 0.00017338709677419356, 'epoch': 0.89}
+{'loss': 0.7329, 'grad_norm': 1.9636390209197998, 'learning_rate': 0.00017336265884652978, 'epoch': 0.89}
+{'loss': 0.7581, 'grad_norm': 2.4966297149658203, 'learning_rate': 0.00017333822091886606, 'epoch': 0.89}
+ 45%|████▍     | 5695/12776 [58:48<25:22,  4.65it/s] 45%|████▍     | 5696/12776 [58:48<24:34,  4.80it/s]                                                     45%|████▍     | 5696/12776 [58:48<24:34,  4.80it/s] 45%|████▍     | 5697/12776 [58:49<25:21,  4.65it/s]                                                     45%|████▍     | 5697/12776 [58:49<25:21,  4.65it/s] 45%|████▍     | 5698/12776 [58:49<24:23,  4.84it/s]                                                     45%|████▍     | 5698/12776 [58:49<24:23,  4.84it/s] 45%|████▍     | 5699/12776 [58:49<23:39,  4.99it/s]                                                     45%|████▍     | 5699/12776 [58:49<23:39,  4.99it/s] 45%|████▍     | 5700/12776 [58:50<43:32,  2.71it/s]                                                     45%|████▍     | 5700/12776 [58:50<43:32,  2.71it/s] 45%|████▍     | 5701/12776 [58:51<1:27:07,  1.35it/s]                                                       45%|████▍     | 5701/12776 [58:51<1:27:07,  1.35it/s] 45%|████▍     | 5702/12776 [58:53<1:39:51,  1.18it/s]                                                       45%|████▍     | 5702/12776 [58:53<1:39:51,  1.18it/s] 45%|████▍     | 5703/12776 [58:53<1:39:20,  1.19it/s]                                                       45%|████▍     | 5703/12776 [58:53<1:39:20,  1.19it/s] 45%|████▍     | 5704/12776 [58:54<1:37:13,  1.21it/s]                                                       45%|████▍     | 5704/12776 [58:54<1:37:13,  1.21it/s] 45%|████▍     | 5705/12776 [58:55<1:33:58,  1.25it/s]                                                       45%|████▍     | 5705/12776 [58:55<1:33:58,  1.25it/s] 45%|████▍     | 5706/12776 [58:56<1:30:47,  1.30it/s]                                                       45%|████▍     | 5706/12776 [58:56<1:30:47,  1.30it/s] 45%|████▍     | 5707/12776 [58:56<1:26:17,  1.37it/s]                                                       45%|████▍     | 5707/12776 [58:56<1:26:17,  1.37it/s] 45%|████▍     | 5708/12776 [58:57<1:27:27,  1.35it/s]                                                       45%|████▍     | 5708/12776 [58:57<1:27:27,  1.35it/s] 45%|████▍     | 5709/12776 [58:58<1:21:24,  1.45it/s]                                                       45%|████▍     | 5709/12776 [58:58<1:21:24,  1.45it/s] 45%|████▍     | 5710/12776 [58:58<1:17:25,  1.52it/s]                                                       45%|████▍     | 5710/12776 [58:58<1:17:25,  1.52it/s] 45%|████▍     | 5711/12776 [58:59<1:13:45,  1.60it/s]                                                       45%|████▍     | 5711/12776 [58:59<1:13:45,  1.60it/s] 45%|████▍     | 5712/12776 [58:59<1:12:29,  1.62it/s]                                                       45%|████▍     | 5712/12776 [58:59<1:12:29,  1.62it/s] 45%|████▍     | 5713/12776 [59:00<1:07:35,  1.74it/s]                                                       45%|████▍     | 5713/12776 [59:00<1:07:35,  1.74it/s] 45%|████▍     | 5714/12776 [59:00<1:03:18,  1.86it/s]                                                       45%|████▍     | 5714/12776 [59:00<1:03:18,  1.86it/s] 45%|████▍     | 5715/12776 [59:01<59:47,  1.97it/s]                                                       45%|████▍     | 5715/12776 [59:01<59:47,  1.97it/s] 45%|████▍     | 5716/12776 [59:01<56:29,  2.08it/s]                                                     45%|████▍     | 5716/12776 [59:01<56:29,  2.08it/s] 45%|████▍     | 5717/12776 [59:02<55:16,  2.13it/s]                                                     45%|████▍     | 5717/12776 [59:02<55:16,  2.13it/s] 45%|████▍     | 5718/12776 [59:02<52:04,  2.26it/s]                                                     45%|████▍     | 5718/12776 [59:02<52:04,  2.26it/s] 45%|████▍     | 5719/12776 [59:02<49:02,  2.40it/s]                                                     45%|████▍     | 5719/12776 [59:02<49:02,  2.40it/s] 45%|████▍     | 5720/12776 [59:03<50:06,  2.35it/s]                                                     45%|████▍     | 5720/12776 [59:03<50:06,  2.35it/s] 45%|████▍     | 5721/12776 [59:03<47:09,  2.49it/s]                                                     45%|████▍     | 5721/12776 [59:03<47:09,  2.49it/s] 45%|████▍     | 5722/12776 [59:03<44:43,  2.63it/s]                                                     45%|████▍     | 5722/12776 [59:03<44:43,  2.63it/s] 45%|████▍     | 5723/12776 [59:04<44:50,  2.62it/s]                                                     45%|████▍     | 5723/12776 [59:04<44:50,  2.62it/s] 45%|████▍     | 5724/12776 [59:04<42:30,  2.77it/s]                                                     45%|████▍     | 5724/12776 [59:04<42:30,  2.77it/s] 45%|████▍     | 5725/12776 [59:04<40:18,  2.91it/s]                                                     45%|████▍     | 5725/12776 [59:04<40:18,  2.91it/s] 45%|████▍     | 5726/12776 [59:05<40:48,  2.88it/s]                                                     45%|████▍     | 5726/12776 [59:05<40:48,  2.88it/s] 45%|████▍     | 5727/12776 [59:05<38:23,  3.06it/s]                                                     45%|████▍     | 5727/12776 [59:05<38:23,  3.06it/s] 45%|████▍     | 5728/12776 [59:05<36:28,  3.22it/s]                                                     45%|████▍     | 5728/12776 [59:05<36:28,  3.22it/s] 45%|████▍     | 5729/12776 [59:06<34:57,  3.36it/s]                                                     45%|████▍     | 5729/12776 [59:06<34:57,  3.36it/s] 45%|████▍     | 5730/12776 [59:06<36:02,  3.26it/s]                                                     45%|████▍     | 5730/12776 [59:06<36:02,  3.26it/s] 45%|████▍     | 5731/12776 [59:06<34:09,  3.44it/s]                                                     45%|████▍     | 5731/12776 [59:06<34:09,  3.44it/s] 45%|████▍     | 5732/12776 [59:06<32:36,  3.60it/s]                                                     45%|████▍     | 5732/12776 [59:06<32:36,  3.60it/s] 45%|████▍     | 5733/12776 [59:07<31:29,  3.73it/s]                                                     45%|████▍     | 5733/12776 [59:07<31:29,  3.73it/s] 45%|████▍     | 5734/12776 [59:07<30:27,  3.85it/s]                                                     45%|████▍     | 5734/12776 [59:07<30:27,  3.85it/s] 45%|████▍     | 5735/12776 [59:07<31:48,  3.69it/s]                                                     45%|████▍     | 5735/12776 [59:07<31:48,  3.69it/s] 45%|████▍     | 5736/12776 [59:07<30:20,  3.87it/s]                                                     45%|████▍     | 5736/12776 [59:07<30:20,  3.87it/s] 45%|████▍     | 5737/12776 [59:08<29:00,  4.04it/s]                                                     45%|████▍     | 5737/12776 [59:08<29:00,  4.04it/s] 45%|████▍     | 5738/12776 [59:08<27:50,  4.21it/s]                                                     45%|████▍     | 5738/12776 [59:08<27:50,  4.21it/s] 45%|████▍     | 5739/12776 [59:08<31:28,  3.73it/s]                                                     45%|████▍     | 5739/12776 [59:08<31:28,  3.73it/s] 45%|████▍     | 5740/12776 [59:08<29:30,  3.97it/s]                                                     45%|████▍     | 5740/12776 [59:08<29:30,  3.97it/s] 45%|████▍     | 5741/12776 [59:09<27:55,  4.20it/s]                                                     45%|████▍     | 5741/12776 [59:09<27:55,  4.20it/s] 45%|████▍     | 5742/12776 [59:09<26:52,  4.36it/s]                                                     45%|████▍     | 5742/12776 [59:09<26:52,  4.36it/s] 45%|████▍     | 5743/12776 [59:09<25:51,  4.53it/s]                                                     45%|████▍     | 5743/12776 [59:09<25:51,  4.53it/s] 45%|████▍     | 5744/12776 [59:09<28:56,  4.05it/s]                                                     45%|████▍     | 5744/12776 [59:09<28:56,  4.05it/s] 45%|████▍     | 5745/12776 [59:09<27:08,  4.32it/s]                                                     45%|████▍     | 5745/12776 [59:09<27:08,  4.32it/s] 45%|████▍     | 5746/12776 [59:10<25:46,  4.55it/s]                                                     45%|████▍     | 5746/12776 [59:10<25:46,  4.55it/s] 45%|████▍     | 5747/12776 [59:10<24:50,  4.72it/s]                                                     45%|████▍     | 5747/12776 [59:10<24:50,  4.72it/s] 45%|████▍     | 5748/12776 [59:10<23:54,  4.90it/s]                                                     45%|████▍     | 5748/12776 [59:10<23:54,  4.90it/s] 45%|████▍     | 5749/12776 [59:10<23:21,  5.02it/s]                                                     45%|████▍     | 5749/12776 [59:10<23:21,  5.02it/s] 45%|████▌     | 5750/12776 [59:11<41:42,  2.81it/s]                                                     45%|████▌     | 5750/12776 [59:11<41:42,  2.81it/s] 45%|████▌     | 5751/12776 [59:13<1:26:49,  1.35it/s]                                                       45%|████▌     | 5751/12776 [59:13<1:26:49,  1.35it/s] 45%|████▌     | 5752/12776 [59:14<1:34:23,  1.24it/s]                                                       45%|████▌     | 5752/12776 [59:14<1:34:23,  1.24it/s] 45%|████▌     | 5753/12776 [59:14<1:36:11,  1.22it/s]                                                       45%|████▌     | 5753/12776 [59:14<1:36:11,  1.22it/s] 45%|████▌     | 5754/12776 [59:15<1:39:48,  1.17it/s]                                                       45%|████▌     | 5754/12776 [59:15<1:39:48,  1.17it/s] 45%|████▌     | 5755/12776 [59:16<1:38:46,  1.18it/s]                                                       45%|████▌     | 5755/12776 [59:16<1:38:46,  1.18it/s] 45%|████▌     | 5756/12776 [59:17<1:32:48,  1.26it/s]                                                       45%|████▌     | 5756/12776 [59:17<1:32:48,  1.26it/s] 45%|████▌     | 5757/12776 [59:18<1:30:50,  1.29it/s]                                                       45%|████▌     | 5757/12776 [59:18<1:30:50,  1.29it/s] 45%|████▌     | 5758/12776 [59:18<1:25:07,  1.37it/s]                                                       45%|████▌     | 5758/12776 [59:18<1:25:07,  1.37it/s] 45%|████▌     | 5759/12776 [59:19<1:20:48,  1.45it/s]                                                       45%|████▌     | 5759/12776 [59:19<1:20:48,  1.45it/s] 45%|████▌     | 5760/12776 [59:19<1:16:00,  1.54it/s]                                                       45%|████▌     | 5760/12776 [59:19<1:16:00,  1.54it/s] 45%|████▌     | 5761/12776 [59:20<1:13:15,  1.60it/s]                                                       45%|████▌     | 5761/12776 [59:20<1:13:15,  1.60it/s] 45%|████▌     | 5762/12776 [59:20<1:08:54,  1.70it/s]                                                       45%|████▌     | 5762/12776 [59:20<1:08:54,  1.70it/s] 45%|████▌     | 5763/12776 [59:21<1:08:22,  1.71it/s]                                                       45%|████▌     | 5763/12776 [59:21<1:08:22,  1.71it/s] 45%|████▌     | 5764/12776 [59:21<1:03:46,  1.83it/s]                                                       45%|████▌     | 5764/12776 [59:21<1:03:46,  1.83it/s] 45%|████▌     | 5765/12776 [59:22<1:04:43,  1.81it/s]                                                       45%|████▌     | 5765/12776 [59:22<1:04:43,  1.81it/s] 45%|████▌     | 5766/12776 [59:22<1:00:06,  1.94it/s]                                                       45%|████▌     | 5766/12776 [59:22<1:00:06,  1.94it/s] 45%|████▌     | 5767/12776 [59:23<1:00:09,  1.94it/s]                                                       45%|████▌     | 5767/12776 [59:23<1:00:09,  1.94it/s] 45%|████▌     | 5768/12776 [59:23<55:27,  2.11it/s]                                                       45%|████▌     | 5768/12776 [59:23<55:27,  2.11it/s] 45%|████▌     | 5769/12776 [59:24<51:33,  2.27it/s]                                                     45%|████▌     | 5769/12776 [59:24<51:33,  2.27it/s] 45%|████▌     | 5770/12776 [59:24<49:58,  2.34it/s]                                                     45%|████▌     | 5770/12776 [59:24<49:58,  2.34it/s] 45%|████▌     | 5771/12776 [59:24<46:46,  2.50it/s]                                                     45%|████▌     | 5771/12776 [59:24<46:46,  2.50it/s] 45%|████▌     | 5772/12776 [59:25<44:17,  2.64it/s]                                                     45%|████▌     | 5772/12776 [59:25<44:17,  2.64it/s] 45%|████▌     | 5773/12776 [59:25<42:05,  2.77it/s]                                                    {'loss': 0.4329, 'grad_norm': 1.2184866666793823, 'learning_rate': 0.00017331378299120234, 'epoch': 0.89}
+{'loss': 0.9643, 'grad_norm': 3.0518410205841064, 'learning_rate': 0.0001732893450635386, 'epoch': 0.89}
+{'loss': 1.076, 'grad_norm': 2.205357074737549, 'learning_rate': 0.00017326490713587487, 'epoch': 0.89}
+{'loss': 0.2366, 'grad_norm': 1.6067478656768799, 'learning_rate': 0.00017324046920821115, 'epoch': 0.89}
+{'loss': 0.9772, 'grad_norm': 1.8827284574508667, 'learning_rate': 0.0001732160312805474, 'epoch': 0.89}
+{'loss': 1.1912, 'grad_norm': 3.9518826007843018, 'learning_rate': 0.00017319159335288367, 'epoch': 0.89}
+{'loss': 0.2854, 'grad_norm': 0.5631269812583923, 'learning_rate': 0.00017316715542521993, 'epoch': 0.89}
+{'loss': 0.4003, 'grad_norm': 0.7066730856895447, 'learning_rate': 0.00017314271749755618, 'epoch': 0.89}
+{'loss': 0.2347, 'grad_norm': 0.7470924854278564, 'learning_rate': 0.00017311827956989246, 'epoch': 0.89}
+{'loss': 0.4271, 'grad_norm': 0.9745728969573975, 'learning_rate': 0.00017309384164222873, 'epoch': 0.89}
+{'loss': 0.3031, 'grad_norm': 0.6492823362350464, 'learning_rate': 0.00017306940371456499, 'epoch': 0.89}
+{'loss': 0.2764, 'grad_norm': 0.7017116546630859, 'learning_rate': 0.00017304496578690126, 'epoch': 0.89}
+{'loss': 0.3132, 'grad_norm': 0.9077380895614624, 'learning_rate': 0.00017302052785923754, 'epoch': 0.89}
+{'loss': 0.4168, 'grad_norm': 0.9337749481201172, 'learning_rate': 0.00017299608993157377, 'epoch': 0.89}
+{'loss': 0.2644, 'grad_norm': 1.3660385608673096, 'learning_rate': 0.00017297165200391004, 'epoch': 0.89}
+{'loss': 0.4336, 'grad_norm': 2.1170074939727783, 'learning_rate': 0.00017294721407624632, 'epoch': 0.89}
+{'loss': 0.5736, 'grad_norm': 0.9623836278915405, 'learning_rate': 0.00017292277614858257, 'epoch': 0.89}
+{'loss': 0.4649, 'grad_norm': 1.502590298652649, 'learning_rate': 0.00017289833822091885, 'epoch': 0.89}
+{'loss': 0.3547, 'grad_norm': 1.0726498365402222, 'learning_rate': 0.00017287390029325513, 'epoch': 0.89}
+{'loss': 0.4567, 'grad_norm': 1.2182965278625488, 'learning_rate': 0.00017284946236559138, 'epoch': 0.89}
+{'loss': 0.5393, 'grad_norm': 1.0065919160842896, 'learning_rate': 0.00017282502443792766, 'epoch': 0.89}
+{'loss': 0.3155, 'grad_norm': 0.693427562713623, 'learning_rate': 0.00017280058651026394, 'epoch': 0.89}
+{'loss': 0.5297, 'grad_norm': 1.1155869960784912, 'learning_rate': 0.00017277614858260016, 'epoch': 0.89}
+{'loss': 1.0583, 'grad_norm': 4.776678562164307, 'learning_rate': 0.00017275171065493644, 'epoch': 0.9}
+{'loss': 0.7256, 'grad_norm': 3.4916136264801025, 'learning_rate': 0.00017272727272727272, 'epoch': 0.9}
+{'loss': 0.5441, 'grad_norm': 1.9992774724960327, 'learning_rate': 0.00017270283479960897, 'epoch': 0.9}
+{'loss': 0.8787, 'grad_norm': 1.5791343450546265, 'learning_rate': 0.00017267839687194525, 'epoch': 0.9}
+{'loss': 0.4736, 'grad_norm': 1.3216490745544434, 'learning_rate': 0.00017265395894428153, 'epoch': 0.9}
+{'loss': 0.4765, 'grad_norm': 1.6248879432678223, 'learning_rate': 0.00017262952101661778, 'epoch': 0.9}
+{'loss': 0.8183, 'grad_norm': 1.6858633756637573, 'learning_rate': 0.00017260508308895406, 'epoch': 0.9}
+{'loss': 0.622, 'grad_norm': 1.9334176778793335, 'learning_rate': 0.0001725806451612903, 'epoch': 0.9}
+{'loss': 0.5704, 'grad_norm': 1.8685994148254395, 'learning_rate': 0.00017255620723362656, 'epoch': 0.9}
+{'loss': 0.8175, 'grad_norm': 2.087904453277588, 'learning_rate': 0.00017253176930596284, 'epoch': 0.9}
+{'loss': 0.509, 'grad_norm': 1.4752204418182373, 'learning_rate': 0.00017250733137829912, 'epoch': 0.9}
+{'loss': 0.9958, 'grad_norm': 1.6020748615264893, 'learning_rate': 0.00017248289345063537, 'epoch': 0.9}
+{'loss': 0.8474, 'grad_norm': 2.548856258392334, 'learning_rate': 0.00017245845552297165, 'epoch': 0.9}
+{'loss': 0.7504, 'grad_norm': 1.2991943359375, 'learning_rate': 0.00017243401759530792, 'epoch': 0.9}
+{'loss': 0.9691, 'grad_norm': 3.308283805847168, 'learning_rate': 0.00017240957966764415, 'epoch': 0.9}
+{'loss': 1.2162, 'grad_norm': 2.927663803100586, 'learning_rate': 0.00017238514173998043, 'epoch': 0.9}
+{'loss': 0.6097, 'grad_norm': 1.6149358749389648, 'learning_rate': 0.0001723607038123167, 'epoch': 0.9}
+{'loss': 0.9498, 'grad_norm': 2.015613555908203, 'learning_rate': 0.00017233626588465296, 'epoch': 0.9}
+{'loss': 1.2403, 'grad_norm': 3.3470332622528076, 'learning_rate': 0.00017231182795698923, 'epoch': 0.9}
+{'loss': 0.6295, 'grad_norm': 3.9462878704071045, 'learning_rate': 0.0001722873900293255, 'epoch': 0.9}
+{'loss': 0.6039, 'grad_norm': 1.829472303390503, 'learning_rate': 0.00017226295210166176, 'epoch': 0.9}
+{'loss': 1.0498, 'grad_norm': 1.9444429874420166, 'learning_rate': 0.00017223851417399804, 'epoch': 0.9}
+{'loss': 1.4808, 'grad_norm': 2.516277551651001, 'learning_rate': 0.00017221407624633432, 'epoch': 0.9}
+{'loss': 1.1102, 'grad_norm': 2.0812556743621826, 'learning_rate': 0.00017218963831867055, 'epoch': 0.9}
+{'loss': 1.3883, 'grad_norm': 3.566479206085205, 'learning_rate': 0.00017216520039100682, 'epoch': 0.9}
+{'loss': 1.3953, 'grad_norm': 4.617666244506836, 'learning_rate': 0.0001721407624633431, 'epoch': 0.9}
+{'loss': 1.2931, 'grad_norm': 2.720006227493286, 'learning_rate': 0.00017211632453567935, 'epoch': 0.9}
+{'loss': 1.467, 'grad_norm': 3.672116756439209, 'learning_rate': 0.00017209188660801563, 'epoch': 0.9}
+{'loss': 0.8112, 'grad_norm': 5.463393211364746, 'learning_rate': 0.0001720674486803519, 'epoch': 0.9}
+{'loss': 0.3882, 'grad_norm': 3.226026773452759, 'learning_rate': 0.00017204301075268816, 'epoch': 0.9}
+{'loss': 0.3974, 'grad_norm': 1.810815453529358, 'learning_rate': 0.0001720185728250244, 'epoch': 0.9}
+{'loss': 1.265, 'grad_norm': 4.585860729217529, 'learning_rate': 0.0001719941348973607, 'epoch': 0.9}
+{'loss': 0.8456, 'grad_norm': 1.762209415435791, 'learning_rate': 0.00017196969696969694, 'epoch': 0.9}
+{'loss': 0.3196, 'grad_norm': 0.5263707041740417, 'learning_rate': 0.00017194525904203322, 'epoch': 0.9}
+{'loss': 0.1791, 'grad_norm': 0.41388189792633057, 'learning_rate': 0.0001719208211143695, 'epoch': 0.9}
+{'loss': 0.2162, 'grad_norm': 1.194480061531067, 'learning_rate': 0.00017189638318670575, 'epoch': 0.9}
+{'loss': 0.3029, 'grad_norm': 0.5579673647880554, 'learning_rate': 0.00017187194525904203, 'epoch': 0.9}
+{'loss': 0.3157, 'grad_norm': 0.6641569137573242, 'learning_rate': 0.0001718475073313783, 'epoch': 0.9}
+{'loss': 0.3372, 'grad_norm': 0.6336182355880737, 'learning_rate': 0.00017182306940371453, 'epoch': 0.9}
+{'loss': 0.4165, 'grad_norm': 0.9077296853065491, 'learning_rate': 0.0001717986314760508, 'epoch': 0.9}
+{'loss': 0.3048, 'grad_norm': 1.123456597328186, 'learning_rate': 0.0001717741935483871, 'epoch': 0.9}
+{'loss': 0.4485, 'grad_norm': 0.6466776132583618, 'learning_rate': 0.00017174975562072334, 'epoch': 0.9}
+{'loss': 0.3242, 'grad_norm': 0.5841704607009888, 'learning_rate': 0.00017172531769305962, 'epoch': 0.9}
+{'loss': 0.4687, 'grad_norm': 0.8340890407562256, 'learning_rate': 0.0001717008797653959, 'epoch': 0.9}
+{'loss': 0.4741, 'grad_norm': 1.2464579343795776, 'learning_rate': 0.00017167644183773215, 'epoch': 0.9}
+{'loss': 0.4531, 'grad_norm': 0.7806826233863831, 'learning_rate': 0.00017165200391006842, 'epoch': 0.9}
+{'loss': 0.4653, 'grad_norm': 1.0296310186386108, 'learning_rate': 0.0001716275659824047, 'epoch': 0.9}
+{'loss': 0.5479, 'grad_norm': 0.7613905072212219, 'learning_rate': 0.00017160312805474093, 'epoch': 0.9}
+{'loss': 0.7166, 'grad_norm': 1.586656093597412, 'learning_rate': 0.0001715786901270772, 'epoch': 0.9}
+{'loss': 0.4473, 'grad_norm': 1.1600213050842285, 'learning_rate': 0.00017155425219941348, 'epoch': 0.9}
+{'loss': 0.5886, 'grad_norm': 1.4801439046859741, 'learning_rate': 0.00017152981427174974, 'epoch': 0.9}
+{'loss': 0.5857, 'grad_norm': 1.5481951236724854, 'learning_rate': 0.00017150537634408601, 'epoch': 0.9}
+{'loss': 0.8743, 'grad_norm': 1.2954734563827515, 'learning_rate': 0.0001714809384164223, 'epoch': 0.9}
+{'loss': 0.6394, 'grad_norm': 2.4921019077301025, 'learning_rate': 0.00017145650048875854, 'epoch': 0.9}
+{'loss': 0.6264, 'grad_norm': 1.4190541505813599, 'learning_rate': 0.0001714320625610948, 'epoch': 0.9}
+ 45%|████▌     | 5773/12776 [59:25<42:05,  2.77it/s] 45%|████▌     | 5774/12776 [59:25<40:33,  2.88it/s]                                                     45%|████▌     | 5774/12776 [59:25<40:33,  2.88it/s] 45%|████▌     | 5775/12776 [59:26<38:38,  3.02it/s]                                                     45%|████▌     | 5775/12776 [59:26<38:38,  3.02it/s] 45%|████▌     | 5776/12776 [59:26<36:50,  3.17it/s]                                                     45%|████▌     | 5776/12776 [59:26<36:50,  3.17it/s] 45%|████▌     | 5777/12776 [59:26<35:27,  3.29it/s]                                                     45%|████▌     | 5777/12776 [59:26<35:27,  3.29it/s] 45%|████▌     | 5778/12776 [59:27<35:10,  3.32it/s]                                                     45%|████▌     | 5778/12776 [59:27<35:10,  3.32it/s] 45%|████▌     | 5779/12776 [59:27<33:57,  3.43it/s]                                                     45%|████▌     | 5779/12776 [59:27<33:57,  3.43it/s] 45%|████▌     | 5780/12776 [59:27<32:40,  3.57it/s]                                                     45%|████▌     | 5780/12776 [59:27<32:40,  3.57it/s] 45%|████▌     | 5781/12776 [59:27<31:36,  3.69it/s]                                                     45%|████▌     | 5781/12776 [59:27<31:36,  3.69it/s] 45%|████▌     | 5782/12776 [59:28<34:24,  3.39it/s]                                                     45%|████▌     | 5782/12776 [59:28<34:24,  3.39it/s] 45%|████▌     | 5783/12776 [59:28<32:33,  3.58it/s]                                                     45%|████▌     | 5783/12776 [59:28<32:33,  3.58it/s] 45%|████▌     | 5784/12776 [59:28<30:59,  3.76it/s]                                                     45%|████▌     | 5784/12776 [59:28<30:59,  3.76it/s] 45%|████▌     | 5785/12776 [59:28<29:46,  3.91it/s]                                                     45%|████▌     | 5785/12776 [59:28<29:46,  3.91it/s] 45%|████▌     | 5786/12776 [59:29<30:55,  3.77it/s]                                                     45%|████▌     | 5786/12776 [59:29<30:55,  3.77it/s] 45%|████▌     | 5787/12776 [59:29<29:22,  3.96it/s]                                                     45%|████▌     | 5787/12776 [59:29<29:22,  3.96it/s] 45%|████▌     | 5788/12776 [59:29<28:19,  4.11it/s]                                                     45%|████▌     | 5788/12776 [59:29<28:19,  4.11it/s] 45%|████▌     | 5789/12776 [59:29<27:07,  4.29it/s]                                                     45%|████▌     | 5789/12776 [59:29<27:07,  4.29it/s] 45%|████▌     | 5790/12776 [59:30<26:19,  4.42it/s]                                                     45%|████▌     | 5790/12776 [59:30<26:19,  4.42it/s] 45%|████▌     | 5791/12776 [59:30<27:46,  4.19it/s]                                                     45%|████▌     | 5791/12776 [59:30<27:46,  4.19it/s] 45%|████▌     | 5792/12776 [59:30<26:31,  4.39it/s]                                                     45%|████▌     | 5792/12776 [59:30<26:31,  4.39it/s] 45%|████▌     | 5793/12776 [59:30<25:33,  4.55it/s]                                                     45%|████▌     | 5793/12776 [59:30<25:33,  4.55it/s] 45%|████▌     | 5794/12776 [59:30<24:52,  4.68it/s]                                                     45%|████▌     | 5794/12776 [59:30<24:52,  4.68it/s] 45%|████▌     | 5795/12776 [59:31<24:15,  4.80it/s]                                                     45%|████▌     | 5795/12776 [59:31<24:15,  4.80it/s] 45%|████▌     | 5796/12776 [59:31<23:45,  4.90it/s]                                                     45%|████▌     | 5796/12776 [59:31<23:45,  4.90it/s] 45%|████▌     | 5797/12776 [59:31<25:50,  4.50it/s]                                                     45%|████▌     | 5797/12776 [59:31<25:50,  4.50it/s] 45%|████▌     | 5798/12776 [59:31<24:47,  4.69it/s]                                                     45%|████▌     | 5798/12776 [59:31<24:47,  4.69it/s] 45%|████▌     | 5799/12776 [59:31<23:51,  4.87it/s]                                                     45%|████▌     | 5799/12776 [59:31<23:51,  4.87it/s] 45%|████▌     | 5800/12776 [59:32<42:02,  2.77it/s]                                                     45%|████▌     | 5800/12776 [59:32<42:02,  2.77it/s] 45%|████▌     | 5801/12776 [59:34<1:22:52,  1.40it/s]                                                       45%|████▌     | 5801/12776 [59:34<1:22:52,  1.40it/s] 45%|████▌     | 5802/12776 [59:35<1:34:33,  1.23it/s]                                                       45%|████▌     | 5802/12776 [59:35<1:34:33,  1.23it/s] 45%|████▌     | 5803/12776 [59:36<1:35:42,  1.21it/s]                                                       45%|████▌     | 5803/12776 [59:36<1:35:42,  1.21it/s] 45%|████▌     | 5804/12776 [59:36<1:34:30,  1.23it/s]                                                       45%|████▌     | 5804/12776 [59:36<1:34:30,  1.23it/s] 45%|████▌     | 5805/12776 [59:37<1:32:08,  1.26it/s]                                                       45%|████▌     | 5805/12776 [59:37<1:32:08,  1.26it/s] 45%|████▌     | 5806/12776 [59:38<1:30:43,  1.28it/s]                                                       45%|████▌     | 5806/12776 [59:38<1:30:43,  1.28it/s] 45%|████▌     | 5807/12776 [59:39<1:26:04,  1.35it/s]                                                       45%|████▌     | 5807/12776 [59:39<1:26:04,  1.35it/s] 45%|████▌     | 5808/12776 [59:39<1:26:50,  1.34it/s]                                                       45%|████▌     | 5808/12776 [59:39<1:26:50,  1.34it/s] 45%|████▌     | 5809/12776 [59:40<1:20:27,  1.44it/s]                                                       45%|████▌     | 5809/12776 [59:40<1:20:27,  1.44it/s] 45%|████▌     | 5810/12776 [59:41<1:18:16,  1.48it/s]                                                       45%|████▌     | 5810/12776 [59:41<1:18:16,  1.48it/s] 45%|████▌     | 5811/12776 [59:41<1:12:36,  1.60it/s]                                                       45%|████▌     | 5811/12776 [59:41<1:12:36,  1.60it/s] 45%|████▌     | 5812/12776 [59:42<1:11:39,  1.62it/s]                                                       45%|████▌     | 5812/12776 [59:42<1:11:39,  1.62it/s] 45%|████▌     | 5813/12776 [59:42<1:06:13,  1.75it/s]                                                       45%|████▌     | 5813/12776 [59:42<1:06:13,  1.75it/s] 46%|████▌     | 5814/12776 [59:43<1:05:45,  1.76it/s]                                                       46%|████▌     | 5814/12776 [59:43<1:05:45,  1.76it/s] 46%|████▌     | 5815/12776 [59:43<1:01:06,  1.90it/s]                                                       46%|████▌     | 5815/12776 [59:43<1:01:06,  1.90it/s] 46%|████▌     | 5816/12776 [59:44<1:00:24,  1.92it/s]                                                       46%|████▌     | 5816/12776 [59:44<1:00:24,  1.92it/s] 46%|████▌     | 5817/12776 [59:44<56:28,  2.05it/s]                                                       46%|████▌     | 5817/12776 [59:44<56:28,  2.05it/s] 46%|████▌     | 5818/12776 [59:44<53:19,  2.17it/s]                                                     46%|████▌     | 5818/12776 [59:44<53:19,  2.17it/s] 46%|████▌     | 5819/12776 [59:45<54:26,  2.13it/s]                                                     46%|████▌     | 5819/12776 [59:45<54:26,  2.13it/s] 46%|████▌     | 5820/12776 [59:45<50:42,  2.29it/s]                                                     46%|████▌     | 5820/12776 [59:45<50:42,  2.29it/s] 46%|████▌     | 5821/12776 [59:46<47:27,  2.44it/s]                                                     46%|████▌     | 5821/12776 [59:46<47:27,  2.44it/s] 46%|████▌     | 5822/12776 [59:46<47:35,  2.43it/s]                                                     46%|████▌     | 5822/12776 [59:46<47:35,  2.43it/s] 46%|████▌     | 5823/12776 [59:46<44:47,  2.59it/s]                                                     46%|████▌     | 5823/12776 [59:46<44:47,  2.59it/s] 46%|████▌     | 5824/12776 [59:47<42:22,  2.73it/s]                                                     46%|████▌     | 5824/12776 [59:47<42:22,  2.73it/s] 46%|████▌     | 5825/12776 [59:47<41:16,  2.81it/s]                                                     46%|████▌     | 5825/12776 [59:47<41:16,  2.81it/s] 46%|████▌     | 5826/12776 [59:47<39:12,  2.95it/s]                                                     46%|████▌     | 5826/12776 [59:47<39:12,  2.95it/s] 46%|████▌     | 5827/12776 [59:48<37:33,  3.08it/s]                                                     46%|████▌     | 5827/12776 [59:48<37:33,  3.08it/s] 46%|████▌     | 5828/12776 [59:48<36:26,  3.18it/s]                                                     46%|████▌     | 5828/12776 [59:48<36:26,  3.18it/s] 46%|████▌     | 5829/12776 [59:48<39:07,  2.96it/s]                                                     46%|████▌     | 5829/12776 [59:48<39:07,  2.96it/s] 46%|████▌     | 5830/12776 [59:49<36:41,  3.16it/s]                                                     46%|████▌     | 5830/12776 [59:49<36:41,  3.16it/s] 46%|████▌     | 5831/12776 [59:49<34:48,  3.33it/s]                                                     46%|████▌     | 5831/12776 [59:49<34:48,  3.33it/s] 46%|████▌     | 5832/12776 [59:49<33:21,  3.47it/s]                                                     46%|████▌     | 5832/12776 [59:49<33:21,  3.47it/s] 46%|████▌     | 5833/12776 [59:49<35:24,  3.27it/s]                                                     46%|████▌     | 5833/12776 [59:49<35:24,  3.27it/s] 46%|████▌     | 5834/12776 [59:50<33:57,  3.41it/s]                                                     46%|████▌     | 5834/12776 [59:50<33:57,  3.41it/s] 46%|████▌     | 5835/12776 [59:50<32:48,  3.53it/s]                                                     46%|████▌     | 5835/12776 [59:50<32:48,  3.53it/s] 46%|████▌     | 5836/12776 [59:50<32:37,  3.55it/s]                                                     46%|████▌     | 5836/12776 [59:50<32:37,  3.55it/s] 46%|████▌     | 5837/12776 [59:51<35:06,  3.29it/s]                                                     46%|████▌     | 5837/12776 [59:51<35:06,  3.29it/s] 46%|████▌     | 5838/12776 [59:51<32:25,  3.57it/s]                                                     46%|████▌     | 5838/12776 [59:51<32:25,  3.57it/s] 46%|████▌     | 5839/12776 [59:51<31:12,  3.70it/s]                                                     46%|████▌     | 5839/12776 [59:51<31:12,  3.70it/s] 46%|████▌     | 5840/12776 [59:51<29:29,  3.92it/s]                                                     46%|████▌     | 5840/12776 [59:51<29:29,  3.92it/s] 46%|████▌     | 5841/12776 [59:52<30:36,  3.78it/s]                                                     46%|████▌     | 5841/12776 [59:52<30:36,  3.78it/s] 46%|████▌     | 5842/12776 [59:52<28:45,  4.02it/s]                                                     46%|████▌     | 5842/12776 [59:52<28:45,  4.02it/s] 46%|████▌     | 5843/12776 [59:52<28:05,  4.11it/s]                                                     46%|████▌     | 5843/12776 [59:52<28:05,  4.11it/s] 46%|████▌     | 5844/12776 [59:52<26:56,  4.29it/s]                                                     46%|████▌     | 5844/12776 [59:52<26:56,  4.29it/s] 46%|████▌     | 5845/12776 [59:52<25:41,  4.50it/s]                                                     46%|████▌     | 5845/12776 [59:52<25:41,  4.50it/s] 46%|████▌     | 5846/12776 [59:53<27:44,  4.16it/s]                                                     46%|████▌     | 5846/12776 [59:53<27:44,  4.16it/s] 46%|████▌     | 5847/12776 [59:53<26:25,  4.37it/s]                                                     46%|████▌     | 5847/12776 [59:53<26:25,  4.37it/s] 46%|████▌     | 5848/12776 [59:53<25:26,  4.54it/s]                                                     46%|████▌     | 5848/12776 [59:53<25:26,  4.54it/s] 46%|████▌     | 5849/12776 [59:53<24:17,  4.75it/s]                                                     46%|████▌     | 5849/12776 [59:53<24:17,  4.75it/s] 46%|████▌     | 5850/12776 [59:54<42:01,  2.75it/s]                                                    {'loss': 0.4692, 'grad_norm': 0.9357963800430298, 'learning_rate': 0.00017140762463343107, 'epoch': 0.9}
+{'loss': 0.6029, 'grad_norm': 3.0093657970428467, 'learning_rate': 0.00017138318670576732, 'epoch': 0.9}
+{'loss': 0.5638, 'grad_norm': 2.1367123126983643, 'learning_rate': 0.0001713587487781036, 'epoch': 0.9}
+{'loss': 0.9128, 'grad_norm': 3.115868091583252, 'learning_rate': 0.00017133431085043988, 'epoch': 0.9}
+{'loss': 0.893, 'grad_norm': 1.7553256750106812, 'learning_rate': 0.00017130987292277613, 'epoch': 0.9}
+{'loss': 0.8314, 'grad_norm': 1.6324225664138794, 'learning_rate': 0.0001712854349951124, 'epoch': 0.9}
+{'loss': 0.4471, 'grad_norm': 2.084900379180908, 'learning_rate': 0.0001712609970674487, 'epoch': 0.9}
+{'loss': 0.8395, 'grad_norm': 1.5276941061019897, 'learning_rate': 0.0001712365591397849, 'epoch': 0.9}
+{'loss': 0.8451, 'grad_norm': 1.490938663482666, 'learning_rate': 0.0001712121212121212, 'epoch': 0.9}
+{'loss': 1.2903, 'grad_norm': 2.2541961669921875, 'learning_rate': 0.00017118768328445747, 'epoch': 0.91}
+{'loss': 1.0065, 'grad_norm': 2.1898505687713623, 'learning_rate': 0.00017116324535679372, 'epoch': 0.91}
+{'loss': 0.8832, 'grad_norm': 1.505751609802246, 'learning_rate': 0.00017113880742913, 'epoch': 0.91}
+{'loss': 0.6737, 'grad_norm': 2.1820976734161377, 'learning_rate': 0.00017111436950146628, 'epoch': 0.91}
+{'loss': 0.9483, 'grad_norm': 2.6969125270843506, 'learning_rate': 0.00017108993157380253, 'epoch': 0.91}
+{'loss': 1.0961, 'grad_norm': 4.440044403076172, 'learning_rate': 0.0001710654936461388, 'epoch': 0.91}
+{'loss': 1.0894, 'grad_norm': 2.3744637966156006, 'learning_rate': 0.00017104105571847509, 'epoch': 0.91}
+{'loss': 0.8032, 'grad_norm': 2.1773595809936523, 'learning_rate': 0.0001710166177908113, 'epoch': 0.91}
+{'loss': 1.3428, 'grad_norm': 2.6673295497894287, 'learning_rate': 0.0001709921798631476, 'epoch': 0.91}
+{'loss': 1.0168, 'grad_norm': 2.424062967300415, 'learning_rate': 0.00017096774193548387, 'epoch': 0.91}
+{'loss': 0.843, 'grad_norm': 1.6261483430862427, 'learning_rate': 0.00017094330400782012, 'epoch': 0.91}
+{'loss': 1.4431, 'grad_norm': 2.254560947418213, 'learning_rate': 0.0001709188660801564, 'epoch': 0.91}
+{'loss': 0.788, 'grad_norm': 1.4824719429016113, 'learning_rate': 0.00017089442815249267, 'epoch': 0.91}
+{'loss': 0.9419, 'grad_norm': 1.547086477279663, 'learning_rate': 0.0001708699902248289, 'epoch': 0.91}
+{'loss': 0.899, 'grad_norm': 2.4336373805999756, 'learning_rate': 0.00017084555229716518, 'epoch': 0.91}
+{'loss': 0.3773, 'grad_norm': 1.5645617246627808, 'learning_rate': 0.00017082111436950146, 'epoch': 0.91}
+{'loss': 1.37, 'grad_norm': 3.626685380935669, 'learning_rate': 0.0001707966764418377, 'epoch': 0.91}
+{'loss': 0.4822, 'grad_norm': 2.090294122695923, 'learning_rate': 0.00017077223851417398, 'epoch': 0.91}
+{'loss': 1.7751, 'grad_norm': 3.3169608116149902, 'learning_rate': 0.00017074780058651026, 'epoch': 0.91}
+{'loss': 0.4204, 'grad_norm': 0.6831493377685547, 'learning_rate': 0.00017072336265884651, 'epoch': 0.91}
+{'loss': 0.2246, 'grad_norm': 0.5136266350746155, 'learning_rate': 0.0001706989247311828, 'epoch': 0.91}
+{'loss': 0.2548, 'grad_norm': 0.6040349006652832, 'learning_rate': 0.00017067448680351907, 'epoch': 0.91}
+{'loss': 0.3375, 'grad_norm': 0.5405697226524353, 'learning_rate': 0.0001706500488758553, 'epoch': 0.91}
+{'loss': 0.3044, 'grad_norm': 0.4443010985851288, 'learning_rate': 0.00017062561094819157, 'epoch': 0.91}
+{'loss': 0.5221, 'grad_norm': 0.9515902400016785, 'learning_rate': 0.00017060117302052785, 'epoch': 0.91}
+{'loss': 0.3327, 'grad_norm': 0.5636407732963562, 'learning_rate': 0.0001705767350928641, 'epoch': 0.91}
+{'loss': 0.3531, 'grad_norm': 1.611080288887024, 'learning_rate': 0.00017055229716520038, 'epoch': 0.91}
+{'loss': 0.3976, 'grad_norm': 0.7491126656532288, 'learning_rate': 0.00017052785923753666, 'epoch': 0.91}
+{'loss': 0.2023, 'grad_norm': 0.7487491965293884, 'learning_rate': 0.0001705034213098729, 'epoch': 0.91}
+{'loss': 0.4256, 'grad_norm': 0.7374340295791626, 'learning_rate': 0.0001704789833822092, 'epoch': 0.91}
+{'loss': 0.3024, 'grad_norm': 0.7710371017456055, 'learning_rate': 0.00017045454545454547, 'epoch': 0.91}
+{'loss': 0.4539, 'grad_norm': 0.7972769737243652, 'learning_rate': 0.0001704301075268817, 'epoch': 0.91}
+{'loss': 0.4388, 'grad_norm': 2.1603457927703857, 'learning_rate': 0.00017040566959921797, 'epoch': 0.91}
+{'loss': 0.5987, 'grad_norm': 1.0259324312210083, 'learning_rate': 0.00017038123167155425, 'epoch': 0.91}
+{'loss': 0.6287, 'grad_norm': 1.267693281173706, 'learning_rate': 0.0001703567937438905, 'epoch': 0.91}
+{'loss': 0.3516, 'grad_norm': 0.7310298085212708, 'learning_rate': 0.00017033235581622678, 'epoch': 0.91}
+{'loss': 0.6303, 'grad_norm': 0.9882366061210632, 'learning_rate': 0.00017030791788856306, 'epoch': 0.91}
+{'loss': 0.508, 'grad_norm': 1.0292433500289917, 'learning_rate': 0.00017028347996089928, 'epoch': 0.91}
+{'loss': 0.986, 'grad_norm': 2.08266019821167, 'learning_rate': 0.00017025904203323556, 'epoch': 0.91}
+{'loss': 0.558, 'grad_norm': 1.249509572982788, 'learning_rate': 0.00017023460410557184, 'epoch': 0.91}
+{'loss': 0.5545, 'grad_norm': 1.4662624597549438, 'learning_rate': 0.0001702101661779081, 'epoch': 0.91}
+{'loss': 0.5318, 'grad_norm': 1.311246395111084, 'learning_rate': 0.00017018572825024437, 'epoch': 0.91}
+{'loss': 0.8669, 'grad_norm': 1.9841166734695435, 'learning_rate': 0.00017016129032258065, 'epoch': 0.91}
+{'loss': 0.3847, 'grad_norm': 1.4273552894592285, 'learning_rate': 0.0001701368523949169, 'epoch': 0.91}
+{'loss': 0.5714, 'grad_norm': 1.7364933490753174, 'learning_rate': 0.00017011241446725318, 'epoch': 0.91}
+{'loss': 0.7064, 'grad_norm': 1.4556832313537598, 'learning_rate': 0.00017008797653958945, 'epoch': 0.91}
+{'loss': 0.6752, 'grad_norm': 1.4327338933944702, 'learning_rate': 0.00017006353861192568, 'epoch': 0.91}
+{'loss': 0.6105, 'grad_norm': 1.3399438858032227, 'learning_rate': 0.00017003910068426196, 'epoch': 0.91}
+{'loss': 0.954, 'grad_norm': 1.6405707597732544, 'learning_rate': 0.00017001466275659823, 'epoch': 0.91}
+{'loss': 0.7187, 'grad_norm': 2.615856170654297, 'learning_rate': 0.00016999022482893449, 'epoch': 0.91}
+{'loss': 1.097, 'grad_norm': 2.1250553131103516, 'learning_rate': 0.00016996578690127076, 'epoch': 0.91}
+{'loss': 0.6268, 'grad_norm': 1.6402782201766968, 'learning_rate': 0.00016994134897360704, 'epoch': 0.91}
+{'loss': 0.8648, 'grad_norm': 2.8018393516540527, 'learning_rate': 0.0001699169110459433, 'epoch': 0.91}
+{'loss': 0.9006, 'grad_norm': 2.0957236289978027, 'learning_rate': 0.00016989247311827957, 'epoch': 0.91}
+{'loss': 1.3675, 'grad_norm': 4.053585529327393, 'learning_rate': 0.00016986803519061582, 'epoch': 0.91}
+{'loss': 0.8759, 'grad_norm': 1.946283221244812, 'learning_rate': 0.00016984359726295207, 'epoch': 0.91}
+{'loss': 1.0044, 'grad_norm': 1.3934121131896973, 'learning_rate': 0.00016981915933528835, 'epoch': 0.91}
+{'loss': 0.9486, 'grad_norm': 4.167563438415527, 'learning_rate': 0.00016979472140762463, 'epoch': 0.91}
+{'loss': 0.9403, 'grad_norm': 3.6018738746643066, 'learning_rate': 0.00016977028347996088, 'epoch': 0.91}
+{'loss': 1.152, 'grad_norm': 2.0300357341766357, 'learning_rate': 0.00016974584555229716, 'epoch': 0.91}
+{'loss': 0.8827, 'grad_norm': 2.6860861778259277, 'learning_rate': 0.00016972140762463344, 'epoch': 0.91}
+{'loss': 1.57, 'grad_norm': 1.8871678113937378, 'learning_rate': 0.00016969696969696966, 'epoch': 0.91}
+{'loss': 1.4632, 'grad_norm': 2.2716727256774902, 'learning_rate': 0.00016967253176930594, 'epoch': 0.91}
+{'loss': 1.4531, 'grad_norm': 3.0800204277038574, 'learning_rate': 0.00016964809384164222, 'epoch': 0.91}
+{'loss': 0.8666, 'grad_norm': 4.430184841156006, 'learning_rate': 0.00016962365591397847, 'epoch': 0.92}
+{'loss': 0.6021, 'grad_norm': 2.232022285461426, 'learning_rate': 0.00016959921798631475, 'epoch': 0.92}
+{'loss': 0.9608, 'grad_norm': 2.0096168518066406, 'learning_rate': 0.00016957478005865103, 'epoch': 0.92}
+{'loss': 0.7093, 'grad_norm': 3.4531753063201904, 'learning_rate': 0.00016955034213098728, 'epoch': 0.92}
+ 46%|████▌     | 5850/12776 [59:54<42:01,  2.75it/s] 46%|████▌     | 5851/12776 [59:56<1:26:07,  1.34it/s]                                                       46%|████▌     | 5851/12776 [59:56<1:26:07,  1.34it/s] 46%|████▌     | 5852/12776 [59:57<1:35:56,  1.20it/s]                                                       46%|████▌     | 5852/12776 [59:57<1:35:56,  1.20it/s] 46%|████▌     | 5853/12776 [59:58<1:40:00,  1.15it/s]                                                       46%|████▌     | 5853/12776 [59:58<1:40:00,  1.15it/s] 46%|████▌     | 5854/12776 [59:58<1:38:32,  1.17it/s]                                                       46%|████▌     | 5854/12776 [59:58<1:38:32,  1.17it/s] 46%|████▌     | 5855/12776 [59:59<1:34:43,  1.22it/s]                                                       46%|████▌     | 5855/12776 [59:59<1:34:43,  1.22it/s] 46%|████▌     | 5856/12776 [1:00:00<1:34:02,  1.23it/s]                                                         46%|████▌     | 5856/12776 [1:00:00<1:34:02,  1.23it/s] 46%|████▌     | 5857/12776 [1:00:01<1:31:14,  1.26it/s]                                                         46%|████▌     | 5857/12776 [1:00:01<1:31:14,  1.26it/s] 46%|████▌     | 5858/12776 [1:00:01<1:25:35,  1.35it/s]                                                         46%|████▌     | 5858/12776 [1:00:01<1:25:35,  1.35it/s] 46%|████▌     | 5859/12776 [1:00:02<1:20:05,  1.44it/s]                                                         46%|████▌     | 5859/12776 [1:00:02<1:20:05,  1.44it/s] 46%|████▌     | 5860/12776 [1:00:02<1:15:11,  1.53it/s]                                                         46%|████▌     | 5860/12776 [1:00:02<1:15:11,  1.53it/s] 46%|████▌     | 5861/12776 [1:00:03<1:12:06,  1.60it/s]                                                         46%|████▌     | 5861/12776 [1:00:03<1:12:06,  1.60it/s] 46%|████▌     | 5862/12776 [1:00:04<1:07:58,  1.70it/s]                                                         46%|████▌     | 5862/12776 [1:00:04<1:07:58,  1.70it/s] 46%|████▌     | 5863/12776 [1:00:04<1:03:59,  1.80it/s]                                                         46%|████▌     | 5863/12776 [1:00:04<1:03:59,  1.80it/s] 46%|████▌     | 5864/12776 [1:00:04<1:00:34,  1.90it/s]                                                         46%|████▌     | 5864/12776 [1:00:04<1:00:34,  1.90it/s] 46%|████▌     | 5865/12776 [1:00:05<57:24,  2.01it/s]                                                         46%|████▌     | 5865/12776 [1:00:05<57:24,  2.01it/s] 46%|████▌     | 5866/12776 [1:00:05<55:49,  2.06it/s]                                                       46%|████▌     | 5866/12776 [1:00:05<55:49,  2.06it/s] 46%|████▌     | 5867/12776 [1:00:06<52:54,  2.18it/s]                                                       46%|████▌     | 5867/12776 [1:00:06<52:54,  2.18it/s] 46%|████▌     | 5868/12776 [1:00:06<50:26,  2.28it/s]                                                       46%|████▌     | 5868/12776 [1:00:06<50:26,  2.28it/s] 46%|████▌     | 5869/12776 [1:00:07<52:45,  2.18it/s]                                                       46%|████▌     | 5869/12776 [1:00:07<52:45,  2.18it/s] 46%|████▌     | 5870/12776 [1:00:07<49:04,  2.35it/s]                                                       46%|████▌     | 5870/12776 [1:00:07<49:04,  2.35it/s] 46%|████▌     | 5871/12776 [1:00:07<46:15,  2.49it/s]                                                       46%|████▌     | 5871/12776 [1:00:07<46:15,  2.49it/s] 46%|████▌     | 5872/12776 [1:00:08<47:18,  2.43it/s]                                                       46%|████▌     | 5872/12776 [1:00:08<47:18,  2.43it/s] 46%|████▌     | 5873/12776 [1:00:08<44:28,  2.59it/s]                                                       46%|████▌     | 5873/12776 [1:00:08<44:28,  2.59it/s] 46%|████▌     | 5874/12776 [1:00:08<42:09,  2.73it/s]                                                       46%|████▌     | 5874/12776 [1:00:08<42:09,  2.73it/s] 46%|████▌     | 5875/12776 [1:00:09<41:31,  2.77it/s]                                                       46%|████▌     | 5875/12776 [1:00:09<41:31,  2.77it/s] 46%|████▌     | 5876/12776 [1:00:09<39:26,  2.92it/s]                                                       46%|████▌     | 5876/12776 [1:00:09<39:26,  2.92it/s] 46%|████▌     | 5877/12776 [1:00:09<37:39,  3.05it/s]                                                       46%|████▌     | 5877/12776 [1:00:09<37:39,  3.05it/s] 46%|████▌     | 5878/12776 [1:00:10<36:06,  3.18it/s]                                                       46%|████▌     | 5878/12776 [1:00:10<36:06,  3.18it/s] 46%|████▌     | 5879/12776 [1:00:10<38:36,  2.98it/s]                                                       46%|████▌     | 5879/12776 [1:00:10<38:36,  2.98it/s] 46%|████▌     | 5880/12776 [1:00:10<36:19,  3.16it/s]                                                       46%|████▌     | 5880/12776 [1:00:10<36:19,  3.16it/s] 46%|████▌     | 5881/12776 [1:00:11<34:30,  3.33it/s]                                                       46%|████▌     | 5881/12776 [1:00:11<34:30,  3.33it/s] 46%|████▌     | 5882/12776 [1:00:11<32:53,  3.49it/s]                                                       46%|████▌     | 5882/12776 [1:00:11<32:53,  3.49it/s] 46%|████▌     | 5883/12776 [1:00:11<34:45,  3.31it/s]                                                       46%|████▌     | 5883/12776 [1:00:11<34:45,  3.31it/s] 46%|████▌     | 5884/12776 [1:00:11<32:36,  3.52it/s]                                                       46%|████▌     | 5884/12776 [1:00:11<32:36,  3.52it/s] 46%|████▌     | 5885/12776 [1:00:12<30:46,  3.73it/s]                                                       46%|████▌     | 5885/12776 [1:00:12<30:46,  3.73it/s] 46%|████▌     | 5886/12776 [1:00:12<29:19,  3.92it/s]                                                       46%|████▌     | 5886/12776 [1:00:12<29:19,  3.92it/s] 46%|████▌     | 5887/12776 [1:00:12<31:11,  3.68it/s]                                                       46%|████▌     | 5887/12776 [1:00:12<31:11,  3.68it/s] 46%|████▌     | 5888/12776 [1:00:12<29:08,  3.94it/s]                                                       46%|████▌     | 5888/12776 [1:00:12<29:08,  3.94it/s] 46%|████▌     | 5889/12776 [1:00:13<27:32,  4.17it/s]                                                       46%|████▌     | 5889/12776 [1:00:13<27:32,  4.17it/s] 46%|████▌     | 5890/12776 [1:00:13<26:33,  4.32it/s]                                                       46%|████▌     | 5890/12776 [1:00:13<26:33,  4.32it/s] 46%|████▌     | 5891/12776 [1:00:13<25:42,  4.46it/s]                                                       46%|████▌     | 5891/12776 [1:00:13<25:42,  4.46it/s] 46%|████▌     | 5892/12776 [1:00:13<27:46,  4.13it/s]                                                       46%|████▌     | 5892/12776 [1:00:13<27:46,  4.13it/s] 46%|████▌     | 5893/12776 [1:00:13<26:22,  4.35it/s]                                                       46%|████▌     | 5893/12776 [1:00:13<26:22,  4.35it/s] 46%|████▌     | 5894/12776 [1:00:14<25:16,  4.54it/s]                                                       46%|████▌     | 5894/12776 [1:00:14<25:16,  4.54it/s] 46%|████▌     | 5895/12776 [1:00:14<24:28,  4.69it/s]                                                       46%|████▌     | 5895/12776 [1:00:14<24:28,  4.69it/s] 46%|████▌     | 5896/12776 [1:00:14<23:46,  4.82it/s]                                                       46%|████▌     | 5896/12776 [1:00:14<23:46,  4.82it/s] 46%|████▌     | 5897/12776 [1:00:14<23:15,  4.93it/s]                                                       46%|████▌     | 5897/12776 [1:00:14<23:15,  4.93it/s] 46%|████▌     | 5898/12776 [1:00:15<25:30,  4.50it/s]                                                       46%|████▌     | 5898/12776 [1:00:15<25:30,  4.50it/s] 46%|████▌     | 5899/12776 [1:00:15<24:08,  4.75it/s]                                                       46%|████▌     | 5899/12776 [1:00:15<24:08,  4.75it/s] 46%|████▌     | 5900/12776 [1:00:16<45:50,  2.50it/s]                                                       46%|████▌     | 5900/12776 [1:00:16<45:50,  2.50it/s] 46%|████▌     | 5901/12776 [1:00:17<1:23:31,  1.37it/s]                                                         46%|████▌     | 5901/12776 [1:00:17<1:23:31,  1.37it/s] 46%|████▌     | 5902/12776 [1:00:18<1:29:54,  1.27it/s]                                                         46%|████▌     | 5902/12776 [1:00:18<1:29:54,  1.27it/s] 46%|████▌     | 5903/12776 [1:00:19<1:32:02,  1.24it/s]                                                         46%|████▌     | 5903/12776 [1:00:19<1:32:02,  1.24it/s] 46%|████▌     | 5904/12776 [1:00:20<1:30:53,  1.26it/s]                                                         46%|████▌     | 5904/12776 [1:00:20<1:30:53,  1.26it/s] 46%|████▌     | 5905/12776 [1:00:20<1:34:37,  1.21it/s]                                                         46%|████▌     | 5905/12776 [1:00:20<1:34:37,  1.21it/s] 46%|████▌     | 5906/12776 [1:00:21<1:29:43,  1.28it/s]                                                         46%|████▌     | 5906/12776 [1:00:21<1:29:43,  1.28it/s] 46%|████▌     | 5907/12776 [1:00:22<1:25:17,  1.34it/s]                                                         46%|████▌     | 5907/12776 [1:00:22<1:25:17,  1.34it/s] 46%|████▌     | 5908/12776 [1:00:23<1:23:38,  1.37it/s]                                                         46%|████▌     | 5908/12776 [1:00:23<1:23:38,  1.37it/s] 46%|████▋     | 5909/12776 [1:00:23<1:18:24,  1.46it/s]                                                         46%|████▋     | 5909/12776 [1:00:23<1:18:24,  1.46it/s] 46%|████▋     | 5910/12776 [1:00:24<1:16:12,  1.50it/s]                                                         46%|████▋     | 5910/12776 [1:00:24<1:16:12,  1.50it/s] 46%|████▋     | 5911/12776 [1:00:24<1:11:40,  1.60it/s]                                                         46%|████▋     | 5911/12776 [1:00:24<1:11:40,  1.60it/s] 46%|████▋     | 5912/12776 [1:00:25<1:11:34,  1.60it/s]                                                         46%|████▋     | 5912/12776 [1:00:25<1:11:34,  1.60it/s] 46%|████▋     | 5913/12776 [1:00:25<1:06:48,  1.71it/s]                                                         46%|████▋     | 5913/12776 [1:00:25<1:06:48,  1.71it/s] 46%|████▋     | 5914/12776 [1:00:26<1:02:20,  1.83it/s]                                                         46%|████▋     | 5914/12776 [1:00:26<1:02:20,  1.83it/s] 46%|████▋     | 5915/12776 [1:00:26<1:00:15,  1.90it/s]                                                         46%|████▋     | 5915/12776 [1:00:26<1:00:15,  1.90it/s] 46%|████▋     | 5916/12776 [1:00:27<57:07,  2.00it/s]                                                         46%|████▋     | 5916/12776 [1:00:27<57:07,  2.00it/s] 46%|████▋     | 5917/12776 [1:00:27<55:31,  2.06it/s]                                                       46%|████▋     | 5917/12776 [1:00:27<55:31,  2.06it/s] 46%|████▋     | 5918/12776 [1:00:28<52:51,  2.16it/s]                                                       46%|████▋     | 5918/12776 [1:00:28<52:51,  2.16it/s] 46%|████▋     | 5919/12776 [1:00:28<50:30,  2.26it/s]                                                       46%|████▋     | 5919/12776 [1:00:28<50:30,  2.26it/s] 46%|████▋     | 5920/12776 [1:00:29<52:43,  2.17it/s]                                                       46%|████▋     | 5920/12776 [1:00:29<52:43,  2.17it/s] 46%|████▋     | 5921/12776 [1:00:29<49:11,  2.32it/s]                                                       46%|████▋     | 5921/12776 [1:00:29<49:11,  2.32it/s] 46%|████▋     | 5922/12776 [1:00:29<46:18,  2.47it/s]                                                       46%|████▋     | 5922/12776 [1:00:29<46:18,  2.47it/s] 46%|████▋     | 5923/12776 [1:00:30<46:29,  2.46it/s]                                                       46%|████▋     | 5923/12776 [1:00:30<46:29,  2.46it/s] 46%|████▋     | 5924/12776 [1:00:30<43:42,  2.61it/s]                                                       46%|████▋     | 5924/12776 [1:00:30<43:42,  2.61it/s] 46%|████▋     | 5925/12776 [1:00:30<41:23,  2.76it/s]                                                       46%|████▋     | 5925/12776 [1:00:30<41:23,  2.76it/s] 46%|████▋     | 5926/12776 [1:00:31<41:05,  2.78it/s]                                                       46%|████▋     | 5926/12776 [1:00:31<41:05,  2.78it/s] 46%|████▋     | 5927/12776 [1:00:31<38:45,  2.94it/s]                                                      {'loss': 0.777, 'grad_norm': 5.444746971130371, 'learning_rate': 0.00016952590420332356, 'epoch': 0.92}
+{'loss': 0.3685, 'grad_norm': 0.5307621359825134, 'learning_rate': 0.00016950146627565984, 'epoch': 0.92}
+{'loss': 0.3451, 'grad_norm': 0.5665816068649292, 'learning_rate': 0.00016947702834799606, 'epoch': 0.92}
+{'loss': 0.5412, 'grad_norm': 1.0723793506622314, 'learning_rate': 0.00016945259042033234, 'epoch': 0.92}
+{'loss': 0.3838, 'grad_norm': 0.6314266324043274, 'learning_rate': 0.00016942815249266862, 'epoch': 0.92}
+{'loss': 0.4242, 'grad_norm': 0.65800541639328, 'learning_rate': 0.00016940371456500487, 'epoch': 0.92}
+{'loss': 0.3196, 'grad_norm': 0.8351601958274841, 'learning_rate': 0.00016937927663734115, 'epoch': 0.92}
+{'loss': 0.3413, 'grad_norm': 0.6536497473716736, 'learning_rate': 0.00016935483870967742, 'epoch': 0.92}
+{'loss': 0.4184, 'grad_norm': 0.8502795696258545, 'learning_rate': 0.00016933040078201368, 'epoch': 0.92}
+{'loss': 0.3015, 'grad_norm': 0.9803659319877625, 'learning_rate': 0.00016930596285434995, 'epoch': 0.92}
+{'loss': 0.3153, 'grad_norm': 1.2949978113174438, 'learning_rate': 0.0001692815249266862, 'epoch': 0.92}
+{'loss': 0.3463, 'grad_norm': 0.9877235889434814, 'learning_rate': 0.00016925708699902246, 'epoch': 0.92}
+{'loss': 0.5856, 'grad_norm': 1.161659598350525, 'learning_rate': 0.00016923264907135874, 'epoch': 0.92}
+{'loss': 0.3127, 'grad_norm': 0.7833372950553894, 'learning_rate': 0.000169208211143695, 'epoch': 0.92}
+{'loss': 0.4847, 'grad_norm': 1.466927170753479, 'learning_rate': 0.00016918377321603126, 'epoch': 0.92}
+{'loss': 0.5227, 'grad_norm': 1.4383978843688965, 'learning_rate': 0.00016915933528836754, 'epoch': 0.92}
+{'loss': 0.3953, 'grad_norm': 1.0085314512252808, 'learning_rate': 0.00016913489736070382, 'epoch': 0.92}
+{'loss': 0.6083, 'grad_norm': 1.5321907997131348, 'learning_rate': 0.00016911045943304005, 'epoch': 0.92}
+{'loss': 0.7558, 'grad_norm': 4.02199649810791, 'learning_rate': 0.00016908602150537632, 'epoch': 0.92}
+{'loss': 0.3596, 'grad_norm': 1.296752691268921, 'learning_rate': 0.0001690615835777126, 'epoch': 0.92}
+{'loss': 0.6962, 'grad_norm': 2.0699713230133057, 'learning_rate': 0.00016903714565004885, 'epoch': 0.92}
+{'loss': 0.3088, 'grad_norm': 1.0956624746322632, 'learning_rate': 0.00016901270772238513, 'epoch': 0.92}
+{'loss': 0.5232, 'grad_norm': 1.2984445095062256, 'learning_rate': 0.0001689882697947214, 'epoch': 0.92}
+{'loss': 0.9313, 'grad_norm': 1.7365597486495972, 'learning_rate': 0.00016896383186705766, 'epoch': 0.92}
+{'loss': 0.5194, 'grad_norm': 1.6211953163146973, 'learning_rate': 0.00016893939393939394, 'epoch': 0.92}
+{'loss': 0.4233, 'grad_norm': 1.4481335878372192, 'learning_rate': 0.00016891495601173022, 'epoch': 0.92}
+{'loss': 0.5753, 'grad_norm': 2.125072956085205, 'learning_rate': 0.00016889051808406644, 'epoch': 0.92}
+{'loss': 0.6559, 'grad_norm': 4.071976184844971, 'learning_rate': 0.00016886608015640272, 'epoch': 0.92}
+{'loss': 1.0354, 'grad_norm': 2.427960157394409, 'learning_rate': 0.000168841642228739, 'epoch': 0.92}
+{'loss': 0.813, 'grad_norm': 1.5165985822677612, 'learning_rate': 0.00016881720430107525, 'epoch': 0.92}
+{'loss': 1.2233, 'grad_norm': 2.0573408603668213, 'learning_rate': 0.00016879276637341153, 'epoch': 0.92}
+{'loss': 0.6152, 'grad_norm': 2.256502866744995, 'learning_rate': 0.0001687683284457478, 'epoch': 0.92}
+{'loss': 0.7555, 'grad_norm': 2.5055789947509766, 'learning_rate': 0.00016874389051808406, 'epoch': 0.92}
+{'loss': 1.0083, 'grad_norm': 1.9413427114486694, 'learning_rate': 0.00016871945259042034, 'epoch': 0.92}
+{'loss': 1.0279, 'grad_norm': 1.8647050857543945, 'learning_rate': 0.00016869501466275656, 'epoch': 0.92}
+{'loss': 0.8053, 'grad_norm': 2.933109998703003, 'learning_rate': 0.00016867057673509284, 'epoch': 0.92}
+{'loss': 0.9038, 'grad_norm': 2.2728209495544434, 'learning_rate': 0.00016864613880742912, 'epoch': 0.92}
+{'loss': 0.9049, 'grad_norm': 2.305570125579834, 'learning_rate': 0.00016862170087976537, 'epoch': 0.92}
+{'loss': 1.1697, 'grad_norm': 3.410182476043701, 'learning_rate': 0.00016859726295210165, 'epoch': 0.92}
+{'loss': 1.5584, 'grad_norm': 3.4939680099487305, 'learning_rate': 0.00016857282502443793, 'epoch': 0.92}
+{'loss': 1.1331, 'grad_norm': 1.5268841981887817, 'learning_rate': 0.00016854838709677415, 'epoch': 0.92}
+{'loss': 1.0516, 'grad_norm': 1.2586132287979126, 'learning_rate': 0.00016852394916911043, 'epoch': 0.92}
+{'loss': 1.0746, 'grad_norm': 2.3908731937408447, 'learning_rate': 0.0001684995112414467, 'epoch': 0.92}
+{'loss': 1.5005, 'grad_norm': 2.6460394859313965, 'learning_rate': 0.00016847507331378296, 'epoch': 0.92}
+{'loss': 0.7368, 'grad_norm': 1.6901088953018188, 'learning_rate': 0.00016845063538611924, 'epoch': 0.92}
+{'loss': 0.6092, 'grad_norm': 1.7683452367782593, 'learning_rate': 0.00016842619745845551, 'epoch': 0.92}
+{'loss': 0.4762, 'grad_norm': 3.026430606842041, 'learning_rate': 0.00016840175953079177, 'epoch': 0.92}
+{'loss': 1.0575, 'grad_norm': 2.5244085788726807, 'learning_rate': 0.00016837732160312804, 'epoch': 0.92}
+{'loss': 0.9384, 'grad_norm': 1.5700312852859497, 'learning_rate': 0.00016835288367546432, 'epoch': 0.92}
+{'loss': 0.3906, 'grad_norm': 1.4944273233413696, 'learning_rate': 0.00016832844574780055, 'epoch': 0.92}
+{'loss': 0.9438, 'grad_norm': 4.99465799331665, 'learning_rate': 0.00016830400782013682, 'epoch': 0.92}
+{'loss': 0.3414, 'grad_norm': 0.4812981188297272, 'learning_rate': 0.0001682795698924731, 'epoch': 0.92}
+{'loss': 0.3613, 'grad_norm': 0.5931546092033386, 'learning_rate': 0.00016825513196480935, 'epoch': 0.92}
+{'loss': 0.2214, 'grad_norm': 0.6139108538627625, 'learning_rate': 0.00016823069403714563, 'epoch': 0.92}
+{'loss': 0.3274, 'grad_norm': 0.5874446630477905, 'learning_rate': 0.0001682062561094819, 'epoch': 0.92}
+{'loss': 0.2925, 'grad_norm': 0.4940461218357086, 'learning_rate': 0.00016818181818181816, 'epoch': 0.92}
+{'loss': 0.2816, 'grad_norm': 0.5036855340003967, 'learning_rate': 0.00016815738025415444, 'epoch': 0.92}
+{'loss': 0.2061, 'grad_norm': 0.6490256190299988, 'learning_rate': 0.0001681329423264907, 'epoch': 0.92}
+{'loss': 0.5117, 'grad_norm': 1.324135184288025, 'learning_rate': 0.00016810850439882694, 'epoch': 0.92}
+{'loss': 0.3738, 'grad_norm': 0.9895930290222168, 'learning_rate': 0.00016808406647116322, 'epoch': 0.93}
+{'loss': 0.3548, 'grad_norm': 0.761587917804718, 'learning_rate': 0.0001680596285434995, 'epoch': 0.93}
+{'loss': 0.3428, 'grad_norm': 0.905089259147644, 'learning_rate': 0.00016803519061583575, 'epoch': 0.93}
+{'loss': 0.3537, 'grad_norm': 1.7721465826034546, 'learning_rate': 0.00016801075268817203, 'epoch': 0.93}
+{'loss': 0.7074, 'grad_norm': 0.9065170288085938, 'learning_rate': 0.0001679863147605083, 'epoch': 0.93}
+{'loss': 0.4449, 'grad_norm': 1.3769489526748657, 'learning_rate': 0.00016796187683284453, 'epoch': 0.93}
+{'loss': 0.4508, 'grad_norm': 0.8921677470207214, 'learning_rate': 0.0001679374389051808, 'epoch': 0.93}
+{'loss': 0.6338, 'grad_norm': 0.827818751335144, 'learning_rate': 0.0001679130009775171, 'epoch': 0.93}
+{'loss': 0.6249, 'grad_norm': 1.2946507930755615, 'learning_rate': 0.00016788856304985334, 'epoch': 0.93}
+{'loss': 0.2552, 'grad_norm': 0.9976163506507874, 'learning_rate': 0.00016786412512218962, 'epoch': 0.93}
+{'loss': 0.3495, 'grad_norm': 0.9220026731491089, 'learning_rate': 0.0001678396871945259, 'epoch': 0.93}
+{'loss': 0.5967, 'grad_norm': 1.1707324981689453, 'learning_rate': 0.00016781524926686215, 'epoch': 0.93}
+{'loss': 0.512, 'grad_norm': 1.8587265014648438, 'learning_rate': 0.00016779081133919843, 'epoch': 0.93}
+{'loss': 0.4552, 'grad_norm': 1.1185016632080078, 'learning_rate': 0.0001677663734115347, 'epoch': 0.93}
+{'loss': 0.5696, 'grad_norm': 7.0849127769470215, 'learning_rate': 0.00016774193548387093, 'epoch': 0.93}
+{'loss': 0.5536, 'grad_norm': 1.0725510120391846, 'learning_rate': 0.0001677174975562072, 'epoch': 0.93}
+{'loss': 0.4286, 'grad_norm': 1.6228716373443604, 'learning_rate': 0.00016769305962854349, 'epoch': 0.93}
+{'loss': 0.6547, 'grad_norm': 1.596377968788147, 'learning_rate': 0.00016766862170087974, 'epoch': 0.93}
+ 46%|████▋     | 5927/12776 [1:00:31<38:45,  2.94it/s] 46%|████▋     | 5928/12776 [1:00:31<36:46,  3.10it/s]                                                       46%|████▋     | 5928/12776 [1:00:31<36:46,  3.10it/s] 46%|████▋     | 5929/12776 [1:00:31<35:08,  3.25it/s]                                                       46%|████▋     | 5929/12776 [1:00:31<35:08,  3.25it/s] 46%|████▋     | 5930/12776 [1:00:32<34:52,  3.27it/s]                                                       46%|████▋     | 5930/12776 [1:00:32<34:52,  3.27it/s] 46%|████▋     | 5931/12776 [1:00:32<33:27,  3.41it/s]                                                       46%|████▋     | 5931/12776 [1:00:32<33:27,  3.41it/s] 46%|████▋     | 5932/12776 [1:00:32<31:59,  3.57it/s]                                                       46%|████▋     | 5932/12776 [1:00:32<31:59,  3.57it/s] 46%|████▋     | 5933/12776 [1:00:33<30:54,  3.69it/s]                                                       46%|████▋     | 5933/12776 [1:00:33<30:54,  3.69it/s] 46%|████▋     | 5934/12776 [1:00:33<33:41,  3.39it/s]                                                       46%|████▋     | 5934/12776 [1:00:33<33:41,  3.39it/s] 46%|████▋     | 5935/12776 [1:00:33<31:48,  3.58it/s]                                                       46%|████▋     | 5935/12776 [1:00:33<31:48,  3.58it/s] 46%|████▋     | 5936/12776 [1:00:33<30:16,  3.77it/s]                                                       46%|████▋     | 5936/12776 [1:00:33<30:16,  3.77it/s] 46%|████▋     | 5937/12776 [1:00:34<29:05,  3.92it/s]                                                       46%|████▋     | 5937/12776 [1:00:34<29:05,  3.92it/s] 46%|████▋     | 5938/12776 [1:00:34<28:02,  4.06it/s]                                                       46%|████▋     | 5938/12776 [1:00:34<28:02,  4.06it/s] 46%|████▋     | 5939/12776 [1:00:34<30:13,  3.77it/s]                                                       46%|████▋     | 5939/12776 [1:00:34<30:13,  3.77it/s] 46%|████▋     | 5940/12776 [1:00:34<28:23,  4.01it/s]                                                       46%|████▋     | 5940/12776 [1:00:34<28:23,  4.01it/s] 47%|████▋     | 5941/12776 [1:00:35<27:00,  4.22it/s]                                                       47%|████▋     | 5941/12776 [1:00:35<27:00,  4.22it/s] 47%|████▋     | 5942/12776 [1:00:35<26:03,  4.37it/s]                                                       47%|████▋     | 5942/12776 [1:00:35<26:03,  4.37it/s] 47%|████▋     | 5943/12776 [1:00:35<25:18,  4.50it/s]                                                       47%|████▋     | 5943/12776 [1:00:35<25:18,  4.50it/s] 47%|████▋     | 5944/12776 [1:00:35<27:56,  4.08it/s]                                                       47%|████▋     | 5944/12776 [1:00:35<27:56,  4.08it/s] 47%|████▋     | 5945/12776 [1:00:35<26:21,  4.32it/s]                                                       47%|████▋     | 5945/12776 [1:00:35<26:21,  4.32it/s] 47%|████▋     | 5946/12776 [1:00:36<25:16,  4.50it/s]                                                       47%|████▋     | 5946/12776 [1:00:36<25:16,  4.50it/s] 47%|████▋     | 5947/12776 [1:00:36<24:22,  4.67it/s]                                                       47%|████▋     | 5947/12776 [1:00:36<24:22,  4.67it/s] 47%|████▋     | 5948/12776 [1:00:36<23:36,  4.82it/s]                                                       47%|████▋     | 5948/12776 [1:00:36<23:36,  4.82it/s] 47%|████▋     | 5949/12776 [1:00:36<25:19,  4.49it/s]                                                       47%|████▋     | 5949/12776 [1:00:36<25:19,  4.49it/s] 47%|████▋     | 5950/12776 [1:00:37<41:51,  2.72it/s]                                                       47%|████▋     | 5950/12776 [1:00:37<41:51,  2.72it/s] 47%|████▋     | 5951/12776 [1:00:39<1:20:24,  1.41it/s]                                                         47%|████▋     | 5951/12776 [1:00:39<1:20:24,  1.41it/s] 47%|████▋     | 5952/12776 [1:00:39<1:28:46,  1.28it/s]                                                         47%|████▋     | 5952/12776 [1:00:39<1:28:46,  1.28it/s] 47%|████▋     | 5953/12776 [1:00:40<1:31:12,  1.25it/s]                                                         47%|████▋     | 5953/12776 [1:00:40<1:31:12,  1.25it/s] 47%|████▋     | 5954/12776 [1:00:41<1:35:53,  1.19it/s]                                                         47%|████▋     | 5954/12776 [1:00:41<1:35:53,  1.19it/s] 47%|████▋     | 5955/12776 [1:00:42<1:35:28,  1.19it/s]                                                         47%|████▋     | 5955/12776 [1:00:42<1:35:28,  1.19it/s] 47%|████▋     | 5956/12776 [1:00:43<1:31:38,  1.24it/s]                                                         47%|████▋     | 5956/12776 [1:00:43<1:31:38,  1.24it/s] 47%|████▋     | 5957/12776 [1:00:44<1:28:23,  1.29it/s]                                                         47%|████▋     | 5957/12776 [1:00:44<1:28:23,  1.29it/s] 47%|████▋     | 5958/12776 [1:00:44<1:24:08,  1.35it/s]                                                         47%|████▋     | 5958/12776 [1:00:44<1:24:08,  1.35it/s] 47%|████▋     | 5959/12776 [1:00:45<1:20:00,  1.42it/s]                                                         47%|████▋     | 5959/12776 [1:00:45<1:20:00,  1.42it/s] 47%|████▋     | 5960/12776 [1:00:45<1:15:30,  1.50it/s]                                                         47%|████▋     | 5960/12776 [1:00:45<1:15:30,  1.50it/s] 47%|████▋     | 5961/12776 [1:00:46<1:11:12,  1.60it/s]                                                         47%|████▋     | 5961/12776 [1:00:46<1:11:12,  1.60it/s] 47%|████▋     | 5962/12776 [1:00:46<1:07:17,  1.69it/s]                                                         47%|████▋     | 5962/12776 [1:00:46<1:07:17,  1.69it/s] 47%|████▋     | 5963/12776 [1:00:47<1:07:10,  1.69it/s]                                                         47%|████▋     | 5963/12776 [1:00:47<1:07:10,  1.69it/s] 47%|████▋     | 5964/12776 [1:00:47<1:02:38,  1.81it/s]                                                         47%|████▋     | 5964/12776 [1:00:47<1:02:38,  1.81it/s] 47%|████▋     | 5965/12776 [1:00:48<1:03:05,  1.80it/s]                                                         47%|████▋     | 5965/12776 [1:00:48<1:03:05,  1.80it/s] 47%|████▋     | 5966/12776 [1:00:48<58:56,  1.93it/s]                                                         47%|████▋     | 5966/12776 [1:00:48<58:56,  1.93it/s] 47%|████▋     | 5967/12776 [1:00:49<58:06,  1.95it/s]                                                       47%|████▋     | 5967/12776 [1:00:49<58:06,  1.95it/s] 47%|████▋     | 5968/12776 [1:00:49<54:16,  2.09it/s]                                                       47%|████▋     | 5968/12776 [1:00:49<54:16,  2.09it/s] 47%|████▋     | 5969/12776 [1:00:50<50:54,  2.23it/s]                                                       47%|████▋     | 5969/12776 [1:00:50<50:54,  2.23it/s] 47%|████▋     | 5970/12776 [1:00:50<49:04,  2.31it/s]                                                       47%|████▋     | 5970/12776 [1:00:50<49:04,  2.31it/s] 47%|████▋     | 5971/12776 [1:00:50<46:19,  2.45it/s]                                                       47%|████▋     | 5971/12776 [1:00:50<46:19,  2.45it/s] 47%|████▋     | 5972/12776 [1:00:51<44:12,  2.57it/s]                                                       47%|████▋     | 5972/12776 [1:00:51<44:12,  2.57it/s] 47%|████▋     | 5973/12776 [1:00:51<45:51,  2.47it/s]                                                       47%|████▋     | 5973/12776 [1:00:51<45:51,  2.47it/s] 47%|████▋     | 5974/12776 [1:00:52<43:07,  2.63it/s]                                                       47%|████▋     | 5974/12776 [1:00:52<43:07,  2.63it/s] 47%|████▋     | 5975/12776 [1:00:52<40:41,  2.79it/s]                                                       47%|████▋     | 5975/12776 [1:00:52<40:41,  2.79it/s] 47%|████▋     | 5976/12776 [1:00:52<38:38,  2.93it/s]                                                       47%|████▋     | 5976/12776 [1:00:52<38:38,  2.93it/s] 47%|████▋     | 5977/12776 [1:00:53<39:21,  2.88it/s]                                                       47%|████▋     | 5977/12776 [1:00:53<39:21,  2.88it/s] 47%|████▋     | 5978/12776 [1:00:53<37:02,  3.06it/s]                                                       47%|████▋     | 5978/12776 [1:00:53<37:02,  3.06it/s] 47%|████▋     | 5979/12776 [1:00:53<35:07,  3.23it/s]                                                       47%|████▋     | 5979/12776 [1:00:53<35:07,  3.23it/s] 47%|████▋     | 5980/12776 [1:00:53<33:34,  3.37it/s]                                                       47%|████▋     | 5980/12776 [1:00:53<33:34,  3.37it/s] 47%|████▋     | 5981/12776 [1:00:54<34:30,  3.28it/s]                                                       47%|████▋     | 5981/12776 [1:00:54<34:30,  3.28it/s] 47%|████▋     | 5982/12776 [1:00:54<32:48,  3.45it/s]                                                       47%|████▋     | 5982/12776 [1:00:54<32:48,  3.45it/s] 47%|████▋     | 5983/12776 [1:00:54<31:22,  3.61it/s]                                                       47%|████▋     | 5983/12776 [1:00:54<31:22,  3.61it/s] 47%|████▋     | 5984/12776 [1:00:54<30:07,  3.76it/s]                                                       47%|████▋     | 5984/12776 [1:00:54<30:07,  3.76it/s] 47%|████▋     | 5985/12776 [1:00:55<32:52,  3.44it/s]                                                       47%|████▋     | 5985/12776 [1:00:55<32:52,  3.44it/s] 47%|████▋     | 5986/12776 [1:00:55<30:57,  3.65it/s]                                                       47%|████▋     | 5986/12776 [1:00:55<30:57,  3.65it/s] 47%|████▋     | 5987/12776 [1:00:55<29:29,  3.84it/s]                                                       47%|████▋     | 5987/12776 [1:00:55<29:29,  3.84it/s] 47%|████▋     | 5988/12776 [1:00:55<28:10,  4.01it/s]                                                       47%|████▋     | 5988/12776 [1:00:55<28:10,  4.01it/s] 47%|████▋     | 5989/12776 [1:00:56<30:31,  3.71it/s]                                                       47%|████▋     | 5989/12776 [1:00:56<30:31,  3.71it/s] 47%|████▋     | 5990/12776 [1:00:56<28:27,  3.97it/s]                                                       47%|████▋     | 5990/12776 [1:00:56<28:27,  3.97it/s] 47%|████▋     | 5991/12776 [1:00:56<27:01,  4.18it/s]                                                       47%|████▋     | 5991/12776 [1:00:56<27:01,  4.18it/s] 47%|████▋     | 5992/12776 [1:00:56<25:51,  4.37it/s]                                                       47%|████▋     | 5992/12776 [1:00:56<25:51,  4.37it/s] 47%|████▋     | 5993/12776 [1:00:57<24:57,  4.53it/s]                                                       47%|████▋     | 5993/12776 [1:00:57<24:57,  4.53it/s] 47%|████▋     | 5994/12776 [1:00:57<27:38,  4.09it/s]                                                       47%|████▋     | 5994/12776 [1:00:57<27:38,  4.09it/s] 47%|████▋     | 5995/12776 [1:00:57<25:56,  4.36it/s]                                                       47%|████▋     | 5995/12776 [1:00:57<25:56,  4.36it/s] 47%|████▋     | 5996/12776 [1:00:57<24:40,  4.58it/s]                                                       47%|████▋     | 5996/12776 [1:00:57<24:40,  4.58it/s] 47%|████▋     | 5997/12776 [1:00:58<23:44,  4.76it/s]                                                       47%|████▋     | 5997/12776 [1:00:58<23:44,  4.76it/s] 47%|████▋     | 5998/12776 [1:00:58<22:52,  4.94it/s]                                                       47%|████▋     | 5998/12776 [1:00:58<22:52,  4.94it/s] 47%|████▋     | 5999/12776 [1:00:58<22:15,  5.08it/s]                                                       47%|████▋     | 5999/12776 [1:00:58<22:15,  5.08it/s] 47%|████▋     | 6000/12776 [1:00:59<40:46,  2.77it/s]                                                       47%|████▋     | 6000/12776 [1:00:59<40:46,  2.77it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 0.8426, 'grad_norm': 1.7780789136886597, 'learning_rate': 0.00016764418377321601, 'epoch': 0.93}
+{'loss': 0.6604, 'grad_norm': 1.8917478322982788, 'learning_rate': 0.0001676197458455523, 'epoch': 0.93}
+{'loss': 0.5667, 'grad_norm': 1.4411269426345825, 'learning_rate': 0.00016759530791788854, 'epoch': 0.93}
+{'loss': 0.4491, 'grad_norm': 2.5890650749206543, 'learning_rate': 0.00016757086999022482, 'epoch': 0.93}
+{'loss': 0.7193, 'grad_norm': 1.4896984100341797, 'learning_rate': 0.00016754643206256107, 'epoch': 0.93}
+{'loss': 0.7311, 'grad_norm': 2.9723289012908936, 'learning_rate': 0.00016752199413489733, 'epoch': 0.93}
+{'loss': 0.9025, 'grad_norm': 1.3109147548675537, 'learning_rate': 0.0001674975562072336, 'epoch': 0.93}
+{'loss': 0.8071, 'grad_norm': 2.1195662021636963, 'learning_rate': 0.00016747311827956988, 'epoch': 0.93}
+{'loss': 0.6614, 'grad_norm': 2.0129497051239014, 'learning_rate': 0.00016744868035190613, 'epoch': 0.93}
+{'loss': 1.3507, 'grad_norm': 3.3154754638671875, 'learning_rate': 0.0001674242424242424, 'epoch': 0.93}
+{'loss': 1.2579, 'grad_norm': 2.2003567218780518, 'learning_rate': 0.0001673998044965787, 'epoch': 0.93}
+{'loss': 0.9919, 'grad_norm': 2.3545401096343994, 'learning_rate': 0.00016737536656891491, 'epoch': 0.93}
+{'loss': 1.6852, 'grad_norm': 3.617964744567871, 'learning_rate': 0.0001673509286412512, 'epoch': 0.93}
+{'loss': 0.9846, 'grad_norm': 2.4754271507263184, 'learning_rate': 0.00016732649071358747, 'epoch': 0.93}
+{'loss': 0.7251, 'grad_norm': 2.2454872131347656, 'learning_rate': 0.00016730205278592372, 'epoch': 0.93}
+{'loss': 0.998, 'grad_norm': 2.1622064113616943, 'learning_rate': 0.00016727761485826, 'epoch': 0.93}
+{'loss': 1.5063, 'grad_norm': 3.354268789291382, 'learning_rate': 0.00016725317693059628, 'epoch': 0.93}
+{'loss': 1.1028, 'grad_norm': 1.39380943775177, 'learning_rate': 0.00016722873900293253, 'epoch': 0.93}
+{'loss': 1.2247, 'grad_norm': 2.941927194595337, 'learning_rate': 0.0001672043010752688, 'epoch': 0.93}
+{'loss': 1.3599, 'grad_norm': 6.12858772277832, 'learning_rate': 0.0001671798631476051, 'epoch': 0.93}
+{'loss': 1.2749, 'grad_norm': 1.9864230155944824, 'learning_rate': 0.0001671554252199413, 'epoch': 0.93}
+{'loss': 0.5183, 'grad_norm': 1.012682318687439, 'learning_rate': 0.0001671309872922776, 'epoch': 0.93}
+{'loss': 0.7957, 'grad_norm': 3.2158727645874023, 'learning_rate': 0.00016710654936461387, 'epoch': 0.93}
+{'loss': 0.8408, 'grad_norm': 2.2417471408843994, 'learning_rate': 0.00016708211143695012, 'epoch': 0.93}
+{'loss': 0.4067, 'grad_norm': 0.5746961236000061, 'learning_rate': 0.0001670576735092864, 'epoch': 0.93}
+{'loss': 0.2959, 'grad_norm': 0.5741134881973267, 'learning_rate': 0.00016703323558162268, 'epoch': 0.93}
+{'loss': 0.232, 'grad_norm': 0.4729560911655426, 'learning_rate': 0.00016700879765395893, 'epoch': 0.93}
+{'loss': 0.3221, 'grad_norm': 0.6859723329544067, 'learning_rate': 0.00016698435972629518, 'epoch': 0.93}
+{'loss': 0.4589, 'grad_norm': 0.8597524166107178, 'learning_rate': 0.00016695992179863146, 'epoch': 0.93}
+{'loss': 0.3401, 'grad_norm': 0.4495220184326172, 'learning_rate': 0.0001669354838709677, 'epoch': 0.93}
+{'loss': 1.0121, 'grad_norm': 3.076122999191284, 'learning_rate': 0.00016691104594330399, 'epoch': 0.93}
+{'loss': 0.2799, 'grad_norm': 0.5101608633995056, 'learning_rate': 0.00016688660801564026, 'epoch': 0.93}
+{'loss': 0.2636, 'grad_norm': 0.4766397476196289, 'learning_rate': 0.00016686217008797652, 'epoch': 0.93}
+{'loss': 0.4569, 'grad_norm': 1.034203052520752, 'learning_rate': 0.0001668377321603128, 'epoch': 0.93}
+{'loss': 0.3696, 'grad_norm': 0.8217408061027527, 'learning_rate': 0.00016681329423264907, 'epoch': 0.93}
+{'loss': 0.5895, 'grad_norm': 1.199652075767517, 'learning_rate': 0.0001667888563049853, 'epoch': 0.93}
+{'loss': 0.3166, 'grad_norm': 0.9428247809410095, 'learning_rate': 0.00016676441837732157, 'epoch': 0.93}
+{'loss': 0.663, 'grad_norm': 1.7076395750045776, 'learning_rate': 0.00016673998044965785, 'epoch': 0.93}
+{'loss': 1.2153, 'grad_norm': 3.6879310607910156, 'learning_rate': 0.0001667155425219941, 'epoch': 0.93}
+{'loss': 0.2916, 'grad_norm': 0.7371928691864014, 'learning_rate': 0.00016669110459433038, 'epoch': 0.93}
+{'loss': 0.5749, 'grad_norm': 0.8231880068778992, 'learning_rate': 0.00016666666666666666, 'epoch': 0.93}
+{'loss': 0.3539, 'grad_norm': 0.9370611310005188, 'learning_rate': 0.0001666422287390029, 'epoch': 0.93}
+{'loss': 0.5032, 'grad_norm': 1.1584241390228271, 'learning_rate': 0.0001666177908113392, 'epoch': 0.93}
+{'loss': 0.4978, 'grad_norm': 1.1886309385299683, 'learning_rate': 0.00016659335288367547, 'epoch': 0.93}
+{'loss': 0.5764, 'grad_norm': 1.277079463005066, 'learning_rate': 0.0001665689149560117, 'epoch': 0.93}
+{'loss': 0.478, 'grad_norm': 1.9561655521392822, 'learning_rate': 0.00016654447702834797, 'epoch': 0.93}
+{'loss': 0.6815, 'grad_norm': 2.3838415145874023, 'learning_rate': 0.00016652003910068425, 'epoch': 0.94}
+{'loss': 0.6301, 'grad_norm': 1.1179418563842773, 'learning_rate': 0.0001664956011730205, 'epoch': 0.94}
+{'loss': 0.6859, 'grad_norm': 1.8033316135406494, 'learning_rate': 0.00016647116324535678, 'epoch': 0.94}
+{'loss': 0.9525, 'grad_norm': 1.8072056770324707, 'learning_rate': 0.00016644672531769306, 'epoch': 0.94}
+{'loss': 0.645, 'grad_norm': 1.693436861038208, 'learning_rate': 0.0001664222873900293, 'epoch': 0.94}
+{'loss': 0.8476, 'grad_norm': 2.3530519008636475, 'learning_rate': 0.00016639784946236556, 'epoch': 0.94}
+{'loss': 0.6442, 'grad_norm': 1.2091078758239746, 'learning_rate': 0.00016637341153470184, 'epoch': 0.94}
+{'loss': 0.937, 'grad_norm': 2.406554937362671, 'learning_rate': 0.0001663489736070381, 'epoch': 0.94}
+{'loss': 0.8456, 'grad_norm': 1.2704511880874634, 'learning_rate': 0.00016632453567937437, 'epoch': 0.94}
+{'loss': 0.4806, 'grad_norm': 1.3322428464889526, 'learning_rate': 0.00016630009775171065, 'epoch': 0.94}
+{'loss': 1.2242, 'grad_norm': 1.9257659912109375, 'learning_rate': 0.0001662756598240469, 'epoch': 0.94}
+{'loss': 0.8894, 'grad_norm': 5.305508136749268, 'learning_rate': 0.00016625122189638318, 'epoch': 0.94}
+{'loss': 1.1084, 'grad_norm': 2.0064687728881836, 'learning_rate': 0.00016622678396871945, 'epoch': 0.94}
+{'loss': 0.3418, 'grad_norm': 0.8927270174026489, 'learning_rate': 0.00016620234604105568, 'epoch': 0.94}
+{'loss': 1.2848, 'grad_norm': 2.652418851852417, 'learning_rate': 0.00016617790811339196, 'epoch': 0.94}
+{'loss': 0.8169, 'grad_norm': 1.884332537651062, 'learning_rate': 0.00016615347018572824, 'epoch': 0.94}
+{'loss': 1.0861, 'grad_norm': 2.335824728012085, 'learning_rate': 0.0001661290322580645, 'epoch': 0.94}
+{'loss': 0.7576, 'grad_norm': 2.0490598678588867, 'learning_rate': 0.00016610459433040077, 'epoch': 0.94}
+{'loss': 1.1315, 'grad_norm': 1.8792589902877808, 'learning_rate': 0.00016608015640273704, 'epoch': 0.94}
+{'loss': 0.4345, 'grad_norm': 2.0781798362731934, 'learning_rate': 0.0001660557184750733, 'epoch': 0.94}
+{'loss': 0.8727, 'grad_norm': 1.9264535903930664, 'learning_rate': 0.00016603128054740957, 'epoch': 0.94}
+{'loss': 0.7562, 'grad_norm': 2.6917431354522705, 'learning_rate': 0.00016600684261974585, 'epoch': 0.94}
+{'loss': 0.9416, 'grad_norm': 2.0042667388916016, 'learning_rate': 0.00016598240469208208, 'epoch': 0.94}
+{'loss': 0.9873, 'grad_norm': 3.084071397781372, 'learning_rate': 0.00016595796676441835, 'epoch': 0.94}
+{'loss': 0.7261, 'grad_norm': 2.0100440979003906, 'learning_rate': 0.00016593352883675463, 'epoch': 0.94}
+{'loss': 0.9508, 'grad_norm': 2.626487970352173, 'learning_rate': 0.00016590909090909088, 'epoch': 0.94}
+{'loss': 1.564, 'grad_norm': 4.288865089416504, 'learning_rate': 0.00016588465298142716, 'epoch': 0.94}
+{'loss': 1.4826, 'grad_norm': 6.611445903778076, 'learning_rate': 0.00016586021505376344, 'epoch': 0.94}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:24,  5.34it/s][A
+  0%|          | 3/774 [00:00<02:59,  4.29it/s][A
+  1%|          | 4/774 [00:00<03:23,  3.79it/s][A
+  1%|          | 5/774 [00:01<03:20,  3.84it/s][A
+  1%|          | 6/774 [00:01<03:31,  3.63it/s][A
+  1%|          | 7/774 [00:01<03:28,  3.68it/s][A
+  1%|          | 8/774 [00:02<03:29,  3.65it/s][A
+  1%|          | 9/774 [00:02<03:17,  3.87it/s][A
+  1%|▏         | 10/774 [00:02<03:16,  3.89it/s][A
+  1%|▏         | 11/774 [00:02<03:31,  3.60it/s][A
+  2%|▏         | 12/774 [00:03<03:18,  3.84it/s][A
+  2%|▏         | 13/774 [00:03<03:09,  4.01it/s][A
+  2%|▏         | 14/774 [00:03<03:20,  3.80it/s][A
+  2%|▏         | 15/774 [00:03<03:39,  3.46it/s][A
+  2%|▏         | 16/774 [00:04<03:37,  3.48it/s][A
+  2%|▏         | 17/774 [00:04<03:15,  3.88it/s][A
+  2%|▏         | 18/774 [00:04<03:07,  4.02it/s][A
+  2%|▏         | 19/774 [00:04<03:18,  3.81it/s][A
+  3%|▎         | 20/774 [00:05<03:14,  3.87it/s][A
+  3%|▎         | 21/774 [00:05<03:18,  3.79it/s][A
+  3%|▎         | 22/774 [00:05<03:22,  3.70it/s][A
+  3%|▎         | 23/774 [00:06<03:33,  3.51it/s][A
+  3%|▎         | 24/774 [00:06<03:31,  3.54it/s][A
+  3%|▎         | 25/774 [00:06<03:38,  3.43it/s][A
+  3%|▎         | 26/774 [00:06<03:36,  3.46it/s][A
+  3%|▎         | 27/774 [00:07<03:34,  3.49it/s][A
+  4%|▎         | 28/774 [00:07<03:41,  3.38it/s][A
+  4%|▎         | 29/774 [00:07<03:45,  3.31it/s][A
+  4%|▍         | 30/774 [00:08<03:32,  3.49it/s][A
+  4%|▍         | 31/774 [00:08<03:32,  3.50it/s][A
+  4%|▍         | 32/774 [00:08<04:00,  3.08it/s][A
+  4%|▍         | 33/774 [00:09<03:50,  3.22it/s][A
+  4%|▍         | 34/774 [00:09<03:38,  3.39it/s][A
+  5%|▍         | 35/774 [00:09<03:46,  3.27it/s][A
+  5%|▍         | 36/774 [00:10<03:47,  3.25it/s][A
+  5%|▍         | 37/774 [00:10<03:48,  3.23it/s][A
+  5%|▍         | 38/774 [00:10<03:37,  3.38it/s][A
+  5%|▌         | 39/774 [00:10<03:23,  3.62it/s][A
+  5%|▌         | 40/774 [00:11<03:27,  3.54it/s][A
+  5%|▌         | 41/774 [00:11<03:24,  3.58it/s][A
+  5%|▌         | 42/774 [00:11<03:13,  3.79it/s][A
+  6%|▌         | 43/774 [00:11<03:24,  3.58it/s][A
+  6%|▌         | 44/774 [00:12<03:28,  3.50it/s][A
+  6%|▌         | 45/774 [00:12<03:17,  3.69it/s][A
+  6%|▌         | 46/774 [00:12<03:02,  3.99it/s][A
+  6%|▌         | 47/774 [00:12<02:50,  4.25it/s][A
+  6%|▌         | 48/774 [00:13<02:51,  4.24it/s][A
+  6%|▋         | 49/774 [00:13<02:53,  4.19it/s][A
+  6%|▋         | 50/774 [00:13<02:56,  4.11it/s][A
+  7%|▋         | 51/774 [00:13<02:56,  4.09it/s][A
+  7%|▋         | 52/774 [00:14<02:55,  4.12it/s][A
+  7%|▋         | 53/774 [00:14<03:05,  3.89it/s][A
+  7%|▋         | 54/774 [00:14<03:09,  3.79it/s][A
+  7%|▋         | 55/774 [00:14<03:18,  3.62it/s][A
+  7%|▋         | 56/774 [00:15<03:18,  3.62it/s][A
+  7%|▋         | 57/774 [00:15<03:23,  3.53it/s][A
+  7%|▋         | 58/774 [00:15<03:22,  3.53it/s][A
+  8%|▊         | 59/774 [00:16<03:07,  3.82it/s][A
+  8%|▊         | 60/774 [00:16<02:53,  4.11it/s][A
+  8%|▊         | 61/774 [00:16<02:31,  4.70it/s][A
+  8%|▊         | 62/774 [00:16<02:29,  4.75it/s][A
+  8%|▊         | 63/774 [00:16<02:54,  4.07it/s][A
+  8%|▊         | 64/774 [00:17<02:46,  4.27it/s][A
+  8%|▊         | 65/774 [00:17<02:49,  4.18it/s][A
+  9%|▊         | 66/774 [00:17<02:46,  4.24it/s][A
+  9%|▊         | 67/774 [00:17<02:39,  4.43it/s][A
+  9%|▉         | 68/774 [00:18<02:36,  4.51it/s][A
+  9%|▉         | 69/774 [00:18<02:28,  4.76it/s][A
+  9%|▉         | 70/774 [00:18<02:35,  4.52it/s][A
+  9%|▉         | 71/774 [00:18<02:30,  4.67it/s][A
+  9%|▉         | 72/774 [00:18<02:41,  4.34it/s][A
+  9%|▉         | 73/774 [00:19<02:51,  4.09it/s][A
+ 10%|▉         | 74/774 [00:19<02:58,  3.92it/s][A
+ 10%|▉         | 75/774 [00:19<03:05,  3.78it/s][A
+ 10%|▉         | 76/774 [00:20<03:00,  3.86it/s][A
+ 10%|▉         | 77/774 [00:20<03:13,  3.61it/s][A
+ 10%|█         | 78/774 [00:20<02:53,  4.00it/s][A
+ 10%|█         | 79/774 [00:20<02:41,  4.29it/s][A
+ 10%|█         | 80/774 [00:20<02:39,  4.35it/s][A
+ 10%|█         | 81/774 [00:21<02:18,  5.02it/s][A
+ 11%|█         | 82/774 [00:21<02:18,  5.01it/s][A
+ 11%|█         | 83/774 [00:21<02:22,  4.86it/s][A
+ 11%|█         | 84/774 [00:21<02:28,  4.65it/s][A
+ 11%|█         | 85/774 [00:21<02:36,  4.39it/s][A
+ 11%|█         | 86/774 [00:22<02:43,  4.20it/s][A
+ 11%|█         | 87/774 [00:22<02:43,  4.21it/s][A
+ 11%|█▏        | 88/774 [00:22<02:32,  4.49it/s][A
+ 11%|█▏        | 89/774 [00:22<02:27,  4.66it/s][A
+ 12%|█▏        | 90/774 [00:23<02:35,  4.41it/s][A
+ 12%|█▏        | 91/774 [00:23<02:49,  4.03it/s][A
+ 12%|█▏        | 92/774 [00:23<03:01,  3.76it/s][A
+ 12%|█▏        | 93/774 [00:23<02:57,  3.83it/s][A
+ 12%|█▏        | 94/774 [00:24<03:02,  3.72it/s][A
+ 12%|█▏        | 95/774 [00:24<03:01,  3.75it/s][A
+ 12%|█▏        | 96/774 [00:24<02:55,  3.86it/s][A
+ 13%|█▎        | 97/774 [00:24<02:40,  4.21it/s][A
+ 13%|█▎        | 98/774 [00:25<02:33,  4.40it/s][A
+ 13%|█▎        | 99/774 [00:25<02:46,  4.06it/s][A
+ 13%|█▎        | 100/774 [00:25<02:58,  3.79it/s][A
+ 13%|█▎        | 101/774 [00:26<03:01,  3.70it/s][A
+ 13%|█▎        | 102/774 [00:26<03:14,  3.45it/s][A
+ 13%|█▎        | 103/774 [00:26<03:17,  3.40it/s][A
+ 13%|█▎        | 104/774 [00:26<03:15,  3.42it/s][A
+ 14%|█▎        | 105/774 [00:27<03:14,  3.44it/s][A
+ 14%|█▎        | 106/774 [00:27<03:34,  3.11it/s][A
+ 14%|█▍        | 107/774 [00:28<03:47,  2.94it/s][A
+ 14%|█▍        | 108/774 [00:28<03:37,  3.06it/s][A
+ 14%|█▍        | 109/774 [00:28<03:35,  3.09it/s][A
+ 14%|█▍        | 110/774 [00:28<03:25,  3.23it/s][A
+ 14%|█▍        | 111/774 [00:29<03:24,  3.24it/s][A
+ 14%|█▍        | 112/774 [00:29<03:13,  3.42it/s][A
+ 15%|█▍        | 113/774 [00:29<03:19,  3.32it/s][A
+ 15%|█▍        | 114/774 [00:30<03:23,  3.24it/s][A
+ 15%|█▍        | 115/774 [00:30<03:17,  3.34it/s][A
+ 15%|█▍        | 116/774 [00:30<03:01,  3.63it/s][A
+ 15%|█▌        | 117/774 [00:30<03:08,  3.48it/s][A
+ 15%|█▌        | 118/774 [00:31<03:07,  3.50it/s][A
+ 15%|█▌        | 119/774 [00:31<02:59,  3.64it/s][A
+ 16%|█▌        | 120/774 [00:31<03:09,  3.45it/s][A
+ 16%|█▌        | 121/774 [00:32<03:04,  3.54it/s][A
+ 16%|█▌        | 122/774 [00:32<03:07,  3.48it/s][A
+ 16%|█▌        | 123/774 [00:32<02:59,  3.63it/s][A
+ 16%|█▌        | 124/774 [00:32<03:00,  3.59it/s][A
+ 16%|█▌        | 125/774 [00:33<03:02,  3.55it/s][A
+ 16%|█▋        | 126/774 [00:33<03:10,  3.40it/s][A
+ 16%|█▋        | 127/774 [00:33<03:19,  3.25it/s][A
+ 17%|█▋        | 128/774 [00:34<03:09,  3.42it/s][A
+ 17%|█▋        | 129/774 [00:34<03:11,  3.37it/s][A
+ 17%|█▋        | 130/774 [00:34<03:19,  3.24it/s][A
+ 17%|█▋        | 131/774 [00:35<03:09,  3.40it/s][A
+ 17%|█▋        | 132/774 [00:35<03:07,  3.42it/s][A
+ 17%|█▋        | 133/774 [00:35<03:04,  3.47it/s][A
+ 17%|█▋        | 134/774 [00:35<03:03,  3.48it/s][A
+ 17%|█▋        | 135/774 [00:36<03:20,  3.19it/s][A
+ 18%|█▊        | 136/774 [00:36<03:27,  3.08it/s][A
+ 18%|█▊        | 137/774 [00:36<03:26,  3.08it/s][A
+ 18%|█▊        | 138/774 [00:37<03:22,  3.13it/s][A
+ 18%|█▊        | 139/774 [00:37<03:23,  3.12it/s][A
+ 18%|█▊        | 140/774 [00:37<03:19,  3.19it/s][A
+ 18%|█▊        | 141/774 [00:38<03:11,  3.31it/s][A
+ 18%|█▊        | 142/774 [00:38<03:21,  3.14it/s][A
+ 18%|█▊        | 143/774 [00:38<03:18,  3.19it/s][A
+ 19%|█▊        | 144/774 [00:39<03:07,  3.36it/s][A
+ 19%|█▊        | 145/774 [00:39<03:00,  3.49it/s][A
+ 19%|█▉        | 146/774 [00:39<02:49,  3.70it/s][A
+ 19%|█▉        | 147/774 [00:39<02:40,  3.90it/s][A
+ 19%|█▉        | 148/774 [00:40<02:49,  3.70it/s][A
+ 19%|█▉        | 149/774 [00:40<03:03,  3.41it/s][A
+ 19%|█▉        | 150/774 [00:40<03:06,  3.35it/s][A
+ 20%|█▉        | 151/774 [00:40<02:56,  3.54it/s][A
+ 20%|█▉        | 152/774 [00:41<02:45,  3.75it/s][A
+ 20%|█▉        | 153/774 [00:41<02:53,  3.57it/s][A
+ 20%|█▉        | 154/774 [00:41<02:49,  3.65it/s][A
+ 20%|██        | 155/774 [00:42<02:46,  3.71it/s][A
+ 20%|██        | 156/774 [00:42<02:41,  3.82it/s][A
+ 20%|██        | 157/774 [00:42<02:35,  3.97it/s][A
+ 20%|██        | 158/774 [00:42<02:39,  3.85it/s][A
+ 21%|██        | 159/774 [00:43<02:41,  3.80it/s][A
+ 21%|██        | 160/774 [00:43<02:33,  4.01it/s][A
+ 21%|██        | 161/774 [00:43<02:41,  3.80it/s][A
+ 21%|██        | 162/774 [00:43<02:47,  3.65it/s][A
+ 21%|██        | 163/774 [00:44<02:47,  3.65it/s][A
+ 21%|██        | 164/774 [00:44<02:41,  3.77it/s][A
+ 21%|██▏       | 165/774 [00:44<02:38,  3.84it/s][A
+ 21%|██▏       | 166/774 [00:44<02:42,  3.74it/s][A
+ 22%|██▏       | 167/774 [00:45<02:45,  3.67it/s][A
+ 22%|██▏       | 168/774 [00:45<02:36,  3.88it/s][A
+ 22%|██▏       | 169/774 [00:45<02:28,  4.06it/s][A
+ 22%|██▏       | 170/774 [00:45<02:37,  3.83it/s][A
+ 22%|██▏       | 171/774 [00:46<02:48,  3.58it/s][A
+ 22%|██▏       | 172/774 [00:46<02:55,  3.43it/s][A
+ 22%|██▏       | 173/774 [00:46<02:51,  3.51it/s][A
+ 22%|██▏       | 174/774 [00:47<02:44,  3.66it/s][A
+ 23%|██▎       | 175/774 [00:47<02:44,  3.63it/s][A
+ 23%|██▎       | 176/774 [00:47<02:38,  3.78it/s][A
+ 23%|██▎       | 177/774 [00:47<02:51,  3.47it/s][A
+ 23%|██▎       | 178/774 [00:48<02:34,  3.86it/s][A
+ 23%|██▎       | 179/774 [00:48<02:21,  4.19it/s][A
+ 23%|██▎       | 180/774 [00:48<02:17,  4.33it/s][A
+ 23%|██▎       | 181/774 [00:48<02:20,  4.22it/s][A
+ 24%|██▎       | 182/774 [00:49<02:23,  4.12it/s][A
+ 24%|██▎       | 183/774 [00:49<02:23,  4.12it/s][A
+ 24%|██▍       | 184/774 [00:49<02:34,  3.82it/s][A
+ 24%|██▍       | 185/774 [00:49<02:43,  3.61it/s][A
+ 24%|██▍       | 186/774 [00:50<02:42,  3.63it/s][A
+ 24%|██▍       | 187/774 [00:50<02:35,  3.78it/s][A
+ 24%|██▍       | 188/774 [00:50<02:34,  3.79it/s][A
+ 24%|██▍       | 189/774 [00:50<02:33,  3.81it/s][A
+ 25%|██▍       | 190/774 [00:51<02:29,  3.91it/s][A
+ 25%|██▍       | 191/774 [00:51<02:34,  3.78it/s][A
+ 25%|██▍       | 192/774 [00:51<02:37,  3.70it/s][A
+ 25%|██▍       | 193/774 [00:52<02:40,  3.63it/s][A
+ 25%|██▌       | 194/774 [00:52<02:49,  3.43it/s][A
+ 25%|██▌       | 195/774 [00:52<02:57,  3.26it/s][A
+ 25%|██▌       | 196/774 [00:53<02:57,  3.26it/s][A
+ 25%|██▌       | 197/774 [00:53<02:55,  3.29it/s][A
+ 26%|██▌       | 198/774 [00:53<02:45,  3.48it/s][A
+ 26%|██▌       | 199/774 [00:53<02:46,  3.46it/s][A
+ 26%|██▌       | 200/774 [00:54<02:39,  3.60it/s][A
+ 26%|██▌       | 201/774 [00:54<02:36,  3.66it/s][A
+ 26%|██▌       | 202/774 [00:54<02:34,  3.71it/s][A
+ 26%|██▌       | 203/774 [00:54<02:27,  3.88it/s][A
+ 26%|██▋       | 204/774 [00:55<02:31,  3.77it/s][A
+ 26%|██▋       | 205/774 [00:55<02:40,  3.53it/s][A
+ 27%|██▋       | 206/774 [00:55<02:36,  3.64it/s][A
+ 27%|██▋       | 207/774 [00:55<02:33,  3.70it/s][A
+ 27%|██▋       | 208/774 [00:56<02:33,  3.68it/s][A
+ 27%|██▋       | 209/774 [00:56<02:32,  3.71it/s][A
+ 27%|██▋       | 210/774 [00:56<02:31,  3.72it/s][A
+ 27%|██▋       | 211/774 [00:57<02:28,  3.80it/s][A
+ 27%|██▋       | 212/774 [00:57<02:17,  4.10it/s][A
+ 28%|██▊       | 213/774 [00:57<02:02,  4.60it/s][A
+ 28%|██▊       | 214/774 [00:57<02:03,  4.55it/s][A
+ 28%|██▊       | 215/774 [00:57<02:02,  4.55it/s][A
+ 28%|██▊       | 216/774 [00:58<02:01,  4.58it/s][A
+ 28%|██▊       | 217/774 [00:58<02:05,  4.42it/s][A
+ 28%|██▊       | 218/774 [00:58<02:12,  4.21it/s][A
+ 28%|██▊       | 219/774 [00:58<02:20,  3.94it/s][A
+ 28%|██▊       | 220/774 [00:59<02:19,  3.97it/s][A
+ 29%|██▊       | 221/774 [00:59<02:25,  3.81it/s][A
+ 29%|██▊       | 222/774 [00:59<02:34,  3.58it/s][A
+ 29%|██▉       | 223/774 [01:00<02:51,  3.22it/s][A
+ 29%|██▉       | 224/774 [01:00<03:00,  3.05it/s][A
+ 29%|██▉       | 225/774 [01:00<03:11,  2.86it/s][A
+ 29%|██▉       | 226/774 [01:01<03:16,  2.79it/s][A
+ 29%|██▉       | 227/774 [01:01<03:13,  2.83it/s][A
+ 29%|██▉       | 228/774 [01:01<03:05,  2.94it/s][A
+ 30%|██▉       | 229/774 [01:02<03:19,  2.73it/s][A
+ 30%|██▉       | 230/774 [01:02<03:05,  2.93it/s][A
+ 30%|██▉       | 231/774 [01:02<03:02,  2.97it/s][A
+ 30%|██▉       | 232/774 [01:03<02:53,  3.12it/s][A
+ 30%|███       | 233/774 [01:03<03:09,  2.85it/s][A
+ 30%|███       | 234/774 [01:03<03:12,  2.80it/s][A
+ 30%|███       | 235/774 [01:04<03:11,  2.81it/s][A
+ 30%|███       | 236/774 [01:04<03:13,  2.78it/s][A
+ 31%|███       | 237/774 [01:05<03:10,  2.81it/s][A
+ 31%|███       | 238/774 [01:05<03:01,  2.96it/s][A
+ 31%|███       | 239/774 [01:05<02:59,  2.97it/s][A
+ 31%|███       | 240/774 [01:06<02:59,  2.97it/s][A
+ 31%|███       | 241/774 [01:06<03:02,  2.92it/s][A
+ 31%|███▏      | 242/774 [01:06<03:12,  2.76it/s][A
+ 31%|███▏      | 243/774 [01:07<03:22,  2.62it/s][A
+ 32%|███▏      | 244/774 [01:07<03:16,  2.69it/s][A
+ 32%|███▏      | 245/774 [01:07<03:08,  2.81it/s][A
+ 32%|███▏      | 246/774 [01:08<03:06,  2.83it/s][A
+ 32%|███▏      | 247/774 [01:08<03:45,  2.34it/s][A
+ 32%|███▏      | 248/774 [01:09<03:50,  2.28it/s][A
+ 32%|███▏      | 249/774 [01:09<03:27,  2.53it/s][A
+ 32%|███▏      | 250/774 [01:09<03:20,  2.61it/s][A
+ 32%|███▏      | 251/774 [01:10<03:19,  2.62it/s][A
+ 33%|███▎      | 252/774 [01:10<03:15,  2.67it/s][A
+ 33%|███▎      | 253/774 [01:11<03:14,  2.68it/s][A
+ 33%|███▎      | 254/774 [01:11<03:08,  2.75it/s][A
+ 33%|███▎      | 255/774 [01:11<03:03,  2.83it/s][A
+ 33%|███▎      | 256/774 [01:12<02:59,  2.89it/s][A
+ 33%|███▎      | 257/774 [01:12<02:57,  2.92it/s][A
+ 33%|███▎      | 258/774 [01:12<02:42,  3.17it/s][A
+ 33%|███▎      | 259/774 [01:12<02:24,  3.57it/s][A
+ 34%|███▎      | 260/774 [01:13<02:24,  3.56it/s][A
+ 34%|███▎      | 261/774 [01:13<02:29,  3.44it/s][A
+ 34%|███▍      | 262/774 [01:13<02:14,  3.82it/s][A
+ 34%|███▍      | 263/774 [01:13<02:06,  4.04it/s][A
+ 34%|███▍      | 264/774 [01:14<02:15,  3.78it/s][A
+ 34%|███▍      | 265/774 [01:14<02:09,  3.93it/s][A
+ 34%|███▍      | 266/774 [01:14<02:03,  4.13it/s][A
+ 34%|███▍      | 267/774 [01:14<02:01,  4.16it/s][A
+ 35%|███▍      | 268/774 [01:15<02:09,  3.92it/s][A
+ 35%|███▍      | 269/774 [01:15<02:14,  3.75it/s][A
+ 35%|███▍      | 270/774 [01:15<02:20,  3.59it/s][A
+ 35%|███▌      | 271/774 [01:15<02:16,  3.68it/s][A
+ 35%|███▌      | 272/774 [01:16<02:05,  4.00it/s][A
+ 35%|███▌      | 273/774 [01:16<02:00,  4.16it/s][A
+ 35%|███▌      | 274/774 [01:16<02:05,  3.98it/s][A
+ 36%|███▌      | 275/774 [01:16<01:59,  4.18it/s][A
+ 36%|███▌      | 276/774 [01:17<01:53,  4.40it/s][A
+ 36%|███▌      | 277/774 [01:17<01:57,  4.25it/s][A
+ 36%|███▌      | 278/774 [01:17<01:58,  4.20it/s][A
+ 36%|███▌      | 279/774 [01:17<01:52,  4.39it/s][A
+ 36%|███▌      | 280/774 [01:18<01:54,  4.30it/s][A
+ 36%|███▋      | 281/774 [01:18<02:05,  3.93it/s][A
+ 36%|███▋      | 282/774 [01:18<02:16,  3.60it/s][A
+ 37%|███▋      | 283/774 [01:18<02:15,  3.62it/s][A
+ 37%|███▋      | 284/774 [01:19<02:15,  3.62it/s][A
+ 37%|███▋      | 285/774 [01:19<02:07,  3.84it/s][A
+ 37%|███▋      | 286/774 [01:19<02:01,  4.00it/s][A
+ 37%|███▋      | 287/774 [01:19<02:11,  3.69it/s][A
+ 37%|███▋      | 288/774 [01:20<02:16,  3.57it/s][A
+ 37%|███▋      | 289/774 [01:20<02:13,  3.62it/s][A
+ 37%|███▋      | 290/774 [01:20<02:09,  3.73it/s][A
+ 38%|███▊      | 291/774 [01:21<02:08,  3.77it/s][A
+ 38%|███▊      | 292/774 [01:21<02:05,  3.85it/s][A
+ 38%|███▊      | 293/774 [01:21<01:54,  4.18it/s][A
+ 38%|███▊      | 294/774 [01:21<01:51,  4.31it/s][A
+ 38%|███▊      | 295/774 [01:21<01:49,  4.36it/s][A
+ 38%|███▊      | 296/774 [01:22<01:44,  4.57it/s][A
+ 38%|███▊      | 297/774 [01:22<01:40,  4.74it/s][A
+ 39%|███▊      | 298/774 [01:22<01:45,  4.53it/s][A
+ 39%|███▊      | 299/774 [01:22<01:48,  4.36it/s][A
+ 39%|███▉      | 300/774 [01:23<01:55,  4.11it/s][A
+ 39%|███▉      | 301/774 [01:23<01:47,  4.39it/s][A
+ 39%|███▉      | 302/774 [01:23<01:41,  4.67it/s][A
+ 39%|███▉      | 303/774 [01:23<01:38,  4.79it/s][A
+ 39%|███▉      | 304/774 [01:23<01:27,  5.39it/s][A
+ 39%|███▉      | 305/774 [01:23<01:26,  5.41it/s][A
+ 40%|███▉      | 306/774 [01:24<01:39,  4.71it/s][A
+ 40%|███▉      | 307/774 [01:24<01:44,  4.47it/s][A
+ 40%|███▉      | 308/774 [01:24<01:38,  4.74it/s][A
+ 40%|███▉      | 309/774 [01:24<01:39,  4.68it/s][A
+ 40%|████      | 310/774 [01:25<01:44,  4.43it/s][A
+ 40%|████      | 311/774 [01:25<01:43,  4.49it/s][A
+ 40%|████      | 312/774 [01:25<01:39,  4.63it/s][A
+ 40%|████      | 313/774 [01:25<01:39,  4.64it/s][A
+ 41%|████      | 314/774 [01:26<01:40,  4.56it/s][A
+ 41%|████      | 315/774 [01:26<01:49,  4.19it/s][A
+ 41%|████      | 316/774 [01:26<01:40,  4.56it/s][A
+ 41%|████      | 317/774 [01:26<01:33,  4.90it/s][A
+ 41%|████      | 318/774 [01:26<01:37,  4.69it/s][A
+ 41%|████      | 319/774 [01:27<01:38,  4.61it/s][A
+ 41%|████▏     | 320/774 [01:27<01:39,  4.56it/s][A
+ 41%|████▏     | 321/774 [01:27<01:31,  4.94it/s][A
+ 42%|████▏     | 322/774 [01:27<01:26,  5.25it/s][A
+ 42%|████▏     | 323/774 [01:27<01:17,  5.82it/s][A
+ 42%|████▏     | 324/774 [01:28<01:24,  5.34it/s][A
+ 42%|████▏     | 325/774 [01:28<01:27,  5.13it/s][A
+ 42%|████▏     | 326/774 [01:28<01:24,  5.28it/s][A
+ 42%|████▏     | 327/774 [01:28<01:28,  5.06it/s][A
+ 42%|████▏     | 328/774 [01:28<01:26,  5.17it/s][A
+ 43%|████▎     | 329/774 [01:29<01:34,  4.70it/s][A
+ 43%|████▎     | 330/774 [01:29<01:30,  4.91it/s][A
+ 43%|████▎     | 331/774 [01:29<01:20,  5.48it/s][A
+ 43%|████▎     | 332/774 [01:29<01:19,  5.59it/s][A
+ 43%|████▎     | 333/774 [01:29<01:22,  5.35it/s][A
+ 43%|████▎     | 334/774 [01:29<01:26,  5.08it/s][A
+ 43%|████▎     | 335/774 [01:30<01:27,  5.04it/s][A
+ 43%|████▎     | 336/774 [01:30<01:26,  5.07it/s][A
+ 44%|████▎     | 337/774 [01:30<01:19,  5.49it/s][A
+ 44%|████▎     | 338/774 [01:30<01:14,  5.83it/s][A
+ 44%|████▍     | 339/774 [01:30<01:10,  6.17it/s][A
+ 44%|████▍     | 340/774 [01:30<01:10,  6.14it/s][A
+ 44%|████▍     | 341/774 [01:31<01:28,  4.89it/s][A
+ 44%|████▍     | 342/774 [01:31<01:37,  4.42it/s][A
+ 44%|████▍     | 343/774 [01:31<01:37,  4.43it/s][A
+ 44%|████▍     | 344/774 [01:32<01:41,  4.24it/s][A
+ 45%|████▍     | 345/774 [01:32<01:44,  4.10it/s][A
+ 45%|████▍     | 346/774 [01:32<01:46,  4.01it/s][A
+ 45%|████▍     | 347/774 [01:32<01:43,  4.11it/s][A
+ 45%|████▍     | 348/774 [01:32<01:39,  4.28it/s][A
+ 45%|████▌     | 349/774 [01:33<01:35,  4.46it/s][A
+ 45%|████▌     | 350/774 [01:33<01:39,  4.27it/s][A
+ 45%|████▌     | 351/774 [01:33<01:38,  4.27it/s][A
+ 45%|████▌     | 352/774 [01:33<01:34,  4.45it/s][A
+ 46%|████▌     | 353/774 [01:34<01:33,  4.51it/s][A
+ 46%|████▌     | 354/774 [01:34<01:33,  4.49it/s][A
+ 46%|████▌     | 355/774 [01:34<01:38,  4.24it/s][A
+ 46%|████▌     | 356/774 [01:34<01:48,  3.85it/s][A
+ 46%|████▌     | 357/774 [01:35<02:04,  3.36it/s][A
+ 46%|████▋     | 358/774 [01:35<02:08,  3.23it/s][A
+ 46%|████▋     | 359/774 [01:35<02:07,  3.25it/s][A
+ 47%|████▋     | 360/774 [01:36<02:07,  3.24it/s][A
+ 47%|████▋     | 361/774 [01:36<02:01,  3.41it/s][A
+ 47%|████▋     | 362/774 [01:36<02:07,  3.23it/s][A
+ 47%|████▋     | 363/774 [01:37<02:06,  3.24it/s][A
+ 47%|████▋     | 364/774 [01:37<02:08,  3.19it/s][A
+ 47%|████▋     | 365/774 [01:37<02:04,  3.27it/s][A
+ 47%|████▋     | 366/774 [01:37<01:55,  3.53it/s][A
+ 47%|████▋     | 367/774 [01:38<01:50,  3.69it/s][A
+ 48%|████▊     | 368/774 [01:38<01:47,  3.77it/s][A
+ 48%|████▊     | 369/774 [01:38<01:54,  3.54it/s][A
+ 48%|████▊     | 370/774 [01:39<02:09,  3.13it/s][A
+ 48%|████▊     | 371/774 [01:39<02:00,  3.34it/s][A
+ 48%|████▊     | 372/774 [01:39<02:00,  3.33it/s][A
+ 48%|████▊     | 373/774 [01:40<01:59,  3.36it/s][A
+ 48%|████▊     | 374/774 [01:40<01:56,  3.45it/s][A
+ 48%|████▊     | 375/774 [01:40<01:56,  3.43it/s][A
+ 49%|████▊     | 376/774 [01:40<02:00,  3.29it/s][A
+ 49%|████▊     | 377/774 [01:41<02:12,  2.99it/s][A
+ 49%|████▉     | 378/774 [01:41<02:13,  2.96it/s][A
+ 49%|████▉     | 379/774 [01:41<02:04,  3.18it/s][A
+ 49%|████▉     | 380/774 [01:42<01:54,  3.46it/s][A
+ 49%|████▉     | 381/774 [01:42<01:44,  3.75it/s][A
+ 49%|████▉     | 382/774 [01:42<01:41,  3.85it/s][A
+ 49%|████▉     | 383/774 [01:42<01:40,  3.90it/s][A
+ 50%|████▉     | 384/774 [01:43<01:47,  3.61it/s][A
+ 50%|████▉     | 385/774 [01:43<01:55,  3.37it/s][A
+ 50%|████▉     | 386/774 [01:43<01:48,  3.57it/s][A
+ 50%|█████     | 387/774 [01:44<01:42,  3.79it/s][A
+ 50%|█████     | 388/774 [01:44<01:47,  3.59it/s][A
+ 50%|█████     | 389/774 [01:44<01:44,  3.70it/s][A
+ 50%|█████     | 390/774 [01:44<01:57,  3.27it/s][A
+ 51%|█████     | 391/774 [01:45<01:58,  3.23it/s][A
+ 51%|█████     | 392/774 [01:45<01:49,  3.50it/s][A
+ 51%|█████     | 393/774 [01:45<01:40,  3.79it/s][A
+ 51%|█████     | 394/774 [01:46<01:41,  3.76it/s][A
+ 51%|█████     | 395/774 [01:46<01:48,  3.49it/s][A
+ 51%|█████     | 396/774 [01:46<01:45,  3.58it/s][A
+ 51%|█████▏    | 397/774 [01:46<01:48,  3.46it/s][A
+ 51%|█████▏    | 398/774 [01:47<01:43,  3.62it/s][A
+ 52%|█████▏    | 399/774 [01:47<01:42,  3.67it/s][A
+ 52%|█████▏    | 400/774 [01:47<01:34,  3.94it/s][A
+ 52%|█████▏    | 401/774 [01:47<01:31,  4.08it/s][A
+ 52%|█████▏    | 402/774 [01:48<01:30,  4.10it/s][A
+ 52%|█████▏    | 403/774 [01:48<01:34,  3.92it/s][A
+ 52%|█████▏    | 404/774 [01:48<01:41,  3.66it/s][A
+ 52%|█████▏    | 405/774 [01:48<01:37,  3.79it/s][A
+ 52%|█████▏    | 406/774 [01:49<01:39,  3.68it/s][A
+ 53%|█████▎    | 407/774 [01:49<01:45,  3.46it/s][A
+ 53%|█████▎    | 408/774 [01:49<01:42,  3.58it/s][A
+ 53%|█████▎    | 409/774 [01:50<01:38,  3.69it/s][A
+ 53%|█████▎    | 410/774 [01:50<01:39,  3.64it/s][A
+ 53%|█████▎    | 411/774 [01:50<01:40,  3.62it/s][A
+ 53%|█████▎    | 412/774 [01:50<01:41,  3.57it/s][A
+ 53%|█████▎    | 413/774 [01:51<01:39,  3.63it/s][A
+ 53%|█████▎    | 414/774 [01:51<01:36,  3.72it/s][A
+ 54%|█████▎    | 415/774 [01:51<01:24,  4.26it/s][A
+ 54%|█████▎    | 416/774 [01:51<01:25,  4.20it/s][A
+ 54%|█████▍    | 417/774 [01:52<01:23,  4.26it/s][A
+ 54%|█████▍    | 418/774 [01:52<01:17,  4.58it/s][A
+ 54%|█████▍    | 419/774 [01:52<01:32,  3.83it/s][A
+ 54%|█████▍    | 420/774 [01:52<01:36,  3.68it/s][A
+ 54%|█████▍    | 421/774 [01:53<01:36,  3.66it/s][A
+ 55%|█████▍    | 422/774 [01:53<01:36,  3.65it/s][A
+ 55%|█████▍    | 423/774 [01:53<01:37,  3.60it/s][A
+ 55%|█████▍    | 424/774 [01:54<01:34,  3.70it/s][A
+ 55%|█████▍    | 425/774 [01:54<01:24,  4.15it/s][A
+ 55%|█████▌    | 426/774 [01:54<01:18,  4.45it/s][A
+ 55%|█████▌    | 427/774 [01:54<01:14,  4.65it/s][A
+ 55%|█████▌    | 428/774 [01:54<01:16,  4.51it/s][A
+ 55%|█████▌    | 429/774 [01:55<01:19,  4.37it/s][A
+ 56%|█████▌    | 430/774 [01:55<01:23,  4.12it/s][A
+ 56%|█████▌    | 431/774 [01:55<01:36,  3.54it/s][A
+ 56%|█████▌    | 432/774 [01:55<01:36,  3.55it/s][A
+ 56%|█████▌    | 433/774 [01:56<01:28,  3.83it/s][A
+ 56%|█████▌    | 434/774 [01:56<01:23,  4.10it/s][A
+ 56%|█████▌    | 435/774 [01:56<01:22,  4.13it/s][A
+ 56%|█████▋    | 436/774 [01:56<01:23,  4.06it/s][A
+ 56%|█████▋    | 437/774 [01:57<01:19,  4.23it/s][A
+ 57%|█████▋    | 438/774 [01:57<01:15,  4.44it/s][A
+ 57%|█████▋    | 439/774 [01:57<01:17,  4.32it/s][A
+ 57%|█████▋    | 440/774 [01:57<01:21,  4.09it/s][A
+ 57%|█████▋    | 441/774 [01:58<01:25,  3.88it/s][A
+ 57%|█████▋    | 442/774 [01:58<01:27,  3.79it/s][A
+ 57%|█████▋    | 443/774 [01:58<01:25,  3.88it/s][A
+ 57%|█████▋    | 444/774 [01:58<01:24,  3.92it/s][A
+ 57%|█████▋    | 445/774 [01:59<01:24,  3.89it/s][A
+ 58%|█████▊    | 446/774 [01:59<01:22,  3.99it/s][A
+ 58%|█████▊    | 447/774 [01:59<01:20,  4.06it/s][A
+ 58%|█████▊    | 448/774 [01:59<01:12,  4.47it/s][A
+ 58%|█████▊    | 449/774 [02:00<01:13,  4.39it/s][A
+ 58%|█████▊    | 450/774 [02:00<01:16,  4.21it/s][A
+ 58%|█████▊    | 451/774 [02:00<01:14,  4.32it/s][A
+ 58%|█████▊    | 452/774 [02:00<01:11,  4.52it/s][A
+ 59%|█████▊    | 453/774 [02:00<01:10,  4.56it/s][A
+ 59%|█████▊    | 454/774 [02:01<01:17,  4.15it/s][A
+ 59%|█████▉    | 455/774 [02:01<01:21,  3.90it/s][A
+ 59%|█████▉    | 456/774 [02:01<01:25,  3.71it/s][A
+ 59%|█████▉    | 457/774 [02:02<01:19,  3.97it/s][A
+ 59%|█████▉    | 458/774 [02:02<01:18,  4.00it/s][A
+ 59%|█████▉    | 459/774 [02:02<01:17,  4.07it/s][A
+ 59%|█████▉    | 460/774 [02:02<01:22,  3.80it/s][A
+ 60%|█████▉    | 461/774 [02:03<01:29,  3.49it/s][A
+ 60%|█████▉    | 462/774 [02:03<01:25,  3.63it/s][A
+ 60%|█████▉    | 463/774 [02:03<01:23,  3.73it/s][A
+ 60%|█████▉    | 464/774 [02:03<01:23,  3.72it/s][A
+ 60%|██████    | 465/774 [02:04<01:15,  4.11it/s][A
+ 60%|██████    | 466/774 [02:04<01:12,  4.25it/s][A
+ 60%|██████    | 467/774 [02:04<01:08,  4.51it/s][A
+ 60%|██████    | 468/774 [02:04<01:08,  4.46it/s][A
+ 61%|██████    | 469/774 [02:04<01:03,  4.84it/s][A
+ 61%|██████    | 470/774 [02:05<01:00,  5.03it/s][A
+ 61%|██████    | 471/774 [02:05<01:03,  4.81it/s][A
+ 61%|██████    | 472/774 [02:05<01:07,  4.45it/s][A
+ 61%|██████    | 473/774 [02:05<01:10,  4.26it/s][A
+ 61%|██████    | 474/774 [02:06<01:09,  4.34it/s][A
+ 61%|██████▏   | 475/774 [02:06<01:10,  4.25it/s][A
+ 61%|██████▏   | 476/774 [02:06<01:18,  3.79it/s][A
+ 62%|██████▏   | 477/774 [02:07<01:32,  3.21it/s][A
+ 62%|██████▏   | 478/774 [02:07<01:33,  3.16it/s][A
+ 62%|██████▏   | 479/774 [02:07<01:31,  3.23it/s][A
+ 62%|██████▏   | 480/774 [02:07<01:28,  3.33it/s][A
+ 62%|██████▏   | 481/774 [02:08<01:29,  3.29it/s][A
+ 62%|██████▏   | 482/774 [02:08<01:27,  3.34it/s][A
+ 62%|██████▏   | 483/774 [02:08<01:25,  3.42it/s][A
+ 63%|██████▎   | 484/774 [02:09<01:26,  3.36it/s][A
+ 63%|██████▎   | 485/774 [02:09<01:28,  3.27it/s][A
+ 63%|██████▎   | 486/774 [02:09<01:24,  3.39it/s][A
+ 63%|██████▎   | 487/774 [02:10<01:26,  3.33it/s][A
+ 63%|██████▎   | 488/774 [02:10<01:23,  3.42it/s][A
+ 63%|██████▎   | 489/774 [02:10<01:18,  3.63it/s][A
+ 63%|██████▎   | 490/774 [02:10<01:18,  3.62it/s][A
+ 63%|██████▎   | 491/774 [02:11<01:17,  3.67it/s][A
+ 64%|██████▎   | 492/774 [02:11<01:18,  3.57it/s][A
+ 64%|██████▎   | 493/774 [02:11<01:19,  3.54it/s][A
+ 64%|██████▍   | 494/774 [02:11<01:18,  3.58it/s][A
+ 64%|██████▍   | 495/774 [02:12<01:18,  3.57it/s][A
+ 64%|██████▍   | 496/774 [02:12<01:27,  3.17it/s][A
+ 64%|██████▍   | 497/774 [02:12<01:27,  3.16it/s][A
+ 64%|██████▍   | 498/774 [02:13<01:25,  3.23it/s][A
+ 64%|██████▍   | 499/774 [02:13<01:24,  3.26it/s][A
+ 65%|██████▍   | 500/774 [02:13<01:22,  3.34it/s][A
+ 65%|██████▍   | 501/774 [02:14<01:18,  3.50it/s][A
+ 65%|██████▍   | 502/774 [02:14<01:17,  3.53it/s][A
+ 65%|██████▍   | 503/774 [02:14<01:22,  3.28it/s][A
+ 65%|██████▌   | 504/774 [02:15<01:23,  3.24it/s][A
+ 65%|██████▌   | 505/774 [02:15<01:20,  3.36it/s][A
+ 65%|██████▌   | 506/774 [02:15<01:19,  3.37it/s][A
+ 66%|██████▌   | 507/774 [02:15<01:24,  3.16it/s][A
+ 66%|██████▌   | 508/774 [02:16<01:21,  3.26it/s][A
+ 66%|██████▌   | 509/774 [02:16<01:20,  3.30it/s][A
+ 66%|██████▌   | 510/774 [02:16<01:17,  3.39it/s][A
+ 66%|██████▌   | 511/774 [02:17<01:13,  3.56it/s][A
+ 66%|██████▌   | 512/774 [02:17<01:11,  3.66it/s][A
+ 66%|██████▋   | 513/774 [02:17<01:14,  3.50it/s][A
+ 66%|██████▋   | 514/774 [02:17<01:17,  3.37it/s][A
+ 67%|██████▋   | 515/774 [02:18<01:23,  3.11it/s][A
+ 67%|██████▋   | 516/774 [02:18<01:17,  3.33it/s][A
+ 67%|██████▋   | 517/774 [02:18<01:11,  3.61it/s][A
+ 67%|██████▋   | 518/774 [02:19<01:08,  3.73it/s][A
+ 67%|██████▋   | 519/774 [02:19<01:11,  3.58it/s][A
+ 67%|██████▋   | 520/774 [02:19<01:10,  3.62it/s][A
+ 67%|██████▋   | 521/774 [02:19<01:07,  3.72it/s][A
+ 67%|██████▋   | 522/774 [02:20<01:04,  3.93it/s][A
+ 68%|██████▊   | 523/774 [02:20<01:02,  4.02it/s][A
+ 68%|██████▊   | 524/774 [02:20<01:06,  3.76it/s][A
+ 68%|██████▊   | 525/774 [02:20<01:06,  3.73it/s][A
+ 68%|██████▊   | 526/774 [02:21<01:09,  3.55it/s][A
+ 68%|██████▊   | 527/774 [02:21<01:11,  3.46it/s][A
+ 68%|██████▊   | 528/774 [02:21<01:10,  3.49it/s][A
+ 68%|██████▊   | 529/774 [02:22<01:06,  3.70it/s][A
+ 68%|██████▊   | 530/774 [02:22<01:05,  3.74it/s][A
+ 69%|██████▊   | 531/774 [02:22<01:04,  3.75it/s][A
+ 69%|██████▊   | 532/774 [02:22<01:02,  3.85it/s][A
+ 69%|██████▉   | 533/774 [02:23<00:59,  4.05it/s][A
+ 69%|██████▉   | 534/774 [02:23<00:55,  4.29it/s][A
+ 69%|██████▉   | 535/774 [02:23<00:58,  4.07it/s][A
+ 69%|██████▉   | 536/774 [02:23<01:01,  3.90it/s][A
+ 69%|████���█▉   | 537/774 [02:24<01:01,  3.83it/s][A
+ 70%|██████▉   | 538/774 [02:24<01:05,  3.60it/s][A
+ 70%|██████▉   | 539/774 [02:24<01:04,  3.62it/s][A
+ 70%|██████▉   | 540/774 [02:24<01:04,  3.63it/s][A
+ 70%|██████▉   | 541/774 [02:25<01:02,  3.75it/s][A
+ 70%|███████   | 542/774 [02:25<01:02,  3.73it/s][A
+ 70%|███████   | 543/774 [02:25<01:03,  3.64it/s][A
+ 70%|███████   | 544/774 [02:26<01:03,  3.63it/s][A
+ 70%|███████   | 545/774 [02:26<01:01,  3.74it/s][A
+ 71%|███████   | 546/774 [02:26<00:57,  3.94it/s][A
+ 71%|███████   | 547/774 [02:26<00:55,  4.11it/s][A
+ 71%|███████   | 548/774 [02:26<00:54,  4.14it/s][A
+ 71%|███████   | 549/774 [02:27<00:55,  4.05it/s][A
+ 71%|███████   | 550/774 [02:27<00:59,  3.79it/s][A
+ 71%|███████   | 551/774 [02:27<01:01,  3.62it/s][A
+ 71%|███████▏  | 552/774 [02:28<01:04,  3.42it/s][A
+ 71%|███████▏  | 553/774 [02:28<01:08,  3.21it/s][A
+ 72%|███████▏  | 554/774 [02:28<01:07,  3.25it/s][A
+ 72%|███████▏  | 555/774 [02:29<01:06,  3.28it/s][A
+ 72%|███████▏  | 556/774 [02:29<01:03,  3.44it/s][A
+ 72%|███████▏  | 557/774 [02:29<01:07,  3.24it/s][A
+ 72%|███████▏  | 558/774 [02:29<01:01,  3.53it/s][A
+ 72%|███████▏  | 559/774 [02:30<00:56,  3.82it/s][A
+ 72%|███████▏  | 560/774 [02:30<01:00,  3.52it/s][A
+ 72%|███████▏  | 561/774 [02:30<00:57,  3.71it/s][A
+ 73%|███████▎  | 562/774 [02:30<00:52,  4.03it/s][A
+ 73%|███████▎  | 563/774 [02:31<00:50,  4.17it/s][A
+ 73%|███████▎  | 564/774 [02:31<00:51,  4.05it/s][A
+ 73%|███████▎  | 565/774 [02:31<00:54,  3.87it/s][A
+ 73%|███████▎  | 566/774 [02:31<00:50,  4.16it/s][A
+ 73%|███████▎  | 567/774 [02:32<00:46,  4.50it/s][A
+ 73%|███████▎  | 568/774 [02:32<00:47,  4.31it/s][A
+ 74%|███████▎  | 569/774 [02:32<00:48,  4.24it/s][A
+ 74%|███████▎  | 570/774 [02:32<00:48,  4.23it/s][A
+ 74%|███████▍  | 571/774 [02:33<00:52,  3.89it/s][A
+ 74%|███████▍  | 572/774 [02:33<00:54,  3.72it/s][A
+ 74%|███████▍  | 573/774 [02:33<00:53,  3.74it/s][A
+ 74%|███████▍  | 574/774 [02:33<00:52,  3.78it/s][A
+ 74%|███████▍  | 575/774 [02:34<00:52,  3.76it/s][A
+ 74%|███████▍  | 576/774 [02:34<00:58,  3.40it/s][A
+ 75%|███████▍  | 577/774 [02:34<00:56,  3.46it/s][A
+ 75%|███████▍  | 578/774 [02:35<00:55,  3.55it/s][A
+ 75%|███████▍  | 579/774 [02:35<00:57,  3.41it/s][A
+ 75%|███████▍  | 580/774 [02:35<00:56,  3.44it/s][A
+ 75%|███████▌  | 581/774 [02:35<00:55,  3.49it/s][A
+ 75%|███████▌  | 582/774 [02:36<00:53,  3.59it/s][A
+ 75%|███████▌  | 583/774 [02:36<00:51,  3.73it/s][A
+ 75%|███████▌  | 584/774 [02:36<00:50,  3.76it/s][A
+ 76%|███████▌  | 585/774 [02:37<00:52,  3.59it/s][A
+ 76%|███████▌  | 586/774 [02:37<00:52,  3.57it/s][A
+ 76%|███████▌  | 587/774 [02:37<00:51,  3.64it/s][A
+ 76%|███████▌  | 588/774 [02:37<00:50,  3.70it/s][A
+ 76%|███████▌  | 589/774 [02:38<00:49,  3.77it/s][A
+ 76%|███████▌  | 590/774 [02:38<00:45,  4.02it/s][A
+ 76%|███████▋  | 591/774 [02:38<00:47,  3.89it/s][A
+ 76%|███████▋  | 592/774 [02:38<00:49,  3.64it/s][A
+ 77%|███████▋  | 593/774 [02:39<00:50,  3.59it/s][A
+ 77%|███████▋  | 594/774 [02:39<00:50,  3.57it/s][A
+ 77%|███████▋  | 595/774 [02:39<00:54,  3.31it/s][A
+ 77%|███████▋  | 596/774 [02:40<00:56,  3.15it/s][A
+ 77%|███████▋  | 597/774 [02:40<00:56,  3.12it/s][A
+ 77%|███████▋  | 598/774 [02:40<00:57,  3.06it/s][A
+ 77%|███████▋  | 599/774 [02:41<00:58,  3.01it/s][A
+ 78%|███████▊  | 600/774 [02:41<00:57,  3.01it/s][A
+ 78%|███████▊  | 601/774 [02:41<00:58,  2.97it/s][A
+ 78%|███████▊  | 602/774 [02:42<00:58,  2.95it/s][A
+ 78%|███████▊  | 603/774 [02:42<00:57,  2.99it/s][A
+ 78%|███████▊  | 604/774 [02:42<00:57,  2.94it/s][A
+ 78%|███████▊  | 605/774 [02:43<00:56,  3.02it/s][A
+ 78%|███████▊  | 606/774 [02:43<00:57,  2.92it/s][A
+ 78%|███████▊  | 607/774 [02:43<00:57,  2.90it/s][A
+ 79%|███████▊  | 608/774 [02:44<00:57,  2.88it/s][A
+ 79%|███████▊  | 609/774 [02:44<00:55,  2.98it/s][A
+ 79%|███████▉  | 610/774 [02:44<00:56,  2.90it/s][A
+ 79%|███████▉  | 611/774 [02:45<01:00,  2.68it/s][A
+ 79%|███████▉  | 612/774 [02:45<01:02,  2.59it/s][A
+ 79%|███████▉  | 613/774 [02:46<00:57,  2.80it/s][A
+ 79%|███████▉  | 614/774 [02:46<00:55,  2.87it/s][A
+ 79%|███████▉  | 615/774 [02:46<00:52,  3.04it/s][A
+ 80%|███████▉  | 616/774 [02:47<00:51,  3.08it/s][A
+ 80%|███████▉  | 617/774 [02:47<00:50,  3.09it/s][A
+ 80%|███████▉  | 618/774 [02:47<00:48,  3.24it/s][A
+ 80%|███████▉  | 619/774 [02:47<00:45,  3.43it/s][A
+ 80%|████████  | 620/774 [02:48<00:44,  3.44it/s][A
+ 80%|████████  | 621/774 [02:48<00:41,  3.72it/s][A
+ 80%|████████  | 622/774 [02:48<00:38,  3.97it/s][A
+ 80%|████████  | 623/774 [02:48<00:38,  3.93it/s][A
+ 81%|████████  | 624/774 [02:49<00:41,  3.61it/s][A
+ 81%|████████  | 625/774 [02:49<00:41,  3.55it/s][A
+ 81%|████████  | 626/774 [02:49<00:44,  3.30it/s][A
+ 81%|████████  | 627/774 [02:50<00:45,  3.23it/s][A
+ 81%|████████  | 628/774 [02:50<00:45,  3.22it/s][A
+ 81%|████████▏ | 629/774 [02:50<00:43,  3.33it/s][A
+ 81%|████████▏ | 630/774 [02:50<00:40,  3.55it/s][A
+ 82%|████████▏ | 631/774 [02:51<00:38,  3.74it/s][A
+ 82%|████████▏ | 632/774 [02:51<00:38,  3.74it/s][A
+ 82%|████████▏ | 633/774 [02:51<00:39,  3.55it/s][A
+ 82%|████████▏ | 634/774 [02:52<00:40,  3.45it/s][A
+ 82%|████████▏ | 635/774 [02:52<00:39,  3.52it/s][A
+ 82%|████████▏ | 636/774 [02:52<00:40,  3.45it/s][A
+ 82%|████████▏ | 637/774 [02:52<00:39,  3.50it/s][A
+ 82%|████████▏ | 638/774 [02:53<00:39,  3.47it/s][A
+ 83%|████████▎ | 639/774 [02:53<00:43,  3.10it/s][A
+ 83%|████████▎ | 640/774 [02:54<00:49,  2.68it/s][A
+ 83%|████████▎ | 641/774 [02:54<00:49,  2.70it/s][A
+ 83%|████████▎ | 642/774 [02:54<00:45,  2.90it/s][A
+ 83%|████████▎ | 643/774 [02:55<00:45,  2.91it/s][A
+ 83%|████████▎ | 644/774 [02:55<00:41,  3.13it/s][A
+ 83%|████████▎ | 645/774 [02:55<00:37,  3.41it/s][A
+ 83%|████████▎ | 646/774 [02:55<00:35,  3.64it/s][A
+ 84%|████████▎ | 647/774 [02:56<00:32,  3.94it/s][A
+ 84%|████████▎ | 648/774 [02:56<00:30,  4.07it/s][A
+ 84%|████████▍ | 649/774 [02:56<00:30,  4.09it/s][A
+ 84%|████████▍ | 650/774 [02:56<00:28,  4.28it/s][A
+ 84%|████████▍ | 651/774 [02:56<00:29,  4.23it/s][A
+ 84%|████████▍ | 652/774 [02:57<00:29,  4.16it/s][A
+ 84%|████████▍ | 653/774 [02:57<00:31,  3.88it/s][A
+ 84%|████████▍ | 654/774 [02:57<00:29,  4.11it/s][A
+ 85%|████████▍ | 655/774 [02:57<00:27,  4.39it/s][A
+ 85%|████████▍ | 656/774 [02:58<00:27,  4.22it/s][A
+ 85%|████████▍ | 657/774 [02:58<00:26,  4.48it/s][A
+ 85%|████████▌ | 658/774 [02:58<00:27,  4.25it/s][A
+ 85%|████████▌ | 659/774 [02:58<00:29,  3.89it/s][A
+ 85%|████████▌ | 660/774 [02:59<00:30,  3.75it/s][A
+ 85%|████████▌ | 661/774 [02:59<00:30,  3.71it/s][A
+ 86%|████████▌ | 662/774 [02:59<00:28,  3.88it/s][A
+ 86%|████████▌ | 663/774 [03:00<00:30,  3.65it/s][A
+ 86%|████████▌ | 664/774 [03:00<00:30,  3.62it/s][A
+ 86%|████████▌ | 665/774 [03:00<00:27,  3.90it/s][A
+ 86%|████████▌ | 666/774 [03:00<00:24,  4.36it/s][A
+ 86%|████████▌ | 667/774 [03:00<00:23,  4.62it/s][A
+ 86%|████████▋ | 668/774 [03:01<00:23,  4.45it/s][A
+ 86%|████████▋ | 669/774 [03:01<00:25,  4.18it/s][A
+ 87%|████████▋ | 670/774 [03:01<00:24,  4.31it/s][A
+ 87%|████████▋ | 671/774 [03:01<00:26,  3.95it/s][A
+ 87%|████████▋ | 672/774 [03:02<00:25,  4.02it/s][A
+ 87%|████████▋ | 673/774 [03:02<00:25,  4.02it/s][A
+ 87%|████████▋ | 674/774 [03:02<00:25,  3.98it/s][A
+ 87%|████████▋ | 675/774 [03:02<00:23,  4.20it/s][A
+ 87%|████████▋ | 676/774 [03:03<00:22,  4.45it/s][A
+ 87%|████████▋ | 677/774 [03:03<00:21,  4.43it/s][A
+ 88%|████████▊ | 678/774 [03:03<00:21,  4.47it/s][A
+ 88%|████████▊ | 679/774 [03:03<00:22,  4.22it/s][A
+ 88%|████████▊ | 680/774 [03:04<00:22,  4.18it/s][A
+ 88%|████████▊ | 681/774 [03:04<00:20,  4.49it/s][A
+ 88%|████████▊ | 682/774 [03:04<00:20,  4.53it/s][A
+ 88%|████████▊ | 683/774 [03:04<00:21,  4.15it/s][A
+ 88%|████████▊ | 684/774 [03:05<00:23,  3.89it/s][A
+ 89%|████████▊ | 685/774 [03:05<00:24,  3.71it/s][A
+ 89%|████████▊ | 686/774 [03:05<00:22,  3.83it/s][A
+ 89%|████████▉ | 687/774 [03:05<00:21,  4.04it/s][A
+ 89%|████████▉ | 688/774 [03:06<00:21,  4.04it/s][A
+ 89%|████████▉ | 689/774 [03:06<00:20,  4.18it/s][A
+ 89%|████████▉ | 690/774 [03:06<00:19,  4.28it/s][A
+ 89%|████████▉ | 691/774 [03:06<00:18,  4.44it/s][A
+ 89%|████████▉ | 692/774 [03:06<00:18,  4.47it/s][A
+ 90%|████████▉ | 693/774 [03:07<00:18,  4.45it/s][A
+ 90%|████████▉ | 694/774 [03:07<00:19,  4.17it/s][A
+ 90%|████████▉ | 695/774 [03:07<00:20,  3.80it/s][A
+ 90%|████████▉ | 696/774 [03:07<00:19,  3.93it/s][A
+ 90%|█████████ | 697/774 [03:08<00:19,  3.90it/s][A
+ 90%|█████████ | 698/774 [03:08<00:17,  4.29it/s][A
+ 90%|█████████ | 699/774 [03:08<00:16,  4.64it/s][A
+ 90%|█████████ | 700/774 [03:08<00:17,  4.28it/s][A
+ 91%|█████████ | 701/774 [03:09<00:16,  4.41it/s][A
+ 91%|█████████ | 702/774 [03:09<00:16,  4.38it/s][A
+ 91%|█████████ | 703/774 [03:09<00:16,  4.37it/s][A
+ 91%|█████████ | 704/774 [03:09<00:16,  4.23it/s][A
+ 91%|█████████ | 705/774 [03:09<00:15,  4.57it/s][A
+ 91%|█████████ | 706/774 [03:10<00:14,  4.74it/s][A
+ 91%|█████████▏| 707/774 [03:10<00:14,  4.65it/s][A
+ 91%|█████████▏| 708/774 [03:10<00:13,  4.90it/s][A
+ 92%|█████████▏| 709/774 [03:10<00:13,  4.74it/s][A
+ 92%|█████████▏| 710/774 [03:10<00:13,  4.65it/s][A
+ 92%|█████████▏| 711/774 [03:11<00:12,  4.85it/s][A
+ 92%|█████████▏| 712/774 [03:11<00:12,  5.11it/s][A
+ 92%|█████████▏| 713/774 [03:11<00:12,  4.94it/s][A
+ 92%|█████████▏| 714/774 [03:11<00:12,  4.65it/s][A
+ 92%|█████████▏| 715/774 [03:12<00:12,  4.74it/s][A
+ 93%|█████████▎| 716/774 [03:12<00:11,  5.22it/s][A
+ 93%|█████████▎| 717/774 [03:12<00:10,  5.30it/s][A
+ 93%|█████████▎| 718/774 [03:12<00:11,  4.80it/s][A
+ 93%|█████████▎| 719/774 [03:12<00:11,  4.64it/s][A
+ 93%|█████████▎| 720/774 [03:12<00:10,  4.95it/s][A
+ 93%|█████████▎| 721/774 [03:13<00:10,  5.22it/s][A
+ 93%|█████████▎| 722/774 [03:13<00:09,  5.63it/s][A
+ 93%|█████████▎| 723/774 [03:13<00:09,  5.43it/s][A
+ 94%|█████████▎| 724/774 [03:13<00:09,  5.39it/s][A
+ 94%|█████████▎| 725/774 [03:13<00:08,  5.51it/s][A
+ 94%|█████████▍| 726/774 [03:14<00:08,  5.56it/s][A
+ 94%|█████████▍| 727/774 [03:14<00:08,  5.34it/s][A
+ 94%|█████████▍| 728/774 [03:14<00:09,  4.79it/s][A
+ 94%|█████████▍| 729/774 [03:14<00:08,  5.09it/s][A
+ 94%|█████████▍| 730/774 [03:14<00:08,  5.40it/s][A
+ 94%|█████████▍| 731/774 [03:15<00:07,  5.39it/s][A
+ 95%|█████████▍| 732/774 [03:15<00:07,  5.54it/s][A
+ 95%|█████████▍| 733/774 [03:15<00:07,  5.54it/s][A
+ 95%|█████████▍| 734/774 [03:15<00:07,  5.60it/s][A
+ 95%|█████████▍| 735/774 [03:15<00:06,  5.71it/s][A
+ 95%|█████████▌| 736/774 [03:15<00:06,  5.79it/s][A
+ 95%|█████████▌| 737/774 [03:16<00:06,  5.78it/s][A
+ 95%|█████████▌| 738/774 [03:16<00:06,  5.57it/s][A
+ 95%|█████████▌| 739/774 [03:16<00:06,  5.50it/s][A
+ 96%|█████████▌| 740/774 [03:16<00:06,  5.38it/s][A
+ 96%|█████████▌| 741/774 [03:16<00:06,  5.07it/s][A
+ 96%|█████████▌| 742/774 [03:17<00:06,  5.26it/s][A
+ 96%|█████████▌| 743/774 [03:17<00:05,  5.58it/s][A
+ 96%|█████████▌| 744/774 [03:17<00:05,  5.37it/s][A
+ 96%|█████████▋| 745/774 [03:17<00:06,  4.45it/s][A
+ 96%|█████████▋| 746/774 [03:18<00:07,  3.86it/s][A
+ 97%|█████████▋| 747/774 [03:18<00:06,  4.06it/s][A
+ 97%|█████████▋| 748/774 [03:18<00:06,  4.33it/s][A
+ 97%|█████████▋| 749/774 [03:18<00:05,  4.60it/s][A
+ 97%|█████████▋| 750/774 [03:18<00:05,  4.28it/s][A
+ 97%|█████████▋| 751/774 [03:19<00:05,  4.48it/s][A
+ 97%|█████████▋| 752/774 [03:19<00:04,  4.41it/s][A
+ 97%|█████████▋| 753/774 [03:19<00:04,  4.70it/s][A
+ 97%|█████████▋| 754/774 [03:19<00:03,  5.43it/s][A
+ 98%|█████████▊| 755/774 [03:19<00:03,  5.65it/s][A
+ 98%|█████████▊| 756/774 [03:19<00:03,  5.49it/s][A
+ 98%|█████████▊| 757/774 [03:20<00:03,  5.30it/s][A
+ 98%|█████████▊| 758/774 [03:20<00:03,  5.20it/s][A
+ 98%|█████████▊| 759/774 [03:20<00:02,  5.44it/s][A
+ 98%|█████████▊| 760/774 [03:20<00:02,  5.45it/s][A
+ 98%|█████████▊| 761/774 [03:20<00:02,  5.89it/s][A
+ 98%|█████████▊| 762/774 [03:21<00:02,  6.00it/s][A
+ 99%|█████████▊| 763/774 [03:21<00:01,  6.18it/s][A
+ 99%|█████████▊| 764/774 [03:21<00:01,  6.31it/s][A
+ 99%|█████████▉| 765/774 [03:21<00:01,  6.24it/s][A
+ 99%|█████████▉| 766/774 [03:21<00:01,  5.33it/s][A
+ 99%|█████████▉| 767/774 [03:21<00:01,  5.54it/s][A
+ 99%|█████████▉| 768/774 [03:22<00:01,  5.50it/s][A
+ 99%|█████████▉| 769/774 [03:22<00:00,  5.21it/s][A
+ 99%|█████████▉| 770/774 [03:22<00:00,  5.06it/s][A
+100%|█████████▉| 771/774 [03:22<00:00,  5.34it/s][A
+100%|█████████▉| 772/774 [03:22<00:00,  5.05it/s][A
+100%|█████████▉| 773/774 [03:23<00:00,  4.93it/s][A                                                      
+                                                 [A 47%|████▋     | 6000/12776 [1:04:25<40:46,  2.77it/s]
+100%|██████████| 774/774 [03:25<00:00,  4.93it/s][A
+                                                 [ASaving model checkpoint to ./checkpoint-6000
+Configuration saved in ./checkpoint-6000/config.json
+Model weights saved in ./checkpoint-6000/model.safetensors
+Feature extractor saved in ./checkpoint-6000/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-6000/tokenizer_config.json
+Special tokens file saved in ./checkpoint-6000/special_tokens_map.json
+added tokens file saved in ./checkpoint-6000/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-4800] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 47%|████▋     | 6001/12776 [1:04:31<120:36:46, 64.09s/it]                                                           47%|████▋     | 6001/12776 [1:04:31<120:36:46, 64.09s/it] 47%|████▋     | 6002/12776 [1:04:32<85:00:39, 45.18s/it]                                                           47%|████▋     | 6002/12776 [1:04:32<85:00:39, 45.18s/it] 47%|████▋     | 6003/12776 [1:04:33<59:58:31, 31.88s/it]                                                          47%|████▋     | 6003/12776 [1:04:33<59:58:31, 31.88s/it] 47%|████▋     | 6004/12776 [1:04:34<42:24:31, 22.54s/it]                                                          47%|████▋     | 6004/12776 [1:04:34<42:24:31, 22.54s/it] 47%|████▋     | 6005/12776 [1:04:35<30:05:00, 15.99s/it]                                                          47%|████▋     | 6005/12776 [1:04:35<30:05:00, 15.99s/it] 47%|████▋     | 6006/12776 [1:04:35<21:26:47, 11.40s/it]                                                          47%|████▋     | 6006/12776 [1:04:35<21:26:47, 11.40s/it] 47%|████▋     | 6007/12776 [1:04:36<15:24:17,  8.19s/it]                                                          47%|████▋     | 6007/12776 [1:04:36<15:24:17,  8.19s/it] 47%|████▋     | 6008/12776 [1:04:37<11:07:21,  5.92s/it]                                                          47%|████▋     | 6008/12776 [1:04:37<11:07:21,  5.92s/it] 47%|████▋     | 6009/12776 [1:04:37<8:06:43,  4.32s/it]                                                          47%|████▋     | 6009/12776 [1:04:37<8:06:43,  4.32s/it] 47%|████▋     | 6010/12776 [1:04:38<5:58:39,  3.18s/it]                                                         47%|████▋     | 6010/12776 [1:04:38<5:58:39,  3.18s/it] 47%|████▋     | 6011/12776 [1:04:38<4:29:27,  2.39s/it]                                                         47%|████▋     | 6011/12776 [1:04:38<4:29:27,  2.39s/it] 47%|████▋     | 6012/12776 [1:04:39<3:24:49,  1.82s/it]                                                         47%|████▋     | 6012/12776 [1:04:39<3:24:49,  1.82s/it] 47%|████▋     | 6013/12776 [1:04:39<2:41:23,  1.43s/it]                                                         47%|████▋     | 6013/12776 [1:04:39<2:41:23,  1.43s/it] 47%|████▋     | 6014/12776 [1:04:40<2:07:11,  1.13s/it]                                                         47%|████▋     | 6014/12776 [1:04:40<2:07:11,  1.13s/it] 47%|████▋     | 6015/12776 [1:04:40<1:48:13,  1.04it/s]                                                         47%|████▋     | 6015/12776 [1:04:40<1:48:13,  1.04it/s] 47%|████▋     | 6016/12776 [1:04:41<1:28:37,  1.27it/s]                                                         47%|████▋     | 6016/12776 [1:04:41<1:28:37,  1.27it/s] 47%|████▋     | 6017/12776 [1:04:41<1:14:25,  1.51it/s]                                                         47%|████▋     | 6017/12776 [1:04:41<1:14:25,  1.51it/s] 47%|████▋     | 6018/12776 [1:04:42<1:05:58,  1.71it/s]                                                         47%|████▋     | 6018/12776 [1:04:42<1:05:58,  1.71it/s] 47%|████▋     | 6019/12776 [1:04:42<57:24,  1.96it/s]                                                         47%|████▋     | 6019/12776 [1:04:42<57:24,  1.96it/s] 47%|████▋     | 6020/12776 [1:04:42<51:14,  2.20it/s]                                                       47%|████▋     | 6020/12776 [1:04:42<51:14,  2.20it/s] 47%|████▋     | 6021/12776 [1:04:43<46:41,  2.41it/s]                                                       47%|████▋     | 6021/12776 [1:04:43<46:41,  2.41it/s] 47%|████▋     | 6022/12776 [1:04:43<45:04,  2.50it/s]                                                       47%|████▋     | 6022/12776 [1:04:43<45:04,  2.50it/s] 47%|████▋     | 6023/12776 [1:04:43<41:38,  2.70it/s]                                                       47%|████▋     | 6023/12776 [1:04:43<41:38,  2.70it/s] 47%|████▋     | 6024/12776 [1:04:44<38:35,  2.92it/s]                                                       47%|████▋     | 6024/12776 [1:04:44<38:35,  2.92it/s] 47%|████▋     | 6025/12776 [1:04:44<39:08,  2.87it/s]                                                       47%|████▋     | 6025/12776 [1:04:44<39:08,  2.87it/s] 47%|████▋     | 6026/12776 [1:04:44<36:24,  3.09it/s]                                                       47%|████▋     | 6026/12776 [1:04:44<36:24,  3.09it/s] 47%|████▋     | 6027/12776 [1:04:44<34:17,  3.28it/s]                                                       47%|████▋     | 6027/12776 [1:04:44<34:17,  3.28it/s] 47%|████▋     | 6028/12776 [1:04:45<32:35,  3.45it/s]                                                       47%|████▋     | 6028/12776 [1:04:45<32:35,  3.45it/s] 47%|████▋     | 6029/12776 [1:04:45<34:34,  3.25it/s]                                                       47%|████▋     | 6029/12776 [1:04:45<34:34,  3.25it/s] 47%|████▋     | 6030/12776 [1:04:45<32:17,  3.48it/s]                                                       47%|████▋     | 6030/12776 [1:04:45<32:17,  3.48it/s] 47%|████▋     | 6031/12776 [1:04:45<30:27,  3.69it/s]                                                       47%|████▋     | 6031/12776 [1:04:45<30:27,  3.69it/s] 47%|████▋     | 6032/12776 [1:04:46<29:01,  3.87it/s]                                                       47%|████▋     | 6032/12776 [1:04:46<29:01,  3.87it/s] 47%|████▋     | 6033/12776 [1:04:46<27:54,  4.03it/s]                                                       47%|████▋     | 6033/12776 [1:04:46<27:54,  4.03it/s] 47%|████▋     | 6034/12776 [1:04:46<30:52,  3.64it/s]                                                       47%|████▋     | 6034/12776 [1:04:46<30:52,  3.64it/s] 47%|████▋     | 6035/12776 [1:04:47<29:05,  3.86it/s]                                                       47%|████▋     | 6035/12776 [1:04:47<29:05,  3.86it/s] 47%|████▋     | 6036/12776 [1:04:47<27:37,  4.07it/s]                                                       47%|████▋     | 6036/12776 [1:04:47<27:37,  4.07it/s] 47%|████▋     | 6037/12776 [1:04:47<26:24,  4.25it/s]                                                       47%|████▋     | 6037/12776 [1:04:47<26:24,  4.25it/s] 47%|████▋     | 6038/12776 [1:04:47<25:19,  4.43it/s]                                                       47%|████▋     | 6038/12776 [1:04:47<25:19,  4.43it/s] 47%|████▋     | 6039/12776 [1:04:47<27:25,  4.09it/s]                                                       47%|████▋     | 6039/12776 [1:04:47<27:25,  4.09it/s] 47%|████▋     | 6040/12776 [1:04:48<25:55,  4.33it/s]                                                       47%|████▋     | 6040/12776 [1:04:48<25:55,  4.33it/s] 47%|████▋     | 6041/12776 [1:04:48<24:49,  4.52it/s]                                                       47%|████▋     | 6041/12776 [1:04:48<24:49,  4.52it/s] 47%|████▋     | 6042/12776 [1:04:48<23:51,  4.70it/s]                                                       47%|████▋     | 6042/12776 [1:04:48<23:51,  4.70it/s] 47%|████▋     | 6043/12776 [1:04:48<23:12,  4.84it/s]                                                       47%|████▋     | 6043/12776 [1:04:48<23:12,  4.84it/s] 47%|████▋     | 6044/12776 [1:04:49<26:32,  4.23it/s]                                                       47%|████▋     | 6044/12776 [1:04:49<26:32,  4.23it/s] 47%|████▋     | 6045/12776 [1:04:49<24:30,  4.58it/s]                                                       47%|████▋     | 6045/12776 [1:04:49<24:30,  4.58it/s] 47%|████▋     | 6046/12776 [1:04:49<22:53,  4.90it/s]                                                       47%|████▋     | 6046/12776 [1:04:49<22:53,  4.90it/s] 47%|████▋     | 6047/12776 [1:04:49<21:43,  5.16it/s]                                                       47%|████▋     | 6047/12776 [1:04:49<21:43,  5.16it/s] 47%|████▋     | 6048/12776 [1:04:49<20:45,  5.40it/s]                                                       47%|████▋     | 6048/12776 [1:04:49<20:45,  5.40it/s] 47%|████▋     | 6049/12776 [1:04:49<20:04,  5.58it/s]                                                       47%|████▋     | 6049/12776 [1:04:49<20:04,  5.58it/s] 47%|████▋     | 6050/12776 [1:04:50<37:02,  3.03it/s]                                                       47%|████▋     | 6050/12776 [1:04:50<37:02,  3.03it/s] 47%|████▋     | 6051/12776 [1:04:51<1:12:47,  1.54it/s]                                                         47%|████▋     | 6051/12776 [1:04:51<1:12:47,  1.54it/s] 47%|████▋     | 6052/12776 [1:04:52<1:22:32,  1.36it/s]                                                         47%|████▋     | 6052/12776 [1:04:52<1:22:32,  1.36it/s] 47%|████▋     | 6053/12776 [1:04:53<1:29:50,  1.25it/s]                                                         47%|████▋     | 6053/12776 [1:04:53<1:29:50,  1.25it/s] 47%|████▋     | 6054/12776 [1:04:54<1:30:14,  1.24it/s]                                                         47%|████▋     | 6054/12776 [1:04:54<1:30:14,  1.24it/s] 47%|████▋     | 6055/12776 [1:04:55<1:26:57,  1.29it/s]                                                         47%|████▋     | 6055/12776 [1:04:55<1:26:57,  1.29it/s] 47%|████▋     | 6056/12776 [1:04:56<1:25:03,  1.32it/s]                                                         47%|████▋     | 6056/12776 [1:04:56<1:25:03,  1.32it/s] 47%|████▋     | 6057/12776 [1:04:56<1:20:58,  1.38it/s]                                                         47%|████▋     | 6057/12776 [1:04:56<1:20:58,  1.38it/s] 47%|████▋     | 6058/12776 [1:04:57<1:17:10,  1.45it/s]                                                         47%|████▋     | 6058/12776 [1:04:57<1:17:10,  1.45it/s] 47%|████▋     | 6059/12776 [1:04:57<1:13:37,  1.52it/s]                                                         47%|████▋     | 6059/12776 [1:04:57<1:13:37,  1.52it/s] 47%|████▋     | 6060/12776 [1:04:58<1:10:07,  1.60it/s]                                                         47%|████▋     | 6060/12776 [1:04:58<1:10:07,  1.60it/s] 47%|████▋     | 6061/12776 [1:04:59<1:07:28,  1.66it/s]                                                         47%|████▋     | 6061/12776 [1:04:59<1:07:28,  1.66it/s] 47%|████▋     | 6062/12776 [1:04:59<1:06:17,  1.69it/s]                                                         47%|████▋     | 6062/12776 [1:04:59<1:06:17,  1.69it/s] 47%|████▋     | 6063/12776 [1:05:00<1:02:11,  1.80it/s]                                                         47%|████▋     | 6063/12776 [1:05:00<1:02:11,  1.80it/s] 47%|████▋     | 6064/12776 [1:05:00<1:00:51,  1.84it/s]                                                         47%|████▋     | 6064/12776 [1:05:00<1:00:51,  1.84it/s] 47%|████▋     | 6065/12776 [1:05:00<56:54,  1.97it/s]                                                         47%|████▋     | 6065/12776 [1:05:00<56:54,  1.97it/s] 47%|████▋     | 6066/12776 [1:05:01<57:30,  1.94it/s]                                                       47%|████▋     | 6066/12776 [1:05:01<57:30,  1.94it/s] 47%|████▋     | 6067/12776 [1:05:01<53:15,  2.10it/s]                                                       47%|████▋     | 6067/12776 [1:05:01<53:15,  2.10it/s] 47%|████▋     | 6068/12776 [1:05:02<50:26,  2.22it/s]                                                       47%|████▋     | 6068/12776 [1:05:02<50:26,  2.22it/s] 48%|████▊     | 6069/12776 [1:05:02<48:56,  2.28it/s]                                                       48%|████▊     | 6069/12776 [1:05:02<48:56,  2.28it/s] 48%|████▊     | 6070/12776 [1:05:03<45:30,  2.46it/s]                                                       48%|████▊     | 6070/12776 [1:05:03<45:30,  2.46it/s] 48%|████▊     | 6071/12776 [1:05:03<42:47,  2.61it/s]                                                       48%|████▊     | 6071/12776 [1:05:03<42:47,  2.61it/s] 48%|████▊     | 6072/12776 [1:05:03<40:34,  2.75it/s]                                                       48%|████▊     | 6072/12776 [1:05:03<40:34,  2.75it/s] 48%|████▊     | 6073/12776 [1:05:04<40:21,  2.77it/s]                                                       48%|████▊     | 6073/12776 [1:05:04<40:21,  2.77it/s] 48%|████▊     | 6074/12776 [1:05:04<38:18,  2.92it/s]                                                       48%|████▊     | 6074/12776 [1:05:04<38:18,  2.92it/s] 48%|████▊     | 6075/12776 [1:05:04<36:27,  3.06it/s]                                                       48%|████▊     | 6075/12776 [1:05:04<36:27,  3.06it/s] 48%|████▊     | 6076/12776 [1:05:05<38:52,  2.87it/s]                                                       48%|████▊     | 6076/12776 [1:05:05<38:52,  2.87it/s] 48%|████▊     | 6077/12776 [1:05:05<37:06,  3.01it/s]                                                      {'eval_loss': 0.580058753490448, 'eval_wer': 0.35448498629321884, 'eval_runtime': 206.0189, 'eval_samples_per_second': 60.106, 'eval_steps_per_second': 3.757, 'epoch': 0.94}
+{'loss': 0.2883, 'grad_norm': 0.8797839283943176, 'learning_rate': 0.0001658357771260997, 'epoch': 0.94}
+{'loss': 0.3189, 'grad_norm': 0.6887585520744324, 'learning_rate': 0.00016581133919843594, 'epoch': 0.94}
+{'loss': 0.3, 'grad_norm': 0.4749625325202942, 'learning_rate': 0.00016578690127077222, 'epoch': 0.94}
+{'loss': 0.2879, 'grad_norm': 0.7343231439590454, 'learning_rate': 0.00016576246334310847, 'epoch': 0.94}
+{'loss': 0.4164, 'grad_norm': 0.631679892539978, 'learning_rate': 0.00016573802541544475, 'epoch': 0.94}
+{'loss': 0.3442, 'grad_norm': 1.4770556688308716, 'learning_rate': 0.00016571358748778103, 'epoch': 0.94}
+{'loss': 0.2752, 'grad_norm': 0.5342082977294922, 'learning_rate': 0.00016568914956011728, 'epoch': 0.94}
+{'loss': 0.3402, 'grad_norm': 0.7217116355895996, 'learning_rate': 0.00016566471163245356, 'epoch': 0.94}
+{'loss': 0.249, 'grad_norm': 0.3949846625328064, 'learning_rate': 0.00016564027370478984, 'epoch': 0.94}
+{'loss': 0.3487, 'grad_norm': 0.6596114635467529, 'learning_rate': 0.00016561583577712606, 'epoch': 0.94}
+{'loss': 0.3152, 'grad_norm': 0.5770506858825684, 'learning_rate': 0.00016559139784946234, 'epoch': 0.94}
+{'loss': 0.3005, 'grad_norm': 0.6533299088478088, 'learning_rate': 0.00016556695992179862, 'epoch': 0.94}
+{'loss': 0.5086, 'grad_norm': 1.0162473917007446, 'learning_rate': 0.00016554252199413487, 'epoch': 0.94}
+{'loss': 0.4517, 'grad_norm': 4.207003116607666, 'learning_rate': 0.00016551808406647115, 'epoch': 0.94}
+{'loss': 0.4643, 'grad_norm': 1.691664695739746, 'learning_rate': 0.00016549364613880743, 'epoch': 0.94}
+{'loss': 0.4732, 'grad_norm': 1.8534256219863892, 'learning_rate': 0.00016546920821114368, 'epoch': 0.94}
+{'loss': 0.4391, 'grad_norm': 1.1564279794692993, 'learning_rate': 0.00016544477028347996, 'epoch': 0.94}
+{'loss': 0.7114, 'grad_norm': 1.8195252418518066, 'learning_rate': 0.00016542033235581623, 'epoch': 0.94}
+{'loss': 0.6024, 'grad_norm': 1.6135766506195068, 'learning_rate': 0.00016539589442815246, 'epoch': 0.94}
+{'loss': 0.5552, 'grad_norm': 2.4532477855682373, 'learning_rate': 0.00016537145650048874, 'epoch': 0.94}
+{'loss': 0.3148, 'grad_norm': 0.8328799605369568, 'learning_rate': 0.00016534701857282501, 'epoch': 0.94}
+{'loss': 0.4579, 'grad_norm': 1.1934763193130493, 'learning_rate': 0.00016532258064516127, 'epoch': 0.94}
+{'loss': 0.4106, 'grad_norm': 1.3018206357955933, 'learning_rate': 0.00016529814271749754, 'epoch': 0.94}
+{'loss': 0.8004, 'grad_norm': 2.117647647857666, 'learning_rate': 0.00016527370478983382, 'epoch': 0.94}
+{'loss': 0.4452, 'grad_norm': 1.9799567461013794, 'learning_rate': 0.00016524926686217005, 'epoch': 0.94}
+{'loss': 0.7205, 'grad_norm': 1.8844330310821533, 'learning_rate': 0.00016522482893450632, 'epoch': 0.94}
+{'loss': 0.7866, 'grad_norm': 2.3716540336608887, 'learning_rate': 0.0001652003910068426, 'epoch': 0.94}
+{'loss': 0.8434, 'grad_norm': 1.5887513160705566, 'learning_rate': 0.00016517595307917885, 'epoch': 0.94}
+{'loss': 0.8462, 'grad_norm': 1.6907641887664795, 'learning_rate': 0.00016515151515151513, 'epoch': 0.94}
+{'loss': 0.732, 'grad_norm': 4.261603355407715, 'learning_rate': 0.0001651270772238514, 'epoch': 0.94}
+{'loss': 1.302, 'grad_norm': 2.2813000679016113, 'learning_rate': 0.00016510263929618766, 'epoch': 0.94}
+{'loss': 0.7006, 'grad_norm': 1.3129147291183472, 'learning_rate': 0.00016507820136852394, 'epoch': 0.94}
+{'loss': 1.5246, 'grad_norm': 3.804840564727783, 'learning_rate': 0.00016505376344086022, 'epoch': 0.94}
+{'loss': 0.651, 'grad_norm': 6.360110759735107, 'learning_rate': 0.00016502932551319644, 'epoch': 0.94}
+{'loss': 0.8854, 'grad_norm': 1.6431469917297363, 'learning_rate': 0.00016500488758553272, 'epoch': 0.94}
+{'loss': 1.4713, 'grad_norm': 2.868042469024658, 'learning_rate': 0.000164980449657869, 'epoch': 0.94}
+{'loss': 1.0, 'grad_norm': 1.8416922092437744, 'learning_rate': 0.00016495601173020525, 'epoch': 0.95}
+{'loss': 1.1594, 'grad_norm': 2.1547200679779053, 'learning_rate': 0.00016493157380254153, 'epoch': 0.95}
+{'loss': 1.5753, 'grad_norm': 3.3095543384552, 'learning_rate': 0.0001649071358748778, 'epoch': 0.95}
+{'loss': 1.079, 'grad_norm': 3.2236099243164062, 'learning_rate': 0.00016488269794721406, 'epoch': 0.95}
+{'loss': 0.7609, 'grad_norm': 1.9828205108642578, 'learning_rate': 0.00016485826001955034, 'epoch': 0.95}
+{'loss': 1.2285, 'grad_norm': 2.30930757522583, 'learning_rate': 0.00016483382209188662, 'epoch': 0.95}
+{'loss': 1.3376, 'grad_norm': 2.148881435394287, 'learning_rate': 0.00016480938416422284, 'epoch': 0.95}
+{'loss': 1.3568, 'grad_norm': 2.2145028114318848, 'learning_rate': 0.00016478494623655912, 'epoch': 0.95}
+{'loss': 1.349, 'grad_norm': 3.270275831222534, 'learning_rate': 0.0001647605083088954, 'epoch': 0.95}
+{'loss': 0.9733, 'grad_norm': 1.849869728088379, 'learning_rate': 0.00016473607038123165, 'epoch': 0.95}
+{'loss': 0.6026, 'grad_norm': 2.6626498699188232, 'learning_rate': 0.00016471163245356793, 'epoch': 0.95}
+{'loss': 0.9302, 'grad_norm': 2.1792116165161133, 'learning_rate': 0.0001646871945259042, 'epoch': 0.95}
+{'loss': 1.1798, 'grad_norm': 4.216753959655762, 'learning_rate': 0.00016466275659824043, 'epoch': 0.95}
+{'loss': 0.8717, 'grad_norm': 1.9251540899276733, 'learning_rate': 0.0001646383186705767, 'epoch': 0.95}
+{'loss': 0.4177, 'grad_norm': 0.6738594174385071, 'learning_rate': 0.00016461388074291299, 'epoch': 0.95}
+{'loss': 0.3434, 'grad_norm': 0.584973156452179, 'learning_rate': 0.00016458944281524924, 'epoch': 0.95}
+{'loss': 0.3325, 'grad_norm': 0.5951725840568542, 'learning_rate': 0.00016456500488758552, 'epoch': 0.95}
+{'loss': 0.2946, 'grad_norm': 0.6660220623016357, 'learning_rate': 0.0001645405669599218, 'epoch': 0.95}
+{'loss': 0.416, 'grad_norm': 0.7421219348907471, 'learning_rate': 0.00016451612903225804, 'epoch': 0.95}
+{'loss': 0.288, 'grad_norm': 0.5771187543869019, 'learning_rate': 0.00016449169110459432, 'epoch': 0.95}
+{'loss': 0.4272, 'grad_norm': 0.9346206784248352, 'learning_rate': 0.0001644672531769306, 'epoch': 0.95}
+{'loss': 0.3013, 'grad_norm': 0.5371845364570618, 'learning_rate': 0.00016444281524926683, 'epoch': 0.95}
+{'loss': 0.2977, 'grad_norm': 0.6031778454780579, 'learning_rate': 0.0001644183773216031, 'epoch': 0.95}
+{'loss': 0.2708, 'grad_norm': 0.7967322468757629, 'learning_rate': 0.00016439393939393938, 'epoch': 0.95}
+{'loss': 0.3233, 'grad_norm': 0.7594047784805298, 'learning_rate': 0.00016436950146627563, 'epoch': 0.95}
+{'loss': 0.4047, 'grad_norm': 0.7103455066680908, 'learning_rate': 0.0001643450635386119, 'epoch': 0.95}
+{'loss': 0.3425, 'grad_norm': 0.8710353970527649, 'learning_rate': 0.0001643206256109482, 'epoch': 0.95}
+{'loss': 0.4108, 'grad_norm': 0.7795736193656921, 'learning_rate': 0.00016429618768328444, 'epoch': 0.95}
+{'loss': 0.4573, 'grad_norm': 1.0512408018112183, 'learning_rate': 0.00016427174975562072, 'epoch': 0.95}
+{'loss': 0.5165, 'grad_norm': 2.8175127506256104, 'learning_rate': 0.00016424731182795697, 'epoch': 0.95}
+{'loss': 0.5704, 'grad_norm': 1.3486829996109009, 'learning_rate': 0.00016422287390029322, 'epoch': 0.95}
+{'loss': 0.4442, 'grad_norm': 1.54037344455719, 'learning_rate': 0.0001641984359726295, 'epoch': 0.95}
+{'loss': 0.5064, 'grad_norm': 0.9929664134979248, 'learning_rate': 0.00016417399804496578, 'epoch': 0.95}
+{'loss': 0.5722, 'grad_norm': 1.431089997291565, 'learning_rate': 0.00016414956011730203, 'epoch': 0.95}
+{'loss': 0.6155, 'grad_norm': 1.4068636894226074, 'learning_rate': 0.0001641251221896383, 'epoch': 0.95}
+{'loss': 0.5878, 'grad_norm': 1.600339651107788, 'learning_rate': 0.0001641006842619746, 'epoch': 0.95}
+{'loss': 0.6764, 'grad_norm': 1.6295877695083618, 'learning_rate': 0.0001640762463343108, 'epoch': 0.95}
+{'loss': 0.5213, 'grad_norm': 1.2888362407684326, 'learning_rate': 0.0001640518084066471, 'epoch': 0.95}
+{'loss': 0.6027, 'grad_norm': 1.4205604791641235, 'learning_rate': 0.00016402737047898337, 'epoch': 0.95}
+{'loss': 0.6991, 'grad_norm': 1.4100985527038574, 'learning_rate': 0.00016400293255131962, 'epoch': 0.95}
+ 48%|████▊     | 6077/12776 [1:05:05<37:06,  3.01it/s] 48%|████▊     | 6078/12776 [1:05:05<35:33,  3.14it/s]                                                       48%|████▊     | 6078/12776 [1:05:05<35:33,  3.14it/s] 48%|████▊     | 6079/12776 [1:05:05<34:07,  3.27it/s]                                                       48%|████▊     | 6079/12776 [1:05:05<34:07,  3.27it/s] 48%|████▊     | 6080/12776 [1:05:06<33:45,  3.31it/s]                                                       48%|████▊     | 6080/12776 [1:05:06<33:45,  3.31it/s] 48%|████▊     | 6081/12776 [1:05:06<32:28,  3.44it/s]                                                       48%|████▊     | 6081/12776 [1:05:06<32:28,  3.44it/s] 48%|████▊     | 6082/12776 [1:05:06<31:19,  3.56it/s]                                                       48%|████▊     | 6082/12776 [1:05:06<31:19,  3.56it/s] 48%|████▊     | 6083/12776 [1:05:06<30:17,  3.68it/s]                                                       48%|████▊     | 6083/12776 [1:05:06<30:17,  3.68it/s] 48%|████▊     | 6084/12776 [1:05:07<33:43,  3.31it/s]                                                       48%|████▊     | 6084/12776 [1:05:07<33:43,  3.31it/s] 48%|████▊     | 6085/12776 [1:05:07<31:33,  3.53it/s]                                                       48%|████▊     | 6085/12776 [1:05:07<31:33,  3.53it/s] 48%|████▊     | 6086/12776 [1:05:07<29:50,  3.74it/s]                                                       48%|████▊     | 6086/12776 [1:05:07<29:50,  3.74it/s] 48%|████▊     | 6087/12776 [1:05:08<28:20,  3.93it/s]                                                       48%|████▊     | 6087/12776 [1:05:08<28:20,  3.93it/s] 48%|████▊     | 6088/12776 [1:05:08<27:14,  4.09it/s]                                                       48%|████▊     | 6088/12776 [1:05:08<27:14,  4.09it/s] 48%|████▊     | 6089/12776 [1:05:08<29:40,  3.76it/s]                                                       48%|████▊     | 6089/12776 [1:05:08<29:40,  3.76it/s] 48%|████▊     | 6090/12776 [1:05:08<27:54,  3.99it/s]                                                       48%|████▊     | 6090/12776 [1:05:08<27:54,  3.99it/s] 48%|████▊     | 6091/12776 [1:05:08<26:27,  4.21it/s]                                                       48%|████▊     | 6091/12776 [1:05:08<26:27,  4.21it/s] 48%|████▊     | 6092/12776 [1:05:09<25:20,  4.39it/s]                                                       48%|████▊     | 6092/12776 [1:05:09<25:20,  4.39it/s] 48%|████▊     | 6093/12776 [1:05:09<24:26,  4.56it/s]                                                       48%|████▊     | 6093/12776 [1:05:09<24:26,  4.56it/s] 48%|████▊     | 6094/12776 [1:05:09<26:32,  4.19it/s]                                                       48%|████▊     | 6094/12776 [1:05:09<26:32,  4.19it/s] 48%|████▊     | 6095/12776 [1:05:09<25:12,  4.42it/s]                                                       48%|████▊     | 6095/12776 [1:05:09<25:12,  4.42it/s] 48%|████▊     | 6096/12776 [1:05:10<24:08,  4.61it/s]                                                       48%|████▊     | 6096/12776 [1:05:10<24:08,  4.61it/s] 48%|████▊     | 6097/12776 [1:05:10<23:21,  4.77it/s]                                                       48%|████▊     | 6097/12776 [1:05:10<23:21,  4.77it/s] 48%|████▊     | 6098/12776 [1:05:10<22:33,  4.93it/s]                                                       48%|████▊     | 6098/12776 [1:05:10<22:33,  4.93it/s] 48%|████▊     | 6099/12776 [1:05:10<23:37,  4.71it/s]                                                       48%|████▊     | 6099/12776 [1:05:10<23:37,  4.71it/s] 48%|████▊     | 6100/12776 [1:05:11<40:15,  2.76it/s]                                                       48%|████▊     | 6100/12776 [1:05:11<40:15,  2.76it/s] 48%|████▊     | 6101/12776 [1:05:12<1:13:30,  1.51it/s]                                                         48%|████▊     | 6101/12776 [1:05:12<1:13:30,  1.51it/s] 48%|████▊     | 6102/12776 [1:05:13<1:21:19,  1.37it/s]                                                         48%|████▊     | 6102/12776 [1:05:13<1:21:19,  1.37it/s] 48%|████▊     | 6103/12776 [1:05:14<1:26:39,  1.28it/s]                                                         48%|████▊     | 6103/12776 [1:05:14<1:26:39,  1.28it/s] 48%|████▊     | 6104/12776 [1:05:15<1:25:50,  1.30it/s]                                                         48%|████▊     | 6104/12776 [1:05:15<1:25:50,  1.30it/s] 48%|████▊     | 6105/12776 [1:05:16<1:24:26,  1.32it/s]                                                         48%|████▊     | 6105/12776 [1:05:16<1:24:26,  1.32it/s] 48%|████▊     | 6106/12776 [1:05:16<1:25:14,  1.30it/s]                                                         48%|████▊     | 6106/12776 [1:05:16<1:25:14,  1.30it/s] 48%|████▊     | 6107/12776 [1:05:17<1:24:56,  1.31it/s]                                                         48%|████▊     | 6107/12776 [1:05:17<1:24:56,  1.31it/s] 48%|████▊     | 6108/12776 [1:05:18<1:19:44,  1.39it/s]                                                         48%|████▊     | 6108/12776 [1:05:18<1:19:44,  1.39it/s] 48%|████▊     | 6109/12776 [1:05:18<1:14:56,  1.48it/s]                                                         48%|████▊     | 6109/12776 [1:05:18<1:14:56,  1.48it/s] 48%|████▊     | 6110/12776 [1:05:19<1:11:25,  1.56it/s]                                                         48%|████▊     | 6110/12776 [1:05:19<1:11:25,  1.56it/s] 48%|████▊     | 6111/12776 [1:05:19<1:09:50,  1.59it/s]                                                         48%|████▊     | 6111/12776 [1:05:19<1:09:50,  1.59it/s] 48%|████▊     | 6112/12776 [1:05:20<1:05:41,  1.69it/s]                                                         48%|████▊     | 6112/12776 [1:05:20<1:05:41,  1.69it/s] 48%|████▊     | 6113/12776 [1:05:21<1:06:15,  1.68it/s]                                                         48%|████▊     | 6113/12776 [1:05:21<1:06:15,  1.68it/s] 48%|████▊     | 6114/12776 [1:05:21<1:02:17,  1.78it/s]                                                         48%|████▊     | 6114/12776 [1:05:21<1:02:17,  1.78it/s] 48%|████▊     | 6115/12776 [1:05:21<58:34,  1.90it/s]                                                         48%|████▊     | 6115/12776 [1:05:21<58:34,  1.90it/s] 48%|████▊     | 6116/12776 [1:05:22<57:48,  1.92it/s]                                                       48%|████▊     | 6116/12776 [1:05:22<57:48,  1.92it/s] 48%|████▊     | 6117/12776 [1:05:22<54:29,  2.04it/s]                                                       48%|████▊     | 6117/12776 [1:05:22<54:29,  2.04it/s] 48%|████▊     | 6118/12776 [1:05:23<53:36,  2.07it/s]                                                       48%|████▊     | 6118/12776 [1:05:23<53:36,  2.07it/s] 48%|████▊     | 6119/12776 [1:05:23<50:26,  2.20it/s]                                                       48%|████▊     | 6119/12776 [1:05:23<50:26,  2.20it/s] 48%|████▊     | 6120/12776 [1:05:24<47:47,  2.32it/s]                                                       48%|████▊     | 6120/12776 [1:05:24<47:47,  2.32it/s] 48%|████▊     | 6121/12776 [1:05:24<47:03,  2.36it/s]                                                       48%|████▊     | 6121/12776 [1:05:24<47:03,  2.36it/s] 48%|████▊     | 6122/12776 [1:05:24<44:31,  2.49it/s]                                                       48%|████▊     | 6122/12776 [1:05:24<44:31,  2.49it/s] 48%|████▊     | 6123/12776 [1:05:25<42:24,  2.61it/s]                                                       48%|████▊     | 6123/12776 [1:05:25<42:24,  2.61it/s] 48%|████▊     | 6124/12776 [1:05:25<44:43,  2.48it/s]                                                       48%|████▊     | 6124/12776 [1:05:25<44:43,  2.48it/s] 48%|████▊     | 6125/12776 [1:05:25<42:08,  2.63it/s]                                                       48%|████▊     | 6125/12776 [1:05:25<42:08,  2.63it/s] 48%|████▊     | 6126/12776 [1:05:26<40:14,  2.75it/s]                                                       48%|████▊     | 6126/12776 [1:05:26<40:14,  2.75it/s] 48%|████▊     | 6127/12776 [1:05:26<38:09,  2.90it/s]                                                       48%|████▊     | 6127/12776 [1:05:26<38:09,  2.90it/s] 48%|████▊     | 6128/12776 [1:05:26<38:06,  2.91it/s]                                                       48%|████▊     | 6128/12776 [1:05:26<38:06,  2.91it/s] 48%|████▊     | 6129/12776 [1:05:27<36:16,  3.05it/s]                                                       48%|████▊     | 6129/12776 [1:05:27<36:16,  3.05it/s] 48%|████▊     | 6130/12776 [1:05:27<34:49,  3.18it/s]                                                       48%|████▊     | 6130/12776 [1:05:27<34:49,  3.18it/s] 48%|████▊     | 6131/12776 [1:05:27<33:24,  3.31it/s]                                                       48%|████▊     | 6131/12776 [1:05:27<33:24,  3.31it/s] 48%|████▊     | 6132/12776 [1:05:28<32:31,  3.40it/s]                                                       48%|████▊     | 6132/12776 [1:05:28<32:31,  3.40it/s] 48%|████▊     | 6133/12776 [1:05:28<31:14,  3.54it/s]                                                       48%|████▊     | 6133/12776 [1:05:28<31:14,  3.54it/s] 48%|████▊     | 6134/12776 [1:05:28<30:06,  3.68it/s]                                                       48%|████▊     | 6134/12776 [1:05:28<30:06,  3.68it/s] 48%|████▊     | 6135/12776 [1:05:28<29:10,  3.79it/s]                                                       48%|████▊     | 6135/12776 [1:05:28<29:10,  3.79it/s] 48%|████▊     | 6136/12776 [1:05:29<32:59,  3.35it/s]                                                       48%|████▊     | 6136/12776 [1:05:29<32:59,  3.35it/s] 48%|████▊     | 6137/12776 [1:05:29<30:53,  3.58it/s]                                                       48%|████▊     | 6137/12776 [1:05:29<30:53,  3.58it/s] 48%|████▊     | 6138/12776 [1:05:29<29:09,  3.79it/s]                                                       48%|████▊     | 6138/12776 [1:05:29<29:09,  3.79it/s] 48%|████▊     | 6139/12776 [1:05:29<27:35,  4.01it/s]                                                       48%|████▊     | 6139/12776 [1:05:29<27:35,  4.01it/s] 48%|████▊     | 6140/12776 [1:05:30<29:41,  3.72it/s]                                                       48%|████▊     | 6140/12776 [1:05:30<29:41,  3.72it/s] 48%|████▊     | 6141/12776 [1:05:30<27:42,  3.99it/s]                                                       48%|████▊     | 6141/12776 [1:05:30<27:42,  3.99it/s] 48%|████▊     | 6142/12776 [1:05:30<26:24,  4.19it/s]                                                       48%|████▊     | 6142/12776 [1:05:30<26:24,  4.19it/s] 48%|████▊     | 6143/12776 [1:05:30<25:16,  4.37it/s]                                                       48%|████▊     | 6143/12776 [1:05:30<25:16,  4.37it/s] 48%|████▊     | 6144/12776 [1:05:31<24:25,  4.52it/s]                                                       48%|████▊     | 6144/12776 [1:05:31<24:25,  4.52it/s] 48%|████▊     | 6145/12776 [1:05:31<27:28,  4.02it/s]                                                       48%|████▊     | 6145/12776 [1:05:31<27:28,  4.02it/s] 48%|████▊     | 6146/12776 [1:05:31<25:39,  4.31it/s]                                                       48%|████▊     | 6146/12776 [1:05:31<25:39,  4.31it/s] 48%|████▊     | 6147/12776 [1:05:31<24:19,  4.54it/s]                                                       48%|████▊     | 6147/12776 [1:05:31<24:19,  4.54it/s] 48%|████▊     | 6148/12776 [1:05:31<23:11,  4.76it/s]                                                       48%|████▊     | 6148/12776 [1:05:31<23:11,  4.76it/s] 48%|████▊     | 6149/12776 [1:05:32<22:20,  4.94it/s]                                                       48%|████▊     | 6149/12776 [1:05:32<22:20,  4.94it/s] 48%|████▊     | 6150/12776 [1:05:32<37:33,  2.94it/s]                                                       48%|████▊     | 6150/12776 [1:05:32<37:33,  2.94it/s] 48%|████▊     | 6151/12776 [1:05:34<1:16:24,  1.45it/s]                                                         48%|████▊     | 6151/12776 [1:05:34<1:16:24,  1.45it/s] 48%|████▊     | 6152/12776 [1:05:35<1:24:32,  1.31it/s]                                                         48%|████▊     | 6152/12776 [1:05:35<1:24:32,  1.31it/s] 48%|████▊     | 6153/12776 [1:05:36<1:27:00,  1.27it/s]                                                         48%|████▊     | 6153/12776 [1:05:36<1:27:00,  1.27it/s] 48%|████▊     | 6154/12776 [1:05:36<1:26:25,  1.28it/s]                                                        {'loss': 0.6062, 'grad_norm': 2.3864524364471436, 'learning_rate': 0.0001639784946236559, 'epoch': 0.95}
+{'loss': 0.5013, 'grad_norm': 1.8597694635391235, 'learning_rate': 0.00016395405669599218, 'epoch': 0.95}
+{'loss': 0.8579, 'grad_norm': 1.8252570629119873, 'learning_rate': 0.00016392961876832843, 'epoch': 0.95}
+{'loss': 0.6946, 'grad_norm': 1.7800219058990479, 'learning_rate': 0.0001639051808406647, 'epoch': 0.95}
+{'loss': 1.0379, 'grad_norm': 1.8645308017730713, 'learning_rate': 0.00016388074291300098, 'epoch': 0.95}
+{'loss': 0.9241, 'grad_norm': 2.3539228439331055, 'learning_rate': 0.0001638563049853372, 'epoch': 0.95}
+{'loss': 1.2359, 'grad_norm': 5.200006484985352, 'learning_rate': 0.00016383186705767349, 'epoch': 0.95}
+{'loss': 1.3175, 'grad_norm': 1.8422434329986572, 'learning_rate': 0.00016380742913000976, 'epoch': 0.95}
+{'loss': 0.979, 'grad_norm': 3.778296709060669, 'learning_rate': 0.00016378299120234602, 'epoch': 0.95}
+{'loss': 1.2131, 'grad_norm': 2.5928831100463867, 'learning_rate': 0.0001637585532746823, 'epoch': 0.95}
+{'loss': 0.6146, 'grad_norm': 0.75013267993927, 'learning_rate': 0.00016373411534701857, 'epoch': 0.95}
+{'loss': 0.7088, 'grad_norm': 3.9112038612365723, 'learning_rate': 0.00016370967741935482, 'epoch': 0.95}
+{'loss': 1.0111, 'grad_norm': 1.6564382314682007, 'learning_rate': 0.0001636852394916911, 'epoch': 0.95}
+{'loss': 1.3784, 'grad_norm': 4.030745506286621, 'learning_rate': 0.00016366080156402735, 'epoch': 0.95}
+{'loss': 1.2618, 'grad_norm': 3.7477824687957764, 'learning_rate': 0.0001636363636363636, 'epoch': 0.95}
+{'loss': 1.029, 'grad_norm': 4.694358825683594, 'learning_rate': 0.00016361192570869988, 'epoch': 0.95}
+{'loss': 1.1996, 'grad_norm': 2.626904010772705, 'learning_rate': 0.00016358748778103616, 'epoch': 0.95}
+{'loss': 0.6772, 'grad_norm': 1.2721118927001953, 'learning_rate': 0.0001635630498533724, 'epoch': 0.95}
+{'loss': 1.4234, 'grad_norm': 2.771449327468872, 'learning_rate': 0.0001635386119257087, 'epoch': 0.95}
+{'loss': 1.236, 'grad_norm': 3.96138858795166, 'learning_rate': 0.00016351417399804497, 'epoch': 0.95}
+{'loss': 0.3081, 'grad_norm': 1.7881155014038086, 'learning_rate': 0.0001634897360703812, 'epoch': 0.95}
+{'loss': 1.5357, 'grad_norm': 3.0272154808044434, 'learning_rate': 0.00016346529814271747, 'epoch': 0.95}
+{'loss': 0.485, 'grad_norm': 1.5907299518585205, 'learning_rate': 0.00016344086021505375, 'epoch': 0.95}
+{'loss': 1.5678, 'grad_norm': 2.836285352706909, 'learning_rate': 0.00016341642228739, 'epoch': 0.95}
+{'loss': 0.3355, 'grad_norm': 0.5653170347213745, 'learning_rate': 0.00016339198435972628, 'epoch': 0.96}
+{'loss': 0.3155, 'grad_norm': 0.646906316280365, 'learning_rate': 0.00016336754643206256, 'epoch': 0.96}
+{'loss': 0.3738, 'grad_norm': 0.7139442563056946, 'learning_rate': 0.0001633431085043988, 'epoch': 0.96}
+{'loss': 0.2725, 'grad_norm': 0.612605094909668, 'learning_rate': 0.0001633186705767351, 'epoch': 0.96}
+{'loss': 0.291, 'grad_norm': 0.4651452898979187, 'learning_rate': 0.00016329423264907137, 'epoch': 0.96}
+{'loss': 0.2871, 'grad_norm': 0.6952741146087646, 'learning_rate': 0.0001632697947214076, 'epoch': 0.96}
+{'loss': 0.2545, 'grad_norm': 0.5448809266090393, 'learning_rate': 0.00016324535679374387, 'epoch': 0.96}
+{'loss': 0.3625, 'grad_norm': 0.9530249834060669, 'learning_rate': 0.00016322091886608015, 'epoch': 0.96}
+{'loss': 0.3335, 'grad_norm': 0.5093350410461426, 'learning_rate': 0.0001631964809384164, 'epoch': 0.96}
+{'loss': 0.3436, 'grad_norm': 0.7554910778999329, 'learning_rate': 0.00016317204301075268, 'epoch': 0.96}
+{'loss': 0.3616, 'grad_norm': 0.7633286118507385, 'learning_rate': 0.00016314760508308895, 'epoch': 0.96}
+{'loss': 0.3472, 'grad_norm': 0.7461150288581848, 'learning_rate': 0.0001631231671554252, 'epoch': 0.96}
+{'loss': 0.4187, 'grad_norm': 1.1075865030288696, 'learning_rate': 0.00016309872922776146, 'epoch': 0.96}
+{'loss': 0.5136, 'grad_norm': 1.0741170644760132, 'learning_rate': 0.00016307429130009774, 'epoch': 0.96}
+{'loss': 0.317, 'grad_norm': 0.8717575073242188, 'learning_rate': 0.000163049853372434, 'epoch': 0.96}
+{'loss': 0.4035, 'grad_norm': 2.2658355236053467, 'learning_rate': 0.00016302541544477027, 'epoch': 0.96}
+{'loss': 0.7772, 'grad_norm': 1.5720932483673096, 'learning_rate': 0.00016300097751710654, 'epoch': 0.96}
+{'loss': 0.4958, 'grad_norm': 2.046311140060425, 'learning_rate': 0.0001629765395894428, 'epoch': 0.96}
+{'loss': 0.6284, 'grad_norm': 1.5623672008514404, 'learning_rate': 0.00016295210166177907, 'epoch': 0.96}
+{'loss': 0.9757, 'grad_norm': 2.6061389446258545, 'learning_rate': 0.00016292766373411535, 'epoch': 0.96}
+{'loss': 0.5318, 'grad_norm': 1.6166800260543823, 'learning_rate': 0.00016290322580645158, 'epoch': 0.96}
+{'loss': 0.6959, 'grad_norm': 1.0401281118392944, 'learning_rate': 0.00016287878787878785, 'epoch': 0.96}
+{'loss': 0.6678, 'grad_norm': 2.1870529651641846, 'learning_rate': 0.00016285434995112413, 'epoch': 0.96}
+{'loss': 0.3948, 'grad_norm': 0.9855921864509583, 'learning_rate': 0.00016282991202346038, 'epoch': 0.96}
+{'loss': 1.0385, 'grad_norm': 1.3813424110412598, 'learning_rate': 0.00016280547409579666, 'epoch': 0.96}
+{'loss': 0.4331, 'grad_norm': 1.2955299615859985, 'learning_rate': 0.00016278103616813294, 'epoch': 0.96}
+{'loss': 1.1984, 'grad_norm': 2.7574148178100586, 'learning_rate': 0.0001627565982404692, 'epoch': 0.96}
+{'loss': 0.7977, 'grad_norm': 2.4210774898529053, 'learning_rate': 0.00016273216031280547, 'epoch': 0.96}
+{'loss': 0.9785, 'grad_norm': 4.516207695007324, 'learning_rate': 0.00016270772238514175, 'epoch': 0.96}
+{'loss': 0.8274, 'grad_norm': 1.7034456729888916, 'learning_rate': 0.00016268328445747797, 'epoch': 0.96}
+{'loss': 0.8648, 'grad_norm': 1.6179447174072266, 'learning_rate': 0.00016265884652981425, 'epoch': 0.96}
+{'loss': 0.49, 'grad_norm': 1.179610013961792, 'learning_rate': 0.00016263440860215053, 'epoch': 0.96}
+{'loss': 0.9443, 'grad_norm': 2.1187167167663574, 'learning_rate': 0.00016260997067448678, 'epoch': 0.96}
+{'loss': 0.9851, 'grad_norm': 2.0016579627990723, 'learning_rate': 0.00016258553274682306, 'epoch': 0.96}
+{'loss': 0.9574, 'grad_norm': 2.0684750080108643, 'learning_rate': 0.00016256109481915934, 'epoch': 0.96}
+{'loss': 1.1024, 'grad_norm': 2.0303995609283447, 'learning_rate': 0.0001625366568914956, 'epoch': 0.96}
+{'loss': 0.6025, 'grad_norm': 3.4601995944976807, 'learning_rate': 0.00016251221896383184, 'epoch': 0.96}
+{'loss': 0.6272, 'grad_norm': 3.948913097381592, 'learning_rate': 0.00016248778103616812, 'epoch': 0.96}
+{'loss': 1.3707, 'grad_norm': 9.379945755004883, 'learning_rate': 0.00016246334310850437, 'epoch': 0.96}
+{'loss': 1.0695, 'grad_norm': 1.712647557258606, 'learning_rate': 0.00016243890518084065, 'epoch': 0.96}
+{'loss': 1.3814, 'grad_norm': 2.5404181480407715, 'learning_rate': 0.00016241446725317693, 'epoch': 0.96}
+{'loss': 1.2451, 'grad_norm': 1.7662317752838135, 'learning_rate': 0.00016239002932551318, 'epoch': 0.96}
+{'loss': 1.4876, 'grad_norm': 3.7080023288726807, 'learning_rate': 0.00016236559139784946, 'epoch': 0.96}
+{'loss': 1.0143, 'grad_norm': 2.5112531185150146, 'learning_rate': 0.00016234115347018573, 'epoch': 0.96}
+{'loss': 1.2868, 'grad_norm': 2.770322322845459, 'learning_rate': 0.00016231671554252196, 'epoch': 0.96}
+{'loss': 1.1641, 'grad_norm': 1.875613808631897, 'learning_rate': 0.00016229227761485824, 'epoch': 0.96}
+{'loss': 0.8963, 'grad_norm': 3.305671215057373, 'learning_rate': 0.00016226783968719451, 'epoch': 0.96}
+{'loss': 0.6262, 'grad_norm': 1.5809119939804077, 'learning_rate': 0.00016224340175953077, 'epoch': 0.96}
+{'loss': 0.4588, 'grad_norm': 2.6327903270721436, 'learning_rate': 0.00016221896383186704, 'epoch': 0.96}
+{'loss': 1.0183, 'grad_norm': 1.2948060035705566, 'learning_rate': 0.00016219452590420332, 'epoch': 0.96}
+{'loss': 0.2905, 'grad_norm': 0.6153829097747803, 'learning_rate': 0.00016217008797653957, 'epoch': 0.96}
+{'loss': 0.3681, 'grad_norm': 0.7002934813499451, 'learning_rate': 0.00016214565004887585, 'epoch': 0.96}
+{'loss': 0.2522, 'grad_norm': 0.4247320294380188, 'learning_rate': 0.00016212121212121213, 'epoch': 0.96}
+ 48%|████▊     | 6154/12776 [1:05:36<1:26:25,  1.28it/s] 48%|████▊     | 6155/12776 [1:05:37<1:24:58,  1.30it/s]                                                         48%|████▊     | 6155/12776 [1:05:37<1:24:58,  1.30it/s] 48%|████▊     | 6156/12776 [1:05:38<1:23:51,  1.32it/s]                                                         48%|████▊     | 6156/12776 [1:05:38<1:23:51,  1.32it/s] 48%|████▊     | 6157/12776 [1:05:38<1:20:00,  1.38it/s]                                                         48%|████▊     | 6157/12776 [1:05:38<1:20:00,  1.38it/s] 48%|████▊     | 6158/12776 [1:05:39<1:21:08,  1.36it/s]                                                         48%|████▊     | 6158/12776 [1:05:39<1:21:08,  1.36it/s] 48%|████▊     | 6159/12776 [1:05:40<1:15:59,  1.45it/s]                                                         48%|████▊     | 6159/12776 [1:05:40<1:15:59,  1.45it/s] 48%|████▊     | 6160/12776 [1:05:40<1:14:09,  1.49it/s]                                                         48%|████▊     | 6160/12776 [1:05:40<1:14:09,  1.49it/s] 48%|████▊     | 6161/12776 [1:05:41<1:09:15,  1.59it/s]                                                         48%|████▊     | 6161/12776 [1:05:41<1:09:15,  1.59it/s] 48%|████▊     | 6162/12776 [1:05:42<1:08:40,  1.61it/s]                                                         48%|████▊     | 6162/12776 [1:05:42<1:08:40,  1.61it/s] 48%|████▊     | 6163/12776 [1:05:42<1:03:04,  1.75it/s]                                                         48%|████▊     | 6163/12776 [1:05:42<1:03:04,  1.75it/s] 48%|████▊     | 6164/12776 [1:05:43<1:03:59,  1.72it/s]                                                         48%|████▊     | 6164/12776 [1:05:43<1:03:59,  1.72it/s] 48%|████▊     | 6165/12776 [1:05:43<58:49,  1.87it/s]                                                         48%|████▊     | 6165/12776 [1:05:43<58:49,  1.87it/s] 48%|████▊     | 6166/12776 [1:05:44<58:07,  1.90it/s]                                                       48%|████▊     | 6166/12776 [1:05:44<58:07,  1.90it/s] 48%|████▊     | 6167/12776 [1:05:44<53:43,  2.05it/s]                                                       48%|████▊     | 6167/12776 [1:05:44<53:43,  2.05it/s] 48%|████▊     | 6168/12776 [1:05:44<50:15,  2.19it/s]                                                       48%|████▊     | 6168/12776 [1:05:44<50:15,  2.19it/s] 48%|████▊     | 6169/12776 [1:05:45<48:20,  2.28it/s]                                                       48%|████▊     | 6169/12776 [1:05:45<48:20,  2.28it/s] 48%|████▊     | 6170/12776 [1:05:45<45:27,  2.42it/s]                                                       48%|████▊     | 6170/12776 [1:05:45<45:27,  2.42it/s] 48%|████▊     | 6171/12776 [1:05:45<43:07,  2.55it/s]                                                       48%|████▊     | 6171/12776 [1:05:45<43:07,  2.55it/s] 48%|████▊     | 6172/12776 [1:05:46<44:57,  2.45it/s]                                                       48%|████▊     | 6172/12776 [1:05:46<44:57,  2.45it/s] 48%|████▊     | 6173/12776 [1:05:46<42:12,  2.61it/s]                                                       48%|████▊     | 6173/12776 [1:05:46<42:12,  2.61it/s] 48%|████▊     | 6174/12776 [1:05:46<40:05,  2.74it/s]                                                       48%|████▊     | 6174/12776 [1:05:46<40:05,  2.74it/s] 48%|████▊     | 6175/12776 [1:05:47<38:08,  2.88it/s]                                                       48%|████▊     | 6175/12776 [1:05:47<38:08,  2.88it/s] 48%|████▊     | 6176/12776 [1:05:47<38:22,  2.87it/s]                                                       48%|████▊     | 6176/12776 [1:05:47<38:22,  2.87it/s] 48%|████▊     | 6177/12776 [1:05:47<36:11,  3.04it/s]                                                       48%|████▊     | 6177/12776 [1:05:47<36:11,  3.04it/s] 48%|████▊     | 6178/12776 [1:05:48<34:21,  3.20it/s]                                                       48%|████▊     | 6178/12776 [1:05:48<34:21,  3.20it/s] 48%|████▊     | 6179/12776 [1:05:48<32:57,  3.34it/s]                                                       48%|████▊     | 6179/12776 [1:05:48<32:57,  3.34it/s] 48%|████▊     | 6180/12776 [1:05:48<34:03,  3.23it/s]                                                       48%|████▊     | 6180/12776 [1:05:48<34:03,  3.23it/s] 48%|████▊     | 6181/12776 [1:05:49<32:09,  3.42it/s]                                                       48%|████▊     | 6181/12776 [1:05:49<32:09,  3.42it/s] 48%|████▊     | 6182/12776 [1:05:49<30:36,  3.59it/s]                                                       48%|████▊     | 6182/12776 [1:05:49<30:36,  3.59it/s] 48%|████▊     | 6183/12776 [1:05:49<29:29,  3.73it/s]                                                       48%|████▊     | 6183/12776 [1:05:49<29:29,  3.73it/s] 48%|████▊     | 6184/12776 [1:05:49<31:15,  3.51it/s]                                                       48%|████▊     | 6184/12776 [1:05:49<31:15,  3.51it/s] 48%|████▊     | 6185/12776 [1:05:50<29:30,  3.72it/s]                                                       48%|████▊     | 6185/12776 [1:05:50<29:30,  3.72it/s] 48%|████▊     | 6186/12776 [1:05:50<28:00,  3.92it/s]                                                       48%|████▊     | 6186/12776 [1:05:50<28:00,  3.92it/s] 48%|████▊     | 6187/12776 [1:05:50<26:51,  4.09it/s]                                                       48%|████▊     | 6187/12776 [1:05:50<26:51,  4.09it/s] 48%|████▊     | 6188/12776 [1:05:50<29:11,  3.76it/s]                                                       48%|████▊     | 6188/12776 [1:05:50<29:11,  3.76it/s] 48%|████▊     | 6189/12776 [1:05:51<27:20,  4.02it/s]                                                       48%|████▊     | 6189/12776 [1:05:51<27:20,  4.02it/s] 48%|████▊     | 6190/12776 [1:05:51<26:03,  4.21it/s]                                                       48%|████▊     | 6190/12776 [1:05:51<26:03,  4.21it/s] 48%|████▊     | 6191/12776 [1:05:51<24:59,  4.39it/s]                                                       48%|████▊     | 6191/12776 [1:05:51<24:59,  4.39it/s] 48%|████▊     | 6192/12776 [1:05:51<24:14,  4.53it/s]                                                       48%|████▊     | 6192/12776 [1:05:51<24:14,  4.53it/s] 48%|████▊     | 6193/12776 [1:05:51<27:01,  4.06it/s]                                                       48%|████▊     | 6193/12776 [1:05:51<27:01,  4.06it/s] 48%|████▊     | 6194/12776 [1:05:52<25:39,  4.28it/s]                                                       48%|████▊     | 6194/12776 [1:05:52<25:39,  4.28it/s] 48%|████▊     | 6195/12776 [1:05:52<24:26,  4.49it/s]                                                       48%|████▊     | 6195/12776 [1:05:52<24:26,  4.49it/s] 48%|████▊     | 6196/12776 [1:05:52<23:27,  4.67it/s]                                                       48%|████▊     | 6196/12776 [1:05:52<23:27,  4.67it/s] 49%|████▊     | 6197/12776 [1:05:52<22:45,  4.82it/s]                                                       49%|████▊     | 6197/12776 [1:05:52<22:45,  4.82it/s] 49%|████▊     | 6198/12776 [1:05:52<21:59,  4.99it/s]                                                       49%|████▊     | 6198/12776 [1:05:52<21:59,  4.99it/s] 49%|████▊     | 6199/12776 [1:05:53<24:55,  4.40it/s]                                                       49%|████▊     | 6199/12776 [1:05:53<24:55,  4.40it/s] 49%|████▊     | 6200/12776 [1:05:53<38:50,  2.82it/s]                                                       49%|████▊     | 6200/12776 [1:05:53<38:50,  2.82it/s] 49%|████▊     | 6201/12776 [1:05:55<1:07:53,  1.61it/s]                                                         49%|████▊     | 6201/12776 [1:05:55<1:07:53,  1.61it/s] 49%|████▊     | 6202/12776 [1:05:56<1:18:14,  1.40it/s]                                                         49%|████▊     | 6202/12776 [1:05:56<1:18:14,  1.40it/s] 49%|████▊     | 6203/12776 [1:05:57<1:25:48,  1.28it/s]                                                         49%|████▊     | 6203/12776 [1:05:57<1:25:48,  1.28it/s] 49%|████▊     | 6204/12776 [1:05:57<1:25:08,  1.29it/s]                                                         49%|████▊     | 6204/12776 [1:05:57<1:25:08,  1.29it/s] 49%|████▊     | 6205/12776 [1:05:58<1:24:03,  1.30it/s]                                                         49%|████▊     | 6205/12776 [1:05:58<1:24:03,  1.30it/s] 49%|████▊     | 6206/12776 [1:05:59<1:21:33,  1.34it/s]                                                         49%|████▊     | 6206/12776 [1:05:59<1:21:33,  1.34it/s] 49%|████▊     | 6207/12776 [1:05:59<1:21:57,  1.34it/s]                                                         49%|████▊     | 6207/12776 [1:05:59<1:21:57,  1.34it/s] 49%|████▊     | 6208/12776 [1:06:00<1:17:40,  1.41it/s]                                                         49%|████▊     | 6208/12776 [1:06:00<1:17:40,  1.41it/s] 49%|████▊     | 6209/12776 [1:06:01<1:12:53,  1.50it/s]                                                         49%|████▊     | 6209/12776 [1:06:01<1:12:53,  1.50it/s] 49%|████▊     | 6210/12776 [1:06:01<1:08:58,  1.59it/s]                                                         49%|████▊     | 6210/12776 [1:06:01<1:08:58,  1.59it/s] 49%|████▊     | 6211/12776 [1:06:02<1:08:46,  1.59it/s]                                                         49%|████▊     | 6211/12776 [1:06:02<1:08:46,  1.59it/s] 49%|████▊     | 6212/12776 [1:06:02<1:04:26,  1.70it/s]                                                         49%|████▊     | 6212/12776 [1:06:02<1:04:26,  1.70it/s] 49%|████▊     | 6213/12776 [1:06:03<1:05:16,  1.68it/s]                                                         49%|████▊     | 6213/12776 [1:06:03<1:05:16,  1.68it/s] 49%|████▊     | 6214/12776 [1:06:03<1:00:23,  1.81it/s]                                                         49%|████▊     | 6214/12776 [1:06:03<1:00:23,  1.81it/s] 49%|████▊     | 6215/12776 [1:06:04<56:34,  1.93it/s]                                                         49%|████▊     | 6215/12776 [1:06:04<56:34,  1.93it/s] 49%|████▊     | 6216/12776 [1:06:04<56:41,  1.93it/s]                                                       49%|████▊     | 6216/12776 [1:06:04<56:41,  1.93it/s] 49%|████▊     | 6217/12776 [1:06:05<52:49,  2.07it/s]                                                       49%|████▊     | 6217/12776 [1:06:05<52:49,  2.07it/s] 49%|████▊     | 6218/12776 [1:06:05<52:00,  2.10it/s]                                                       49%|████▊     | 6218/12776 [1:06:05<52:00,  2.10it/s] 49%|████▊     | 6219/12776 [1:06:06<49:02,  2.23it/s]                                                       49%|████▊     | 6219/12776 [1:06:06<49:02,  2.23it/s] 49%|████▊     | 6220/12776 [1:06:06<46:26,  2.35it/s]                                                       49%|████▊     | 6220/12776 [1:06:06<46:26,  2.35it/s] 49%|████▊     | 6221/12776 [1:06:06<45:51,  2.38it/s]                                                       49%|████▊     | 6221/12776 [1:06:06<45:51,  2.38it/s] 49%|████▊     | 6222/12776 [1:06:07<43:30,  2.51it/s]                                                       49%|████▊     | 6222/12776 [1:06:07<43:30,  2.51it/s] 49%|████▊     | 6223/12776 [1:06:07<41:26,  2.64it/s]                                                       49%|████▊     | 6223/12776 [1:06:07<41:26,  2.64it/s] 49%|████▊     | 6224/12776 [1:06:08<43:40,  2.50it/s]                                                       49%|████▊     | 6224/12776 [1:06:08<43:40,  2.50it/s] 49%|████▊     | 6225/12776 [1:06:08<41:07,  2.66it/s]                                                       49%|████▊     | 6225/12776 [1:06:08<41:07,  2.66it/s] 49%|████▊     | 6226/12776 [1:06:08<38:41,  2.82it/s]                                                       49%|████▊     | 6226/12776 [1:06:08<38:41,  2.82it/s] 49%|████▊     | 6227/12776 [1:06:08<36:44,  2.97it/s]                                                       49%|████▊     | 6227/12776 [1:06:08<36:44,  2.97it/s] 49%|████▊     | 6228/12776 [1:06:09<37:23,  2.92it/s]                                                       49%|████▊     | 6228/12776 [1:06:09<37:23,  2.92it/s] 49%|████▉     | 6229/12776 [1:06:09<35:18,  3.09it/s]                                                       49%|████▉     | 6229/12776 [1:06:09<35:18,  3.09it/s] 49%|████▉     | 6230/12776 [1:06:09<33:30,  3.26it/s]                                                       49%|████▉     | 6230/12776 [1:06:09<33:30,  3.26it/s] 49%|████▉     | 6231/12776 [1:06:10<31:58,  3.41it/s]                                                      {'loss': 0.4101, 'grad_norm': 1.023521900177002, 'learning_rate': 0.00016209677419354835, 'epoch': 0.96}
+{'loss': 0.4846, 'grad_norm': 0.9346298575401306, 'learning_rate': 0.00016207233626588463, 'epoch': 0.96}
+{'loss': 0.526, 'grad_norm': 1.02970290184021, 'learning_rate': 0.0001620478983382209, 'epoch': 0.96}
+{'loss': 0.346, 'grad_norm': 0.6798651218414307, 'learning_rate': 0.00016202346041055716, 'epoch': 0.96}
+{'loss': 0.3458, 'grad_norm': 0.7402242422103882, 'learning_rate': 0.00016199902248289344, 'epoch': 0.96}
+{'loss': 0.328, 'grad_norm': 0.9456111788749695, 'learning_rate': 0.00016197458455522972, 'epoch': 0.96}
+{'loss': 0.4049, 'grad_norm': 0.9947827458381653, 'learning_rate': 0.00016195014662756597, 'epoch': 0.96}
+{'loss': 0.2766, 'grad_norm': 0.9443894028663635, 'learning_rate': 0.00016192570869990222, 'epoch': 0.96}
+{'loss': 0.4539, 'grad_norm': 0.9167144298553467, 'learning_rate': 0.0001619012707722385, 'epoch': 0.96}
+{'loss': 0.608, 'grad_norm': 1.195816993713379, 'learning_rate': 0.00016187683284457475, 'epoch': 0.96}
+{'loss': 0.5446, 'grad_norm': 1.4440490007400513, 'learning_rate': 0.00016185239491691103, 'epoch': 0.96}
+{'loss': 0.3242, 'grad_norm': 0.7405814528465271, 'learning_rate': 0.0001618279569892473, 'epoch': 0.97}
+{'loss': 0.4028, 'grad_norm': 1.427616000175476, 'learning_rate': 0.00016180351906158356, 'epoch': 0.97}
+{'loss': 0.6549, 'grad_norm': 1.6844470500946045, 'learning_rate': 0.00016177908113391984, 'epoch': 0.97}
+{'loss': 0.4393, 'grad_norm': 2.0406758785247803, 'learning_rate': 0.00016175464320625612, 'epoch': 0.97}
+{'loss': 0.4588, 'grad_norm': 1.3769197463989258, 'learning_rate': 0.00016173020527859234, 'epoch': 0.97}
+{'loss': 0.4175, 'grad_norm': 1.0385569334030151, 'learning_rate': 0.00016170576735092862, 'epoch': 0.97}
+{'loss': 0.4402, 'grad_norm': 1.2535983324050903, 'learning_rate': 0.0001616813294232649, 'epoch': 0.97}
+{'loss': 0.3781, 'grad_norm': 1.0669677257537842, 'learning_rate': 0.00016165689149560115, 'epoch': 0.97}
+{'loss': 0.5489, 'grad_norm': 1.9178048372268677, 'learning_rate': 0.00016163245356793743, 'epoch': 0.97}
+{'loss': 0.6525, 'grad_norm': 1.401336908340454, 'learning_rate': 0.0001616080156402737, 'epoch': 0.97}
+{'loss': 0.8831, 'grad_norm': 1.5251128673553467, 'learning_rate': 0.00016158357771260996, 'epoch': 0.97}
+{'loss': 0.906, 'grad_norm': 2.3394248485565186, 'learning_rate': 0.00016155913978494623, 'epoch': 0.97}
+{'loss': 0.7642, 'grad_norm': 1.7218315601348877, 'learning_rate': 0.0001615347018572825, 'epoch': 0.97}
+{'loss': 0.6152, 'grad_norm': 1.3168402910232544, 'learning_rate': 0.00016151026392961874, 'epoch': 0.97}
+{'loss': 1.0541, 'grad_norm': 1.864134430885315, 'learning_rate': 0.00016148582600195502, 'epoch': 0.97}
+{'loss': 0.9664, 'grad_norm': 2.145655870437622, 'learning_rate': 0.0001614613880742913, 'epoch': 0.97}
+{'loss': 0.7563, 'grad_norm': 1.7807810306549072, 'learning_rate': 0.00016143695014662755, 'epoch': 0.97}
+{'loss': 0.7937, 'grad_norm': 2.374579906463623, 'learning_rate': 0.00016141251221896382, 'epoch': 0.97}
+{'loss': 1.1843, 'grad_norm': 2.7262957096099854, 'learning_rate': 0.0001613880742913001, 'epoch': 0.97}
+{'loss': 1.443, 'grad_norm': 2.082878589630127, 'learning_rate': 0.00016136363636363633, 'epoch': 0.97}
+{'loss': 0.9447, 'grad_norm': 3.2135493755340576, 'learning_rate': 0.0001613391984359726, 'epoch': 0.97}
+{'loss': 0.7122, 'grad_norm': 1.5901567935943604, 'learning_rate': 0.00016131476050830888, 'epoch': 0.97}
+{'loss': 1.1293, 'grad_norm': 1.4855965375900269, 'learning_rate': 0.00016129032258064513, 'epoch': 0.97}
+{'loss': 1.2107, 'grad_norm': 3.177372694015503, 'learning_rate': 0.0001612658846529814, 'epoch': 0.97}
+{'loss': 0.9807, 'grad_norm': 2.5584800243377686, 'learning_rate': 0.0001612414467253177, 'epoch': 0.97}
+{'loss': 0.876, 'grad_norm': 1.333678960800171, 'learning_rate': 0.00016121700879765394, 'epoch': 0.97}
+{'loss': 0.8509, 'grad_norm': 2.623502254486084, 'learning_rate': 0.00016119257086999022, 'epoch': 0.97}
+{'loss': 1.3429, 'grad_norm': 1.4758764505386353, 'learning_rate': 0.0001611681329423265, 'epoch': 0.97}
+{'loss': 1.4386, 'grad_norm': 1.8458406925201416, 'learning_rate': 0.00016114369501466272, 'epoch': 0.97}
+{'loss': 0.7314, 'grad_norm': 1.940486192703247, 'learning_rate': 0.000161119257086999, 'epoch': 0.97}
+{'loss': 1.3775, 'grad_norm': 2.4906039237976074, 'learning_rate': 0.00016109481915933528, 'epoch': 0.97}
+{'loss': 0.4396, 'grad_norm': 1.3360508680343628, 'learning_rate': 0.00016107038123167153, 'epoch': 0.97}
+{'loss': 0.7913, 'grad_norm': 3.2190940380096436, 'learning_rate': 0.0001610459433040078, 'epoch': 0.97}
+{'loss': 0.5736, 'grad_norm': 1.5043747425079346, 'learning_rate': 0.0001610215053763441, 'epoch': 0.97}
+{'loss': 0.775, 'grad_norm': 2.0739946365356445, 'learning_rate': 0.00016099706744868034, 'epoch': 0.97}
+{'loss': 1.0023, 'grad_norm': 3.4089341163635254, 'learning_rate': 0.00016097262952101662, 'epoch': 0.97}
+{'loss': 0.3349, 'grad_norm': 0.49759766459465027, 'learning_rate': 0.0001609481915933529, 'epoch': 0.97}
+{'loss': 0.2845, 'grad_norm': 0.44160133600234985, 'learning_rate': 0.00016092375366568912, 'epoch': 0.97}
+{'loss': 0.3082, 'grad_norm': 0.6713477969169617, 'learning_rate': 0.0001608993157380254, 'epoch': 0.97}
+{'loss': 0.2274, 'grad_norm': 0.48395052552223206, 'learning_rate': 0.00016087487781036168, 'epoch': 0.97}
+{'loss': 0.3094, 'grad_norm': 0.5824487805366516, 'learning_rate': 0.00016085043988269793, 'epoch': 0.97}
+{'loss': 0.2379, 'grad_norm': 0.6694998145103455, 'learning_rate': 0.0001608260019550342, 'epoch': 0.97}
+{'loss': 0.2673, 'grad_norm': 0.701755166053772, 'learning_rate': 0.00016080156402737048, 'epoch': 0.97}
+{'loss': 0.403, 'grad_norm': 0.9201617240905762, 'learning_rate': 0.0001607771260997067, 'epoch': 0.97}
+{'loss': 0.3029, 'grad_norm': 0.9466022849082947, 'learning_rate': 0.000160752688172043, 'epoch': 0.97}
+{'loss': 0.3822, 'grad_norm': 1.3907926082611084, 'learning_rate': 0.00016072825024437927, 'epoch': 0.97}
+{'loss': 0.5414, 'grad_norm': 0.9519656300544739, 'learning_rate': 0.00016070381231671552, 'epoch': 0.97}
+{'loss': 0.5634, 'grad_norm': 1.2506569623947144, 'learning_rate': 0.0001606793743890518, 'epoch': 0.97}
+{'loss': 0.4921, 'grad_norm': 0.7912557125091553, 'learning_rate': 0.00016065493646138807, 'epoch': 0.97}
+{'loss': 0.3606, 'grad_norm': 1.3480660915374756, 'learning_rate': 0.00016063049853372432, 'epoch': 0.97}
+{'loss': 0.4046, 'grad_norm': 1.24032461643219, 'learning_rate': 0.0001606060606060606, 'epoch': 0.97}
+{'loss': 0.6278, 'grad_norm': 1.4148228168487549, 'learning_rate': 0.00016058162267839688, 'epoch': 0.97}
+{'loss': 0.6703, 'grad_norm': 2.0846927165985107, 'learning_rate': 0.0001605571847507331, 'epoch': 0.97}
+{'loss': 0.6913, 'grad_norm': 2.4335579872131348, 'learning_rate': 0.00016053274682306938, 'epoch': 0.97}
+{'loss': 0.4337, 'grad_norm': 1.5219314098358154, 'learning_rate': 0.00016050830889540566, 'epoch': 0.97}
+{'loss': 0.3213, 'grad_norm': 3.503244400024414, 'learning_rate': 0.0001604838709677419, 'epoch': 0.97}
+{'loss': 0.3756, 'grad_norm': 1.0507615804672241, 'learning_rate': 0.0001604594330400782, 'epoch': 0.97}
+{'loss': 0.808, 'grad_norm': 1.3683404922485352, 'learning_rate': 0.00016043499511241447, 'epoch': 0.97}
+{'loss': 0.7998, 'grad_norm': 3.0311946868896484, 'learning_rate': 0.00016041055718475072, 'epoch': 0.97}
+{'loss': 0.473, 'grad_norm': 1.0792346000671387, 'learning_rate': 0.000160386119257087, 'epoch': 0.97}
+{'loss': 0.4845, 'grad_norm': 1.5082753896713257, 'learning_rate': 0.00016036168132942325, 'epoch': 0.97}
+{'loss': 0.4439, 'grad_norm': 1.629244089126587, 'learning_rate': 0.0001603372434017595, 'epoch': 0.97}
+{'loss': 0.7095, 'grad_norm': 2.6280176639556885, 'learning_rate': 0.00016031280547409578, 'epoch': 0.97}
+{'loss': 0.6459, 'grad_norm': 1.1485744714736938, 'learning_rate': 0.00016028836754643206, 'epoch': 0.97}
+{'loss': 0.4601, 'grad_norm': 3.294517755508423, 'learning_rate': 0.0001602639296187683, 'epoch': 0.98}
+{'loss': 0.9412, 'grad_norm': 2.4806206226348877, 'learning_rate': 0.0001602394916911046, 'epoch': 0.98}
+ 49%|████▉     | 6231/12776 [1:06:10<31:58,  3.41it/s] 49%|████▉     | 6232/12776 [1:06:10<33:08,  3.29it/s]                                                       49%|████▉     | 6232/12776 [1:06:10<33:08,  3.29it/s] 49%|████▉     | 6233/12776 [1:06:10<31:12,  3.49it/s]                                                       49%|████▉     | 6233/12776 [1:06:10<31:12,  3.49it/s] 49%|████▉     | 6234/12776 [1:06:10<29:41,  3.67it/s]                                                       49%|████▉     | 6234/12776 [1:06:10<29:41,  3.67it/s] 49%|████▉     | 6235/12776 [1:06:11<28:26,  3.83it/s]                                                       49%|████▉     | 6235/12776 [1:06:11<28:26,  3.83it/s] 49%|████▉     | 6236/12776 [1:06:11<31:35,  3.45it/s]                                                       49%|████▉     | 6236/12776 [1:06:11<31:35,  3.45it/s] 49%|████▉     | 6237/12776 [1:06:11<29:26,  3.70it/s]                                                       49%|████▉     | 6237/12776 [1:06:11<29:26,  3.70it/s] 49%|████▉     | 6238/12776 [1:06:11<27:44,  3.93it/s]                                                       49%|████▉     | 6238/12776 [1:06:11<27:44,  3.93it/s] 49%|████▉     | 6239/12776 [1:06:12<26:18,  4.14it/s]                                                       49%|████▉     | 6239/12776 [1:06:12<26:18,  4.14it/s] 49%|████▉     | 6240/12776 [1:06:12<25:20,  4.30it/s]                                                       49%|████▉     | 6240/12776 [1:06:12<25:20,  4.30it/s] 49%|████▉     | 6241/12776 [1:06:12<26:48,  4.06it/s]                                                       49%|████▉     | 6241/12776 [1:06:12<26:48,  4.06it/s] 49%|████▉     | 6242/12776 [1:06:12<25:29,  4.27it/s]                                                       49%|████▉     | 6242/12776 [1:06:12<25:29,  4.27it/s] 49%|████▉     | 6243/12776 [1:06:13<24:29,  4.45it/s]                                                       49%|████▉     | 6243/12776 [1:06:13<24:29,  4.45it/s] 49%|████▉     | 6244/12776 [1:06:13<23:43,  4.59it/s]                                                       49%|████▉     | 6244/12776 [1:06:13<23:43,  4.59it/s] 49%|████▉     | 6245/12776 [1:06:13<23:04,  4.72it/s]                                                       49%|████▉     | 6245/12776 [1:06:13<23:04,  4.72it/s] 49%|████▉     | 6246/12776 [1:06:13<26:30,  4.10it/s]                                                       49%|████▉     | 6246/12776 [1:06:13<26:30,  4.10it/s] 49%|████▉     | 6247/12776 [1:06:13<24:51,  4.38it/s]                                                       49%|████▉     | 6247/12776 [1:06:13<24:51,  4.38it/s] 49%|████▉     | 6248/12776 [1:06:14<23:30,  4.63it/s]                                                       49%|████▉     | 6248/12776 [1:06:14<23:30,  4.63it/s] 49%|████▉     | 6249/12776 [1:06:14<22:31,  4.83it/s]                                                       49%|████▉     | 6249/12776 [1:06:14<22:31,  4.83it/s] 49%|████▉     | 6250/12776 [1:06:15<39:25,  2.76it/s]                                                       49%|████▉     | 6250/12776 [1:06:15<39:25,  2.76it/s] 49%|████▉     | 6251/12776 [1:06:16<1:18:12,  1.39it/s]                                                         49%|████▉     | 6251/12776 [1:06:16<1:18:12,  1.39it/s] 49%|████▉     | 6252/12776 [1:06:17<1:26:19,  1.26it/s]                                                         49%|████▉     | 6252/12776 [1:06:17<1:26:19,  1.26it/s] 49%|████▉     | 6253/12776 [1:06:18<1:31:00,  1.19it/s]                                                         49%|████▉     | 6253/12776 [1:06:18<1:31:00,  1.19it/s] 49%|████▉     | 6254/12776 [1:06:19<1:29:56,  1.21it/s]                                                         49%|████▉     | 6254/12776 [1:06:19<1:29:56,  1.21it/s] 49%|████▉     | 6255/12776 [1:06:20<1:27:14,  1.25it/s]                                                         49%|████▉     | 6255/12776 [1:06:20<1:27:14,  1.25it/s] 49%|████▉     | 6256/12776 [1:06:20<1:26:48,  1.25it/s]                                                         49%|████▉     | 6256/12776 [1:06:20<1:26:48,  1.25it/s] 49%|████▉     | 6257/12776 [1:06:21<1:24:40,  1.28it/s]                                                         49%|████▉     | 6257/12776 [1:06:21<1:24:40,  1.28it/s] 49%|████▉     | 6258/12776 [1:06:22<1:20:01,  1.36it/s]                                                         49%|████▉     | 6258/12776 [1:06:22<1:20:01,  1.36it/s] 49%|████▉     | 6259/12776 [1:06:22<1:21:10,  1.34it/s]                                                         49%|████▉     | 6259/12776 [1:06:22<1:21:10,  1.34it/s] 49%|████▉     | 6260/12776 [1:06:23<1:15:28,  1.44it/s]                                                         49%|█���██▉     | 6260/12776 [1:06:23<1:15:28,  1.44it/s] 49%|████▉     | 6261/12776 [1:06:24<1:13:46,  1.47it/s]                                                         49%|████▉     | 6261/12776 [1:06:24<1:13:46,  1.47it/s] 49%|████▉     | 6262/12776 [1:06:24<1:08:56,  1.57it/s]                                                         49%|████▉     | 6262/12776 [1:06:24<1:08:56,  1.57it/s] 49%|████▉     | 6263/12776 [1:06:25<1:07:40,  1.60it/s]                                                         49%|████▉     | 6263/12776 [1:06:25<1:07:40,  1.60it/s] 49%|████▉     | 6264/12776 [1:06:25<1:02:47,  1.73it/s]                                                         49%|████▉     | 6264/12776 [1:06:25<1:02:47,  1.73it/s] 49%|████▉     | 6265/12776 [1:06:26<1:01:48,  1.76it/s]                                                         49%|████▉     | 6265/12776 [1:06:26<1:01:48,  1.76it/s] 49%|████▉     | 6266/12776 [1:06:26<57:27,  1.89it/s]                                                         49%|████▉     | 6266/12776 [1:06:26<57:27,  1.89it/s] 49%|████▉     | 6267/12776 [1:06:27<56:55,  1.91it/s]                                                       49%|████▉     | 6267/12776 [1:06:27<56:55,  1.91it/s] 49%|████▉     | 6268/12776 [1:06:27<52:38,  2.06it/s]                                                       49%|████▉     | 6268/12776 [1:06:27<52:38,  2.06it/s] 49%|████▉     | 6269/12776 [1:06:28<49:12,  2.20it/s]                                                       49%|████▉     | 6269/12776 [1:06:28<49:12,  2.20it/s] 49%|████▉     | 6270/12776 [1:06:28<47:47,  2.27it/s]                                                       49%|████▉     | 6270/12776 [1:06:28<47:47,  2.27it/s] 49%|████▉     | 6271/12776 [1:06:28<45:10,  2.40it/s]                                                       49%|████▉     | 6271/12776 [1:06:28<45:10,  2.40it/s] 49%|████▉     | 6272/12776 [1:06:29<42:58,  2.52it/s]                                                       49%|████▉     | 6272/12776 [1:06:29<42:58,  2.52it/s] 49%|████▉     | 6273/12776 [1:06:29<43:47,  2.47it/s]                                                       49%|████▉     | 6273/12776 [1:06:29<43:47,  2.47it/s] 49%|████▉     | 6274/12776 [1:06:29<41:29,  2.61it/s]                                                       49%|████▉     | 6274/12776 [1:06:29<41:29,  2.61it/s] 49%|████▉     | 6275/12776 [1:06:30<39:32,  2.74it/s]                                                       49%|████▉     | 6275/12776 [1:06:30<39:32,  2.74it/s] 49%|████▉     | 6276/12776 [1:06:30<37:37,  2.88it/s]                                                       49%|████▉     | 6276/12776 [1:06:30<37:37,  2.88it/s] 49%|████▉     | 6277/12776 [1:06:30<36:59,  2.93it/s]                                                       49%|████▉     | 6277/12776 [1:06:30<36:59,  2.93it/s] 49%|████▉     | 6278/12776 [1:06:31<35:17,  3.07it/s]                                                       49%|████▉     | 6278/12776 [1:06:31<35:17,  3.07it/s] 49%|████▉     | 6279/12776 [1:06:31<33:45,  3.21it/s]                                                       49%|████▉     | 6279/12776 [1:06:31<33:45,  3.21it/s] 49%|████▉     | 6280/12776 [1:06:31<32:19,  3.35it/s]                                                       49%|████▉     | 6280/12776 [1:06:31<32:19,  3.35it/s] 49%|████▉     | 6281/12776 [1:06:32<32:17,  3.35it/s]                                                       49%|████▉     | 6281/12776 [1:06:32<32:17,  3.35it/s] 49%|████▉     | 6282/12776 [1:06:32<30:52,  3.51it/s]                                                       49%|████▉     | 6282/12776 [1:06:32<30:52,  3.51it/s] 49%|████▉     | 6283/12776 [1:06:32<29:40,  3.65it/s]                                                       49%|████▉     | 6283/12776 [1:06:32<29:40,  3.65it/s] 49%|████▉     | 6284/12776 [1:06:32<28:44,  3.76it/s]                                                       49%|████▉     | 6284/12776 [1:06:32<28:44,  3.76it/s] 49%|████▉     | 6285/12776 [1:06:33<32:11,  3.36it/s]                                                       49%|████▉     | 6285/12776 [1:06:33<32:11,  3.36it/s] 49%|████▉     | 6286/12776 [1:06:33<30:07,  3.59it/s]                                                       49%|████▉     | 6286/12776 [1:06:33<30:07,  3.59it/s] 49%|████▉     | 6287/12776 [1:06:33<28:34,  3.78it/s]                                                       49%|████▉     | 6287/12776 [1:06:33<28:34,  3.78it/s] 49%|████▉     | 6288/12776 [1:06:33<27:06,  3.99it/s]                                                       49%|████▉     | 6288/12776 [1:06:33<27:06,  3.99it/s] 49%|████▉     | 6289/12776 [1:06:34<29:05,  3.72it/s]                                                       49%|████▉     | 6289/12776 [1:06:34<29:05,  3.72it/s] 49%|████▉     | 6290/12776 [1:06:34<27:14,  3.97it/s]                                                       49%|████▉     | 6290/12776 [1:06:34<27:14,  3.97it/s] 49%|████▉     | 6291/12776 [1:06:34<26:03,  4.15it/s]                                                       49%|████▉     | 6291/12776 [1:06:34<26:03,  4.15it/s] 49%|████▉     | 6292/12776 [1:06:34<25:01,  4.32it/s]                                                       49%|████▉     | 6292/12776 [1:06:34<25:01,  4.32it/s] 49%|████▉     | 6293/12776 [1:06:35<24:08,  4.48it/s]                                                       49%|████▉     | 6293/12776 [1:06:35<24:08,  4.48it/s] 49%|████▉     | 6294/12776 [1:06:35<26:42,  4.04it/s]                                                       49%|████▉     | 6294/12776 [1:06:35<26:42,  4.04it/s] 49%|████▉     | 6295/12776 [1:06:35<25:12,  4.29it/s]                                                       49%|████▉     | 6295/12776 [1:06:35<25:12,  4.29it/s] 49%|████▉     | 6296/12776 [1:06:35<24:00,  4.50it/s]                                                       49%|████▉     | 6296/12776 [1:06:35<24:00,  4.50it/s] 49%|████▉     | 6297/12776 [1:06:35<23:06,  4.67it/s]                                                       49%|████▉     | 6297/12776 [1:06:35<23:06,  4.67it/s] 49%|████▉     | 6298/12776 [1:06:36<22:22,  4.83it/s]                                                       49%|████▉     | 6298/12776 [1:06:36<22:22,  4.83it/s] 49%|████▉     | 6299/12776 [1:06:36<21:41,  4.98it/s]                                                       49%|████▉     | 6299/12776 [1:06:36<21:41,  4.98it/s] 49%|████▉     | 6300/12776 [1:06:36<38:22,  2.81it/s]                                                       49%|████▉     | 6300/12776 [1:06:37<38:22,  2.81it/s] 49%|████▉     | 6301/12776 [1:06:38<1:14:13,  1.45it/s]                                                         49%|████▉     | 6301/12776 [1:06:38<1:14:13,  1.45it/s] 49%|████▉     | 6302/12776 [1:06:39<1:24:21,  1.28it/s]                                                         49%|████▉     | 6302/12776 [1:06:39<1:24:21,  1.28it/s] 49%|████▉     | 6303/12776 [1:06:40<1:28:43,  1.22it/s]                                                         49%|████▉     | 6303/12776 [1:06:40<1:28:43,  1.22it/s] 49%|████▉     | 6304/12776 [1:06:41<1:32:23,  1.17it/s]                                                         49%|████▉     | 6304/12776 [1:06:41<1:32:23,  1.17it/s] 49%|████▉     | 6305/12776 [1:06:42<1:30:44,  1.19it/s]                                                         49%|████▉     | 6305/12776 [1:06:42<1:30:44,  1.19it/s] 49%|████▉     | 6306/12776 [1:06:42<1:27:27,  1.23it/s]                                                         49%|████▉     | 6306/12776 [1:06:42<1:27:27,  1.23it/s] 49%|████▉     | 6307/12776 [1:06:43<1:25:51,  1.26it/s]                                                         49%|████▉     | 6307/12776 [1:06:43<1:25:51,  1.26it/s] 49%|████▉     | 6308/12776 [1:06:44<1:23:34,  1.29it/s]                                                        {'loss': 0.6169, 'grad_norm': 1.6524327993392944, 'learning_rate': 0.00016021505376344087, 'epoch': 0.98}
+{'loss': 0.7836, 'grad_norm': 1.4262551069259644, 'learning_rate': 0.0001601906158357771, 'epoch': 0.98}
+{'loss': 0.3791, 'grad_norm': 1.37894868850708, 'learning_rate': 0.00016016617790811337, 'epoch': 0.98}
+{'loss': 1.2778, 'grad_norm': 3.260524272918701, 'learning_rate': 0.00016014173998044965, 'epoch': 0.98}
+{'loss': 0.5008, 'grad_norm': 1.1274797916412354, 'learning_rate': 0.0001601173020527859, 'epoch': 0.98}
+{'loss': 1.2082, 'grad_norm': 3.6673662662506104, 'learning_rate': 0.00016009286412512218, 'epoch': 0.98}
+{'loss': 0.8466, 'grad_norm': 2.9804086685180664, 'learning_rate': 0.00016006842619745846, 'epoch': 0.98}
+{'loss': 1.1366, 'grad_norm': 2.351480007171631, 'learning_rate': 0.0001600439882697947, 'epoch': 0.98}
+{'loss': 0.8895, 'grad_norm': 2.855189561843872, 'learning_rate': 0.00016001955034213098, 'epoch': 0.98}
+{'loss': 1.5563, 'grad_norm': 3.6381757259368896, 'learning_rate': 0.00015999511241446726, 'epoch': 0.98}
+{'loss': 1.0543, 'grad_norm': 1.6015832424163818, 'learning_rate': 0.0001599706744868035, 'epoch': 0.98}
+{'loss': 0.8863, 'grad_norm': 1.6288787126541138, 'learning_rate': 0.00015994623655913977, 'epoch': 0.98}
+{'loss': 1.1518, 'grad_norm': 3.075251579284668, 'learning_rate': 0.00015992179863147604, 'epoch': 0.98}
+{'loss': 1.1322, 'grad_norm': 1.8009350299835205, 'learning_rate': 0.0001598973607038123, 'epoch': 0.98}
+{'loss': 0.5735, 'grad_norm': 1.083701491355896, 'learning_rate': 0.00015987292277614857, 'epoch': 0.98}
+{'loss': 1.359, 'grad_norm': 3.801417589187622, 'learning_rate': 0.00015984848484848485, 'epoch': 0.98}
+{'loss': 1.027, 'grad_norm': 3.566204309463501, 'learning_rate': 0.0001598240469208211, 'epoch': 0.98}
+{'loss': 1.3133, 'grad_norm': 1.9416583776474, 'learning_rate': 0.00015979960899315738, 'epoch': 0.98}
+{'loss': 1.1721, 'grad_norm': 4.796366214752197, 'learning_rate': 0.00015977517106549363, 'epoch': 0.98}
+{'loss': 1.2208, 'grad_norm': 2.206047296524048, 'learning_rate': 0.00015975073313782988, 'epoch': 0.98}
+{'loss': 0.3513, 'grad_norm': 0.7884498834609985, 'learning_rate': 0.00015972629521016616, 'epoch': 0.98}
+{'loss': 0.2797, 'grad_norm': 0.5849615931510925, 'learning_rate': 0.00015970185728250244, 'epoch': 0.98}
+{'loss': 0.3183, 'grad_norm': 0.5109837651252747, 'learning_rate': 0.0001596774193548387, 'epoch': 0.98}
+{'loss': 0.2323, 'grad_norm': 0.5798192620277405, 'learning_rate': 0.00015965298142717497, 'epoch': 0.98}
+{'loss': 0.259, 'grad_norm': 0.5868884921073914, 'learning_rate': 0.00015962854349951125, 'epoch': 0.98}
+{'loss': 0.2945, 'grad_norm': 0.7878360748291016, 'learning_rate': 0.00015960410557184747, 'epoch': 0.98}
+{'loss': 0.2745, 'grad_norm': 0.6286665201187134, 'learning_rate': 0.00015957966764418375, 'epoch': 0.98}
+{'loss': 0.4147, 'grad_norm': 1.0530362129211426, 'learning_rate': 0.00015955522971652003, 'epoch': 0.98}
+{'loss': 0.3675, 'grad_norm': 1.6230087280273438, 'learning_rate': 0.00015953079178885628, 'epoch': 0.98}
+{'loss': 0.4494, 'grad_norm': 0.9370027184486389, 'learning_rate': 0.00015950635386119256, 'epoch': 0.98}
+{'loss': 0.3247, 'grad_norm': 0.6508410573005676, 'learning_rate': 0.00015948191593352884, 'epoch': 0.98}
+{'loss': 0.4053, 'grad_norm': 0.7678089141845703, 'learning_rate': 0.0001594574780058651, 'epoch': 0.98}
+{'loss': 0.4773, 'grad_norm': 1.5006393194198608, 'learning_rate': 0.00015943304007820137, 'epoch': 0.98}
+{'loss': 0.4695, 'grad_norm': 1.076756238937378, 'learning_rate': 0.00015940860215053765, 'epoch': 0.98}
+{'loss': 0.4956, 'grad_norm': 1.24697744846344, 'learning_rate': 0.00015938416422287387, 'epoch': 0.98}
+{'loss': 0.4614, 'grad_norm': 1.1420774459838867, 'learning_rate': 0.00015935972629521015, 'epoch': 0.98}
+{'loss': 0.512, 'grad_norm': 1.9122599363327026, 'learning_rate': 0.00015933528836754643, 'epoch': 0.98}
+{'loss': 0.4518, 'grad_norm': 1.2293574810028076, 'learning_rate': 0.00015931085043988268, 'epoch': 0.98}
+{'loss': 0.5496, 'grad_norm': 2.1564345359802246, 'learning_rate': 0.00015928641251221896, 'epoch': 0.98}
+{'loss': 0.6363, 'grad_norm': 1.1697052717208862, 'learning_rate': 0.00015926197458455523, 'epoch': 0.98}
+{'loss': 0.3766, 'grad_norm': 1.1160368919372559, 'learning_rate': 0.00015923753665689149, 'epoch': 0.98}
+{'loss': 0.7845, 'grad_norm': 1.3110209703445435, 'learning_rate': 0.00015921309872922774, 'epoch': 0.98}
+{'loss': 0.3716, 'grad_norm': 1.1179189682006836, 'learning_rate': 0.00015918866080156402, 'epoch': 0.98}
+{'loss': 0.4372, 'grad_norm': 0.8737648129463196, 'learning_rate': 0.00015916422287390027, 'epoch': 0.98}
+{'loss': 0.7372, 'grad_norm': 1.6135969161987305, 'learning_rate': 0.00015913978494623654, 'epoch': 0.98}
+{'loss': 0.713, 'grad_norm': 2.5550239086151123, 'learning_rate': 0.00015911534701857282, 'epoch': 0.98}
+{'loss': 0.967, 'grad_norm': 2.2706096172332764, 'learning_rate': 0.00015909090909090907, 'epoch': 0.98}
+{'loss': 0.706, 'grad_norm': 2.025991678237915, 'learning_rate': 0.00015906647116324535, 'epoch': 0.98}
+{'loss': 0.5235, 'grad_norm': 1.7579374313354492, 'learning_rate': 0.00015904203323558163, 'epoch': 0.98}
+{'loss': 0.608, 'grad_norm': 1.4392457008361816, 'learning_rate': 0.00015901759530791786, 'epoch': 0.98}
+{'loss': 1.2177, 'grad_norm': 1.8196216821670532, 'learning_rate': 0.00015899315738025413, 'epoch': 0.98}
+{'loss': 0.6056, 'grad_norm': 4.727586269378662, 'learning_rate': 0.0001589687194525904, 'epoch': 0.98}
+{'loss': 0.7133, 'grad_norm': 1.8476710319519043, 'learning_rate': 0.00015894428152492666, 'epoch': 0.98}
+{'loss': 1.0066, 'grad_norm': 2.2918593883514404, 'learning_rate': 0.00015891984359726294, 'epoch': 0.98}
+{'loss': 0.7832, 'grad_norm': 1.7023167610168457, 'learning_rate': 0.00015889540566959922, 'epoch': 0.98}
+{'loss': 0.8247, 'grad_norm': 1.6587579250335693, 'learning_rate': 0.00015887096774193547, 'epoch': 0.98}
+{'loss': 1.1478, 'grad_norm': 2.7378196716308594, 'learning_rate': 0.00015884652981427175, 'epoch': 0.98}
+{'loss': 0.6136, 'grad_norm': 5.176706314086914, 'learning_rate': 0.00015882209188660803, 'epoch': 0.98}
+{'loss': 0.8779, 'grad_norm': 2.3430936336517334, 'learning_rate': 0.00015879765395894425, 'epoch': 0.98}
+{'loss': 1.3216, 'grad_norm': 2.579249382019043, 'learning_rate': 0.00015877321603128053, 'epoch': 0.98}
+{'loss': 1.1508, 'grad_norm': 2.4976139068603516, 'learning_rate': 0.0001587487781036168, 'epoch': 0.98}
+{'loss': 0.8853, 'grad_norm': 1.357665777206421, 'learning_rate': 0.00015872434017595306, 'epoch': 0.98}
+{'loss': 0.814, 'grad_norm': 2.1828815937042236, 'learning_rate': 0.00015869990224828934, 'epoch': 0.99}
+{'loss': 1.0242, 'grad_norm': 3.8649942874908447, 'learning_rate': 0.00015867546432062562, 'epoch': 0.99}
+{'loss': 1.4216, 'grad_norm': 2.397716760635376, 'learning_rate': 0.00015865102639296187, 'epoch': 0.99}
+{'loss': 0.9754, 'grad_norm': 2.9856514930725098, 'learning_rate': 0.00015862658846529812, 'epoch': 0.99}
+{'loss': 1.3687, 'grad_norm': 4.637380123138428, 'learning_rate': 0.0001586021505376344, 'epoch': 0.99}
+{'loss': 1.0291, 'grad_norm': 3.106740713119507, 'learning_rate': 0.00015857771260997065, 'epoch': 0.99}
+{'loss': 0.6318, 'grad_norm': 2.850269317626953, 'learning_rate': 0.00015855327468230693, 'epoch': 0.99}
+{'loss': 0.6223, 'grad_norm': 1.4052451848983765, 'learning_rate': 0.0001585288367546432, 'epoch': 0.99}
+{'loss': 0.3596, 'grad_norm': 0.5287259817123413, 'learning_rate': 0.00015850439882697946, 'epoch': 0.99}
+{'loss': 0.2857, 'grad_norm': 0.48303574323654175, 'learning_rate': 0.00015847996089931574, 'epoch': 0.99}
+{'loss': 0.3439, 'grad_norm': 0.5090639591217041, 'learning_rate': 0.000158455522971652, 'epoch': 0.99}
+{'loss': 0.2502, 'grad_norm': 0.6524388790130615, 'learning_rate': 0.00015843108504398824, 'epoch': 0.99}
+{'loss': 0.2383, 'grad_norm': 0.5750242471694946, 'learning_rate': 0.00015840664711632452, 'epoch': 0.99}
+{'loss': 0.3987, 'grad_norm': 1.1469411849975586, 'learning_rate': 0.0001583822091886608, 'epoch': 0.99}
+{'loss': 0.4182, 'grad_norm': 0.6706929206848145, 'learning_rate': 0.00015835777126099705, 'epoch': 0.99}
+ 49%|████▉     | 6308/12776 [1:06:44<1:23:34,  1.29it/s] 49%|████▉     | 6309/12776 [1:06:44<1:18:45,  1.37it/s]                                                         49%|████▉     | 6309/12776 [1:06:44<1:18:45,  1.37it/s] 49%|████▉     | 6310/12776 [1:06:45<1:13:38,  1.46it/s]                                                         49%|████▉     | 6310/12776 [1:06:45<1:13:38,  1.46it/s] 49%|████▉     | 6311/12776 [1:06:46<1:09:33,  1.55it/s]                                                         49%|████▉     | 6311/12776 [1:06:46<1:09:33,  1.55it/s] 49%|████▉     | 6312/12776 [1:06:46<1:07:36,  1.59it/s]                                                         49%|████▉     | 6312/12776 [1:06:46<1:07:36,  1.59it/s] 49%|████▉     | 6313/12776 [1:06:47<1:03:51,  1.69it/s]                                                         49%|████▉     | 6313/12776 [1:06:47<1:03:51,  1.69it/s] 49%|████▉     | 6314/12776 [1:06:47<1:00:13,  1.79it/s]                                                         49%|████▉     | 6314/12776 [1:06:47<1:00:13,  1.79it/s] 49%|████▉     | 6315/12776 [1:06:48<57:37,  1.87it/s]                                                         49%|████▉     | 6315/12776 [1:06:48<57:37,  1.87it/s] 49%|████▉     | 6316/12776 [1:06:48<54:43,  1.97it/s]                                                       49%|████▉     | 6316/12776 [1:06:48<54:43,  1.97it/s] 49%|████▉     | 6317/12776 [1:06:49<52:30,  2.05it/s]                                                       49%|████▉     | 6317/12776 [1:06:49<52:30,  2.05it/s] 49%|████▉     | 6318/12776 [1:06:49<49:55,  2.16it/s]                                                       49%|████▉     | 6318/12776 [1:06:49<49:55,  2.16it/s] 49%|████▉     | 6319/12776 [1:06:49<47:41,  2.26it/s]                                                       49%|████▉     | 6319/12776 [1:06:49<47:41,  2.26it/s] 49%|████▉     | 6320/12776 [1:06:50<49:48,  2.16it/s]                                                       49%|████▉     | 6320/12776 [1:06:50<49:48,  2.16it/s] 49%|████▉     | 6321/12776 [1:06:50<46:22,  2.32it/s]                                                       49%|████▉     | 6321/12776 [1:06:50<46:22,  2.32it/s] 49%|████▉     | 6322/12776 [1:06:51<43:44,  2.46it/s]                                                       49%|████▉     | 6322/12776 [1:06:51<43:44,  2.46it/s] 49%|████▉     | 6323/12776 [1:06:51<44:14,  2.43it/s]                                                       49%|████▉     | 6323/12776 [1:06:51<44:14,  2.43it/s] 49%|████▉     | 6324/12776 [1:06:51<42:28,  2.53it/s]                                                       49%|████▉     | 6324/12776 [1:06:51<42:28,  2.53it/s] 50%|████▉     | 6325/12776 [1:06:52<40:08,  2.68it/s]                                                       50%|████▉     | 6325/12776 [1:06:52<40:08,  2.68it/s] 50%|████▉     | 6326/12776 [1:06:52<40:27,  2.66it/s]                                                       50%|████▉     | 6326/12776 [1:06:52<40:27,  2.66it/s] 50%|████▉     | 6327/12776 [1:06:52<37:47,  2.84it/s]                                                       50%|████▉     | 6327/12776 [1:06:52<37:47,  2.84it/s] 50%|████▉     | 6328/12776 [1:06:53<35:33,  3.02it/s]                                                       50%|████▉     | 6328/12776 [1:06:53<35:33,  3.02it/s] 50%|████▉     | 6329/12776 [1:06:53<33:39,  3.19it/s]                                                       50%|████▉     | 6329/12776 [1:06:53<33:39,  3.19it/s] 50%|████▉     | 6330/12776 [1:06:53<31:57,  3.36it/s]                                                       50%|████▉     | 6330/12776 [1:06:53<31:57,  3.36it/s] 50%|████▉     | 6331/12776 [1:06:53<30:27,  3.53it/s]                                                       50%|████▉     | 6331/12776 [1:06:53<30:27,  3.53it/s] 50%|████▉     | 6332/12776 [1:06:54<29:26,  3.65it/s]                                                       50%|████▉     | 6332/12776 [1:06:54<29:26,  3.65it/s] 50%|████▉     | 6333/12776 [1:06:54<28:31,  3.76it/s]                                                       50%|████▉     | 6333/12776 [1:06:54<28:31,  3.76it/s] 50%|████▉     | 6334/12776 [1:06:54<30:45,  3.49it/s]                                                       50%|████▉     | 6334/12776 [1:06:54<30:45,  3.49it/s] 50%|████▉     | 6335/12776 [1:06:54<29:03,  3.69it/s]                                                       50%|████▉     | 6335/12776 [1:06:54<29:03,  3.69it/s] 50%|████▉     | 6336/12776 [1:06:55<27:42,  3.87it/s]                                                       50%|████▉     | 6336/12776 [1:06:55<27:42,  3.87it/s] 50%|████▉     | 6337/12776 [1:06:55<26:34,  4.04it/s]                                                       50%|████▉     | 6337/12776 [1:06:55<26:34,  4.04it/s] 50%|████▉     | 6338/12776 [1:06:55<27:47,  3.86it/s]                                                       50%|████▉     | 6338/12776 [1:06:55<27:47,  3.86it/s] 50%|████▉     | 6339/12776 [1:06:55<26:16,  4.08it/s]                                                       50%|████▉     | 6339/12776 [1:06:55<26:16,  4.08it/s] 50%|████▉     | 6340/12776 [1:06:56<25:13,  4.25it/s]                                                       50%|████▉     | 6340/12776 [1:06:56<25:13,  4.25it/s] 50%|████▉     | 6341/12776 [1:06:56<24:28,  4.38it/s]                                                       50%|████▉     | 6341/12776 [1:06:56<24:28,  4.38it/s] 50%|████▉     | 6342/12776 [1:06:56<23:45,  4.52it/s]                                                       50%|████▉     | 6342/12776 [1:06:56<23:45,  4.52it/s] 50%|████▉     | 6343/12776 [1:06:56<25:41,  4.17it/s]                                                       50%|████▉     | 6343/12776 [1:06:56<25:41,  4.17it/s] 50%|████▉     | 6344/12776 [1:06:57<24:18,  4.41it/s]                                                       50%|████▉     | 6344/12776 [1:06:57<24:18,  4.41it/s] 50%|████▉     | 6345/12776 [1:06:57<23:18,  4.60it/s]                                                       50%|████▉     | 6345/12776 [1:06:57<23:18,  4.60it/s] 50%|████▉     | 6346/12776 [1:06:57<22:28,  4.77it/s]                                                       50%|████▉     | 6346/12776 [1:06:57<22:28,  4.77it/s] 50%|████▉     | 6347/12776 [1:06:57<23:27,  4.57it/s]                                                       50%|████▉     | 6347/12776 [1:06:57<23:27,  4.57it/s] 50%|████▉     | 6348/12776 [1:06:57<26:35,  4.03it/s]                                                       50%|████▉     | 6348/12776 [1:06:57<26:35,  4.03it/s] 50%|████▉     | 6349/12776 [1:06:58<24:37,  4.35it/s]                                                       50%|████▉     | 6349/12776 [1:06:58<24:37,  4.35it/s] 50%|████▉     | 6350/12776 [1:06:58<39:47,  2.69it/s]                                                       50%|████▉     | 6350/12776 [1:06:58<39:47,  2.69it/s] 50%|████▉     | 6351/12776 [1:07:00<1:15:38,  1.42it/s]                                                         50%|████▉     | 6351/12776 [1:07:00<1:15:38,  1.42it/s] 50%|████▉     | 6352/12776 [1:07:01<1:27:02,  1.23it/s]                                                         50%|████▉     | 6352/12776 [1:07:01<1:27:02,  1.23it/s] 50%|████▉     | 6353/12776 [1:07:02<1:26:20,  1.24it/s]                                                         50%|████▉     | 6353/12776 [1:07:02<1:26:20,  1.24it/s] 50%|████▉     | 6354/12776 [1:07:02<1:25:01,  1.26it/s]                                                         50%|████▉     | 6354/12776 [1:07:02<1:25:01,  1.26it/s] 50%|████▉     | 6355/12776 [1:07:03<1:22:58,  1.29it/s]                                                         50%|████▉     | 6355/12776 [1:07:03<1:22:58,  1.29it/s] 50%|████▉     | 6356/12776 [1:07:04<1:18:12,  1.37it/s]                                                         50%|████▉     | 6356/12776 [1:07:04<1:18:12,  1.37it/s] 50%|████▉     | 6357/12776 [1:07:04<1:13:03,  1.46it/s]                                                         50%|████▉     | 6357/12776 [1:07:04<1:13:03,  1.46it/s] 50%|████▉     | 6358/12776 [1:07:05<1:08:29,  1.56it/s]                                                         50%|████▉     | 6358/12776 [1:07:05<1:08:29,  1.56it/s] 50%|████▉     | 6359/12776 [1:07:06<1:06:46,  1.60it/s]                                                         50%|████▉     | 6359/12776 [1:07:06<1:06:46,  1.60it/s] 50%|████▉     | 6360/12776 [1:07:06<1:01:28,  1.74it/s]                                                         50%|████▉     | 6360/12776 [1:07:06<1:01:28,  1.74it/s] 50%|████▉     | 6361/12776 [1:07:06<57:02,  1.87it/s]                                                         50%|████▉     | 6361/12776 [1:07:06<57:02,  1.87it/s] 50%|████▉     | 6362/12776 [1:07:07<55:03,  1.94it/s]                                                       50%|████▉     | 6362/12776 [1:07:07<55:03,  1.94it/s] 50%|████▉     | 6363/12776 [1:07:07<51:08,  2.09it/s]                                                       50%|████▉     | 6363/12776 [1:07:07<51:08,  2.09it/s] 50%|████▉     | 6364/12776 [1:07:08<48:02,  2.22it/s]                                                       50%|████▉     | 6364/12776 [1:07:08<48:02,  2.22it/s] 50%|████▉     | 6365/12776 [1:07:08<45:16,  2.36it/s]                                                       50%|████▉     | 6365/12776 [1:07:08<45:16,  2.36it/s] 50%|████▉     | 6366/12776 [1:07:08<42:50,  2.49it/s]                                                       50%|████▉     | 6366/12776 [1:07:08<42:50,  2.49it/s] 50%|████▉     | 6367/12776 [1:07:09<40:43,  2.62it/s]                                                       50%|████▉     | 6367/12776 [1:07:09<40:43,  2.62it/s] 50%|████▉     | 6368/12776 [1:07:09<41:30,  2.57it/s]                                                       50%|████▉     | 6368/12776 [1:07:09<41:30,  2.57it/s] 50%|████▉     | 6369/12776 [1:07:09<39:19,  2.71it/s]                                                       50%|████▉     | 6369/12776 [1:07:09<39:19,  2.71it/s] 50%|████▉     | 6370/12776 [1:07:10<37:17,  2.86it/s]                                                       50%|████▉     | 6370/12776 [1:07:10<37:17,  2.86it/s] 50%|████▉     | 6371/12776 [1:07:10<37:08,  2.87it/s]                                                       50%|████▉     | 6371/12776 [1:07:10<37:08,  2.87it/s] 50%|████▉     | 6372/12776 [1:07:10<34:51,  3.06it/s]                                                       50%|████▉     | 6372/12776 [1:07:10<34:51,  3.06it/s] 50%|████▉     | 6373/12776 [1:07:11<32:55,  3.24it/s]                                                       50%|████▉     | 6373/12776 [1:07:11<32:55,  3.24it/s] 50%|████▉     | 6374/12776 [1:07:11<31:17,  3.41it/s]                                                       50%|████▉     | 6374/12776 [1:07:11<31:17,  3.41it/s] 50%|████▉     | 6375/12776 [1:07:11<33:44,  3.16it/s]                                                       50%|████▉     | 6375/12776 [1:07:11<33:44,  3.16it/s] 50%|████▉     | 6376/12776 [1:07:12<31:27,  3.39it/s]                                                       50%|████▉     | 6376/12776 [1:07:12<31:27,  3.39it/s] 50%|████▉     | 6377/12776 [1:07:12<29:32,  3.61it/s]                                                       50%|████▉     | 6377/12776 [1:07:12<29:32,  3.61it/s] 50%|████▉     | 6378/12776 [1:07:12<27:50,  3.83it/s]                                                       50%|████▉     | 6378/12776 [1:07:12<27:50,  3.83it/s] 50%|████▉     | 6379/12776 [1:07:12<26:21,  4.05it/s]                                                       50%|████▉     | 6379/12776 [1:07:12<26:21,  4.05it/s] 50%|████▉     | 6380/12776 [1:07:13<28:38,  3.72it/s]                                                       50%|████▉     | 6380/12776 [1:07:13<28:38,  3.72it/s] 50%|████▉     | 6381/12776 [1:07:13<26:47,  3.98it/s]                                                       50%|████▉     | 6381/12776 [1:07:13<26:47,  3.98it/s] 50%|████▉     | 6382/12776 [1:07:13<25:17,  4.21it/s]                                                       50%|████▉     | 6382/12776 [1:07:13<25:17,  4.21it/s] 50%|████▉     | 6383/12776 [1:07:13<24:06,  4.42it/s]                                                       50%|████▉     | 6383/12776 [1:07:13<24:06,  4.42it/s] 50%|████▉     | 6384/12776 [1:07:13<23:12,  4.59it/s]                                                       50%|████▉     | 6384/12776 [1:07:13<23:12,  4.59it/s] 50%|████▉     | 6385/12776 [1:07:14<26:27,  4.03it/s]                                                       50%|████▉     | 6385/12776 [1:07:14<26:27,  4.03it/s] 50%|████▉     | 6386/12776 [1:07:14<24:36,  4.33it/s]                                                      {'loss': 0.3501, 'grad_norm': 0.9753531813621521, 'learning_rate': 0.00015833333333333332, 'epoch': 0.99}
+{'loss': 0.3439, 'grad_norm': 0.6613764762878418, 'learning_rate': 0.0001583088954056696, 'epoch': 0.99}
+{'loss': 0.3267, 'grad_norm': 0.5925769209861755, 'learning_rate': 0.00015828445747800585, 'epoch': 0.99}
+{'loss': 0.2879, 'grad_norm': 1.1225662231445312, 'learning_rate': 0.00015826001955034213, 'epoch': 0.99}
+{'loss': 0.517, 'grad_norm': 1.3221330642700195, 'learning_rate': 0.0001582355816226784, 'epoch': 0.99}
+{'loss': 0.4133, 'grad_norm': 0.802402675151825, 'learning_rate': 0.00015821114369501463, 'epoch': 0.99}
+{'loss': 0.54, 'grad_norm': 1.222743272781372, 'learning_rate': 0.0001581867057673509, 'epoch': 0.99}
+{'loss': 0.4293, 'grad_norm': 0.8106583952903748, 'learning_rate': 0.0001581622678396872, 'epoch': 0.99}
+{'loss': 0.5579, 'grad_norm': 1.7479103803634644, 'learning_rate': 0.00015813782991202344, 'epoch': 0.99}
+{'loss': 0.2549, 'grad_norm': 0.8788287043571472, 'learning_rate': 0.00015811339198435972, 'epoch': 0.99}
+{'loss': 1.0898, 'grad_norm': 4.599166393280029, 'learning_rate': 0.000158088954056696, 'epoch': 0.99}
+{'loss': 0.5713, 'grad_norm': 1.6982215642929077, 'learning_rate': 0.00015806451612903225, 'epoch': 0.99}
+{'loss': 0.3639, 'grad_norm': 1.5406105518341064, 'learning_rate': 0.0001580400782013685, 'epoch': 0.99}
+{'loss': 0.4535, 'grad_norm': 0.9149010181427002, 'learning_rate': 0.00015801564027370478, 'epoch': 0.99}
+{'loss': 1.0454, 'grad_norm': 2.5321741104125977, 'learning_rate': 0.00015799120234604103, 'epoch': 0.99}
+{'loss': 0.6215, 'grad_norm': 1.6757642030715942, 'learning_rate': 0.0001579667644183773, 'epoch': 0.99}
+{'loss': 0.5722, 'grad_norm': 4.210154056549072, 'learning_rate': 0.0001579423264907136, 'epoch': 0.99}
+{'loss': 0.558, 'grad_norm': 2.43436598777771, 'learning_rate': 0.00015791788856304984, 'epoch': 0.99}
+{'loss': 0.4534, 'grad_norm': 1.8142644166946411, 'learning_rate': 0.00015789345063538612, 'epoch': 0.99}
+{'loss': 0.4666, 'grad_norm': 1.3764431476593018, 'learning_rate': 0.0001578690127077224, 'epoch': 0.99}
+{'loss': 0.8198, 'grad_norm': 1.3100895881652832, 'learning_rate': 0.00015784457478005862, 'epoch': 0.99}
+{'loss': 0.7068, 'grad_norm': 1.911555528640747, 'learning_rate': 0.0001578201368523949, 'epoch': 0.99}
+{'loss': 0.8239, 'grad_norm': 2.208777666091919, 'learning_rate': 0.00015779569892473118, 'epoch': 0.99}
+{'loss': 0.3357, 'grad_norm': 2.607349157333374, 'learning_rate': 0.00015777126099706743, 'epoch': 0.99}
+{'loss': 0.6867, 'grad_norm': 2.01904559135437, 'learning_rate': 0.0001577468230694037, 'epoch': 0.99}
+{'loss': 1.0985, 'grad_norm': 3.4520325660705566, 'learning_rate': 0.00015772238514173998, 'epoch': 0.99}
+{'loss': 0.4234, 'grad_norm': 1.0999062061309814, 'learning_rate': 0.00015769794721407624, 'epoch': 0.99}
+{'loss': 0.865, 'grad_norm': 6.374632358551025, 'learning_rate': 0.00015767350928641251, 'epoch': 0.99}
+{'loss': 0.8057, 'grad_norm': 2.246000051498413, 'learning_rate': 0.0001576490713587488, 'epoch': 0.99}
+{'loss': 0.8103, 'grad_norm': 2.388260841369629, 'learning_rate': 0.00015762463343108502, 'epoch': 0.99}
+{'loss': 1.3118, 'grad_norm': 2.392380952835083, 'learning_rate': 0.0001576001955034213, 'epoch': 0.99}
+{'loss': 1.4997, 'grad_norm': 2.8762900829315186, 'learning_rate': 0.00015757575757575757, 'epoch': 0.99}
+{'loss': 1.0189, 'grad_norm': 2.6236648559570312, 'learning_rate': 0.00015755131964809382, 'epoch': 0.99}
+{'loss': 1.0106, 'grad_norm': 2.0896835327148438, 'learning_rate': 0.0001575268817204301, 'epoch': 0.99}
+{'loss': 0.7675, 'grad_norm': 1.5402086973190308, 'learning_rate': 0.00015750244379276638, 'epoch': 0.99}
+{'loss': 1.0019, 'grad_norm': 5.9841179847717285, 'learning_rate': 0.0001574780058651026, 'epoch': 0.99}
+{'loss': 1.1758, 'grad_norm': 2.565718412399292, 'learning_rate': 0.00015745356793743888, 'epoch': 0.99}
+{'loss': 1.1052, 'grad_norm': 2.391500473022461, 'learning_rate': 0.00015742913000977516, 'epoch': 0.99}
+{'loss': 0.9472, 'grad_norm': 1.9955416917800903, 'learning_rate': 0.0001574046920821114, 'epoch': 0.99}
+{'loss': 0.6311, 'grad_norm': 2.492854356765747, 'learning_rate': 0.0001573802541544477, 'epoch': 0.99}
+{'loss': 0.3773, 'grad_norm': 1.4435681104660034, 'learning_rate': 0.00015735581622678397, 'epoch': 0.99}
+{'loss': 0.9125, 'grad_norm': 3.8596134185791016, 'learning_rate': 0.00015733137829912022, 'epoch': 0.99}
+{'loss': 1.5958, 'grad_norm': 2.726219415664673, 'learning_rate': 0.0001573069403714565, 'epoch': 0.99}
+{'loss': 0.3087, 'grad_norm': 0.686896562576294, 'learning_rate': 0.00015728250244379278, 'epoch': 0.99}
+{'loss': 0.3721, 'grad_norm': 0.6529414057731628, 'learning_rate': 0.000157258064516129, 'epoch': 0.99}
+{'loss': 0.3123, 'grad_norm': 0.6401044726371765, 'learning_rate': 0.00015723362658846528, 'epoch': 0.99}
+{'loss': 0.2896, 'grad_norm': 0.6423803567886353, 'learning_rate': 0.00015720918866080156, 'epoch': 0.99}
+{'loss': 0.2859, 'grad_norm': 0.576648473739624, 'learning_rate': 0.0001571847507331378, 'epoch': 0.99}
+{'loss': 0.3422, 'grad_norm': 0.7672856450080872, 'learning_rate': 0.0001571603128054741, 'epoch': 0.99}
+{'loss': 0.294, 'grad_norm': 1.0555219650268555, 'learning_rate': 0.00015713587487781037, 'epoch': 1.0}
+{'loss': 0.3567, 'grad_norm': 0.8917555212974548, 'learning_rate': 0.00015711143695014662, 'epoch': 1.0}
+{'loss': 0.3465, 'grad_norm': 0.863348126411438, 'learning_rate': 0.0001570869990224829, 'epoch': 1.0}
+{'loss': 0.4107, 'grad_norm': 1.0300931930541992, 'learning_rate': 0.00015706256109481917, 'epoch': 1.0}
+{'loss': 0.5715, 'grad_norm': 2.140244483947754, 'learning_rate': 0.0001570381231671554, 'epoch': 1.0}
+{'loss': 0.6483, 'grad_norm': 1.4137368202209473, 'learning_rate': 0.00015701368523949168, 'epoch': 1.0}
+{'loss': 0.4882, 'grad_norm': 1.0269410610198975, 'learning_rate': 0.00015698924731182796, 'epoch': 1.0}
+{'loss': 0.4823, 'grad_norm': 2.0102436542510986, 'learning_rate': 0.0001569648093841642, 'epoch': 1.0}
+{'loss': 0.5051, 'grad_norm': 2.118197441101074, 'learning_rate': 0.00015694037145650049, 'epoch': 1.0}
+{'loss': 0.2953, 'grad_norm': 1.0090569257736206, 'learning_rate': 0.00015691593352883676, 'epoch': 1.0}
+{'loss': 0.6598, 'grad_norm': 2.38740873336792, 'learning_rate': 0.000156891495601173, 'epoch': 1.0}
+{'loss': 0.5072, 'grad_norm': 1.2782179117202759, 'learning_rate': 0.00015686705767350927, 'epoch': 1.0}
+{'loss': 0.5702, 'grad_norm': 1.9059410095214844, 'learning_rate': 0.00015684261974584554, 'epoch': 1.0}
+{'loss': 0.6943, 'grad_norm': 2.062696933746338, 'learning_rate': 0.0001568181818181818, 'epoch': 1.0}
+{'loss': 0.5178, 'grad_norm': 1.2728642225265503, 'learning_rate': 0.00015679374389051807, 'epoch': 1.0}
+{'loss': 1.1807, 'grad_norm': 2.6865758895874023, 'learning_rate': 0.00015676930596285435, 'epoch': 1.0}
+{'loss': 1.0252, 'grad_norm': 2.504348039627075, 'learning_rate': 0.0001567448680351906, 'epoch': 1.0}
+{'loss': 1.1543, 'grad_norm': 3.697204351425171, 'learning_rate': 0.00015672043010752688, 'epoch': 1.0}
+{'loss': 0.8737, 'grad_norm': 1.612819790840149, 'learning_rate': 0.00015669599217986316, 'epoch': 1.0}
+{'loss': 0.8725, 'grad_norm': 3.1941633224487305, 'learning_rate': 0.00015667155425219938, 'epoch': 1.0}
+{'loss': 1.1326, 'grad_norm': 2.2976832389831543, 'learning_rate': 0.00015664711632453566, 'epoch': 1.0}
+{'loss': 0.5624, 'grad_norm': 3.410414457321167, 'learning_rate': 0.00015662267839687194, 'epoch': 1.0}
+{'loss': 0.9388, 'grad_norm': 3.318864107131958, 'learning_rate': 0.0001565982404692082, 'epoch': 1.0}
+{'loss': 0.9887, 'grad_norm': 1.7484426498413086, 'learning_rate': 0.00015657380254154447, 'epoch': 1.0}
+{'loss': 1.4972, 'grad_norm': 3.5230112075805664, 'learning_rate': 0.00015654936461388075, 'epoch': 1.0}
+{'loss': 1.1754, 'grad_norm': 5.785534381866455, 'learning_rate': 0.000156524926686217, 'epoch': 1.0}
+{'loss': 1.2539, 'grad_norm': 2.6626009941101074, 'learning_rate': 0.00015650048875855328, 'epoch': 1.0}
+{'loss': 1.0544, 'grad_norm': 1.2295653820037842, 'learning_rate': 0.00015647605083088953, 'epoch': 1.0}
+{'loss': 0.5378, 'grad_norm': 1.510327935218811, 'learning_rate': 0.00015645161290322578, 'epoch': 1.0}
+ 50%|████▉     | 6386/12776 [1:07:14<24:36,  4.33it/s] 50%|████▉     | 6387/12776 [1:07:14<22:59,  4.63it/s]                                                       50%|████▉     | 6387/12776 [1:07:14<22:59,  4.63it/s] 50%|█████     | 6388/12776 [1:07:14<21:43,  4.90it/s]                                                       50%|█████     | 6388/12776 [1:07:14<21:43,  4.90it/s] 50%|█████     | 6389/12776 [1:07:16<1:22:39,  1.29it/s]                                                         50%|█████     | 6389/12776 [1:07:16<1:22:39,  1.29it/s] 50%|█████     | 6390/12776 [1:07:17<1:34:14,  1.13it/s]                                                         50%|█████     | 6390/12776 [1:07:17<1:34:14,  1.13it/s] 50%|█████     | 6391/12776 [1:07:18<1:34:00,  1.13it/s]                                                         50%|█████     | 6391/12776 [1:07:18<1:34:00,  1.13it/s] 50%|█████     | 6392/12776 [1:07:19<1:32:22,  1.15it/s]                                                         50%|█████     | 6392/12776 [1:07:19<1:32:22,  1.15it/s] 50%|█████     | 6393/12776 [1:07:20<1:28:45,  1.20it/s]                                                         50%|█████     | 6393/12776 [1:07:20<1:28:45,  1.20it/s] 50%|█████     | 6394/12776 [1:07:21<1:24:39,  1.26it/s]                                                         50%|█████     | 6394/12776 [1:07:21<1:24:39,  1.26it/s] 50%|█████     | 6395/12776 [1:07:21<1:20:17,  1.32it/s]                                                         50%|█████     | 6395/12776 [1:07:21<1:20:17,  1.32it/s] 50%|█████     | 6396/12776 [1:07:22<1:18:16,  1.36it/s]                                                         50%|█████     | 6396/12776 [1:07:22<1:18:16,  1.36it/s] 50%|█████     | 6397/12776 [1:07:23<1:13:41,  1.44it/s]                                                         50%|█████     | 6397/12776 [1:07:23<1:13:41,  1.44it/s] 50%|█████     | 6398/12776 [1:07:23<1:10:27,  1.51it/s]                                                         50%|█████     | 6398/12776 [1:07:23<1:10:27,  1.51it/s] 50%|█████     | 6399/12776 [1:07:24<1:06:22,  1.60it/s]                                                         50%|█████     | 6399/12776 [1:07:24<1:06:22,  1.60it/s] 50%|█████     | 6400/12776 [1:07:24<1:06:02,  1.61it/s]                                                         50%|█████     | 6400/12776 [1:07:24<1:06:02,  1.61it/s]Saving model checkpoint to ./checkpoint-6400
+Configuration saved in ./checkpoint-6400/config.json
+Model weights saved in ./checkpoint-6400/model.safetensors
+Feature extractor saved in ./checkpoint-6400/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-6400/tokenizer_config.json
+Special tokens file saved in ./checkpoint-6400/special_tokens_map.json
+added tokens file saved in ./checkpoint-6400/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-5200] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 50%|█████     | 6401/12776 [1:07:30<3:49:58,  2.16s/it]                                                         50%|█████     | 6401/12776 [1:07:30<3:49:58,  2.16s/it] 50%|█████     | 6402/12776 [1:07:31<2:54:42,  1.64s/it]                                                         50%|█████     | 6402/12776 [1:07:31<2:54:42,  1.64s/it] 50%|█████     | 6403/12776 [1:07:31<2:18:31,  1.30s/it]                                                         50%|█████     | 6403/12776 [1:07:31<2:18:31,  1.30s/it] 50%|█████     | 6404/12776 [1:07:31<1:49:33,  1.03s/it]                                                         50%|█████     | 6404/12776 [1:07:31<1:49:33,  1.03s/it] 50%|█████     | 6405/12776 [1:07:32<1:33:16,  1.14it/s]                                                         50%|█████     | 6405/12776 [1:07:32<1:33:16,  1.14it/s] 50%|█████     | 6406/12776 [1:07:32<1:17:19,  1.37it/s]                                                         50%|█████     | 6406/12776 [1:07:32<1:17:19,  1.37it/s] 50%|█████     | 6407/12776 [1:07:33<1:05:37,  1.62it/s]                                                         50%|█████     | 6407/12776 [1:07:33<1:05:37,  1.62it/s] 50%|█████     | 6408/12776 [1:07:33<58:51,  1.80it/s]                                                         50%|█████     | 6408/12776 [1:07:33<58:51,  1.80it/s] 50%|█████     | 6409/12776 [1:07:33<51:32,  2.06it/s]                                                       50%|█████     | 6409/12776 [1:07:33<51:32,  2.06it/s] 50%|█████     | 6410/12776 [1:07:34<46:06,  2.30it/s]                                                       50%|█████     | 6410/12776 [1:07:34<46:06,  2.30it/s] 50%|█████     | 6411/12776 [1:07:34<42:00,  2.53it/s]                                                       50%|█████     | 6411/12776 [1:07:34<42:00,  2.53it/s] 50%|█████     | 6412/12776 [1:07:34<41:52,  2.53it/s]                                                       50%|█████     | 6412/12776 [1:07:34<41:52,  2.53it/s] 50%|█████     | 6413/12776 [1:07:35<38:08,  2.78it/s]                                                       50%|█████     | 6413/12776 [1:07:35<38:08,  2.78it/s] 50%|█████     | 6414/12776 [1:07:35<35:21,  3.00it/s]                                                       50%|█████     | 6414/12776 [1:07:35<35:21,  3.00it/s] 50%|█████     | 6415/12776 [1:07:35<37:10,  2.85it/s]                                                       50%|█████     | 6415/12776 [1:07:35<37:10,  2.85it/s] 50%|█████     | 6416/12776 [1:07:36<34:15,  3.09it/s]                                                       50%|█████     | 6416/12776 [1:07:36<34:15,  3.09it/s] 50%|█████     | 6417/12776 [1:07:36<31:58,  3.31it/s]                                                       50%|█████     | 6417/12776 [1:07:36<31:58,  3.31it/s] 50%|█████     | 6418/12776 [1:07:36<30:13,  3.51it/s]                                                       50%|█████     | 6418/12776 [1:07:36<30:13,  3.51it/s] 50%|█████     | 6419/12776 [1:07:36<28:44,  3.69it/s]                                                       50%|█████     | 6419/12776 [1:07:36<28:44,  3.69it/s] 50%|█████     | 6420/12776 [1:07:37<29:09,  3.63it/s]                                                       50%|█████     | 6420/12776 [1:07:37<29:09,  3.63it/s] 50%|█████     | 6421/12776 [1:07:37<27:27,  3.86it/s]                                                       50%|█████     | 6421/12776 [1:07:37<27:27,  3.86it/s] 50%|█████     | 6422/12776 [1:07:37<26:11,  4.04it/s]                                                       50%|█████     | 6422/12776 [1:07:37<26:11,  4.04it/s] 50%|█████     | 6423/12776 [1:07:37<25:02,  4.23it/s]                                                       50%|█████     | 6423/12776 [1:07:37<25:02,  4.23it/s] 50%|█████     | 6424/12776 [1:07:38<28:08,  3.76it/s]                                                       50%|█████     | 6424/12776 [1:07:38<28:08,  3.76it/s] 50%|█████     | 6425/12776 [1:07:38<26:07,  4.05it/s]                                                       50%|█████     | 6425/12776 [1:07:38<26:07,  4.05it/s] 50%|█████     | 6426/12776 [1:07:38<24:29,  4.32it/s]                                                       50%|█████     | 6426/12776 [1:07:38<24:29,  4.32it/s] 50%|█████     | 6427/12776 [1:07:38<23:11,  4.56it/s]                                                       50%|█████     | 6427/12776 [1:07:38<23:11,  4.56it/s] 50%|█████     | 6428/12776 [1:07:38<22:19,  4.74it/s]                                                       50%|█████     | 6428/12776 [1:07:38<22:19,  4.74it/s] 50%|█████     | 6429/12776 [1:07:39<21:32,  4.91it/s]                                                       50%|█████     | 6429/12776 [1:07:39<21:32,  4.91it/s] 50%|█████     | 6430/12776 [1:07:39<23:55,  4.42it/s]                                                       50%|█████     | 6430/12776 [1:07:39<23:55,  4.42it/s] 50%|█████     | 6431/12776 [1:07:39<22:25,  4.72it/s]                                                       50%|█████     | 6431/12776 [1:07:39<22:25,  4.72it/s] 50%|█████     | 6432/12776 [1:07:39<21:20,  4.95it/s]                                                       50%|█████     | 6432/12776 [1:07:39<21:20,  4.95it/s] 50%|█████     | 6433/12776 [1:07:39<20:33,  5.14it/s]                                                       50%|█████     | 6433/12776 [1:07:39<20:33,  5.14it/s] 50%|█████     | 6434/12776 [1:07:40<19:53,  5.31it/s]                                                       50%|█████     | 6434/12776 [1:07:40<19:53,  5.31it/s] 50%|█████     | 6435/12776 [1:07:40<19:31,  5.41it/s]                                                       50%|█████     | 6435/12776 [1:07:40<19:31,  5.41it/s] 50%|█████     | 6436/12776 [1:07:40<21:54,  4.82it/s]                                                       50%|█████     | 6436/12776 [1:07:40<21:54,  4.82it/s] 50%|█████     | 6437/12776 [1:07:40<20:32,  5.14it/s]                                                       50%|█████     | 6437/12776 [1:07:40<20:32,  5.14it/s] 50%|█████     | 6438/12776 [1:07:41<37:25,  2.82it/s]                                                       50%|█████     | 6438/12776 [1:07:41<37:25,  2.82it/s] 50%|█████     | 6439/12776 [1:07:42<1:14:15,  1.42it/s]                                                         50%|█████     | 6439/12776 [1:07:42<1:14:15,  1.42it/s] 50%|█████     | 6440/12776 [1:07:43<1:23:24,  1.27it/s]                                                         50%|█████     | 6440/12776 [1:07:43<1:23:24,  1.27it/s] 50%|█████     | 6441/12776 [1:07:44<1:26:11,  1.23it/s]                                                         50%|█████     | 6441/12776 [1:07:44<1:26:11,  1.23it/s] 50%|█████     | 6442/12776 [1:07:45<1:24:40,  1.25it/s]                                                         50%|█████     | 6442/12776 [1:07:45<1:24:40,  1.25it/s] 50%|█████     | 6443/12776 [1:07:46<1:26:12,  1.22it/s]                                                         50%|█████     | 6443/12776 [1:07:46<1:26:12,  1.22it/s] 50%|█████     | 6444/12776 [1:07:47<1:21:28,  1.30it/s]                                                         50%|█████     | 6444/12776 [1:07:47<1:21:28,  1.30it/s] 50%|█████     | 6445/12776 [1:07:47<1:16:37,  1.38it/s]                                                         50%|█████     | 6445/12776 [1:07:47<1:16:37,  1.38it/s] 50%|█████     | 6446/12776 [1:07:48<1:18:39,  1.34it/s]                                                         50%|█████     | 6446/12776 [1:07:48<1:18:39,  1.34it/s] 50%|█████     | 6447/12776 [1:07:49<1:12:36,  1.45it/s]                                                         50%|█████     | 6447/12776 [1:07:49<1:12:36,  1.45it/s] 50%|█████     | 6448/12776 [1:07:49<1:09:20,  1.52it/s]                                                         50%|█████     | 6448/12776 [1:07:49<1:09:20,  1.52it/s] 50%|█████     | 6449/12776 [1:07:50<1:04:14,  1.64it/s]                                                         50%|█████     | 6449/12776 [1:07:50<1:04:14,  1.64it/s] 50%|█████     | 6450/12776 [1:07:50<59:42,  1.77it/s]                                                         50%|█████     | 6450/12776 [1:07:50<59:42,  1.77it/s] 50%|█████     | 6451/12776 [1:07:51<56:35,  1.86it/s]                                                       50%|█████     | 6451/12776 [1:07:51<56:35,  1.86it/s] 51%|█████     | 6452/12776 [1:07:51<52:44,  2.00it/s]                                                       51%|█████     | 6452/12776 [1:07:51<52:44,  2.00it/s] 51%|█████     | 6453/12776 [1:07:52<54:03,  1.95it/s]                                                       51%|█████     | 6453/12776 [1:07:52<54:03,  1.95it/s] 51%|█████     | 6454/12776 [1:07:52<49:53,  2.11it/s]                                                       51%|█████     | 6454/12776 [1:07:52<49:53,  2.11it/s] 51%|█████     | 6455/12776 [1:07:52<46:41,  2.26it/s]                                                       51%|█████     | 6455/12776 [1:07:52<46:41,  2.26it/s] 51%|█████     | 6456/12776 [1:07:53<45:38,  2.31it/s]                                                       51%|█████     | 6456/12776 [1:07:53<45:38,  2.31it/s] 51%|█████     | 6457/12776 [1:07:53<42:58,  2.45it/s]                                                       51%|█████     | 6457/12776 [1:07:53<42:58,  2.45it/s] 51%|█████     | 6458/12776 [1:07:53<40:37,  2.59it/s]                                                       51%|█████     | 6458/12776 [1:07:53<40:37,  2.59it/s] 51%|█████     | 6459/12776 [1:07:54<43:26,  2.42it/s]                                                       51%|█████     | 6459/12776 [1:07:54<43:26,  2.42it/s] 51%|█████     | 6460/12776 [1:07:54<40:09,  2.62it/s]                                                       51%|█████     | 6460/12776 [1:07:54<40:09,  2.62it/s] 51%|█████     | 6461/12776 [1:07:54<37:55,  2.78it/s]                                                       51%|█████     | 6461/12776 [1:07:54<37:55,  2.78it/s] 51%|█████     | 6462/12776 [1:07:55<35:45,  2.94it/s]                                                       51%|█████     | 6462/12776 [1:07:55<35:45,  2.94it/s] 51%|█████     | 6463/12776 [1:07:55<35:47,  2.94it/s]                                                       51%|█████     | 6463/12776 [1:07:55<35:47,  2.94it/s] 51%|█████     | 6464/12776 [1:07:55<33:37,  3.13it/s]                                                      {'loss': 0.8904, 'grad_norm': 1.7747565507888794, 'learning_rate': 0.00015642717497556206, 'epoch': 1.0}
+{'loss': 1.7167, 'grad_norm': 6.381943225860596, 'learning_rate': 0.00015640273704789834, 'epoch': 1.0}
+{'loss': 1.0733, 'grad_norm': 2.6135759353637695, 'learning_rate': 0.0001563782991202346, 'epoch': 1.0}
+{'loss': 0.3242, 'grad_norm': 0.6401867866516113, 'learning_rate': 0.00015635386119257087, 'epoch': 1.0}
+{'loss': 0.2188, 'grad_norm': 0.46278223395347595, 'learning_rate': 0.00015632942326490715, 'epoch': 1.0}
+{'loss': 0.2602, 'grad_norm': 0.5220637917518616, 'learning_rate': 0.00015630498533724337, 'epoch': 1.0}
+{'loss': 0.3363, 'grad_norm': 0.6111396551132202, 'learning_rate': 0.00015628054740957965, 'epoch': 1.0}
+{'loss': 0.2684, 'grad_norm': 0.6467208862304688, 'learning_rate': 0.00015625610948191593, 'epoch': 1.0}
+{'loss': 0.2703, 'grad_norm': 0.5766490697860718, 'learning_rate': 0.00015623167155425218, 'epoch': 1.0}
+{'loss': 0.3272, 'grad_norm': 0.6601141095161438, 'learning_rate': 0.00015620723362658846, 'epoch': 1.0}
+{'loss': 0.2942, 'grad_norm': 0.61875981092453, 'learning_rate': 0.0001561827956989247, 'epoch': 1.0}
+{'loss': 0.3894, 'grad_norm': 0.9664865732192993, 'learning_rate': 0.00015615835777126099, 'epoch': 1.0}
+{'loss': 0.1922, 'grad_norm': 0.5463569760322571, 'learning_rate': 0.00015613391984359726, 'epoch': 1.0}
+{'loss': 0.3852, 'grad_norm': 1.0034434795379639, 'learning_rate': 0.0001561094819159335, 'epoch': 1.0}
+{'loss': 0.3026, 'grad_norm': 0.9108190536499023, 'learning_rate': 0.00015608504398826977, 'epoch': 1.0}
+{'loss': 0.4889, 'grad_norm': 1.1288667917251587, 'learning_rate': 0.00015606060606060605, 'epoch': 1.0}
+{'loss': 0.5495, 'grad_norm': 0.8430051207542419, 'learning_rate': 0.0001560361681329423, 'epoch': 1.0}
+{'loss': 0.3109, 'grad_norm': 1.0893449783325195, 'learning_rate': 0.00015601173020527857, 'epoch': 1.0}
+{'loss': 0.5138, 'grad_norm': 1.9297972917556763, 'learning_rate': 0.00015598729227761485, 'epoch': 1.0}
+{'loss': 0.3637, 'grad_norm': 1.0670981407165527, 'learning_rate': 0.0001559628543499511, 'epoch': 1.0}
+{'loss': 0.4379, 'grad_norm': 1.307648777961731, 'learning_rate': 0.00015593841642228738, 'epoch': 1.0}
+{'loss': 0.4889, 'grad_norm': 1.2096155881881714, 'learning_rate': 0.00015591397849462366, 'epoch': 1.0}
+{'loss': 0.2895, 'grad_norm': 1.7205982208251953, 'learning_rate': 0.00015588954056695989, 'epoch': 1.0}
+{'loss': 0.4063, 'grad_norm': 1.0032813549041748, 'learning_rate': 0.00015586510263929616, 'epoch': 1.0}
+{'loss': 0.4581, 'grad_norm': 2.1352264881134033, 'learning_rate': 0.00015584066471163244, 'epoch': 1.0}
+{'loss': 0.4092, 'grad_norm': 1.522994041442871, 'learning_rate': 0.0001558162267839687, 'epoch': 1.0}
+{'loss': 0.3602, 'grad_norm': 1.0274169445037842, 'learning_rate': 0.00015579178885630497, 'epoch': 1.0}
+{'loss': 0.7179, 'grad_norm': 1.5904802083969116, 'learning_rate': 0.00015576735092864125, 'epoch': 1.0}
+{'loss': 0.5017, 'grad_norm': 1.4016789197921753, 'learning_rate': 0.00015574291300097747, 'epoch': 1.0}
+{'loss': 0.8605, 'grad_norm': 5.12330436706543, 'learning_rate': 0.00015571847507331375, 'epoch': 1.0}
+{'loss': 0.49, 'grad_norm': 2.253363609313965, 'learning_rate': 0.00015569403714565003, 'epoch': 1.0}
+{'loss': 0.6846, 'grad_norm': 8.99915599822998, 'learning_rate': 0.00015566959921798628, 'epoch': 1.0}
+{'loss': 0.7329, 'grad_norm': 2.108546018600464, 'learning_rate': 0.00015564516129032256, 'epoch': 1.0}
+{'loss': 0.7537, 'grad_norm': 2.5911080837249756, 'learning_rate': 0.00015562072336265884, 'epoch': 1.0}
+{'loss': 1.2488, 'grad_norm': 9.908422470092773, 'learning_rate': 0.0001555962854349951, 'epoch': 1.01}
+{'loss': 0.872, 'grad_norm': 2.6049554347991943, 'learning_rate': 0.00015557184750733137, 'epoch': 1.01}
+{'loss': 1.3471, 'grad_norm': 4.485533237457275, 'learning_rate': 0.00015554740957966765, 'epoch': 1.01}
+{'loss': 0.7273, 'grad_norm': 3.122943162918091, 'learning_rate': 0.00015552297165200387, 'epoch': 1.01}
+{'loss': 1.014, 'grad_norm': 2.615659475326538, 'learning_rate': 0.00015549853372434015, 'epoch': 1.01}
+{'loss': 0.3932, 'grad_norm': 1.3930341005325317, 'learning_rate': 0.00015547409579667643, 'epoch': 1.01}
+{'loss': 1.1018, 'grad_norm': 2.3153088092803955, 'learning_rate': 0.00015544965786901268, 'epoch': 1.01}
+{'loss': 1.1748, 'grad_norm': 4.314839839935303, 'learning_rate': 0.00015542521994134896, 'epoch': 1.01}
+{'loss': 1.2124, 'grad_norm': 3.0916011333465576, 'learning_rate': 0.00015540078201368524, 'epoch': 1.01}
+{'loss': 1.0817, 'grad_norm': 4.510741233825684, 'learning_rate': 0.0001553763440860215, 'epoch': 1.01}
+{'loss': 1.6207, 'grad_norm': 5.305693626403809, 'learning_rate': 0.00015535190615835777, 'epoch': 1.01}
+{'loss': 1.512, 'grad_norm': 2.222895383834839, 'learning_rate': 0.00015532746823069402, 'epoch': 1.01}
+{'loss': 1.712, 'grad_norm': 3.7936456203460693, 'learning_rate': 0.00015530303030303027, 'epoch': 1.01}
+{'loss': 1.0667, 'grad_norm': 1.699407696723938, 'learning_rate': 0.00015527859237536655, 'epoch': 1.01}
+{'loss': 1.1835, 'grad_norm': 2.3237552642822266, 'learning_rate': 0.00015525415444770282, 'epoch': 1.01}
+{'loss': 0.3155, 'grad_norm': 0.950505793094635, 'learning_rate': 0.00015522971652003908, 'epoch': 1.01}
+{'loss': 0.5834, 'grad_norm': 3.0012030601501465, 'learning_rate': 0.00015520527859237535, 'epoch': 1.01}
+{'loss': 0.8096, 'grad_norm': 1.9200234413146973, 'learning_rate': 0.00015518084066471163, 'epoch': 1.01}
+{'loss': 1.6, 'grad_norm': 3.209449291229248, 'learning_rate': 0.00015515640273704786, 'epoch': 1.01}
+{'loss': 0.2785, 'grad_norm': 0.4958963990211487, 'learning_rate': 0.00015513196480938413, 'epoch': 1.01}
+{'loss': 0.2305, 'grad_norm': 0.461821585893631, 'learning_rate': 0.0001551075268817204, 'epoch': 1.01}
+{'loss': 0.1765, 'grad_norm': 0.5686135292053223, 'learning_rate': 0.00015508308895405666, 'epoch': 1.01}
+{'loss': 0.1702, 'grad_norm': 0.5753898024559021, 'learning_rate': 0.00015505865102639294, 'epoch': 1.01}
+{'loss': 0.1378, 'grad_norm': 0.2971111536026001, 'learning_rate': 0.00015503421309872922, 'epoch': 1.01}
+{'loss': 0.2316, 'grad_norm': 0.6010518074035645, 'learning_rate': 0.00015500977517106547, 'epoch': 1.01}
+{'loss': 0.1997, 'grad_norm': 0.7052372097969055, 'learning_rate': 0.00015498533724340175, 'epoch': 1.01}
+{'loss': 0.2774, 'grad_norm': 0.728542685508728, 'learning_rate': 0.00015496089931573803, 'epoch': 1.01}
+{'loss': 0.2283, 'grad_norm': 0.6369313597679138, 'learning_rate': 0.00015493646138807425, 'epoch': 1.01}
+{'loss': 0.336, 'grad_norm': 0.7698948383331299, 'learning_rate': 0.00015491202346041053, 'epoch': 1.01}
+{'loss': 0.275, 'grad_norm': 0.579069197177887, 'learning_rate': 0.0001548875855327468, 'epoch': 1.01}
+{'loss': 0.2888, 'grad_norm': 0.8608638644218445, 'learning_rate': 0.00015486314760508306, 'epoch': 1.01}
+{'loss': 0.2552, 'grad_norm': 0.651512622833252, 'learning_rate': 0.00015483870967741934, 'epoch': 1.01}
+{'loss': 0.5846, 'grad_norm': 1.6202661991119385, 'learning_rate': 0.00015481427174975562, 'epoch': 1.01}
+{'loss': 0.29, 'grad_norm': 1.3742421865463257, 'learning_rate': 0.00015478983382209187, 'epoch': 1.01}
+{'loss': 0.4051, 'grad_norm': 1.502272605895996, 'learning_rate': 0.00015476539589442815, 'epoch': 1.01}
+{'loss': 0.4372, 'grad_norm': 0.8512781262397766, 'learning_rate': 0.0001547409579667644, 'epoch': 1.01}
+{'loss': 0.4414, 'grad_norm': 1.3594355583190918, 'learning_rate': 0.00015471652003910065, 'epoch': 1.01}
+{'loss': 0.5294, 'grad_norm': 1.8934544324874878, 'learning_rate': 0.00015469208211143693, 'epoch': 1.01}
+{'loss': 0.2968, 'grad_norm': 1.089806318283081, 'learning_rate': 0.0001546676441837732, 'epoch': 1.01}
+{'loss': 0.2608, 'grad_norm': 1.3797799348831177, 'learning_rate': 0.00015464320625610946, 'epoch': 1.01}
+{'loss': 0.4816, 'grad_norm': 1.3458391427993774, 'learning_rate': 0.00015461876832844574, 'epoch': 1.01}
+{'loss': 0.5628, 'grad_norm': 1.3416376113891602, 'learning_rate': 0.00015459433040078201, 'epoch': 1.01}
+{'loss': 0.4349, 'grad_norm': 1.862665057182312, 'learning_rate': 0.00015456989247311824, 'epoch': 1.01}
+{'loss': 0.4393, 'grad_norm': 1.254166603088379, 'learning_rate': 0.00015454545454545452, 'epoch': 1.01}
+ 51%|█████     | 6464/12776 [1:07:55<33:37,  3.13it/s] 51%|█████     | 6465/12776 [1:07:56<31:58,  3.29it/s]                                                       51%|█████     | 6465/12776 [1:07:56<31:58,  3.29it/s] 51%|█████     | 6466/12776 [1:07:56<30:35,  3.44it/s]                                                       51%|█████     | 6466/12776 [1:07:56<30:35,  3.44it/s] 51%|█████     | 6467/12776 [1:07:56<31:47,  3.31it/s]                                                       51%|█████     | 6467/12776 [1:07:56<31:47,  3.31it/s] 51%|█████     | 6468/12776 [1:07:56<29:56,  3.51it/s]                                                       51%|█████     | 6468/12776 [1:07:56<29:56,  3.51it/s] 51%|█████     | 6469/12776 [1:07:57<28:18,  3.71it/s]                                                       51%|█████     | 6469/12776 [1:07:57<28:18,  3.71it/s] 51%|█████     | 6470/12776 [1:07:57<26:59,  3.89it/s]                                                       51%|█████     | 6470/12776 [1:07:57<26:59,  3.89it/s] 51%|█████     | 6471/12776 [1:07:57<28:03,  3.74it/s]                                                       51%|█████     | 6471/12776 [1:07:57<28:03,  3.74it/s] 51%|█████     | 6472/12776 [1:07:57<26:32,  3.96it/s]                                                       51%|█████     | 6472/12776 [1:07:57<26:32,  3.96it/s] 51%|█████     | 6473/12776 [1:07:58<25:22,  4.14it/s]                                                       51%|█████     | 6473/12776 [1:07:58<25:22,  4.14it/s] 51%|█████     | 6474/12776 [1:07:58<24:20,  4.31it/s]                                                       51%|█████     | 6474/12776 [1:07:58<24:20,  4.31it/s] 51%|█████     | 6475/12776 [1:07:58<23:20,  4.50it/s]                                                       51%|█████     | 6475/12776 [1:07:58<23:20,  4.50it/s] 51%|█████     | 6476/12776 [1:07:58<24:38,  4.26it/s]                                                       51%|█████     | 6476/12776 [1:07:58<24:38,  4.26it/s] 51%|█████     | 6477/12776 [1:07:59<23:25,  4.48it/s]                                                       51%|█████     | 6477/12776 [1:07:59<23:25,  4.48it/s] 51%|█████     | 6478/12776 [1:07:59<22:20,  4.70it/s]                                                       51%|█████     | 6478/12776 [1:07:59<22:20,  4.70it/s] 51%|█████     | 6479/12776 [1:07:59<21:34,  4.87it/s]                                                       51%|█████     | 6479/12776 [1:07:59<21:34,  4.87it/s] 51%|█████     | 6480/12776 [1:07:59<21:33,  4.87it/s]                                                       51%|█████     | 6480/12776 [1:07:59<21:33,  4.87it/s] 51%|█████     | 6481/12776 [1:07:59<20:46,  5.05it/s]                                                       51%|█████     | 6481/12776 [1:07:59<20:46,  5.05it/s] 51%|█████     | 6482/12776 [1:08:00<22:47,  4.60it/s]                                                       51%|█████     | 6482/12776 [1:08:00<22:47,  4.60it/s] 51%|█████     | 6483/12776 [1:08:00<22:14,  4.71it/s]                                                       51%|█████     | 6483/12776 [1:08:00<22:14,  4.71it/s] 51%|█████     | 6484/12776 [1:08:00<21:42,  4.83it/s]                                                       51%|█████     | 6484/12776 [1:08:00<21:42,  4.83it/s] 51%|█████     | 6485/12776 [1:08:00<21:17,  4.92it/s]                                                       51%|█████     | 6485/12776 [1:08:00<21:17,  4.92it/s] 51%|█████     | 6486/12776 [1:08:00<20:45,  5.05it/s]                                                       51%|█████     | 6486/12776 [1:08:00<20:45,  5.05it/s] 51%|█████     | 6487/12776 [1:08:01<22:13,  4.72it/s]                                                       51%|█████     | 6487/12776 [1:08:01<22:13,  4.72it/s] 51%|█████     | 6488/12776 [1:08:01<41:12,  2.54it/s]                                                       51%|█████     | 6488/12776 [1:08:01<41:12,  2.54it/s] 51%|█████     | 6489/12776 [1:08:03<1:17:38,  1.35it/s]                                                         51%|█████     | 6489/12776 [1:08:03<1:17:38,  1.35it/s] 51%|█████     | 6490/12776 [1:08:04<1:30:53,  1.15it/s]                                                         51%|█████     | 6490/12776 [1:08:04<1:30:53,  1.15it/s] 51%|█████     | 6491/12776 [1:08:05<1:31:19,  1.15it/s]                                                         51%|█████     | 6491/12776 [1:08:05<1:31:19,  1.15it/s] 51%|█████     | 6492/12776 [1:08:06<1:29:20,  1.17it/s]                                                         51%|█████     | 6492/12776 [1:08:06<1:29:20,  1.17it/s] 51%|█████     | 6493/12776 [1:08:07<1:27:43,  1.19it/s]                                                         51%|█████     | 6493/12776 [1:08:07<1:27:43,  1.19it/s] 51%|█████     | 6494/12776 [1:08:07<1:27:54,  1.19it/s]                                                         51%|█████     | 6494/12776 [1:08:07<1:27:54,  1.19it/s] 51%|█████     | 6495/12776 [1:08:08<1:22:38,  1.27it/s]                                                         51%|█████     | 6495/12776 [1:08:08<1:22:38,  1.27it/s] 51%|█████     | 6496/12776 [1:08:09<1:21:26,  1.29it/s]                                                         51%|█████     | 6496/12776 [1:08:09<1:21:26,  1.29it/s] 51%|█████     | 6497/12776 [1:08:09<1:16:05,  1.38it/s]                                                         51%|█████     | 6497/12776 [1:08:09<1:16:05,  1.38it/s] 51%|█████     | 6498/12776 [1:08:10<1:11:52,  1.46it/s]                                                         51%|█████     | 6498/12776 [1:08:10<1:11:52,  1.46it/s] 51%|█████     | 6499/12776 [1:08:11<1:07:06,  1.56it/s]                                                         51%|█████     | 6499/12776 [1:08:11<1:07:06,  1.56it/s] 51%|█████     | 6500/12776 [1:08:11<1:05:01,  1.61it/s]                                                         51%|█████     | 6500/12776 [1:08:11<1:05:01,  1.61it/s] 51%|█████     | 6501/12776 [1:08:12<1:00:28,  1.73it/s]                                                         51%|█████     | 6501/12776 [1:08:12<1:00:28,  1.73it/s] 51%|█████     | 6502/12776 [1:08:12<59:56,  1.74it/s]                                                         51%|█████     | 6502/12776 [1:08:12<59:56,  1.74it/s] 51%|█████     | 6503/12776 [1:08:13<55:38,  1.88it/s]                                                       51%|█████     | 6503/12776 [1:08:13<55:38,  1.88it/s] 51%|█████     | 6504/12776 [1:08:13<55:00,  1.90it/s]                                                       51%|█████     | 6504/12776 [1:08:13<55:00,  1.90it/s] 51%|█████     | 6505/12776 [1:08:14<51:06,  2.05it/s]                                                       51%|█████     | 6505/12776 [1:08:14<51:06,  2.05it/s] 51%|█████     | 6506/12776 [1:08:14<47:54,  2.18it/s]                                                       51%|█████     | 6506/12776 [1:08:14<47:54,  2.18it/s] 51%|█████     | 6507/12776 [1:08:14<45:42,  2.29it/s]                                                       51%|█████     | 6507/12776 [1:08:14<45:42,  2.29it/s] 51%|█████     | 6508/12776 [1:08:15<43:09,  2.42it/s]                                                       51%|█████     | 6508/12776 [1:08:15<43:09,  2.42it/s] 51%|█████     | 6509/12776 [1:08:15<41:04,  2.54it/s]                                                       51%|█████     | 6509/12776 [1:08:15<41:04,  2.54it/s] 51%|█████     | 6510/12776 [1:08:15<42:37,  2.45it/s]                                                       51%|█████     | 6510/12776 [1:08:15<42:37,  2.45it/s] 51%|█████     | 6511/12776 [1:08:16<40:16,  2.59it/s]                                                       51%|█████     | 6511/12776 [1:08:16<40:16,  2.59it/s] 51%|█████     | 6512/12776 [1:08:16<38:26,  2.72it/s]                                                       51%|█████     | 6512/12776 [1:08:16<38:26,  2.72it/s] 51%|█████     | 6513/12776 [1:08:16<36:33,  2.86it/s]                                                       51%|█████     | 6513/12776 [1:08:16<36:33,  2.86it/s] 51%|█████     | 6514/12776 [1:08:17<35:49,  2.91it/s]                                                       51%|█████     | 6514/12776 [1:08:17<35:49,  2.91it/s] 51%|█████     | 6515/12776 [1:08:17<34:08,  3.06it/s]                                                       51%|█████     | 6515/12776 [1:08:17<34:08,  3.06it/s] 51%|█████     | 6516/12776 [1:08:17<32:46,  3.18it/s]                                                       51%|█████     | 6516/12776 [1:08:17<32:46,  3.18it/s] 51%|█████     | 6517/12776 [1:08:18<31:27,  3.32it/s]                                                       51%|█████     | 6517/12776 [1:08:18<31:27,  3.32it/s] 51%|█████     | 6518/12776 [1:08:18<30:31,  3.42it/s]                                                       51%|█████     | 6518/12776 [1:08:18<30:31,  3.42it/s] 51%|█████     | 6519/12776 [1:08:18<29:22,  3.55it/s]                                                       51%|█████     | 6519/12776 [1:08:18<29:22,  3.55it/s] 51%|█████     | 6520/12776 [1:08:18<28:27,  3.66it/s]                                                       51%|█████     | 6520/12776 [1:08:18<28:27,  3.66it/s] 51%|█████     | 6521/12776 [1:08:19<28:03,  3.72it/s]                                                       51%|█████     | 6521/12776 [1:08:19<28:03,  3.72it/s] 51%|█████     | 6522/12776 [1:08:19<30:04,  3.47it/s]                                                       51%|█████     | 6522/12776 [1:08:19<30:04,  3.47it/s] 51%|█████     | 6523/12776 [1:08:19<28:18,  3.68it/s]                                                       51%|█████     | 6523/12776 [1:08:19<28:18,  3.68it/s] 51%|█████     | 6524/12776 [1:08:19<26:57,  3.87it/s]                                                       51%|█████     | 6524/12776 [1:08:19<26:57,  3.87it/s] 51%|█████     | 6525/12776 [1:08:20<25:42,  4.05it/s]                                                       51%|█████     | 6525/12776 [1:08:20<25:42,  4.05it/s] 51%|█████     | 6526/12776 [1:08:20<28:20,  3.68it/s]                                                       51%|█████     | 6526/12776 [1:08:20<28:20,  3.68it/s] 51%|█████     | 6527/12776 [1:08:20<26:26,  3.94it/s]                                                       51%|█████     | 6527/12776 [1:08:20<26:26,  3.94it/s] 51%|█████     | 6528/12776 [1:08:20<25:05,  4.15it/s]                                                       51%|█████     | 6528/12776 [1:08:20<25:05,  4.15it/s] 51%|█████     | 6529/12776 [1:08:21<24:01,  4.33it/s]                                                       51%|█████     | 6529/12776 [1:08:21<24:01,  4.33it/s] 51%|█████     | 6530/12776 [1:08:21<23:09,  4.49it/s]                                                       51%|█████     | 6530/12776 [1:08:21<23:09,  4.49it/s] 51%|█████     | 6531/12776 [1:08:21<24:55,  4.18it/s]                                                       51%|█████     | 6531/12776 [1:08:21<24:55,  4.18it/s] 51%|█████     | 6532/12776 [1:08:21<23:37,  4.40it/s]                                                       51%|█████     | 6532/12776 [1:08:21<23:37,  4.40it/s] 51%|█████     | 6533/12776 [1:08:22<22:38,  4.59it/s]                                                       51%|█████     | 6533/12776 [1:08:22<22:38,  4.59it/s] 51%|█████     | 6534/12776 [1:08:22<21:53,  4.75it/s]                                                       51%|█████     | 6534/12776 [1:08:22<21:53,  4.75it/s] 51%|█████     | 6535/12776 [1:08:22<21:17,  4.88it/s]                                                       51%|█████     | 6535/12776 [1:08:22<21:17,  4.88it/s] 51%|█████     | 6536/12776 [1:08:22<20:43,  5.02it/s]                                                       51%|█████     | 6536/12776 [1:08:22<20:43,  5.02it/s] 51%|█████     | 6537/12776 [1:08:22<22:31,  4.62it/s]                                                       51%|█████     | 6537/12776 [1:08:22<22:31,  4.62it/s] 51%|█████     | 6538/12776 [1:08:23<39:39,  2.62it/s]                                                       51%|█████     | 6538/12776 [1:08:23<39:39,  2.62it/s] 51%|█████     | 6539/12776 [1:08:25<1:16:11,  1.36it/s]                                                         51%|█████     | 6539/12776 [1:08:25<1:16:11,  1.36it/s] 51%|█████     | 6540/12776 [1:08:26<1:25:01,  1.22it/s]                                                         51%|█████     | 6540/12776 [1:08:26<1:25:01,  1.22it/s] 51%|█████     | 6541/12776 [1:08:27<1:27:10,  1.19it/s]                                                        {'loss': 0.5046, 'grad_norm': 1.7121856212615967, 'learning_rate': 0.0001545210166177908, 'epoch': 1.01}
+{'loss': 0.5843, 'grad_norm': 1.5799607038497925, 'learning_rate': 0.00015449657869012705, 'epoch': 1.01}
+{'loss': 1.143, 'grad_norm': 2.453345775604248, 'learning_rate': 0.00015447214076246333, 'epoch': 1.01}
+{'loss': 0.6528, 'grad_norm': 2.5280091762542725, 'learning_rate': 0.0001544477028347996, 'epoch': 1.01}
+{'loss': 0.8534, 'grad_norm': 1.8263561725616455, 'learning_rate': 0.00015442326490713585, 'epoch': 1.01}
+{'loss': 0.7371, 'grad_norm': 1.4700437784194946, 'learning_rate': 0.00015439882697947213, 'epoch': 1.01}
+{'loss': 0.9834, 'grad_norm': 2.2002007961273193, 'learning_rate': 0.0001543743890518084, 'epoch': 1.01}
+{'loss': 0.7919, 'grad_norm': 2.333889961242676, 'learning_rate': 0.00015434995112414464, 'epoch': 1.01}
+{'loss': 0.8916, 'grad_norm': 5.423836708068848, 'learning_rate': 0.00015432551319648091, 'epoch': 1.01}
+{'loss': 0.7162, 'grad_norm': 1.438730001449585, 'learning_rate': 0.0001543010752688172, 'epoch': 1.01}
+{'loss': 1.0373, 'grad_norm': 1.9986294507980347, 'learning_rate': 0.00015427663734115344, 'epoch': 1.01}
+{'loss': 1.1194, 'grad_norm': 4.634669303894043, 'learning_rate': 0.00015425219941348972, 'epoch': 1.01}
+{'loss': 0.9643, 'grad_norm': 2.2568159103393555, 'learning_rate': 0.000154227761485826, 'epoch': 1.01}
+{'loss': 0.6398, 'grad_norm': 1.3911081552505493, 'learning_rate': 0.00015420332355816225, 'epoch': 1.01}
+{'loss': 1.4445, 'grad_norm': 2.0544090270996094, 'learning_rate': 0.00015417888563049853, 'epoch': 1.01}
+{'loss': 0.7207, 'grad_norm': 1.972231388092041, 'learning_rate': 0.00015415444770283478, 'epoch': 1.01}
+{'loss': 1.3453, 'grad_norm': 2.4303290843963623, 'learning_rate': 0.00015413000977517103, 'epoch': 1.01}
+{'loss': 1.5838, 'grad_norm': 5.570343017578125, 'learning_rate': 0.0001541055718475073, 'epoch': 1.01}
+{'loss': 1.1335, 'grad_norm': 1.2938313484191895, 'learning_rate': 0.0001540811339198436, 'epoch': 1.01}
+{'loss': 0.6899, 'grad_norm': 2.1745591163635254, 'learning_rate': 0.00015405669599217984, 'epoch': 1.01}
+{'loss': 0.8181, 'grad_norm': 3.9206926822662354, 'learning_rate': 0.00015403225806451612, 'epoch': 1.02}
+{'loss': 0.2675, 'grad_norm': 3.452662706375122, 'learning_rate': 0.0001540078201368524, 'epoch': 1.02}
+{'loss': 0.7693, 'grad_norm': 1.481506109237671, 'learning_rate': 0.00015398338220918862, 'epoch': 1.02}
+{'loss': 0.4043, 'grad_norm': 3.136730670928955, 'learning_rate': 0.0001539589442815249, 'epoch': 1.02}
+{'loss': 1.0601, 'grad_norm': 4.050405502319336, 'learning_rate': 0.00015393450635386118, 'epoch': 1.02}
+{'loss': 0.312, 'grad_norm': 0.4689655601978302, 'learning_rate': 0.00015391006842619743, 'epoch': 1.02}
+{'loss': 0.1755, 'grad_norm': 0.32348453998565674, 'learning_rate': 0.0001538856304985337, 'epoch': 1.02}
+{'loss': 0.2209, 'grad_norm': 0.456193745136261, 'learning_rate': 0.00015386119257086999, 'epoch': 1.02}
+{'loss': 0.3865, 'grad_norm': 0.6613138914108276, 'learning_rate': 0.00015383675464320624, 'epoch': 1.02}
+{'loss': 0.2318, 'grad_norm': 0.45760709047317505, 'learning_rate': 0.00015381231671554252, 'epoch': 1.02}
+{'loss': 0.2009, 'grad_norm': 0.4187608063220978, 'learning_rate': 0.0001537878787878788, 'epoch': 1.02}
+{'loss': 0.3943, 'grad_norm': 0.9235534071922302, 'learning_rate': 0.00015376344086021502, 'epoch': 1.02}
+{'loss': 0.3704, 'grad_norm': 0.9163877367973328, 'learning_rate': 0.0001537390029325513, 'epoch': 1.02}
+{'loss': 0.2252, 'grad_norm': 0.5193113684654236, 'learning_rate': 0.00015371456500488757, 'epoch': 1.02}
+{'loss': 0.3135, 'grad_norm': 0.8574435710906982, 'learning_rate': 0.00015369012707722383, 'epoch': 1.02}
+{'loss': 0.3573, 'grad_norm': 1.2782654762268066, 'learning_rate': 0.0001536656891495601, 'epoch': 1.02}
+{'loss': 0.9878, 'grad_norm': 4.555233955383301, 'learning_rate': 0.00015364125122189638, 'epoch': 1.02}
+{'loss': 0.212, 'grad_norm': 0.6440590620040894, 'learning_rate': 0.00015361681329423263, 'epoch': 1.02}
+{'loss': 0.5993, 'grad_norm': 1.6863186359405518, 'learning_rate': 0.00015359237536656889, 'epoch': 1.02}
+{'loss': 0.3741, 'grad_norm': 1.924282431602478, 'learning_rate': 0.00015356793743890516, 'epoch': 1.02}
+{'loss': 0.4833, 'grad_norm': 0.9348188638687134, 'learning_rate': 0.00015354349951124141, 'epoch': 1.02}
+{'loss': 0.3112, 'grad_norm': 1.7575799226760864, 'learning_rate': 0.0001535190615835777, 'epoch': 1.02}
+{'loss': 0.4143, 'grad_norm': 1.014967679977417, 'learning_rate': 0.00015349462365591397, 'epoch': 1.02}
+{'loss': 0.4754, 'grad_norm': 1.0853968858718872, 'learning_rate': 0.00015347018572825022, 'epoch': 1.02}
+{'loss': 0.3803, 'grad_norm': 1.5835812091827393, 'learning_rate': 0.0001534457478005865, 'epoch': 1.02}
+{'loss': 0.4094, 'grad_norm': 0.9677704572677612, 'learning_rate': 0.00015342130987292278, 'epoch': 1.02}
+{'loss': 0.4622, 'grad_norm': 1.162556767463684, 'learning_rate': 0.000153396871945259, 'epoch': 1.02}
+{'loss': 0.361, 'grad_norm': 1.3809646368026733, 'learning_rate': 0.00015337243401759528, 'epoch': 1.02}
+{'loss': 0.4641, 'grad_norm': 1.8124253749847412, 'learning_rate': 0.00015334799608993156, 'epoch': 1.02}
+{'loss': 0.3851, 'grad_norm': 1.3627941608428955, 'learning_rate': 0.0001533235581622678, 'epoch': 1.02}
+{'loss': 1.1882, 'grad_norm': 4.081453800201416, 'learning_rate': 0.0001532991202346041, 'epoch': 1.02}
+{'loss': 0.4868, 'grad_norm': 1.3441665172576904, 'learning_rate': 0.00015327468230694037, 'epoch': 1.02}
+{'loss': 0.6887, 'grad_norm': 3.412243366241455, 'learning_rate': 0.00015325024437927662, 'epoch': 1.02}
+{'loss': 0.3311, 'grad_norm': 0.8700874447822571, 'learning_rate': 0.0001532258064516129, 'epoch': 1.02}
+{'loss': 0.607, 'grad_norm': 1.8043177127838135, 'learning_rate': 0.00015320136852394918, 'epoch': 1.02}
+{'loss': 0.8962, 'grad_norm': 2.9217605590820312, 'learning_rate': 0.0001531769305962854, 'epoch': 1.02}
+{'loss': 0.4919, 'grad_norm': 1.854831337928772, 'learning_rate': 0.00015315249266862168, 'epoch': 1.02}
+{'loss': 0.9187, 'grad_norm': 2.2908477783203125, 'learning_rate': 0.00015312805474095796, 'epoch': 1.02}
+{'loss': 0.4282, 'grad_norm': 3.0967938899993896, 'learning_rate': 0.0001531036168132942, 'epoch': 1.02}
+{'loss': 0.6401, 'grad_norm': 1.6227751970291138, 'learning_rate': 0.0001530791788856305, 'epoch': 1.02}
+{'loss': 1.1405, 'grad_norm': 1.9775352478027344, 'learning_rate': 0.00015305474095796676, 'epoch': 1.02}
+{'loss': 0.6276, 'grad_norm': 2.77632212638855, 'learning_rate': 0.00015303030303030302, 'epoch': 1.02}
+{'loss': 1.3446, 'grad_norm': 5.503850936889648, 'learning_rate': 0.00015300586510263927, 'epoch': 1.02}
+{'loss': 1.1199, 'grad_norm': 4.683707237243652, 'learning_rate': 0.00015298142717497555, 'epoch': 1.02}
+{'loss': 1.3954, 'grad_norm': 2.9078924655914307, 'learning_rate': 0.0001529569892473118, 'epoch': 1.02}
+{'loss': 0.5525, 'grad_norm': 1.9573535919189453, 'learning_rate': 0.00015293255131964808, 'epoch': 1.02}
+{'loss': 1.9003, 'grad_norm': 2.339132308959961, 'learning_rate': 0.00015290811339198435, 'epoch': 1.02}
+{'loss': 1.2449, 'grad_norm': 2.6952342987060547, 'learning_rate': 0.0001528836754643206, 'epoch': 1.02}
+{'loss': 2.0955, 'grad_norm': 6.4942216873168945, 'learning_rate': 0.00015285923753665688, 'epoch': 1.02}
+{'loss': 1.3895, 'grad_norm': 2.6404001712799072, 'learning_rate': 0.00015283479960899316, 'epoch': 1.02}
+{'loss': 0.4467, 'grad_norm': 1.230292558670044, 'learning_rate': 0.00015281036168132939, 'epoch': 1.02}
+{'loss': 0.5228, 'grad_norm': 1.6380767822265625, 'learning_rate': 0.00015278592375366566, 'epoch': 1.02}
+{'loss': 0.4542, 'grad_norm': 1.788313388824463, 'learning_rate': 0.00015276148582600194, 'epoch': 1.02}
+{'loss': 0.8112, 'grad_norm': 2.3561313152313232, 'learning_rate': 0.0001527370478983382, 'epoch': 1.02}
+{'loss': 1.1279, 'grad_norm': 2.4350478649139404, 'learning_rate': 0.00015271260997067447, 'epoch': 1.02}
+{'loss': 0.3027, 'grad_norm': 1.0181922912597656, 'learning_rate': 0.00015268817204301075, 'epoch': 1.02}
+{'loss': 0.2241, 'grad_norm': 1.0185344219207764, 'learning_rate': 0.000152663734115347, 'epoch': 1.02}
+ 51%|█████     | 6541/12776 [1:08:27<1:27:10,  1.19it/s] 51%|█████     | 6542/12776 [1:08:27<1:25:14,  1.22it/s]                                                         51%|█████     | 6542/12776 [1:08:27<1:25:14,  1.22it/s] 51%|█████     | 6543/12776 [1:08:28<1:22:08,  1.26it/s]                                                         51%|█████     | 6543/12776 [1:08:28<1:22:08,  1.26it/s] 51%|█████     | 6544/12776 [1:08:29<1:18:50,  1.32it/s]                                                         51%|█████     | 6544/12776 [1:08:29<1:18:50,  1.32it/s] 51%|█████     | 6545/12776 [1:08:29<1:18:25,  1.32it/s]                                                         51%|█████     | 6545/12776 [1:08:29<1:18:25,  1.32it/s] 51%|█████     | 6546/12776 [1:08:30<1:18:18,  1.33it/s]                                                         51%|█████     | 6546/12776 [1:08:30<1:18:18,  1.33it/s] 51%|█████     | 6547/12776 [1:08:31<1:13:42,  1.41it/s]                                                         51%|█████     | 6547/12776 [1:08:31<1:13:42,  1.41it/s] 51%|█████▏    | 6548/12776 [1:08:31<1:09:39,  1.49it/s]                                                         51%|█████▏    | 6548/12776 [1:08:31<1:09:39,  1.49it/s] 51%|█████▏    | 6549/12776 [1:08:32<1:05:34,  1.58it/s]                                                         51%|█████▏    | 6549/12776 [1:08:32<1:05:34,  1.58it/s] 51%|█████▏    | 6550/12776 [1:08:33<1:04:23,  1.61it/s]                                                         51%|█████▏    | 6550/12776 [1:08:33<1:04:23,  1.61it/s] 51%|█████▏    | 6551/12776 [1:08:33<1:00:18,  1.72it/s]                                                         51%|█████▏    | 6551/12776 [1:08:33<1:00:18,  1.72it/s] 51%|█████▏    | 6552/12776 [1:08:34<56:20,  1.84it/s]                                                         51%|█████▏    | 6552/12776 [1:08:34<56:20,  1.84it/s] 51%|█████▏    | 6553/12776 [1:08:34<54:49,  1.89it/s]                                                       51%|█████▏    | 6553/12776 [1:08:34<54:49,  1.89it/s] 51%|█████▏    | 6554/12776 [1:08:34<51:32,  2.01it/s]                                                       51%|█████▏    | 6554/12776 [1:08:34<51:32,  2.01it/s] 51%|█████▏    | 6555/12776 [1:08:35<50:41,  2.05it/s]                                                       51%|█████▏    | 6555/12776 [1:08:35<50:41,  2.05it/s] 51%|█████▏    | 6556/12776 [1:08:35<47:51,  2.17it/s]                                                       51%|█████▏    | 6556/12776 [1:08:35<47:51,  2.17it/s] 51%|█████▏    | 6557/12776 [1:08:36<45:33,  2.27it/s]                                                       51%|█████▏    | 6557/12776 [1:08:36<45:33,  2.27it/s] 51%|█████▏    | 6558/12776 [1:08:36<47:43,  2.17it/s]                                                       51%|█████▏    | 6558/12776 [1:08:36<47:43,  2.17it/s] 51%|█████▏    | 6559/12776 [1:08:37<44:30,  2.33it/s]                                                       51%|█████▏    | 6559/12776 [1:08:37<44:30,  2.33it/s] 51%|█████▏    | 6560/12776 [1:08:37<41:55,  2.47it/s]                                                       51%|█████▏    | 6560/12776 [1:08:37<41:55,  2.47it/s] 51%|█████▏    | 6561/12776 [1:08:37<42:05,  2.46it/s]                                                       51%|█████▏    | 6561/12776 [1:08:37<42:05,  2.46it/s] 51%|█████▏    | 6562/12776 [1:08:38<39:38,  2.61it/s]                                                       51%|█████▏    | 6562/12776 [1:08:38<39:38,  2.61it/s] 51%|█████▏    | 6563/12776 [1:08:38<37:15,  2.78it/s]                                                       51%|█████▏    | 6563/12776 [1:08:38<37:15,  2.78it/s] 51%|█████▏    | 6564/12776 [1:08:38<37:29,  2.76it/s]                                                       51%|█████▏    | 6564/12776 [1:08:38<37:29,  2.76it/s] 51%|█████▏    | 6565/12776 [1:08:39<35:24,  2.92it/s]                                                       51%|█████▏    | 6565/12776 [1:08:39<35:24,  2.92it/s] 51%|█████▏    | 6566/12776 [1:08:39<33:46,  3.06it/s]                                                       51%|█████▏    | 6566/12776 [1:08:39<33:46,  3.06it/s] 51%|█████▏    | 6567/12776 [1:08:39<32:21,  3.20it/s]                                                       51%|█████▏    | 6567/12776 [1:08:39<32:21,  3.20it/s] 51%|█████▏    | 6568/12776 [1:08:40<34:27,  3.00it/s]                                                       51%|█████▏    | 6568/12776 [1:08:40<34:27,  3.00it/s] 51%|█████▏    | 6569/12776 [1:08:40<32:17,  3.20it/s]                                                       51%|█████���    | 6569/12776 [1:08:40<32:17,  3.20it/s] 51%|█████▏    | 6570/12776 [1:08:40<30:25,  3.40it/s]                                                       51%|█████▏    | 6570/12776 [1:08:40<30:25,  3.40it/s] 51%|█████▏    | 6571/12776 [1:08:40<28:54,  3.58it/s]                                                       51%|█████▏    | 6571/12776 [1:08:40<28:54,  3.58it/s] 51%|█████▏    | 6572/12776 [1:08:41<31:08,  3.32it/s]                                                       51%|█████▏    | 6572/12776 [1:08:41<31:08,  3.32it/s] 51%|█████▏    | 6573/12776 [1:08:41<29:00,  3.56it/s]                                                       51%|█████▏    | 6573/12776 [1:08:41<29:00,  3.56it/s] 51%|█████▏    | 6574/12776 [1:08:41<27:29,  3.76it/s]                                                       51%|█████▏    | 6574/12776 [1:08:41<27:29,  3.76it/s] 51%|█████▏    | 6575/12776 [1:08:41<26:06,  3.96it/s]                                                       51%|█████▏    | 6575/12776 [1:08:41<26:06,  3.96it/s] 51%|█████▏    | 6576/12776 [1:08:42<28:54,  3.57it/s]                                                       51%|█████▏    | 6576/12776 [1:08:42<28:54,  3.57it/s] 51%|█████▏    | 6577/12776 [1:08:42<26:52,  3.84it/s]                                                       51%|█████▏    | 6577/12776 [1:08:42<26:52,  3.84it/s] 51%|█████▏    | 6578/12776 [1:08:42<25:14,  4.09it/s]                                                       51%|█████▏    | 6578/12776 [1:08:42<25:14,  4.09it/s] 51%|█████▏    | 6579/12776 [1:08:42<24:12,  4.27it/s]                                                       51%|█████▏    | 6579/12776 [1:08:42<24:12,  4.27it/s] 52%|█████▏    | 6580/12776 [1:08:43<23:18,  4.43it/s]                                                       52%|█████▏    | 6580/12776 [1:08:43<23:18,  4.43it/s] 52%|█████▏    | 6581/12776 [1:08:43<26:05,  3.96it/s]                                                       52%|█████▏    | 6581/12776 [1:08:43<26:05,  3.96it/s] 52%|█████▏    | 6582/12776 [1:08:43<24:27,  4.22it/s]                                                       52%|█████▏    | 6582/12776 [1:08:43<24:27,  4.22it/s] 52%|█████▏    | 6583/12776 [1:08:43<23:12,  4.45it/s]                                                       52%|█████▏    | 6583/12776 [1:08:43<23:12,  4.45it/s] 52%|█████▏    | 6584/12776 [1:08:43<22:18,  4.63it/s]                                                       52%|█████▏    | 6584/12776 [1:08:43<22:18,  4.63it/s] 52%|█████▏    | 6585/12776 [1:08:44<21:30,  4.80it/s]                                                       52%|█████▏    | 6585/12776 [1:08:44<21:30,  4.80it/s] 52%|█████▏    | 6586/12776 [1:08:44<20:50,  4.95it/s]                                                       52%|█████▏    | 6586/12776 [1:08:44<20:50,  4.95it/s] 52%|█████▏    | 6587/12776 [1:08:44<23:39,  4.36it/s]                                                       52%|█████▏    | 6587/12776 [1:08:44<23:39,  4.36it/s] 52%|█████▏    | 6588/12776 [1:08:45<39:04,  2.64it/s]                                                       52%|█████▏    | 6588/12776 [1:08:45<39:04,  2.64it/s] 52%|█████▏    | 6589/12776 [1:08:46<1:12:37,  1.42it/s]                                                         52%|█████▏    | 6589/12776 [1:08:46<1:12:37,  1.42it/s] 52%|█████▏    | 6590/12776 [1:08:47<1:19:56,  1.29it/s]                                                         52%|█████▏    | 6590/12776 [1:08:47<1:19:56,  1.29it/s] 52%|█████▏    | 6591/12776 [1:08:48<1:22:54,  1.24it/s]                                                         52%|█████▏    | 6591/12776 [1:08:48<1:22:54,  1.24it/s] 52%|█████▏    | 6592/12776 [1:08:49<1:22:40,  1.25it/s]                                                         52%|█████▏    | 6592/12776 [1:08:49<1:22:40,  1.25it/s] 52%|█████▏    | 6593/12776 [1:08:50<1:23:31,  1.23it/s]                                                         52%|█████▏    | 6593/12776 [1:08:50<1:23:31,  1.23it/s] 52%|█████▏    | 6594/12776 [1:08:50<1:19:24,  1.30it/s]                                                         52%|█████▏    | 6594/12776 [1:08:50<1:19:24,  1.30it/s] 52%|█████▏    | 6595/12776 [1:08:51<1:20:24,  1.28it/s]                                                         52%|█████▏    | 6595/12776 [1:08:51<1:20:24,  1.28it/s] 52%|█████▏    | 6596/12776 [1:08:52<1:15:23,  1.37it/s]                                                         52%|█████▏    | 6596/12776 [1:08:52<1:15:23,  1.37it/s] 52%|█████▏    | 6597/12776 [1:08:52<1:11:07,  1.45it/s]                                                         52%|█████▏    | 6597/12776 [1:08:52<1:11:07,  1.45it/s] 52%|█████▏    | 6598/12776 [1:08:53<1:06:42,  1.54it/s]                                                         52%|█████▏    | 6598/12776 [1:08:53<1:06:42,  1.54it/s] 52%|█████▏    | 6599/12776 [1:08:54<1:05:35,  1.57it/s]                                                         52%|█████▏    | 6599/12776 [1:08:54<1:05:35,  1.57it/s] 52%|█████▏    | 6600/12776 [1:08:54<1:01:40,  1.67it/s]                                                         52%|█████▏    | 6600/12776 [1:08:54<1:01:40,  1.67it/s] 52%|█████▏    | 6601/12776 [1:08:55<1:01:23,  1.68it/s]                                                         52%|█████▏    | 6601/12776 [1:08:55<1:01:23,  1.68it/s] 52%|█████▏    | 6602/12776 [1:08:55<56:59,  1.81it/s]                                                         52%|█████▏    | 6602/12776 [1:08:55<56:59,  1.81it/s] 52%|█████▏    | 6603/12776 [1:08:56<57:43,  1.78it/s]                                                       52%|█████▏    | 6603/12776 [1:08:56<57:43,  1.78it/s] 52%|█████▏    | 6604/12776 [1:08:56<53:37,  1.92it/s]                                                       52%|█████▏    | 6604/12776 [1:08:56<53:37,  1.92it/s] 52%|█████▏    | 6605/12776 [1:08:57<54:23,  1.89it/s]                                                       52%|█████▏    | 6605/12776 [1:08:57<54:23,  1.89it/s] 52%|█████▏    | 6606/12776 [1:08:57<50:10,  2.05it/s]                                                       52%|█████▏    | 6606/12776 [1:08:57<50:10,  2.05it/s] 52%|█████▏    | 6607/12776 [1:08:57<46:51,  2.19it/s]                                                       52%|█████▏    | 6607/12776 [1:08:57<46:51,  2.19it/s] 52%|█████▏    | 6608/12776 [1:08:58<44:58,  2.29it/s]                                                       52%|█████▏    | 6608/12776 [1:08:58<44:58,  2.29it/s] 52%|█████▏    | 6609/12776 [1:08:58<42:20,  2.43it/s]                                                       52%|█████▏    | 6609/12776 [1:08:58<42:20,  2.43it/s] 52%|█████▏    | 6610/12776 [1:08:59<40:20,  2.55it/s]                                                       52%|█████▏    | 6610/12776 [1:08:59<40:20,  2.55it/s] 52%|█████▏    | 6611/12776 [1:08:59<40:59,  2.51it/s]                                                       52%|█████▏    | 6611/12776 [1:08:59<40:59,  2.51it/s] 52%|█████▏    | 6612/12776 [1:08:59<38:44,  2.65it/s]                                                       52%|█████▏    | 6612/12776 [1:08:59<38:44,  2.65it/s] 52%|█████▏    | 6613/12776 [1:09:00<36:27,  2.82it/s]                                                       52%|█████▏    | 6613/12776 [1:09:00<36:27,  2.82it/s] 52%|█████▏    | 6614/12776 [1:09:00<34:39,  2.96it/s]                                                       52%|█████▏    | 6614/12776 [1:09:00<34:39,  2.96it/s] 52%|█████▏    | 6615/12776 [1:09:00<35:26,  2.90it/s]                                                       52%|█████▏    | 6615/12776 [1:09:00<35:26,  2.90it/s] 52%|█████▏    | 6616/12776 [1:09:01<33:21,  3.08it/s]                                                       52%|█████▏    | 6616/12776 [1:09:01<33:21,  3.08it/s] 52%|█████▏    | 6617/12776 [1:09:01<31:51,  3.22it/s]                                                       52%|█████▏    | 6617/12776 [1:09:01<31:51,  3.22it/s] 52%|█████▏    | 6618/12776 [1:09:01<30:36,  3.35it/s]                                                      {'loss': 0.3597, 'grad_norm': 1.063295841217041, 'learning_rate': 0.00015263929618768328, 'epoch': 1.02}
+{'loss': 0.252, 'grad_norm': 0.785956621170044, 'learning_rate': 0.00015261485826001956, 'epoch': 1.02}
+{'loss': 0.2251, 'grad_norm': 0.5662931203842163, 'learning_rate': 0.00015259042033235578, 'epoch': 1.02}
+{'loss': 0.1643, 'grad_norm': 0.4908407926559448, 'learning_rate': 0.00015256598240469206, 'epoch': 1.02}
+{'loss': 0.2976, 'grad_norm': 0.8578366041183472, 'learning_rate': 0.00015254154447702834, 'epoch': 1.02}
+{'loss': 0.2358, 'grad_norm': 0.5953052639961243, 'learning_rate': 0.0001525171065493646, 'epoch': 1.02}
+{'loss': 0.1725, 'grad_norm': 0.401744544506073, 'learning_rate': 0.00015249266862170087, 'epoch': 1.02}
+{'loss': 0.3137, 'grad_norm': 0.6543681621551514, 'learning_rate': 0.00015246823069403715, 'epoch': 1.03}
+{'loss': 0.2601, 'grad_norm': 0.9384623765945435, 'learning_rate': 0.0001524437927663734, 'epoch': 1.03}
+{'loss': 0.3343, 'grad_norm': 1.2204899787902832, 'learning_rate': 0.00015241935483870965, 'epoch': 1.03}
+{'loss': 0.3626, 'grad_norm': 0.8273910880088806, 'learning_rate': 0.00015239491691104593, 'epoch': 1.03}
+{'loss': 0.2757, 'grad_norm': 0.8227601051330566, 'learning_rate': 0.00015237047898338218, 'epoch': 1.03}
+{'loss': 0.3225, 'grad_norm': 0.7043795585632324, 'learning_rate': 0.00015234604105571846, 'epoch': 1.03}
+{'loss': 0.2981, 'grad_norm': 1.460331916809082, 'learning_rate': 0.00015232160312805474, 'epoch': 1.03}
+{'loss': 0.7302, 'grad_norm': 1.7429531812667847, 'learning_rate': 0.000152297165200391, 'epoch': 1.03}
+{'loss': 0.4463, 'grad_norm': 0.9695984125137329, 'learning_rate': 0.00015227272727272727, 'epoch': 1.03}
+{'loss': 0.4064, 'grad_norm': 0.9565237164497375, 'learning_rate': 0.00015224828934506354, 'epoch': 1.03}
+{'loss': 0.3796, 'grad_norm': 1.8992421627044678, 'learning_rate': 0.00015222385141739977, 'epoch': 1.03}
+{'loss': 0.3354, 'grad_norm': 1.4258291721343994, 'learning_rate': 0.00015219941348973605, 'epoch': 1.03}
+{'loss': 0.4635, 'grad_norm': 2.1398561000823975, 'learning_rate': 0.00015217497556207232, 'epoch': 1.03}
+{'loss': 0.5687, 'grad_norm': 1.383025884628296, 'learning_rate': 0.00015215053763440858, 'epoch': 1.03}
+{'loss': 0.5766, 'grad_norm': 1.5760375261306763, 'learning_rate': 0.00015212609970674485, 'epoch': 1.03}
+{'loss': 0.5502, 'grad_norm': 1.6190121173858643, 'learning_rate': 0.00015210166177908113, 'epoch': 1.03}
+{'loss': 0.6046, 'grad_norm': 1.2204431295394897, 'learning_rate': 0.00015207722385141738, 'epoch': 1.03}
+{'loss': 0.3985, 'grad_norm': 1.0577452182769775, 'learning_rate': 0.00015205278592375366, 'epoch': 1.03}
+{'loss': 0.4528, 'grad_norm': 1.111060619354248, 'learning_rate': 0.00015202834799608994, 'epoch': 1.03}
+{'loss': 0.8267, 'grad_norm': 2.2486462593078613, 'learning_rate': 0.00015200391006842616, 'epoch': 1.03}
+{'loss': 0.8305, 'grad_norm': 1.4202734231948853, 'learning_rate': 0.00015197947214076244, 'epoch': 1.03}
+{'loss': 0.7312, 'grad_norm': 2.8072428703308105, 'learning_rate': 0.00015195503421309872, 'epoch': 1.03}
+{'loss': 0.8986, 'grad_norm': 3.127408504486084, 'learning_rate': 0.00015193059628543497, 'epoch': 1.03}
+{'loss': 0.8522, 'grad_norm': 1.911171793937683, 'learning_rate': 0.00015190615835777125, 'epoch': 1.03}
+{'loss': 0.9098, 'grad_norm': 2.793549060821533, 'learning_rate': 0.00015188172043010753, 'epoch': 1.03}
+{'loss': 1.2318, 'grad_norm': 1.8251924514770508, 'learning_rate': 0.00015185728250244375, 'epoch': 1.03}
+{'loss': 0.9744, 'grad_norm': 7.607204437255859, 'learning_rate': 0.00015183284457478003, 'epoch': 1.03}
+{'loss': 1.8618, 'grad_norm': 3.5871646404266357, 'learning_rate': 0.0001518084066471163, 'epoch': 1.03}
+{'loss': 1.4547, 'grad_norm': 3.6757612228393555, 'learning_rate': 0.00015178396871945256, 'epoch': 1.03}
+{'loss': 0.8764, 'grad_norm': 2.2011477947235107, 'learning_rate': 0.00015175953079178884, 'epoch': 1.03}
+{'loss': 0.9784, 'grad_norm': 2.396167278289795, 'learning_rate': 0.00015173509286412512, 'epoch': 1.03}
+{'loss': 1.4503, 'grad_norm': 2.8914072513580322, 'learning_rate': 0.00015171065493646137, 'epoch': 1.03}
+{'loss': 1.3224, 'grad_norm': 2.1105411052703857, 'learning_rate': 0.00015168621700879765, 'epoch': 1.03}
+{'loss': 0.9473, 'grad_norm': 1.6223315000534058, 'learning_rate': 0.00015166177908113393, 'epoch': 1.03}
+{'loss': 1.2726, 'grad_norm': 2.1070399284362793, 'learning_rate': 0.00015163734115347015, 'epoch': 1.03}
+{'loss': 1.6022, 'grad_norm': 2.866163492202759, 'learning_rate': 0.00015161290322580643, 'epoch': 1.03}
+{'loss': 0.6449, 'grad_norm': 1.343387484550476, 'learning_rate': 0.0001515884652981427, 'epoch': 1.03}
+{'loss': 0.6767, 'grad_norm': 2.003796339035034, 'learning_rate': 0.00015156402737047896, 'epoch': 1.03}
+{'loss': 0.8415, 'grad_norm': 1.329639196395874, 'learning_rate': 0.00015153958944281524, 'epoch': 1.03}
+{'loss': 0.717, 'grad_norm': 2.692793607711792, 'learning_rate': 0.00015151515151515152, 'epoch': 1.03}
+{'loss': 0.7249, 'grad_norm': 1.7069381475448608, 'learning_rate': 0.00015149071358748777, 'epoch': 1.03}
+{'loss': 0.2442, 'grad_norm': 0.498267263174057, 'learning_rate': 0.00015146627565982404, 'epoch': 1.03}
+{'loss': 0.3577, 'grad_norm': 0.7013705372810364, 'learning_rate': 0.0001514418377321603, 'epoch': 1.03}
+{'loss': 0.2867, 'grad_norm': 1.0541874170303345, 'learning_rate': 0.00015141739980449655, 'epoch': 1.03}
+{'loss': 0.2876, 'grad_norm': 1.2763069868087769, 'learning_rate': 0.00015139296187683283, 'epoch': 1.03}
+{'loss': 0.3681, 'grad_norm': 0.702283501625061, 'learning_rate': 0.0001513685239491691, 'epoch': 1.03}
+{'loss': 0.2794, 'grad_norm': 0.6756613850593567, 'learning_rate': 0.00015134408602150536, 'epoch': 1.03}
+{'loss': 0.2589, 'grad_norm': 1.0558266639709473, 'learning_rate': 0.00015131964809384163, 'epoch': 1.03}
+{'loss': 0.4377, 'grad_norm': 0.8309182524681091, 'learning_rate': 0.0001512952101661779, 'epoch': 1.03}
+{'loss': 0.2772, 'grad_norm': 0.7502772212028503, 'learning_rate': 0.00015127077223851414, 'epoch': 1.03}
+{'loss': 0.3705, 'grad_norm': 1.0841580629348755, 'learning_rate': 0.00015124633431085041, 'epoch': 1.03}
+{'loss': 0.2433, 'grad_norm': 0.5633625388145447, 'learning_rate': 0.0001512218963831867, 'epoch': 1.03}
+{'loss': 0.2835, 'grad_norm': 0.6718013882637024, 'learning_rate': 0.00015119745845552294, 'epoch': 1.03}
+{'loss': 0.3426, 'grad_norm': 0.8712942600250244, 'learning_rate': 0.00015117302052785922, 'epoch': 1.03}
+{'loss': 0.311, 'grad_norm': 0.9348687529563904, 'learning_rate': 0.0001511485826001955, 'epoch': 1.03}
+{'loss': 0.3267, 'grad_norm': 0.8929564356803894, 'learning_rate': 0.00015112414467253175, 'epoch': 1.03}
+{'loss': 0.491, 'grad_norm': 1.1071451902389526, 'learning_rate': 0.00015109970674486803, 'epoch': 1.03}
+{'loss': 0.2884, 'grad_norm': 0.7876406908035278, 'learning_rate': 0.0001510752688172043, 'epoch': 1.03}
+{'loss': 0.4125, 'grad_norm': 1.126440405845642, 'learning_rate': 0.00015105083088954053, 'epoch': 1.03}
+{'loss': 0.1574, 'grad_norm': 0.6488103866577148, 'learning_rate': 0.0001510263929618768, 'epoch': 1.03}
+{'loss': 0.3864, 'grad_norm': 2.044353485107422, 'learning_rate': 0.0001510019550342131, 'epoch': 1.03}
+{'loss': 0.4484, 'grad_norm': 1.4567679166793823, 'learning_rate': 0.00015097751710654934, 'epoch': 1.03}
+{'loss': 0.9196, 'grad_norm': 2.9722204208374023, 'learning_rate': 0.00015095307917888562, 'epoch': 1.03}
+{'loss': 0.7483, 'grad_norm': 1.4675703048706055, 'learning_rate': 0.0001509286412512219, 'epoch': 1.03}
+{'loss': 0.586, 'grad_norm': 1.8821780681610107, 'learning_rate': 0.00015090420332355815, 'epoch': 1.04}
+{'loss': 0.6357, 'grad_norm': 2.960568428039551, 'learning_rate': 0.00015087976539589443, 'epoch': 1.04}
+{'loss': 0.5007, 'grad_norm': 1.3695021867752075, 'learning_rate': 0.00015085532746823068, 'epoch': 1.04}
+{'loss': 0.5556, 'grad_norm': 1.4479131698608398, 'learning_rate': 0.00015083088954056693, 'epoch': 1.04}
+{'loss': 0.9825, 'grad_norm': 1.7650444507598877, 'learning_rate': 0.0001508064516129032, 'epoch': 1.04}
+{'loss': 0.6728, 'grad_norm': 2.19905424118042, 'learning_rate': 0.00015078201368523949, 'epoch': 1.04}
+ 52%|█████▏    | 6618/12776 [1:09:01<30:36,  3.35it/s] 52%|█████▏    | 6619/12776 [1:09:01<31:10,  3.29it/s]                                                       52%|█████▏    | 6619/12776 [1:09:01<31:10,  3.29it/s] 52%|█████▏    | 6620/12776 [1:09:02<29:30,  3.48it/s]                                                       52%|█████▏    | 6620/12776 [1:09:02<29:30,  3.48it/s] 52%|█████▏    | 6621/12776 [1:09:02<28:10,  3.64it/s]                                                       52%|█████▏    | 6621/12776 [1:09:02<28:10,  3.64it/s] 52%|█████▏    | 6622/12776 [1:09:02<27:09,  3.78it/s]                                                       52%|█████▏    | 6622/12776 [1:09:02<27:09,  3.78it/s] 52%|█████▏    | 6623/12776 [1:09:03<29:33,  3.47it/s]                                                       52%|█████▏    | 6623/12776 [1:09:03<29:33,  3.47it/s] 52%|█████▏    | 6624/12776 [1:09:03<27:49,  3.69it/s]                                                       52%|█████▏    | 6624/12776 [1:09:03<27:49,  3.69it/s] 52%|█████▏    | 6625/12776 [1:09:03<26:23,  3.88it/s]                                                       52%|█████▏    | 6625/12776 [1:09:03<26:23,  3.88it/s] 52%|█████▏    | 6626/12776 [1:09:03<25:09,  4.07it/s]                                                       52%|█████▏    | 6626/12776 [1:09:03<25:09,  4.07it/s] 52%|█████▏    | 6627/12776 [1:09:04<27:37,  3.71it/s]                                                       52%|█████▏    | 6627/12776 [1:09:04<27:37,  3.71it/s] 52%|█████▏    | 6628/12776 [1:09:04<25:45,  3.98it/s]                                                       52%|█████▏    | 6628/12776 [1:09:04<25:45,  3.98it/s] 52%|█████▏    | 6629/12776 [1:09:04<24:29,  4.18it/s]                                                       52%|█████▏    | 6629/12776 [1:09:04<24:29,  4.18it/s] 52%|█████▏    | 6630/12776 [1:09:04<23:31,  4.36it/s]                                                       52%|█████▏    | 6630/12776 [1:09:04<23:31,  4.36it/s] 52%|█████▏    | 6631/12776 [1:09:04<22:38,  4.52it/s]                                                       52%|█████▏    | 6631/12776 [1:09:04<22:38,  4.52it/s] 52%|█████▏    | 6632/12776 [1:09:05<25:43,  3.98it/s]                                                       52%|█████▏    | 6632/12776 [1:09:05<25:43,  3.98it/s] 52%|█████▏    | 6633/12776 [1:09:05<24:07,  4.24it/s]                                                       52%|█████▏    | 6633/12776 [1:09:05<24:07,  4.24it/s] 52%|█████▏    | 6634/12776 [1:09:05<22:48,  4.49it/s]                                                       52%|█████▏    | 6634/12776 [1:09:05<22:48,  4.49it/s] 52%|█████▏    | 6635/12776 [1:09:05<21:52,  4.68it/s]                                                       52%|█████▏    | 6635/12776 [1:09:05<21:52,  4.68it/s] 52%|█████▏    | 6636/12776 [1:09:05<21:10,  4.83it/s]                                                       52%|█████▏    | 6636/12776 [1:09:05<21:10,  4.83it/s] 52%|█████▏    | 6637/12776 [1:09:06<20:33,  4.98it/s]                                                       52%|█████▏    | 6637/12776 [1:09:06<20:33,  4.98it/s] 52%|█████▏    | 6638/12776 [1:09:06<35:48,  2.86it/s]                                                       52%|█████▏    | 6638/12776 [1:09:06<35:48,  2.86it/s] 52%|█████▏    | 6639/12776 [1:09:08<1:05:31,  1.56it/s]                                                         52%|█████▏    | 6639/12776 [1:09:08<1:05:31,  1.56it/s] 52%|█████▏    | 6640/12776 [1:09:09<1:18:00,  1.31it/s]                                                         52%|█████▏    | 6640/12776 [1:09:09<1:18:00,  1.31it/s] 52%|█████▏    | 6641/12776 [1:09:10<1:25:27,  1.20it/s]                                                         52%|█████▏    | 6641/12776 [1:09:10<1:25:27,  1.20it/s] 52%|█████▏    | 6642/12776 [1:09:11<1:24:53,  1.20it/s]                                                         52%|█████▏    | 6642/12776 [1:09:11<1:24:53,  1.20it/s] 52%|█████▏    | 6643/12776 [1:09:11<1:22:35,  1.24it/s]                                                         52%|█████▏    | 6643/12776 [1:09:11<1:22:35,  1.24it/s] 52%|█████▏    | 6644/12776 [1:09:12<1:21:43,  1.25it/s]                                                         52%|█████▏    | 6644/12776 [1:09:12<1:21:43,  1.25it/s] 52%|█████▏    | 6645/12776 [1:09:13<1:21:41,  1.25it/s]                                                         52%|█████▏    | 6645/12776 [1:09:13<1:21:41,  1.25it/s] 52%|█████▏    | 6646/12776 [1:09:13<1:16:49,  1.33it/s]                                                         52%|█████▏    | 6646/12776 [1:09:13<1:16:49,  1.33it/s] 52%|█████▏    | 6647/12776 [1:09:14<1:11:17,  1.43it/s]                                                         52%|█████▏    | 6647/12776 [1:09:14<1:11:17,  1.43it/s] 52%|█████▏    | 6648/12776 [1:09:15<1:06:51,  1.53it/s]                                                         52%|█████▏    | 6648/12776 [1:09:15<1:06:51,  1.53it/s] 52%|█████▏    | 6649/12776 [1:09:15<1:05:50,  1.55it/s]                                                         52%|█████▏    | 6649/12776 [1:09:15<1:05:50,  1.55it/s] 52%|█████▏    | 6650/12776 [1:09:16<1:01:34,  1.66it/s]                                                         52%|█████▏    | 6650/12776 [1:09:16<1:01:34,  1.66it/s] 52%|█████▏    | 6651/12776 [1:09:16<1:01:22,  1.66it/s]                                                         52%|█████▏    | 6651/12776 [1:09:16<1:01:22,  1.66it/s] 52%|█████▏    | 6652/12776 [1:09:17<56:58,  1.79it/s]                                                         52%|█████▏    | 6652/12776 [1:09:17<56:58,  1.79it/s] 52%|█████▏    | 6653/12776 [1:09:17<53:36,  1.90it/s]                                                       52%|█████▏    | 6653/12776 [1:09:17<53:36,  1.90it/s] 52%|█████▏    | 6654/12776 [1:09:18<54:25,  1.87it/s]                                                       52%|█████▏    | 6654/12776 [1:09:18<54:25,  1.87it/s] 52%|█████▏    | 6655/12776 [1:09:18<51:11,  1.99it/s]                                                       52%|█████▏    | 6655/12776 [1:09:18<51:11,  1.99it/s] 52%|█████▏    | 6656/12776 [1:09:19<49:15,  2.07it/s]                                                       52%|█████▏    | 6656/12776 [1:09:19<49:15,  2.07it/s] 52%|█████▏    | 6657/12776 [1:09:19<46:16,  2.20it/s]                                                       52%|█████▏    | 6657/12776 [1:09:19<46:16,  2.20it/s] 52%|█████▏    | 6658/12776 [1:09:19<43:38,  2.34it/s]                                                       52%|█████▏    | 6658/12776 [1:09:19<43:38,  2.34it/s] 52%|█████▏    | 6659/12776 [1:09:20<42:49,  2.38it/s]                                                       52%|█████▏    | 6659/12776 [1:09:20<42:49,  2.38it/s] 52%|█████▏    | 6660/12776 [1:09:20<40:29,  2.52it/s]                                                       52%|█████▏    | 6660/12776 [1:09:20<40:29,  2.52it/s] 52%|█████▏    | 6661/12776 [1:09:20<38:45,  2.63it/s]                                                       52%|█████▏    | 6661/12776 [1:09:20<38:45,  2.63it/s] 52%|█████▏    | 6662/12776 [1:09:21<40:04,  2.54it/s]                                                       52%|█████▏    | 6662/12776 [1:09:21<40:04,  2.54it/s] 52%|█████▏    | 6663/12776 [1:09:21<37:40,  2.70it/s]                                                       52%|█████▏    | 6663/12776 [1:09:21<37:40,  2.70it/s] 52%|█████▏    | 6664/12776 [1:09:22<35:33,  2.87it/s]                                                       52%|█████▏    | 6664/12776 [1:09:22<35:33,  2.87it/s] 52%|█████▏    | 6665/12776 [1:09:22<33:50,  3.01it/s]                                                       52%|█████▏    | 6665/12776 [1:09:22<33:50,  3.01it/s] 52%|█████▏    | 6666/12776 [1:09:22<33:59,  3.00it/s]                                                       52%|█████▏    | 6666/12776 [1:09:22<33:59,  3.00it/s] 52%|█████▏    | 6667/12776 [1:09:22<32:12,  3.16it/s]                                                       52%|█████▏    | 6667/12776 [1:09:22<32:12,  3.16it/s] 52%|█████▏    | 6668/12776 [1:09:23<30:56,  3.29it/s]                                                       52%|█████▏    | 6668/12776 [1:09:23<30:56,  3.29it/s] 52%|█████▏    | 6669/12776 [1:09:23<29:35,  3.44it/s]                                                       52%|█████▏    | 6669/12776 [1:09:23<29:35,  3.44it/s] 52%|█████▏    | 6670/12776 [1:09:23<31:00,  3.28it/s]                                                       52%|█████▏    | 6670/12776 [1:09:23<31:00,  3.28it/s] 52%|█████▏    | 6671/12776 [1:09:24<29:13,  3.48it/s]                                                       52%|█████▏    | 6671/12776 [1:09:24<29:13,  3.48it/s] 52%|█████▏    | 6672/12776 [1:09:24<27:54,  3.65it/s]                                                       52%|█████▏    | 6672/12776 [1:09:24<27:54,  3.65it/s] 52%|█████▏    | 6673/12776 [1:09:24<26:36,  3.82it/s]                                                       52%|█████▏    | 6673/12776 [1:09:24<26:36,  3.82it/s] 52%|█████▏    | 6674/12776 [1:09:24<29:38,  3.43it/s]                                                       52%|█████▏    | 6674/12776 [1:09:24<29:38,  3.43it/s] 52%|█████▏    | 6675/12776 [1:09:25<27:31,  3.69it/s]                                                       52%|█████▏    | 6675/12776 [1:09:25<27:31,  3.69it/s] 52%|█████▏    | 6676/12776 [1:09:25<25:48,  3.94it/s]                                                       52%|█████▏    | 6676/12776 [1:09:25<25:48,  3.94it/s] 52%|█████▏    | 6677/12776 [1:09:25<24:27,  4.16it/s]                                                       52%|█████▏    | 6677/12776 [1:09:25<24:27,  4.16it/s] 52%|█████▏    | 6678/12776 [1:09:25<23:38,  4.30it/s]                                                       52%|█████▏    | 6678/12776 [1:09:25<23:38,  4.30it/s] 52%|█████▏    | 6679/12776 [1:09:26<25:07,  4.05it/s]                                                       52%|█████▏    | 6679/12776 [1:09:26<25:07,  4.05it/s] 52%|█████▏    | 6680/12776 [1:09:26<23:47,  4.27it/s]                                                       52%|█████▏    | 6680/12776 [1:09:26<23:47,  4.27it/s] 52%|█████▏    | 6681/12776 [1:09:26<22:50,  4.45it/s]                                                       52%|█████▏    | 6681/12776 [1:09:26<22:50,  4.45it/s] 52%|█████▏    | 6682/12776 [1:09:26<22:14,  4.57it/s]                                                       52%|█████▏    | 6682/12776 [1:09:26<22:14,  4.57it/s] 52%|█████▏    | 6683/12776 [1:09:26<21:36,  4.70it/s]                                                       52%|█████▏    | 6683/12776 [1:09:26<21:36,  4.70it/s] 52%|█████▏    | 6684/12776 [1:09:27<24:52,  4.08it/s]                                                       52%|█████▏    | 6684/12776 [1:09:27<24:52,  4.08it/s] 52%|█████▏    | 6685/12776 [1:09:27<23:20,  4.35it/s]                                                       52%|█████▏    | 6685/12776 [1:09:27<23:20,  4.35it/s] 52%|█████▏    | 6686/12776 [1:09:27<22:00,  4.61it/s]                                                       52%|█████▏    | 6686/12776 [1:09:27<22:00,  4.61it/s] 52%|█████▏    | 6687/12776 [1:09:27<21:03,  4.82it/s]                                                       52%|█████▏    | 6687/12776 [1:09:27<21:03,  4.82it/s] 52%|█████▏    | 6688/12776 [1:09:28<37:13,  2.73it/s]                                                       52%|█████▏    | 6688/12776 [1:09:28<37:13,  2.73it/s] 52%|█████▏    | 6689/12776 [1:09:29<1:09:21,  1.46it/s]                                                         52%|█████▏    | 6689/12776 [1:09:29<1:09:21,  1.46it/s] 52%|█████▏    | 6690/12776 [1:09:30<1:17:50,  1.30it/s]                                                         52%|█████▏    | 6690/12776 [1:09:30<1:17:50,  1.30it/s] 52%|█████▏    | 6691/12776 [1:09:31<1:21:21,  1.25it/s]                                                         52%|█████▏    | 6691/12776 [1:09:31<1:21:21,  1.25it/s] 52%|█████▏    | 6692/12776 [1:09:32<1:20:45,  1.26it/s]                                                         52%|█████▏    | 6692/12776 [1:09:32<1:20:45,  1.26it/s] 52%|█████▏    | 6693/12776 [1:09:33<1:18:39,  1.29it/s]                                                         52%|█████▏    | 6693/12776 [1:09:33<1:18:39,  1.29it/s] 52%|█████▏    | 6694/12776 [1:09:33<1:16:18,  1.33it/s]                                                         52%|█████▏    | 6694/12776 [1:09:33<1:16:18,  1.33it/s] 52%|█████▏    | 6695/12776 [1:09:34<1:12:53,  1.39it/s]                                                        {'loss': 0.7472, 'grad_norm': 1.4833353757858276, 'learning_rate': 0.00015075757575757574, 'epoch': 1.04}
+{'loss': 0.7654, 'grad_norm': 1.978433609008789, 'learning_rate': 0.00015073313782991202, 'epoch': 1.04}
+{'loss': 1.1002, 'grad_norm': 1.5188682079315186, 'learning_rate': 0.0001507086999022483, 'epoch': 1.04}
+{'loss': 0.746, 'grad_norm': 2.915623188018799, 'learning_rate': 0.00015068426197458452, 'epoch': 1.04}
+{'loss': 0.6476, 'grad_norm': 3.499290704727173, 'learning_rate': 0.0001506598240469208, 'epoch': 1.04}
+{'loss': 0.8943, 'grad_norm': 2.9197161197662354, 'learning_rate': 0.00015063538611925707, 'epoch': 1.04}
+{'loss': 0.7917, 'grad_norm': 1.2932711839675903, 'learning_rate': 0.00015061094819159333, 'epoch': 1.04}
+{'loss': 0.6509, 'grad_norm': 2.556731939315796, 'learning_rate': 0.0001505865102639296, 'epoch': 1.04}
+{'loss': 1.0172, 'grad_norm': 3.067978620529175, 'learning_rate': 0.00015056207233626588, 'epoch': 1.04}
+{'loss': 0.5996, 'grad_norm': 2.2281928062438965, 'learning_rate': 0.00015053763440860213, 'epoch': 1.04}
+{'loss': 0.6033, 'grad_norm': 2.0016701221466064, 'learning_rate': 0.0001505131964809384, 'epoch': 1.04}
+{'loss': 1.0168, 'grad_norm': 2.9594852924346924, 'learning_rate': 0.0001504887585532747, 'epoch': 1.04}
+{'loss': 1.1409, 'grad_norm': 2.735733985900879, 'learning_rate': 0.00015046432062561091, 'epoch': 1.04}
+{'loss': 1.7845, 'grad_norm': 4.281361103057861, 'learning_rate': 0.0001504398826979472, 'epoch': 1.04}
+{'loss': 1.0803, 'grad_norm': 4.471887111663818, 'learning_rate': 0.00015041544477028347, 'epoch': 1.04}
+{'loss': 1.1523, 'grad_norm': 3.5582237243652344, 'learning_rate': 0.00015039100684261972, 'epoch': 1.04}
+{'loss': 0.6606, 'grad_norm': 2.951169490814209, 'learning_rate': 0.000150366568914956, 'epoch': 1.04}
+{'loss': 0.6368, 'grad_norm': 1.8436496257781982, 'learning_rate': 0.00015034213098729228, 'epoch': 1.04}
+{'loss': 0.4588, 'grad_norm': 2.94608473777771, 'learning_rate': 0.00015031769305962853, 'epoch': 1.04}
+{'loss': 0.4628, 'grad_norm': 2.0353431701660156, 'learning_rate': 0.0001502932551319648, 'epoch': 1.04}
+{'loss': 0.7833, 'grad_norm': 2.008845329284668, 'learning_rate': 0.00015026881720430106, 'epoch': 1.04}
+{'loss': 0.194, 'grad_norm': 0.664910078048706, 'learning_rate': 0.0001502443792766373, 'epoch': 1.04}
+{'loss': 0.1953, 'grad_norm': 0.388094037771225, 'learning_rate': 0.0001502199413489736, 'epoch': 1.04}
+{'loss': 0.1952, 'grad_norm': 0.3977845311164856, 'learning_rate': 0.00015019550342130987, 'epoch': 1.04}
+{'loss': 0.1599, 'grad_norm': 0.4224265515804291, 'learning_rate': 0.00015017106549364612, 'epoch': 1.04}
+{'loss': 0.2271, 'grad_norm': 0.7730469703674316, 'learning_rate': 0.0001501466275659824, 'epoch': 1.04}
+{'loss': 0.328, 'grad_norm': 0.7396877408027649, 'learning_rate': 0.00015012218963831868, 'epoch': 1.04}
+{'loss': 0.2924, 'grad_norm': 0.7163776755332947, 'learning_rate': 0.0001500977517106549, 'epoch': 1.04}
+{'loss': 0.2665, 'grad_norm': 0.7537462115287781, 'learning_rate': 0.00015007331378299118, 'epoch': 1.04}
+{'loss': 1.0396, 'grad_norm': 4.295088768005371, 'learning_rate': 0.00015004887585532746, 'epoch': 1.04}
+{'loss': 0.2891, 'grad_norm': 0.830767810344696, 'learning_rate': 0.0001500244379276637, 'epoch': 1.04}
+{'loss': 0.3978, 'grad_norm': 0.9357022643089294, 'learning_rate': 0.00015, 'epoch': 1.04}
+{'loss': 0.2655, 'grad_norm': 0.6923136115074158, 'learning_rate': 0.00014997556207233624, 'epoch': 1.04}
+{'loss': 0.2939, 'grad_norm': 1.0531424283981323, 'learning_rate': 0.00014995112414467252, 'epoch': 1.04}
+{'loss': 0.4025, 'grad_norm': 0.8230469822883606, 'learning_rate': 0.0001499266862170088, 'epoch': 1.04}
+{'loss': 0.3075, 'grad_norm': 0.7676156163215637, 'learning_rate': 0.00014990224828934505, 'epoch': 1.04}
+{'loss': 0.5837, 'grad_norm': 2.29850697517395, 'learning_rate': 0.00014987781036168132, 'epoch': 1.04}
+{'loss': 0.4561, 'grad_norm': 0.798516571521759, 'learning_rate': 0.00014985337243401758, 'epoch': 1.04}
+{'loss': 0.5438, 'grad_norm': 1.210540771484375, 'learning_rate': 0.00014982893450635385, 'epoch': 1.04}
+{'loss': 0.6722, 'grad_norm': 1.6626873016357422, 'learning_rate': 0.00014980449657869013, 'epoch': 1.04}
+{'loss': 0.5357, 'grad_norm': 0.6938304305076599, 'learning_rate': 0.00014978005865102638, 'epoch': 1.04}
+{'loss': 0.4106, 'grad_norm': 0.9663745164871216, 'learning_rate': 0.00014975562072336263, 'epoch': 1.04}
+{'loss': 0.3207, 'grad_norm': 0.8333467245101929, 'learning_rate': 0.0001497311827956989, 'epoch': 1.04}
+{'loss': 0.3437, 'grad_norm': 0.7671467065811157, 'learning_rate': 0.00014970674486803516, 'epoch': 1.04}
+{'loss': 0.4651, 'grad_norm': 0.9945407509803772, 'learning_rate': 0.00014968230694037144, 'epoch': 1.04}
+{'loss': 0.5176, 'grad_norm': 1.5871999263763428, 'learning_rate': 0.00014965786901270772, 'epoch': 1.04}
+{'loss': 0.7892, 'grad_norm': 3.3692855834960938, 'learning_rate': 0.00014963343108504397, 'epoch': 1.04}
+{'loss': 0.4972, 'grad_norm': 2.7460215091705322, 'learning_rate': 0.00014960899315738022, 'epoch': 1.04}
+{'loss': 0.4056, 'grad_norm': 2.0710816383361816, 'learning_rate': 0.0001495845552297165, 'epoch': 1.04}
+{'loss': 0.6009, 'grad_norm': 2.1053054332733154, 'learning_rate': 0.00014956011730205278, 'epoch': 1.04}
+{'loss': 0.8748, 'grad_norm': 2.5301668643951416, 'learning_rate': 0.00014953567937438903, 'epoch': 1.04}
+{'loss': 0.6859, 'grad_norm': 1.9505361318588257, 'learning_rate': 0.0001495112414467253, 'epoch': 1.04}
+{'loss': 0.4152, 'grad_norm': 1.7739278078079224, 'learning_rate': 0.00014948680351906156, 'epoch': 1.04}
+{'loss': 0.5335, 'grad_norm': 2.2372188568115234, 'learning_rate': 0.00014946236559139784, 'epoch': 1.04}
+{'loss': 0.9134, 'grad_norm': 3.2507851123809814, 'learning_rate': 0.00014943792766373412, 'epoch': 1.04}
+{'loss': 0.9072, 'grad_norm': 1.7486919164657593, 'learning_rate': 0.00014941348973607037, 'epoch': 1.04}
+{'loss': 0.8984, 'grad_norm': 1.5472897291183472, 'learning_rate': 0.00014938905180840662, 'epoch': 1.04}
+{'loss': 0.6659, 'grad_norm': 2.7646539211273193, 'learning_rate': 0.0001493646138807429, 'epoch': 1.04}
+{'loss': 1.1551, 'grad_norm': 2.3693439960479736, 'learning_rate': 0.00014934017595307918, 'epoch': 1.05}
+{'loss': 1.4091, 'grad_norm': 4.565879821777344, 'learning_rate': 0.00014931573802541543, 'epoch': 1.05}
+{'loss': 1.1402, 'grad_norm': 2.8540971279144287, 'learning_rate': 0.0001492913000977517, 'epoch': 1.05}
+{'loss': 1.2866, 'grad_norm': 3.433985948562622, 'learning_rate': 0.00014926686217008796, 'epoch': 1.05}
+{'loss': 0.4375, 'grad_norm': 1.5474061965942383, 'learning_rate': 0.00014924242424242424, 'epoch': 1.05}
+{'loss': 0.9397, 'grad_norm': 1.9934204816818237, 'learning_rate': 0.00014921798631476051, 'epoch': 1.05}
+{'loss': 1.5848, 'grad_norm': 4.180722236633301, 'learning_rate': 0.00014919354838709677, 'epoch': 1.05}
+{'loss': 1.2605, 'grad_norm': 2.291654348373413, 'learning_rate': 0.00014916911045943302, 'epoch': 1.05}
+{'loss': 0.4326, 'grad_norm': 2.387073516845703, 'learning_rate': 0.0001491446725317693, 'epoch': 1.05}
+{'loss': 0.4884, 'grad_norm': 2.3532493114471436, 'learning_rate': 0.00014912023460410555, 'epoch': 1.05}
+{'loss': 1.0124, 'grad_norm': 2.2486379146575928, 'learning_rate': 0.00014909579667644183, 'epoch': 1.05}
+{'loss': 1.0389, 'grad_norm': 2.661993980407715, 'learning_rate': 0.0001490713587487781, 'epoch': 1.05}
+{'loss': 0.4506, 'grad_norm': 2.130023717880249, 'learning_rate': 0.00014904692082111435, 'epoch': 1.05}
+{'loss': 0.2949, 'grad_norm': 0.6266974806785583, 'learning_rate': 0.0001490224828934506, 'epoch': 1.05}
+{'loss': 0.2782, 'grad_norm': 0.5856244564056396, 'learning_rate': 0.00014899804496578688, 'epoch': 1.05}
+{'loss': 0.244, 'grad_norm': 0.578836977481842, 'learning_rate': 0.00014897360703812316, 'epoch': 1.05}
+{'loss': 0.2312, 'grad_norm': 0.5507057905197144, 'learning_rate': 0.00014894916911045941, 'epoch': 1.05}
+{'loss': 0.2004, 'grad_norm': 0.4590769112110138, 'learning_rate': 0.0001489247311827957, 'epoch': 1.05}
+{'loss': 0.2628, 'grad_norm': 0.5535408854484558, 'learning_rate': 0.00014890029325513194, 'epoch': 1.05}
+ 52%|█████▏    | 6695/12776 [1:09:34<1:12:53,  1.39it/s] 52%|█████▏    | 6696/12776 [1:09:35<1:13:18,  1.38it/s]                                                         52%|█████▏    | 6696/12776 [1:09:35<1:13:18,  1.38it/s] 52%|█████▏    | 6697/12776 [1:09:35<1:08:59,  1.47it/s]                                                         52%|█████▏    | 6697/12776 [1:09:35<1:08:59,  1.47it/s] 52%|█████▏    | 6698/12776 [1:09:36<1:08:05,  1.49it/s]                                                         52%|█████▏    | 6698/12776 [1:09:36<1:08:05,  1.49it/s] 52%|█████▏    | 6699/12776 [1:09:37<1:03:52,  1.59it/s]                                                         52%|█████▏    | 6699/12776 [1:09:37<1:03:52,  1.59it/s] 52%|█████▏    | 6700/12776 [1:09:37<1:02:55,  1.61it/s]                                                         52%|█████▏    | 6700/12776 [1:09:37<1:02:55,  1.61it/s] 52%|█████▏    | 6701/12776 [1:09:38<58:06,  1.74it/s]                                                         52%|█████▏    | 6701/12776 [1:09:38<58:06,  1.74it/s] 52%|█████▏    | 6702/12776 [1:09:38<59:29,  1.70it/s]                                                       52%|█████▏    | 6702/12776 [1:09:38<59:29,  1.70it/s] 52%|█████▏    | 6703/12776 [1:09:39<54:55,  1.84it/s]                                                       52%|█████▏    | 6703/12776 [1:09:39<54:55,  1.84it/s] 52%|█████▏    | 6704/12776 [1:09:39<54:37,  1.85it/s]                                                       52%|█████▏    | 6704/12776 [1:09:39<54:37,  1.85it/s] 52%|█████▏    | 6705/12776 [1:09:40<50:25,  2.01it/s]                                                       52%|█████▏    | 6705/12776 [1:09:40<50:25,  2.01it/s] 52%|█████▏    | 6706/12776 [1:09:40<47:10,  2.14it/s]                                                       52%|█████▏    | 6706/12776 [1:09:40<47:10,  2.14it/s] 52%|█████▏    | 6707/12776 [1:09:41<49:02,  2.06it/s]                                                       52%|█████▏    | 6707/12776 [1:09:41<49:02,  2.06it/s] 53%|█████▎    | 6708/12776 [1:09:41<45:21,  2.23it/s]                                                       53%|█████▎    | 6708/12776 [1:09:41<45:21,  2.23it/s] 53%|█████▎    | 6709/12776 [1:09:41<42:31,  2.38it/s]                                                       53%|█████▎    | 6709/12776 [1:09:41<42:31,  2.38it/s] 53%|█████▎    | 6710/12776 [1:09:42<41:41,  2.42it/s]                                                       53%|█████▎    | 6710/12776 [1:09:42<41:41,  2.42it/s] 53%|█████▎    | 6711/12776 [1:09:42<39:16,  2.57it/s]                                                       53%|█████▎    | 6711/12776 [1:09:42<39:16,  2.57it/s] 53%|█████▎    | 6712/12776 [1:09:42<37:20,  2.71it/s]                                                       53%|█████▎    | 6712/12776 [1:09:42<37:20,  2.71it/s] 53%|█████▎    | 6713/12776 [1:09:43<36:28,  2.77it/s]                                                       53%|█████▎    | 6713/12776 [1:09:43<36:28,  2.77it/s] 53%|█████▎    | 6714/12776 [1:09:43<34:31,  2.93it/s]                                                       53%|█████▎    | 6714/12776 [1:09:43<34:31,  2.93it/s] 53%|█████▎    | 6715/12776 [1:09:43<32:55,  3.07it/s]                                                       53%|█████▎    | 6715/12776 [1:09:43<32:55,  3.07it/s] 53%|█████▎    | 6716/12776 [1:09:44<31:23,  3.22it/s]                                                       53%|█████▎    | 6716/12776 [1:09:44<31:23,  3.22it/s] 53%|█████▎    | 6717/12776 [1:09:44<33:57,  2.97it/s]                                                       53%|█████▎    | 6717/12776 [1:09:44<33:57,  2.97it/s] 53%|█████▎    | 6718/12776 [1:09:44<31:44,  3.18it/s]                                                       53%|█████▎    | 6718/12776 [1:09:44<31:44,  3.18it/s] 53%|█████▎    | 6719/12776 [1:09:44<29:52,  3.38it/s]                                                       53%|█████▎    | 6719/12776 [1:09:44<29:52,  3.38it/s] 53%|█████▎    | 6720/12776 [1:09:45<28:40,  3.52it/s]                                                       53%|█████▎    | 6720/12776 [1:09:45<28:40,  3.52it/s] 53%|█████▎    | 6721/12776 [1:09:45<30:56,  3.26it/s]                                                       53%|█████▎    | 6721/12776 [1:09:45<30:56,  3.26it/s] 53%|█████▎    | 6722/12776 [1:09:45<28:57,  3.49it/s]                                                       53%|█████▎    | 6722/12776 [1:09:45<28:57,  3.49it/s] 53%|█████▎    | 6723/12776 [1:09:46<27:10,  3.71it/s]                                                       53%|█████▎    | 6723/12776 [1:09:46<27:10,  3.71it/s] 53%|█████▎    | 6724/12776 [1:09:46<25:38,  3.93it/s]                                                       53%|█████▎    | 6724/12776 [1:09:46<25:38,  3.93it/s] 53%|█████▎    | 6725/12776 [1:09:46<27:34,  3.66it/s]                                                       53%|█████▎    | 6725/12776 [1:09:46<27:34,  3.66it/s] 53%|█████▎    | 6726/12776 [1:09:46<25:47,  3.91it/s]                                                       53%|█████▎    | 6726/12776 [1:09:46<25:47,  3.91it/s] 53%|█████▎    | 6727/12776 [1:09:46<24:24,  4.13it/s]                                                       53%|█████▎    | 6727/12776 [1:09:47<24:24,  4.13it/s] 53%|█████▎    | 6728/12776 [1:09:47<23:27,  4.30it/s]                                                       53%|█████▎    | 6728/12776 [1:09:47<23:27,  4.30it/s] 53%|█████▎    | 6729/12776 [1:09:47<22:40,  4.44it/s]                                                       53%|█████▎    | 6729/12776 [1:09:47<22:40,  4.44it/s] 53%|█████▎    | 6730/12776 [1:09:47<24:53,  4.05it/s]                                                       53%|█████▎    | 6730/12776 [1:09:47<24:53,  4.05it/s] 53%|█████▎    | 6731/12776 [1:09:47<23:31,  4.28it/s]                                                       53%|█████▎    | 6731/12776 [1:09:47<23:31,  4.28it/s] 53%|█████▎    | 6732/12776 [1:09:48<22:29,  4.48it/s]                                                       53%|█████▎    | 6732/12776 [1:09:48<22:29,  4.48it/s] 53%|█████▎    | 6733/12776 [1:09:48<21:43,  4.64it/s]                                                       53%|█████▎    | 6733/12776 [1:09:48<21:43,  4.64it/s] 53%|█████▎    | 6734/12776 [1:09:48<21:02,  4.79it/s]                                                       53%|█████▎    | 6734/12776 [1:09:48<21:02,  4.79it/s] 53%|█████▎    | 6735/12776 [1:09:48<20:34,  4.89it/s]                                                       53%|█████▎    | 6735/12776 [1:09:48<20:34,  4.89it/s] 53%|█████▎    | 6736/12776 [1:09:48<21:47,  4.62it/s]                                                       53%|█████▎    | 6736/12776 [1:09:48<21:47,  4.62it/s] 53%|█████▎    | 6737/12776 [1:09:49<20:51,  4.83it/s]                                                       53%|█████▎    | 6737/12776 [1:09:49<20:51,  4.83it/s] 53%|█████▎    | 6738/12776 [1:09:49<35:28,  2.84it/s]                                                       53%|█████▎    | 6738/12776 [1:09:49<35:28,  2.84it/s] 53%|█████▎    | 6739/12776 [1:09:51<1:05:22,  1.54it/s]                                                         53%|█████▎    | 6739/12776 [1:09:51<1:05:22,  1.54it/s] 53%|█████▎    | 6740/12776 [1:09:52<1:12:44,  1.38it/s]                                                         53%|█████▎    | 6740/12776 [1:09:52<1:12:44,  1.38it/s] 53%|█████▎    | 6741/12776 [1:09:52<1:14:28,  1.35it/s]                                                         53%|█████▎    | 6741/12776 [1:09:52<1:14:28,  1.35it/s] 53%|█████▎    | 6742/12776 [1:09:53<1:18:43,  1.28it/s]                                                         53%|█████▎    | 6742/12776 [1:09:53<1:18:43,  1.28it/s] 53%|█████▎    | 6743/12776 [1:09:54<1:16:06,  1.32it/s]                                                         53%|█████▎    | 6743/12776 [1:09:54<1:16:06,  1.32it/s] 53%|█████▎    | 6744/12776 [1:09:55<1:13:08,  1.37it/s]                                                         53%|█████▎    | 6744/12776 [1:09:55<1:13:08,  1.37it/s] 53%|█████▎    | 6745/12776 [1:09:55<1:12:08,  1.39it/s]                                                         53%|█████▎    | 6745/12776 [1:09:55<1:12:08,  1.39it/s] 53%|█████▎    | 6746/12776 [1:09:56<1:08:48,  1.46it/s]                                                         53%|█████▎    | 6746/12776 [1:09:56<1:08:48,  1.46it/s] 53%|█████▎    | 6747/12776 [1:09:56<1:06:17,  1.52it/s]                                                         53%|█████▎    | 6747/12776 [1:09:56<1:06:17,  1.52it/s] 53%|█████▎    | 6748/12776 [1:09:57<1:02:38,  1.60it/s]                                                         53%|█████▎    | 6748/12776 [1:09:57<1:02:38,  1.60it/s] 53%|█████▎    | 6749/12776 [1:09:58<1:02:47,  1.60it/s]                                                         53%|█████▎    | 6749/12776 [1:09:58<1:02:47,  1.60it/s] 53%|█████▎    | 6750/12776 [1:09:58<58:50,  1.71it/s]                                                         53%|█████▎    | 6750/12776 [1:09:58<58:50,  1.71it/s] 53%|█████▎    | 6751/12776 [1:09:59<55:00,  1.83it/s]                                                       53%|█████▎    | 6751/12776 [1:09:59<55:00,  1.83it/s] 53%|█████▎    | 6752/12776 [1:09:59<52:26,  1.91it/s]                                                       53%|█████▎    | 6752/12776 [1:09:59<52:26,  1.91it/s] 53%|█████▎    | 6753/12776 [1:10:00<49:56,  2.01it/s]                                                       53%|█████▎    | 6753/12776 [1:10:00<49:56,  2.01it/s] 53%|█████▎    | 6754/12776 [1:10:00<48:35,  2.07it/s]                                                       53%|█████▎    | 6754/12776 [1:10:00<48:35,  2.07it/s] 53%|█████▎    | 6755/12776 [1:10:00<46:06,  2.18it/s]                                                       53%|█████▎    | 6755/12776 [1:10:00<46:06,  2.18it/s] 53%|█████▎    | 6756/12776 [1:10:01<43:50,  2.29it/s]                                                       53%|█████▎    | 6756/12776 [1:10:01<43:50,  2.29it/s] 53%|█████▎    | 6757/12776 [1:10:01<45:19,  2.21it/s]                                                       53%|█████▎    | 6757/12776 [1:10:01<45:19,  2.21it/s] 53%|█████▎    | 6758/12776 [1:10:02<42:10,  2.38it/s]                                                       53%|█████▎    | 6758/12776 [1:10:02<42:10,  2.38it/s] 53%|█████▎    | 6759/12776 [1:10:02<39:46,  2.52it/s]                                                       53%|█████▎    | 6759/12776 [1:10:02<39:46,  2.52it/s] 53%|█████▎    | 6760/12776 [1:10:02<41:11,  2.43it/s]                                                       53%|█████▎    | 6760/12776 [1:10:02<41:11,  2.43it/s] 53%|█████▎    | 6761/12776 [1:10:03<38:18,  2.62it/s]                                                       53%|█████▎    | 6761/12776 [1:10:03<38:18,  2.62it/s] 53%|█████▎    | 6762/12776 [1:10:03<35:57,  2.79it/s]                                                       53%|█████▎    | 6762/12776 [1:10:03<35:57,  2.79it/s] 53%|█████▎    | 6763/12776 [1:10:03<36:09,  2.77it/s]                                                       53%|█████▎    | 6763/12776 [1:10:03<36:09,  2.77it/s] 53%|█████▎    | 6764/12776 [1:10:04<34:01,  2.94it/s]                                                       53%|█████▎    | 6764/12776 [1:10:04<34:01,  2.94it/s] 53%|█████▎    | 6765/12776 [1:10:04<32:14,  3.11it/s]                                                       53%|█████▎    | 6765/12776 [1:10:04<32:14,  3.11it/s] 53%|█████▎    | 6766/12776 [1:10:04<30:46,  3.25it/s]                                                       53%|█████▎    | 6766/12776 [1:10:04<30:46,  3.25it/s] 53%|█████▎    | 6767/12776 [1:10:05<31:10,  3.21it/s]                                                       53%|█████▎    | 6767/12776 [1:10:05<31:10,  3.21it/s] 53%|█████▎    | 6768/12776 [1:10:05<29:42,  3.37it/s]                                                       53%|█████▎    | 6768/12776 [1:10:05<29:42,  3.37it/s] 53%|█████▎    | 6769/12776 [1:10:05<28:24,  3.52it/s]                                                       53%|█████▎    | 6769/12776 [1:10:05<28:24,  3.52it/s] 53%|█████▎    | 6770/12776 [1:10:05<27:21,  3.66it/s]                                                       53%|█████▎    | 6770/12776 [1:10:05<27:21,  3.66it/s] 53%|█████▎    | 6771/12776 [1:10:06<30:16,  3.31it/s]                                                       53%|█████▎    | 6771/12776 [1:10:06<30:16,  3.31it/s] 53%|█████▎    | 6772/12776 [1:10:06<28:19,  3.53it/s]                                                      {'loss': 0.2208, 'grad_norm': 2.307926654815674, 'learning_rate': 0.00014887585532746822, 'epoch': 1.05}
+{'loss': 0.2184, 'grad_norm': 0.5684087872505188, 'learning_rate': 0.0001488514173998045, 'epoch': 1.05}
+{'loss': 0.2584, 'grad_norm': 0.6311387419700623, 'learning_rate': 0.00014882697947214075, 'epoch': 1.05}
+{'loss': 0.2507, 'grad_norm': 0.6801841259002686, 'learning_rate': 0.000148802541544477, 'epoch': 1.05}
+{'loss': 0.3224, 'grad_norm': 0.8191403150558472, 'learning_rate': 0.00014877810361681328, 'epoch': 1.05}
+{'loss': 0.3435, 'grad_norm': 0.9720447063446045, 'learning_rate': 0.00014875366568914956, 'epoch': 1.05}
+{'loss': 0.1308, 'grad_norm': 0.380145400762558, 'learning_rate': 0.0001487292277614858, 'epoch': 1.05}
+{'loss': 0.3108, 'grad_norm': 0.5885724425315857, 'learning_rate': 0.0001487047898338221, 'epoch': 1.05}
+{'loss': 0.3114, 'grad_norm': 0.8687223792076111, 'learning_rate': 0.00014868035190615834, 'epoch': 1.05}
+{'loss': 0.3741, 'grad_norm': 1.1362780332565308, 'learning_rate': 0.00014865591397849462, 'epoch': 1.05}
+{'loss': 0.2473, 'grad_norm': 1.4467766284942627, 'learning_rate': 0.00014863147605083087, 'epoch': 1.05}
+{'loss': 0.5426, 'grad_norm': 1.3377172946929932, 'learning_rate': 0.00014860703812316715, 'epoch': 1.05}
+{'loss': 0.4587, 'grad_norm': 1.023105502128601, 'learning_rate': 0.0001485826001955034, 'epoch': 1.05}
+{'loss': 0.3052, 'grad_norm': 2.0178208351135254, 'learning_rate': 0.00014855816226783968, 'epoch': 1.05}
+{'loss': 0.4834, 'grad_norm': 1.5888885259628296, 'learning_rate': 0.00014853372434017593, 'epoch': 1.05}
+{'loss': 0.6655, 'grad_norm': 2.150106430053711, 'learning_rate': 0.0001485092864125122, 'epoch': 1.05}
+{'loss': 0.6639, 'grad_norm': 1.21843421459198, 'learning_rate': 0.00014848484848484849, 'epoch': 1.05}
+{'loss': 0.3775, 'grad_norm': 1.0294426679611206, 'learning_rate': 0.00014846041055718474, 'epoch': 1.05}
+{'loss': 0.6612, 'grad_norm': 1.3102911710739136, 'learning_rate': 0.000148435972629521, 'epoch': 1.05}
+{'loss': 0.6148, 'grad_norm': 2.4257891178131104, 'learning_rate': 0.00014841153470185727, 'epoch': 1.05}
+{'loss': 0.7202, 'grad_norm': 1.66291344165802, 'learning_rate': 0.00014838709677419355, 'epoch': 1.05}
+{'loss': 0.7751, 'grad_norm': 2.0738868713378906, 'learning_rate': 0.0001483626588465298, 'epoch': 1.05}
+{'loss': 0.5269, 'grad_norm': 1.5332388877868652, 'learning_rate': 0.00014833822091886607, 'epoch': 1.05}
+{'loss': 0.8477, 'grad_norm': 1.8209433555603027, 'learning_rate': 0.00014831378299120233, 'epoch': 1.05}
+{'loss': 1.1148, 'grad_norm': 1.7813162803649902, 'learning_rate': 0.0001482893450635386, 'epoch': 1.05}
+{'loss': 0.7127, 'grad_norm': 1.9501174688339233, 'learning_rate': 0.00014826490713587488, 'epoch': 1.05}
+{'loss': 1.0263, 'grad_norm': 2.0499088764190674, 'learning_rate': 0.00014824046920821113, 'epoch': 1.05}
+{'loss': 0.8959, 'grad_norm': 2.545484781265259, 'learning_rate': 0.00014821603128054739, 'epoch': 1.05}
+{'loss': 1.346, 'grad_norm': 2.9357028007507324, 'learning_rate': 0.00014819159335288366, 'epoch': 1.05}
+{'loss': 1.0074, 'grad_norm': 1.6298185586929321, 'learning_rate': 0.00014816715542521994, 'epoch': 1.05}
+{'loss': 0.636, 'grad_norm': 2.8814504146575928, 'learning_rate': 0.0001481427174975562, 'epoch': 1.05}
+{'loss': 0.7553, 'grad_norm': 3.723740339279175, 'learning_rate': 0.00014811827956989247, 'epoch': 1.05}
+{'loss': 0.9195, 'grad_norm': 1.4765031337738037, 'learning_rate': 0.00014809384164222872, 'epoch': 1.05}
+{'loss': 1.3745, 'grad_norm': 1.9450255632400513, 'learning_rate': 0.000148069403714565, 'epoch': 1.05}
+{'loss': 1.0113, 'grad_norm': 2.2991535663604736, 'learning_rate': 0.00014804496578690125, 'epoch': 1.05}
+{'loss': 1.1969, 'grad_norm': 2.7434890270233154, 'learning_rate': 0.00014802052785923753, 'epoch': 1.05}
+{'loss': 1.1237, 'grad_norm': 2.5354650020599365, 'learning_rate': 0.00014799608993157378, 'epoch': 1.05}
+{'loss': 1.2223, 'grad_norm': 1.4999096393585205, 'learning_rate': 0.00014797165200391006, 'epoch': 1.05}
+{'loss': 1.1544, 'grad_norm': 2.2378246784210205, 'learning_rate': 0.0001479472140762463, 'epoch': 1.05}
+{'loss': 0.7526, 'grad_norm': 1.6880450248718262, 'learning_rate': 0.0001479227761485826, 'epoch': 1.05}
+{'loss': 1.2331, 'grad_norm': 5.19423246383667, 'learning_rate': 0.00014789833822091887, 'epoch': 1.05}
+{'loss': 0.4293, 'grad_norm': 3.7082395553588867, 'learning_rate': 0.00014787390029325512, 'epoch': 1.05}
+{'loss': 1.0767, 'grad_norm': 2.9954659938812256, 'learning_rate': 0.00014784946236559137, 'epoch': 1.05}
+{'loss': 0.4366, 'grad_norm': 1.1557728052139282, 'learning_rate': 0.00014782502443792765, 'epoch': 1.05}
+{'loss': 0.2985, 'grad_norm': 0.6128056049346924, 'learning_rate': 0.00014780058651026393, 'epoch': 1.05}
+{'loss': 0.3021, 'grad_norm': 0.6102817058563232, 'learning_rate': 0.00014777614858260018, 'epoch': 1.06}
+{'loss': 0.2417, 'grad_norm': 0.8309397101402283, 'learning_rate': 0.00014775171065493646, 'epoch': 1.06}
+{'loss': 0.2599, 'grad_norm': 0.7619971632957458, 'learning_rate': 0.0001477272727272727, 'epoch': 1.06}
+{'loss': 0.3468, 'grad_norm': 0.6904311180114746, 'learning_rate': 0.000147702834799609, 'epoch': 1.06}
+{'loss': 0.412, 'grad_norm': 1.885140061378479, 'learning_rate': 0.00014767839687194526, 'epoch': 1.06}
+{'loss': 0.2116, 'grad_norm': 0.6122669577598572, 'learning_rate': 0.00014765395894428152, 'epoch': 1.06}
+{'loss': 0.3224, 'grad_norm': 0.754728376865387, 'learning_rate': 0.00014762952101661777, 'epoch': 1.06}
+{'loss': 0.3732, 'grad_norm': 2.336873769760132, 'learning_rate': 0.00014760508308895405, 'epoch': 1.06}
+{'loss': 0.4202, 'grad_norm': 1.1394752264022827, 'learning_rate': 0.00014758064516129032, 'epoch': 1.06}
+{'loss': 0.2911, 'grad_norm': 0.6358504891395569, 'learning_rate': 0.00014755620723362658, 'epoch': 1.06}
+{'loss': 0.2091, 'grad_norm': 0.8128040432929993, 'learning_rate': 0.00014753176930596285, 'epoch': 1.06}
+{'loss': 0.2296, 'grad_norm': 1.233589768409729, 'learning_rate': 0.0001475073313782991, 'epoch': 1.06}
+{'loss': 0.4545, 'grad_norm': 1.3525103330612183, 'learning_rate': 0.00014748289345063538, 'epoch': 1.06}
+{'loss': 0.4035, 'grad_norm': 1.5476621389389038, 'learning_rate': 0.00014745845552297163, 'epoch': 1.06}
+{'loss': 0.2726, 'grad_norm': 1.2331606149673462, 'learning_rate': 0.0001474340175953079, 'epoch': 1.06}
+{'loss': 0.5273, 'grad_norm': 1.3619499206542969, 'learning_rate': 0.00014740957966764416, 'epoch': 1.06}
+{'loss': 0.5354, 'grad_norm': 3.523498773574829, 'learning_rate': 0.00014738514173998044, 'epoch': 1.06}
+{'loss': 0.6084, 'grad_norm': 1.602575659751892, 'learning_rate': 0.0001473607038123167, 'epoch': 1.06}
+{'loss': 0.3558, 'grad_norm': 2.1576313972473145, 'learning_rate': 0.00014733626588465297, 'epoch': 1.06}
+{'loss': 0.4306, 'grad_norm': 1.1257455348968506, 'learning_rate': 0.00014731182795698925, 'epoch': 1.06}
+{'loss': 0.5438, 'grad_norm': 1.3478970527648926, 'learning_rate': 0.0001472873900293255, 'epoch': 1.06}
+{'loss': 0.8583, 'grad_norm': 2.616065263748169, 'learning_rate': 0.00014726295210166175, 'epoch': 1.06}
+{'loss': 0.2882, 'grad_norm': 1.401318907737732, 'learning_rate': 0.00014723851417399803, 'epoch': 1.06}
+{'loss': 0.9521, 'grad_norm': 2.02484130859375, 'learning_rate': 0.0001472140762463343, 'epoch': 1.06}
+{'loss': 0.4258, 'grad_norm': 1.5256311893463135, 'learning_rate': 0.00014718963831867056, 'epoch': 1.06}
+{'loss': 0.4773, 'grad_norm': 1.3191006183624268, 'learning_rate': 0.00014716520039100684, 'epoch': 1.06}
+{'loss': 0.6198, 'grad_norm': 2.7704293727874756, 'learning_rate': 0.0001471407624633431, 'epoch': 1.06}
+{'loss': 0.646, 'grad_norm': 1.2429702281951904, 'learning_rate': 0.00014711632453567937, 'epoch': 1.06}
+{'loss': 0.6263, 'grad_norm': 1.98096764087677, 'learning_rate': 0.00014709188660801565, 'epoch': 1.06}
+{'loss': 1.4979, 'grad_norm': 5.404338836669922, 'learning_rate': 0.0001470674486803519, 'epoch': 1.06}
+{'loss': 0.7869, 'grad_norm': 2.1490466594696045, 'learning_rate': 0.00014704301075268815, 'epoch': 1.06}
+{'loss': 0.8592, 'grad_norm': 1.6337779760360718, 'learning_rate': 0.00014701857282502443, 'epoch': 1.06}
+ 53%|█████▎    | 6772/12776 [1:10:06<28:19,  3.53it/s] 53%|█████▎    | 6773/12776 [1:10:06<26:36,  3.76it/s]                                                       53%|█████▎    | 6773/12776 [1:10:06<26:36,  3.76it/s] 53%|█████▎    | 6774/12776 [1:10:06<25:08,  3.98it/s]                                                       53%|█████▎    | 6774/12776 [1:10:06<25:08,  3.98it/s] 53%|█████▎    | 6775/12776 [1:10:07<24:05,  4.15it/s]                                                       53%|█████▎    | 6775/12776 [1:10:07<24:05,  4.15it/s] 53%|█████▎    | 6776/12776 [1:10:07<25:08,  3.98it/s]                                                       53%|█████▎    | 6776/12776 [1:10:07<25:08,  3.98it/s] 53%|█████▎    | 6777/12776 [1:10:07<23:56,  4.18it/s]                                                       53%|█████▎    | 6777/12776 [1:10:07<23:56,  4.18it/s] 53%|█████▎    | 6778/12776 [1:10:07<22:56,  4.36it/s]                                                       53%|█████▎    | 6778/12776 [1:10:07<22:56,  4.36it/s] 53%|█████▎    | 6779/12776 [1:10:07<22:07,  4.52it/s]                                                       53%|█████▎    | 6779/12776 [1:10:07<22:07,  4.52it/s] 53%|█████▎    | 6780/12776 [1:10:08<21:28,  4.65it/s]                                                       53%|█████▎    | 6780/12776 [1:10:08<21:28,  4.65it/s] 53%|█████▎    | 6781/12776 [1:10:08<23:49,  4.19it/s]                                                       53%|█████▎    | 6781/12776 [1:10:08<23:49,  4.19it/s] 53%|█████▎    | 6782/12776 [1:10:08<22:35,  4.42it/s]                                                       53%|█████▎    | 6782/12776 [1:10:08<22:35,  4.42it/s] 53%|█████▎    | 6783/12776 [1:10:08<21:39,  4.61it/s]                                                       53%|█████▎    | 6783/12776 [1:10:08<21:39,  4.61it/s] 53%|█████▎    | 6784/12776 [1:10:09<21:22,  4.67it/s]                                                       53%|█████▎    | 6784/12776 [1:10:09<21:22,  4.67it/s] 53%|█████▎    | 6785/12776 [1:10:09<20:43,  4.82it/s]                                                       53%|█████▎    | 6785/12776 [1:10:09<20:43,  4.82it/s] 53%|█████▎    | 6786/12776 [1:10:09<20:11,  4.95it/s]                                                       53%|█████▎    | 6786/12776 [1:10:09<20:11,  4.95it/s] 53%|█████▎    | 6787/12776 [1:10:09<21:30,  4.64it/s]                                                       53%|█████▎    | 6787/12776 [1:10:09<21:30,  4.64it/s] 53%|█████▎    | 6788/12776 [1:10:10<36:47,  2.71it/s]                                                       53%|█████▎    | 6788/12776 [1:10:10<36:47,  2.71it/s] 53%|█████▎    | 6789/12776 [1:10:11<1:11:31,  1.39it/s]                                                         53%|█████▎    | 6789/12776 [1:10:11<1:11:31,  1.39it/s] 53%|█████▎    | 6790/12776 [1:10:13<1:22:36,  1.21it/s]                                                         53%|█████▎    | 6790/12776 [1:10:13<1:22:36,  1.21it/s] 53%|█████▎    | 6791/12776 [1:10:13<1:22:52,  1.20it/s]                                                         53%|█████▎    | 6791/12776 [1:10:13<1:22:52,  1.20it/s] 53%|█████▎    | 6792/12776 [1:10:14<1:20:53,  1.23it/s]                                                         53%|█████▎    | 6792/12776 [1:10:14<1:20:53,  1.23it/s] 53%|█████▎    | 6793/12776 [1:10:15<1:18:05,  1.28it/s]                                                         53%|█████▎    | 6793/12776 [1:10:15<1:18:05,  1.28it/s] 53%|█████▎    | 6794/12776 [1:10:16<1:14:53,  1.33it/s]                                                         53%|█████▎    | 6794/12776 [1:10:16<1:14:53,  1.33it/s] 53%|█████▎    | 6795/12776 [1:10:16<1:13:29,  1.36it/s]                                                         53%|█████▎    | 6795/12776 [1:10:16<1:13:29,  1.36it/s] 53%|█████▎    | 6796/12776 [1:10:17<1:09:43,  1.43it/s]                                                         53%|█████▎    | 6796/12776 [1:10:17<1:09:43,  1.43it/s] 53%|█████▎    | 6797/12776 [1:10:17<1:06:41,  1.49it/s]                                                         53%|█████▎    | 6797/12776 [1:10:17<1:06:41,  1.49it/s] 53%|█████▎    | 6798/12776 [1:10:18<1:03:16,  1.57it/s]                                                         53%|█████▎    | 6798/12776 [1:10:18<1:03:16,  1.57it/s] 53%|█████▎    | 6799/12776 [1:10:19<1:02:02,  1.61it/s]                                                         53%|█████▎    | 6799/12776 [1:10:19<1:02:02,  1.61it/s] 53%|█████▎    | 6800/12776 [1:10:19<58:31,  1.70it/s]                                                         53%|█████▎    | 6800/12776 [1:10:19<58:31,  1.70it/s]Saving model checkpoint to ./checkpoint-6800
+Configuration saved in ./checkpoint-6800/config.json
+Model weights saved in ./checkpoint-6800/model.safetensors
+Feature extractor saved in ./checkpoint-6800/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-6800/tokenizer_config.json
+Special tokens file saved in ./checkpoint-6800/special_tokens_map.json
+added tokens file saved in ./checkpoint-6800/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-5600] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 53%|█████▎    | 6801/12776 [1:10:25<3:36:59,  2.18s/it]                                                         53%|█████▎    | 6801/12776 [1:10:25<3:36:59,  2.18s/it] 53%|█████▎    | 6802/12776 [1:10:25<2:44:26,  1.65s/it]                                                         53%|█████▎    | 6802/12776 [1:10:25<2:44:26,  1.65s/it] 53%|█████▎    | 6803/12776 [1:10:26<2:07:12,  1.28s/it]                                                         53%|█████▎    | 6803/12776 [1:10:26<2:07:12,  1.28s/it] 53%|█████▎    | 6804/12776 [1:10:26<1:44:10,  1.05s/it]                                                         53%|█████▎    | 6804/12776 [1:10:26<1:44:10,  1.05s/it] 53%|█████▎    | 6805/12776 [1:10:27<1:24:54,  1.17it/s]                                                         53%|█████▎    | 6805/12776 [1:10:27<1:24:54,  1.17it/s] 53%|█████▎    | 6806/12776 [1:10:27<1:10:11,  1.42it/s]                                                         53%|█████▎    | 6806/12776 [1:10:27<1:10:11,  1.42it/s] 53%|█████▎    | 6807/12776 [1:10:27<1:01:55,  1.61it/s]                                                         53%|█████▎    | 6807/12776 [1:10:27<1:01:55,  1.61it/s] 53%|█████▎    | 6808/12776 [1:10:28<53:12,  1.87it/s]                                                         53%|█████▎    | 6808/12776 [1:10:28<53:12,  1.87it/s] 53%|█████▎    | 6809/12776 [1:10:28<46:51,  2.12it/s]                                                       53%|█████▎    | 6809/12776 [1:10:28<46:51,  2.12it/s] 53%|█████▎    | 6810/12776 [1:10:29<45:55,  2.16it/s]                                                       53%|█████▎    | 6810/12776 [1:10:29<45:55,  2.16it/s] 53%|█████▎    | 6811/12776 [1:10:29<41:23,  2.40it/s]                                                       53%|█████▎    | 6811/12776 [1:10:29<41:23,  2.40it/s] 53%|█████▎    | 6812/12776 [1:10:29<37:53,  2.62it/s]                                                       53%|█████▎    | 6812/12776 [1:10:29<37:53,  2.62it/s] 53%|█████▎    | 6813/12776 [1:10:30<37:30,  2.65it/s]                                                       53%|█████▎    | 6813/12776 [1:10:30<37:30,  2.65it/s] 53%|█████▎    | 6814/12776 [1:10:30<34:32,  2.88it/s]                                                       53%|█████▎    | 6814/12776 [1:10:30<34:32,  2.88it/s] 53%|█████▎    | 6815/12776 [1:10:30<32:10,  3.09it/s]                                                       53%|█████▎    | 6815/12776 [1:10:30<32:10,  3.09it/s] 53%|█████▎    | 6816/12776 [1:10:30<30:21,  3.27it/s]                                                       53%|█████▎    | 6816/12776 [1:10:30<30:21,  3.27it/s] 53%|█████▎    | 6817/12776 [1:10:31<32:35,  3.05it/s]                                                       53%|█████▎    | 6817/12776 [1:10:31<32:35,  3.05it/s] 53%|█████▎    | 6818/12776 [1:10:31<30:18,  3.28it/s]                                                       53%|█████▎    | 6818/12776 [1:10:31<30:18,  3.28it/s] 53%|█████▎    | 6819/12776 [1:10:31<28:48,  3.45it/s]                                                       53%|█████▎    | 6819/12776 [1:10:31<28:48,  3.45it/s] 53%|█████▎    | 6820/12776 [1:10:31<27:04,  3.67it/s]                                                       53%|█████▎    | 6820/12776 [1:10:31<27:04,  3.67it/s] 53%|█████▎    | 6821/12776 [1:10:32<25:38,  3.87it/s]                                                       53%|█████▎    | 6821/12776 [1:10:32<25:38,  3.87it/s] 53%|█████▎    | 6822/12776 [1:10:32<25:46,  3.85it/s]                                                       53%|█████▎    | 6822/12776 [1:10:32<25:46,  3.85it/s] 53%|█████▎    | 6823/12776 [1:10:32<24:15,  4.09it/s]                                                       53%|█████▎    | 6823/12776 [1:10:32<24:15,  4.09it/s] 53%|█████▎    | 6824/12776 [1:10:32<23:11,  4.28it/s]                                                       53%|█████▎    | 6824/12776 [1:10:32<23:11,  4.28it/s] 53%|█████▎    | 6825/12776 [1:10:33<22:10,  4.47it/s]                                                       53%|█████▎    | 6825/12776 [1:10:33<22:10,  4.47it/s] 53%|█████▎    | 6826/12776 [1:10:33<21:15,  4.66it/s]                                                       53%|█████▎    | 6826/12776 [1:10:33<21:15,  4.66it/s] 53%|█████▎    | 6827/12776 [1:10:33<23:40,  4.19it/s]                                                       53%|█████▎    | 6827/12776 [1:10:33<23:40,  4.19it/s] 53%|█████▎    | 6828/12776 [1:10:33<22:14,  4.46it/s]                                                       53%|█████▎    | 6828/12776 [1:10:33<22:14,  4.46it/s] 53%|█████▎    | 6829/12776 [1:10:33<21:11,  4.68it/s]                                                       53%|█████▎    | 6829/12776 [1:10:33<21:11,  4.68it/s] 53%|█████▎    | 6830/12776 [1:10:34<20:17,  4.88it/s]                                                       53%|█████▎    | 6830/12776 [1:10:34<20:17,  4.88it/s] 53%|█████▎    | 6831/12776 [1:10:34<19:28,  5.09it/s]                                                       53%|█████▎    | 6831/12776 [1:10:34<19:28,  5.09it/s] 53%|█████▎    | 6832/12776 [1:10:34<22:39,  4.37it/s]                                                       53%|█████▎    | 6832/12776 [1:10:34<22:39,  4.37it/s] 53%|█████▎    | 6833/12776 [1:10:34<21:03,  4.71it/s]                                                       53%|█████▎    | 6833/12776 [1:10:34<21:03,  4.71it/s] 53%|█████▎    | 6834/12776 [1:10:34<19:51,  4.99it/s]                                                       53%|█████▎    | 6834/12776 [1:10:34<19:51,  4.99it/s] 53%|█████▎    | 6835/12776 [1:10:35<18:57,  5.22it/s]                                                       53%|█████▎    | 6835/12776 [1:10:35<18:57,  5.22it/s] 54%|█████▎    | 6836/12776 [1:10:35<18:07,  5.46it/s]                                                       54%|█████▎    | 6836/12776 [1:10:35<18:07,  5.46it/s] 54%|█████▎    | 6837/12776 [1:10:35<17:28,  5.66it/s]                                                       54%|█████▎    | 6837/12776 [1:10:35<17:28,  5.66it/s] 54%|█████▎    | 6838/12776 [1:10:36<33:16,  2.97it/s]                                                       54%|█████▎    | 6838/12776 [1:10:36<33:16,  2.97it/s] 54%|█████▎    | 6839/12776 [1:10:37<1:07:00,  1.48it/s]                                                         54%|█████▎    | 6839/12776 [1:10:37<1:07:00,  1.48it/s] 54%|█████▎    | 6840/12776 [1:10:38<1:16:44,  1.29it/s]                                                         54%|█████▎    | 6840/12776 [1:10:38<1:16:44,  1.29it/s] 54%|█████▎    | 6841/12776 [1:10:39<1:21:53,  1.21it/s]                                                         54%|█████▎    | 6841/12776 [1:10:39<1:21:53,  1.21it/s] 54%|█████▎    | 6842/12776 [1:10:40<1:21:48,  1.21it/s]                                                         54%|█████▎    | 6842/12776 [1:10:40<1:21:48,  1.21it/s] 54%|█████▎    | 6843/12776 [1:10:41<1:18:03,  1.27it/s]                                                         54%|█████▎    | 6843/12776 [1:10:41<1:18:03,  1.27it/s] 54%|█████▎    | 6844/12776 [1:10:41<1:15:43,  1.31it/s]                                                         54%|█████▎    | 6844/12776 [1:10:41<1:15:43,  1.31it/s] 54%|█████▎    | 6845/12776 [1:10:42<1:11:28,  1.38it/s]                                                         54%|█████▎    | 6845/12776 [1:10:42<1:11:28,  1.38it/s] 54%|█████▎    | 6846/12776 [1:10:43<1:07:17,  1.47it/s]                                                         54%|█████▎    | 6846/12776 [1:10:43<1:07:17,  1.47it/s] 54%|█████▎    | 6847/12776 [1:10:43<1:03:03,  1.57it/s]                                                         54%|█████▎    | 6847/12776 [1:10:43<1:03:03,  1.57it/s] 54%|█████▎    | 6848/12776 [1:10:44<1:00:45,  1.63it/s]                                                         54%|█████▎    | 6848/12776 [1:10:44<1:00:45,  1.63it/s] 54%|█████▎    | 6849/12776 [1:10:44<57:18,  1.72it/s]                                                        {'loss': 0.9336, 'grad_norm': 3.1310315132141113, 'learning_rate': 0.0001469941348973607, 'epoch': 1.06}
+{'loss': 1.043, 'grad_norm': 2.745962619781494, 'learning_rate': 0.00014696969696969696, 'epoch': 1.06}
+{'loss': 1.1677, 'grad_norm': 2.7593672275543213, 'learning_rate': 0.00014694525904203324, 'epoch': 1.06}
+{'loss': 1.777, 'grad_norm': 5.965583801269531, 'learning_rate': 0.0001469208211143695, 'epoch': 1.06}
+{'loss': 1.0561, 'grad_norm': 3.3984930515289307, 'learning_rate': 0.00014689638318670574, 'epoch': 1.06}
+{'loss': 0.6857, 'grad_norm': 2.305185317993164, 'learning_rate': 0.00014687194525904202, 'epoch': 1.06}
+{'loss': 0.8298, 'grad_norm': 2.9663422107696533, 'learning_rate': 0.0001468475073313783, 'epoch': 1.06}
+{'loss': 0.9201, 'grad_norm': 2.1952407360076904, 'learning_rate': 0.00014682306940371455, 'epoch': 1.06}
+{'loss': 1.1013, 'grad_norm': 2.043056011199951, 'learning_rate': 0.0001467986314760508, 'epoch': 1.06}
+{'loss': 1.3418, 'grad_norm': 1.94023597240448, 'learning_rate': 0.00014677419354838708, 'epoch': 1.06}
+{'loss': 0.8855, 'grad_norm': 2.6357297897338867, 'learning_rate': 0.00014674975562072335, 'epoch': 1.06}
+{'loss': 0.9809, 'grad_norm': 2.302753448486328, 'learning_rate': 0.0001467253176930596, 'epoch': 1.06}
+{'loss': 0.4587, 'grad_norm': 2.690652370452881, 'learning_rate': 0.00014670087976539588, 'epoch': 1.06}
+{'loss': 1.0801, 'grad_norm': 3.5731654167175293, 'learning_rate': 0.00014667644183773214, 'epoch': 1.06}
+{'loss': 0.7417, 'grad_norm': 2.150663375854492, 'learning_rate': 0.00014665200391006841, 'epoch': 1.06}
+{'loss': 1.2934, 'grad_norm': 2.761242151260376, 'learning_rate': 0.0001466275659824047, 'epoch': 1.06}
+{'loss': 0.5649, 'grad_norm': 2.148921251296997, 'learning_rate': 0.00014660312805474094, 'epoch': 1.06}
+{'loss': 0.287, 'grad_norm': 0.691116452217102, 'learning_rate': 0.0001465786901270772, 'epoch': 1.06}
+{'loss': 0.3013, 'grad_norm': 0.4150255024433136, 'learning_rate': 0.00014655425219941347, 'epoch': 1.06}
+{'loss': 0.3082, 'grad_norm': 0.48139479756355286, 'learning_rate': 0.00014652981427174975, 'epoch': 1.06}
+{'loss': 0.2616, 'grad_norm': 0.808285117149353, 'learning_rate': 0.000146505376344086, 'epoch': 1.06}
+{'loss': 0.2752, 'grad_norm': 0.7357711791992188, 'learning_rate': 0.00014648093841642228, 'epoch': 1.06}
+{'loss': 0.26, 'grad_norm': 0.6319572329521179, 'learning_rate': 0.00014645650048875853, 'epoch': 1.06}
+{'loss': 0.2896, 'grad_norm': 0.698928952217102, 'learning_rate': 0.0001464320625610948, 'epoch': 1.06}
+{'loss': 0.3453, 'grad_norm': 1.5302014350891113, 'learning_rate': 0.0001464076246334311, 'epoch': 1.06}
+{'loss': 0.3171, 'grad_norm': 1.11781907081604, 'learning_rate': 0.00014638318670576734, 'epoch': 1.06}
+{'loss': 0.2568, 'grad_norm': 1.107818365097046, 'learning_rate': 0.0001463587487781036, 'epoch': 1.06}
+{'loss': 0.2953, 'grad_norm': 1.2923407554626465, 'learning_rate': 0.00014633431085043987, 'epoch': 1.06}
+{'loss': 0.4314, 'grad_norm': 1.0053560733795166, 'learning_rate': 0.00014630987292277612, 'epoch': 1.06}
+{'loss': 0.1903, 'grad_norm': 0.73367840051651, 'learning_rate': 0.0001462854349951124, 'epoch': 1.06}
+{'loss': 0.2807, 'grad_norm': 0.6966777443885803, 'learning_rate': 0.00014626099706744868, 'epoch': 1.06}
+{'loss': 0.5498, 'grad_norm': 1.2056246995925903, 'learning_rate': 0.00014623655913978493, 'epoch': 1.06}
+{'loss': 0.3544, 'grad_norm': 1.8048405647277832, 'learning_rate': 0.00014621212121212118, 'epoch': 1.07}
+{'loss': 0.3607, 'grad_norm': 1.2983676195144653, 'learning_rate': 0.00014618768328445746, 'epoch': 1.07}
+{'loss': 0.2121, 'grad_norm': 0.5953453183174133, 'learning_rate': 0.00014616324535679374, 'epoch': 1.07}
+{'loss': 0.7027, 'grad_norm': 1.4588236808776855, 'learning_rate': 0.00014613880742913, 'epoch': 1.07}
+{'loss': 0.4823, 'grad_norm': 1.0992096662521362, 'learning_rate': 0.00014611436950146627, 'epoch': 1.07}
+{'loss': 0.5958, 'grad_norm': 1.8514833450317383, 'learning_rate': 0.00014608993157380252, 'epoch': 1.07}
+{'loss': 0.495, 'grad_norm': 1.5322425365447998, 'learning_rate': 0.0001460654936461388, 'epoch': 1.07}
+{'loss': 0.3159, 'grad_norm': 0.9982278347015381, 'learning_rate': 0.00014604105571847507, 'epoch': 1.07}
+{'loss': 0.5013, 'grad_norm': 1.4766974449157715, 'learning_rate': 0.00014601661779081133, 'epoch': 1.07}
+{'loss': 0.5875, 'grad_norm': 1.6511269807815552, 'learning_rate': 0.00014599217986314758, 'epoch': 1.07}
+{'loss': 0.7228, 'grad_norm': 1.9279791116714478, 'learning_rate': 0.00014596774193548386, 'epoch': 1.07}
+{'loss': 0.6166, 'grad_norm': 2.1245806217193604, 'learning_rate': 0.00014594330400782013, 'epoch': 1.07}
+{'loss': 0.6317, 'grad_norm': 2.9913034439086914, 'learning_rate': 0.00014591886608015638, 'epoch': 1.07}
+{'loss': 0.352, 'grad_norm': 1.126991868019104, 'learning_rate': 0.00014589442815249266, 'epoch': 1.07}
+{'loss': 0.5466, 'grad_norm': 1.3619285821914673, 'learning_rate': 0.00014586999022482891, 'epoch': 1.07}
+{'loss': 0.7798, 'grad_norm': 1.5446709394454956, 'learning_rate': 0.0001458455522971652, 'epoch': 1.07}
+{'loss': 0.7226, 'grad_norm': 1.6549172401428223, 'learning_rate': 0.00014582111436950144, 'epoch': 1.07}
+{'loss': 0.8854, 'grad_norm': 1.4883651733398438, 'learning_rate': 0.00014579667644183772, 'epoch': 1.07}
+{'loss': 0.9569, 'grad_norm': 1.6428464651107788, 'learning_rate': 0.00014577223851417397, 'epoch': 1.07}
+{'loss': 1.1195, 'grad_norm': 2.9931986331939697, 'learning_rate': 0.00014574780058651025, 'epoch': 1.07}
+{'loss': 0.6068, 'grad_norm': 1.3941583633422852, 'learning_rate': 0.0001457233626588465, 'epoch': 1.07}
+{'loss': 0.5655, 'grad_norm': 1.757436752319336, 'learning_rate': 0.00014569892473118278, 'epoch': 1.07}
+{'loss': 0.8291, 'grad_norm': 2.191127300262451, 'learning_rate': 0.00014567448680351906, 'epoch': 1.07}
+{'loss': 0.9857, 'grad_norm': 3.134359121322632, 'learning_rate': 0.0001456500488758553, 'epoch': 1.07}
+{'loss': 0.7704, 'grad_norm': 1.7476389408111572, 'learning_rate': 0.00014562561094819156, 'epoch': 1.07}
+{'loss': 1.404, 'grad_norm': 2.1934890747070312, 'learning_rate': 0.00014560117302052784, 'epoch': 1.07}
+{'loss': 0.8811, 'grad_norm': 3.2194011211395264, 'learning_rate': 0.00014557673509286412, 'epoch': 1.07}
+{'loss': 0.781, 'grad_norm': 2.342510461807251, 'learning_rate': 0.00014555229716520037, 'epoch': 1.07}
+{'loss': 0.8468, 'grad_norm': 2.317560911178589, 'learning_rate': 0.00014552785923753665, 'epoch': 1.07}
+{'loss': 0.8117, 'grad_norm': 2.0507400035858154, 'learning_rate': 0.0001455034213098729, 'epoch': 1.07}
+{'loss': 0.7835, 'grad_norm': 2.625714063644409, 'learning_rate': 0.00014547898338220918, 'epoch': 1.07}
+{'loss': 0.5083, 'grad_norm': 2.597598075866699, 'learning_rate': 0.00014545454545454546, 'epoch': 1.07}
+{'loss': 1.1845, 'grad_norm': 2.5991411209106445, 'learning_rate': 0.0001454301075268817, 'epoch': 1.07}
+{'loss': 0.414, 'grad_norm': 1.9454537630081177, 'learning_rate': 0.00014540566959921796, 'epoch': 1.07}
+{'loss': 0.6621, 'grad_norm': 1.6982367038726807, 'learning_rate': 0.00014538123167155424, 'epoch': 1.07}
+{'loss': 0.3462, 'grad_norm': 0.6477040648460388, 'learning_rate': 0.00014535679374389052, 'epoch': 1.07}
+{'loss': 0.1706, 'grad_norm': 0.3390127420425415, 'learning_rate': 0.00014533235581622677, 'epoch': 1.07}
+{'loss': 0.3586, 'grad_norm': 0.8520225882530212, 'learning_rate': 0.00014530791788856305, 'epoch': 1.07}
+{'loss': 0.1433, 'grad_norm': 0.6063798069953918, 'learning_rate': 0.0001452834799608993, 'epoch': 1.07}
+{'loss': 0.5062, 'grad_norm': 0.7696285247802734, 'learning_rate': 0.00014525904203323557, 'epoch': 1.07}
+{'loss': 0.2092, 'grad_norm': 0.47626444697380066, 'learning_rate': 0.00014523460410557183, 'epoch': 1.07}
+{'loss': 0.295, 'grad_norm': 0.7897009253501892, 'learning_rate': 0.0001452101661779081, 'epoch': 1.07}
+{'loss': 0.2313, 'grad_norm': 0.6091282963752747, 'learning_rate': 0.00014518572825024436, 'epoch': 1.07}
+{'loss': 0.2965, 'grad_norm': 0.9005686044692993, 'learning_rate': 0.00014516129032258063, 'epoch': 1.07}
+{'loss': 0.2936, 'grad_norm': 0.9107126593589783, 'learning_rate': 0.00014513685239491689, 'epoch': 1.07}
+ 54%|█████▎    | 6849/12776 [1:10:44<57:18,  1.72it/s] 54%|█████▎    | 6850/12776 [1:10:45<56:06,  1.76it/s]                                                       54%|█████▎    | 6850/12776 [1:10:45<56:06,  1.76it/s] 54%|█████▎    | 6851/12776 [1:10:45<52:53,  1.87it/s]                                                       54%|█████▎    | 6851/12776 [1:10:45<52:53,  1.87it/s] 54%|█████▎    | 6852/12776 [1:10:46<52:17,  1.89it/s]                                                       54%|█████▎    | 6852/12776 [1:10:46<52:17,  1.89it/s] 54%|█████▎    | 6853/12776 [1:10:46<48:56,  2.02it/s]                                                       54%|█████▎    | 6853/12776 [1:10:46<48:56,  2.02it/s] 54%|█████▎    | 6854/12776 [1:10:46<45:56,  2.15it/s]                                                       54%|█████▎    | 6854/12776 [1:10:46<45:56,  2.15it/s] 54%|█████▎    | 6855/12776 [1:10:47<46:43,  2.11it/s]                                                       54%|█████▎    | 6855/12776 [1:10:47<46:43,  2.11it/s] 54%|█████▎    | 6856/12776 [1:10:47<43:38,  2.26it/s]                                                       54%|█████▎    | 6856/12776 [1:10:47<43:38,  2.26it/s] 54%|█████▎    | 6857/12776 [1:10:48<41:01,  2.40it/s]                                                       54%|█████▎    | 6857/12776 [1:10:48<41:01,  2.40it/s] 54%|█████▎    | 6858/12776 [1:10:48<41:11,  2.39it/s]                                                       54%|█████▎    | 6858/12776 [1:10:48<41:11,  2.39it/s] 54%|█████▎    | 6859/12776 [1:10:48<38:33,  2.56it/s]                                                       54%|█████▎    | 6859/12776 [1:10:48<38:33,  2.56it/s] 54%|█████▎    | 6860/12776 [1:10:49<36:26,  2.71it/s]                                                       54%|█████▎    | 6860/12776 [1:10:49<36:26,  2.71it/s] 54%|█████▎    | 6861/12776 [1:10:49<35:54,  2.75it/s]                                                       54%|█████▎    | 6861/12776 [1:10:49<35:54,  2.75it/s] 54%|█████▎    | 6862/12776 [1:10:49<33:48,  2.92it/s]                                                       54%|█████▎    | 6862/12776 [1:10:49<33:48,  2.92it/s] 54%|█████▎    | 6863/12776 [1:10:50<31:59,  3.08it/s]                                                       54%|█████▎    | 6863/12776 [1:10:50<31:59,  3.08it/s] 54%|█████▎    | 6864/12776 [1:10:50<31:02,  3.18it/s]                                                       54%|█████▎    | 6864/12776 [1:10:50<31:02,  3.18it/s] 54%|█████▎    | 6865/12776 [1:10:50<34:21,  2.87it/s]                                                       54%|█████▎    | 6865/12776 [1:10:50<34:21,  2.87it/s] 54%|█████▎    | 6866/12776 [1:10:51<32:16,  3.05it/s]                                                       54%|█████▎    | 6866/12776 [1:10:51<32:16,  3.05it/s] 54%|█████▎    | 6867/12776 [1:10:51<30:33,  3.22it/s]                                                       54%|█████▎    | 6867/12776 [1:10:51<30:33,  3.22it/s] 54%|█████▍    | 6868/12776 [1:10:51<29:10,  3.38it/s]                                                       54%|█████▍    | 6868/12776 [1:10:51<29:10,  3.38it/s] 54%|█████▍    | 6869/12776 [1:10:52<31:18,  3.14it/s]                                                       54%|█████▍    | 6869/12776 [1:10:52<31:18,  3.14it/s] 54%|█████▍    | 6870/12776 [1:10:52<29:16,  3.36it/s]                                                       54%|█████▍    | 6870/12776 [1:10:52<29:16,  3.36it/s] 54%|█████▍    | 6871/12776 [1:10:52<27:36,  3.56it/s]                                                       54%|█████▍    | 6871/12776 [1:10:52<27:36,  3.56it/s] 54%|█████▍    | 6872/12776 [1:10:52<26:21,  3.73it/s]                                                       54%|█████▍    | 6872/12776 [1:10:52<26:21,  3.73it/s] 54%|█████▍    | 6873/12776 [1:10:53<28:37,  3.44it/s]                                                       54%|█████▍    | 6873/12776 [1:10:53<28:37,  3.44it/s] 54%|█████▍    | 6874/12776 [1:10:53<26:40,  3.69it/s]                                                       54%|█████▍    | 6874/12776 [1:10:53<26:40,  3.69it/s] 54%|█████▍    | 6875/12776 [1:10:53<25:04,  3.92it/s]                                                       54%|█████▍    | 6875/12776 [1:10:53<25:04,  3.92it/s] 54%|█████▍    | 6876/12776 [1:10:53<23:49,  4.13it/s]                                                       54%|█████▍    | 6876/12776 [1:10:53<23:49,  4.13it/s] 54%|█████▍    | 6877/12776 [1:10:54<22:15,  4.42it/s]                                                       54%|█████▍    | 6877/12776 [1:10:54<22:15,  4.42it/s] 54%|█████▍    | 6878/12776 [1:10:54<24:21,  4.04it/s]                                                       54%|█████▍    | 6878/12776 [1:10:54<24:21,  4.04it/s] 54%|█████▍    | 6879/12776 [1:10:54<22:28,  4.37it/s]                                                       54%|█████▍    | 6879/12776 [1:10:54<22:28,  4.37it/s] 54%|█████▍    | 6880/12776 [1:10:54<21:03,  4.67it/s]                                                       54%|█████▍    | 6880/12776 [1:10:54<21:03,  4.67it/s] 54%|█████▍    | 6881/12776 [1:10:54<19:59,  4.92it/s]                                                       54%|█████▍    | 6881/12776 [1:10:54<19:59,  4.92it/s] 54%|█████▍    | 6882/12776 [1:10:55<19:19,  5.08it/s]                                                       54%|█████▍    | 6882/12776 [1:10:55<19:19,  5.08it/s] 54%|█████▍    | 6883/12776 [1:10:55<22:10,  4.43it/s]                                                       54%|█████▍    | 6883/12776 [1:10:55<22:10,  4.43it/s] 54%|█████▍    | 6884/12776 [1:10:55<20:34,  4.77it/s]                                                       54%|█████▍    | 6884/12776 [1:10:55<20:34,  4.77it/s] 54%|█████▍    | 6885/12776 [1:10:55<19:21,  5.07it/s]                                                       54%|█████▍    | 6885/12776 [1:10:55<19:21,  5.07it/s] 54%|█████▍    | 6886/12776 [1:10:55<18:26,  5.32it/s]                                                       54%|█████▍    | 6886/12776 [1:10:55<18:26,  5.32it/s] 54%|█████▍    | 6887/12776 [1:10:55<17:37,  5.57it/s]                                                       54%|█████▍    | 6887/12776 [1:10:55<17:37,  5.57it/s] 54%|█████▍    | 6888/12776 [1:10:56<33:43,  2.91it/s]                                                       54%|█████▍    | 6888/12776 [1:10:56<33:43,  2.91it/s] 54%|█████▍    | 6889/12776 [1:10:58<1:10:32,  1.39it/s]                                                         54%|█████▍    | 6889/12776 [1:10:58<1:10:32,  1.39it/s] 54%|█████▍    | 6890/12776 [1:10:59<1:18:37,  1.25it/s]                                                         54%|█████▍    | 6890/12776 [1:10:59<1:18:37,  1.25it/s] 54%|█████▍    | 6891/12776 [1:11:00<1:22:55,  1.18it/s]                                                         54%|█████▍    | 6891/12776 [1:11:00<1:22:55,  1.18it/s] 54%|█████▍    | 6892/12776 [1:11:01<1:22:14,  1.19it/s]                                                         54%|█████▍    | 6892/12776 [1:11:01<1:22:14,  1.19it/s] 54%|█████▍    | 6893/12776 [1:11:01<1:20:10,  1.22it/s]                                                         54%|█████▍    | 6893/12776 [1:11:01<1:20:10,  1.22it/s] 54%|█████▍    | 6894/12776 [1:11:02<1:20:07,  1.22it/s]                                                         54%|█████▍    | 6894/12776 [1:11:02<1:20:07,  1.22it/s] 54%|█████▍    | 6895/12776 [1:11:03<1:17:50,  1.26it/s]                                                         54%|█████▍    | 6895/12776 [1:11:03<1:17:50,  1.26it/s] 54%|█████▍    | 6896/12776 [1:11:04<1:13:27,  1.33it/s]                                                         54%|█████▍    | 6896/12776 [1:11:04<1:13:27,  1.33it/s] 54%|█████▍    | 6897/12776 [1:11:04<1:13:55,  1.33it/s]                                                         54%|█████▍    | 6897/12776 [1:11:04<1:13:55,  1.33it/s] 54%|█████▍    | 6898/12776 [1:11:05<1:08:41,  1.43it/s]                                                         54%|█████▍    | 6898/12776 [1:11:05<1:08:41,  1.43it/s] 54%|█████▍    | 6899/12776 [1:11:06<1:06:38,  1.47it/s]                                                         54%|█████▍    | 6899/12776 [1:11:06<1:06:38,  1.47it/s] 54%|█████▍    | 6900/12776 [1:11:06<1:02:10,  1.58it/s]                                                         54%|█████▍    | 6900/12776 [1:11:06<1:02:10,  1.58it/s] 54%|█████▍    | 6901/12776 [1:11:07<1:00:30,  1.62it/s]                                                         54%|█████▍    | 6901/12776 [1:11:07<1:00:30,  1.62it/s] 54%|█████▍    | 6902/12776 [1:11:07<55:54,  1.75it/s]                                                         54%|█████▍    | 6902/12776 [1:11:07<55:54,  1.75it/s] 54%|█████▍    | 6903/12776 [1:11:08<56:27,  1.73it/s]                                                       54%|█████▍    | 6903/12776 [1:11:08<56:27,  1.73it/s] 54%|█████▍    | 6904/12776 [1:11:08<52:09,  1.88it/s]                                                       54%|█████▍    | 6904/12776 [1:11:08<52:09,  1.88it/s] 54%|█████▍    | 6905/12776 [1:11:09<51:36,  1.90it/s]                                                       54%|█████▍    | 6905/12776 [1:11:09<51:36,  1.90it/s] 54%|█████▍    | 6906/12776 [1:11:09<47:40,  2.05it/s]                                                       54%|█████▍    | 6906/12776 [1:11:09<47:40,  2.05it/s] 54%|█████▍    | 6907/12776 [1:11:09<44:27,  2.20it/s]                                                       54%|█████▍    | 6907/12776 [1:11:09<44:27,  2.20it/s] 54%|█████▍    | 6908/12776 [1:11:10<42:13,  2.32it/s]                                                       54%|█████▍    | 6908/12776 [1:11:10<42:13,  2.32it/s] 54%|█████▍    | 6909/12776 [1:11:10<39:57,  2.45it/s]                                                       54%|█████▍    | 6909/12776 [1:11:10<39:57,  2.45it/s] 54%|█████▍    | 6910/12776 [1:11:10<37:48,  2.59it/s]                                                       54%|█████▍    | 6910/12776 [1:11:10<37:48,  2.59it/s] 54%|█████▍    | 6911/12776 [1:11:11<38:51,  2.52it/s]                                                       54%|█████▍    | 6911/12776 [1:11:11<38:51,  2.52it/s] 54%|█████▍    | 6912/12776 [1:11:11<36:29,  2.68it/s]                                                       54%|█████▍    | 6912/12776 [1:11:11<36:29,  2.68it/s] 54%|█████▍    | 6913/12776 [1:11:12<34:23,  2.84it/s]                                                       54%|█████▍    | 6913/12776 [1:11:12<34:23,  2.84it/s] 54%|█████▍    | 6914/12776 [1:11:12<32:41,  2.99it/s]                                                       54%|█████▍    | 6914/12776 [1:11:12<32:41,  2.99it/s] 54%|█████▍    | 6915/12776 [1:11:12<33:23,  2.93it/s]                                                       54%|█████▍    | 6915/12776 [1:11:12<33:23,  2.93it/s] 54%|█████▍    | 6916/12776 [1:11:12<31:23,  3.11it/s]                                                       54%|█████▍    | 6916/12776 [1:11:12<31:23,  3.11it/s] 54%|█████▍    | 6917/12776 [1:11:13<29:44,  3.28it/s]                                                       54%|█████▍    | 6917/12776 [1:11:13<29:44,  3.28it/s] 54%|█████▍    | 6918/12776 [1:11:13<28:29,  3.43it/s]                                                       54%|█████▍    | 6918/12776 [1:11:13<28:29,  3.43it/s] 54%|█████▍    | 6919/12776 [1:11:13<29:36,  3.30it/s]                                                       54%|█████▍    | 6919/12776 [1:11:13<29:36,  3.30it/s] 54%|█████▍    | 6920/12776 [1:11:14<28:03,  3.48it/s]                                                       54%|█████▍    | 6920/12776 [1:11:14<28:03,  3.48it/s] 54%|█████▍    | 6921/12776 [1:11:14<26:44,  3.65it/s]                                                       54%|█████▍    | 6921/12776 [1:11:14<26:44,  3.65it/s] 54%|█████▍    | 6922/12776 [1:11:14<25:42,  3.80it/s]                                                       54%|█████▍    | 6922/12776 [1:11:14<25:42,  3.80it/s] 54%|█████▍    | 6923/12776 [1:11:14<28:33,  3.42it/s]                                                       54%|█████▍    | 6923/12776 [1:11:14<28:33,  3.42it/s] 54%|█████▍    | 6924/12776 [1:11:15<26:34,  3.67it/s]                                                       54%|█████▍    | 6924/12776 [1:11:15<26:34,  3.67it/s] 54%|█████▍    | 6925/12776 [1:11:15<25:10,  3.87it/s]                                                       54%|█████▍    | 6925/12776 [1:11:15<25:10,  3.87it/s] 54%|█████▍    | 6926/12776 [1:11:15<23:49,  4.09it/s]                                                      {'loss': 0.2276, 'grad_norm': 0.5607098340988159, 'learning_rate': 0.00014511241446725316, 'epoch': 1.07}
+{'loss': 0.3193, 'grad_norm': 13.283991813659668, 'learning_rate': 0.00014508797653958944, 'epoch': 1.07}
+{'loss': 0.422, 'grad_norm': 1.3404076099395752, 'learning_rate': 0.0001450635386119257, 'epoch': 1.07}
+{'loss': 0.4161, 'grad_norm': 0.9615288376808167, 'learning_rate': 0.00014503910068426194, 'epoch': 1.07}
+{'loss': 0.3811, 'grad_norm': 1.6815721988677979, 'learning_rate': 0.00014501466275659822, 'epoch': 1.07}
+{'loss': 0.5096, 'grad_norm': 0.926059901714325, 'learning_rate': 0.0001449902248289345, 'epoch': 1.07}
+{'loss': 0.4286, 'grad_norm': 0.8995077610015869, 'learning_rate': 0.00014496578690127075, 'epoch': 1.07}
+{'loss': 0.5821, 'grad_norm': 0.9982707500457764, 'learning_rate': 0.00014494134897360703, 'epoch': 1.07}
+{'loss': 0.3978, 'grad_norm': 1.031527042388916, 'learning_rate': 0.00014491691104594328, 'epoch': 1.07}
+{'loss': 0.3313, 'grad_norm': 0.7758069038391113, 'learning_rate': 0.00014489247311827956, 'epoch': 1.07}
+{'loss': 0.2895, 'grad_norm': 1.3925272226333618, 'learning_rate': 0.00014486803519061584, 'epoch': 1.07}
+{'loss': 0.4096, 'grad_norm': 1.40436851978302, 'learning_rate': 0.0001448435972629521, 'epoch': 1.07}
+{'loss': 0.4963, 'grad_norm': 1.4296326637268066, 'learning_rate': 0.00014481915933528834, 'epoch': 1.07}
+{'loss': 0.4817, 'grad_norm': 1.4773507118225098, 'learning_rate': 0.00014479472140762462, 'epoch': 1.07}
+{'loss': 0.6741, 'grad_norm': 1.4051315784454346, 'learning_rate': 0.0001447702834799609, 'epoch': 1.07}
+{'loss': 0.2722, 'grad_norm': 1.041042447090149, 'learning_rate': 0.00014474584555229715, 'epoch': 1.07}
+{'loss': 0.764, 'grad_norm': 2.3967373371124268, 'learning_rate': 0.00014472140762463343, 'epoch': 1.07}
+{'loss': 0.4523, 'grad_norm': 4.08719539642334, 'learning_rate': 0.00014469696969696968, 'epoch': 1.07}
+{'loss': 0.5414, 'grad_norm': 1.987392783164978, 'learning_rate': 0.00014467253176930596, 'epoch': 1.07}
+{'loss': 0.8434, 'grad_norm': 3.570604085922241, 'learning_rate': 0.0001446480938416422, 'epoch': 1.08}
+{'loss': 0.7103, 'grad_norm': 7.099319934844971, 'learning_rate': 0.0001446236559139785, 'epoch': 1.08}
+{'loss': 0.5604, 'grad_norm': 3.188019037246704, 'learning_rate': 0.00014459921798631474, 'epoch': 1.08}
+{'loss': 0.5358, 'grad_norm': 1.420037865638733, 'learning_rate': 0.00014457478005865102, 'epoch': 1.08}
+{'loss': 0.6757, 'grad_norm': 1.5740822553634644, 'learning_rate': 0.00014455034213098727, 'epoch': 1.08}
+{'loss': 1.1542, 'grad_norm': 2.2425143718719482, 'learning_rate': 0.00014452590420332355, 'epoch': 1.08}
+{'loss': 0.8133, 'grad_norm': 3.6810855865478516, 'learning_rate': 0.00014450146627565982, 'epoch': 1.08}
+{'loss': 0.8714, 'grad_norm': 2.337358236312866, 'learning_rate': 0.00014447702834799608, 'epoch': 1.08}
+{'loss': 0.9102, 'grad_norm': 3.2725279331207275, 'learning_rate': 0.00014445259042033233, 'epoch': 1.08}
+{'loss': 1.4275, 'grad_norm': 5.1807355880737305, 'learning_rate': 0.0001444281524926686, 'epoch': 1.08}
+{'loss': 0.6962, 'grad_norm': 3.2198193073272705, 'learning_rate': 0.00014440371456500488, 'epoch': 1.08}
+{'loss': 1.4533, 'grad_norm': 5.173696041107178, 'learning_rate': 0.00014437927663734113, 'epoch': 1.08}
+{'loss': 1.139, 'grad_norm': 2.3174338340759277, 'learning_rate': 0.0001443548387096774, 'epoch': 1.08}
+{'loss': 1.0105, 'grad_norm': 2.363914966583252, 'learning_rate': 0.00014433040078201366, 'epoch': 1.08}
+{'loss': 0.8286, 'grad_norm': 2.0799217224121094, 'learning_rate': 0.00014430596285434994, 'epoch': 1.08}
+{'loss': 0.6723, 'grad_norm': 2.2222182750701904, 'learning_rate': 0.00014428152492668622, 'epoch': 1.08}
+{'loss': 0.8287, 'grad_norm': 2.70257568359375, 'learning_rate': 0.00014425708699902247, 'epoch': 1.08}
+{'loss': 0.3816, 'grad_norm': 2.3015530109405518, 'learning_rate': 0.00014423264907135872, 'epoch': 1.08}
+{'loss': 0.9525, 'grad_norm': 2.944084644317627, 'learning_rate': 0.000144208211143695, 'epoch': 1.08}
+{'loss': 0.489, 'grad_norm': 2.2739789485931396, 'learning_rate': 0.00014418377321603128, 'epoch': 1.08}
+{'loss': 0.9746, 'grad_norm': 6.7248969078063965, 'learning_rate': 0.00014415933528836753, 'epoch': 1.08}
+{'loss': 0.2798, 'grad_norm': 0.4254947006702423, 'learning_rate': 0.0001441348973607038, 'epoch': 1.08}
+{'loss': 0.1879, 'grad_norm': 0.3661378026008606, 'learning_rate': 0.00014411045943304006, 'epoch': 1.08}
+{'loss': 0.1684, 'grad_norm': 0.5312639474868774, 'learning_rate': 0.0001440860215053763, 'epoch': 1.08}
+{'loss': 0.2838, 'grad_norm': 0.9988065361976624, 'learning_rate': 0.0001440615835777126, 'epoch': 1.08}
+{'loss': 0.2015, 'grad_norm': 0.5143894553184509, 'learning_rate': 0.00014403714565004887, 'epoch': 1.08}
+{'loss': 0.2814, 'grad_norm': 0.6396801471710205, 'learning_rate': 0.00014401270772238512, 'epoch': 1.08}
+{'loss': 0.3326, 'grad_norm': 0.8477953672409058, 'learning_rate': 0.0001439882697947214, 'epoch': 1.08}
+{'loss': 0.1656, 'grad_norm': 0.47463440895080566, 'learning_rate': 0.00014396383186705765, 'epoch': 1.08}
+{'loss': 0.2643, 'grad_norm': 0.7857781052589417, 'learning_rate': 0.00014393939393939393, 'epoch': 1.08}
+{'loss': 0.3092, 'grad_norm': 0.7944388389587402, 'learning_rate': 0.0001439149560117302, 'epoch': 1.08}
+{'loss': 0.2997, 'grad_norm': 1.1828808784484863, 'learning_rate': 0.00014389051808406646, 'epoch': 1.08}
+{'loss': 0.2784, 'grad_norm': 0.6055023074150085, 'learning_rate': 0.0001438660801564027, 'epoch': 1.08}
+{'loss': 0.3366, 'grad_norm': 0.6134934425354004, 'learning_rate': 0.000143841642228739, 'epoch': 1.08}
+{'loss': 0.5705, 'grad_norm': 1.3209537267684937, 'learning_rate': 0.00014381720430107527, 'epoch': 1.08}
+{'loss': 0.3105, 'grad_norm': 0.7764045000076294, 'learning_rate': 0.00014379276637341152, 'epoch': 1.08}
+{'loss': 0.2681, 'grad_norm': 1.349913477897644, 'learning_rate': 0.0001437683284457478, 'epoch': 1.08}
+{'loss': 0.3003, 'grad_norm': 0.920647919178009, 'learning_rate': 0.00014374389051808405, 'epoch': 1.08}
+{'loss': 0.3485, 'grad_norm': 1.2609316110610962, 'learning_rate': 0.00014371945259042033, 'epoch': 1.08}
+{'loss': 0.4673, 'grad_norm': 1.3429254293441772, 'learning_rate': 0.0001436950146627566, 'epoch': 1.08}
+{'loss': 0.4167, 'grad_norm': 1.207157850265503, 'learning_rate': 0.00014367057673509285, 'epoch': 1.08}
+{'loss': 0.4355, 'grad_norm': 0.8418388366699219, 'learning_rate': 0.0001436461388074291, 'epoch': 1.08}
+{'loss': 0.627, 'grad_norm': 1.242262363433838, 'learning_rate': 0.00014362170087976538, 'epoch': 1.08}
+{'loss': 0.9752, 'grad_norm': 2.378530979156494, 'learning_rate': 0.00014359726295210166, 'epoch': 1.08}
+{'loss': 0.8568, 'grad_norm': 2.7604053020477295, 'learning_rate': 0.00014357282502443791, 'epoch': 1.08}
+{'loss': 0.6182, 'grad_norm': 1.2572295665740967, 'learning_rate': 0.0001435483870967742, 'epoch': 1.08}
+{'loss': 0.6046, 'grad_norm': 1.6974389553070068, 'learning_rate': 0.00014352394916911044, 'epoch': 1.08}
+{'loss': 0.7052, 'grad_norm': 2.8349547386169434, 'learning_rate': 0.0001434995112414467, 'epoch': 1.08}
+{'loss': 0.8496, 'grad_norm': 3.3643503189086914, 'learning_rate': 0.00014347507331378297, 'epoch': 1.08}
+{'loss': 0.9502, 'grad_norm': 2.5238869190216064, 'learning_rate': 0.00014345063538611925, 'epoch': 1.08}
+{'loss': 1.0775, 'grad_norm': 2.205315113067627, 'learning_rate': 0.0001434261974584555, 'epoch': 1.08}
+{'loss': 0.9507, 'grad_norm': 5.002822399139404, 'learning_rate': 0.00014340175953079178, 'epoch': 1.08}
+{'loss': 0.8607, 'grad_norm': 3.2781922817230225, 'learning_rate': 0.00014337732160312803, 'epoch': 1.08}
+{'loss': 0.9261, 'grad_norm': 1.792444109916687, 'learning_rate': 0.0001433528836754643, 'epoch': 1.08}
+{'loss': 0.9687, 'grad_norm': 2.4402613639831543, 'learning_rate': 0.0001433284457478006, 'epoch': 1.08}
+{'loss': 1.0102, 'grad_norm': 2.88875412940979, 'learning_rate': 0.00014330400782013684, 'epoch': 1.08}
+{'loss': 0.962, 'grad_norm': 2.4586126804351807, 'learning_rate': 0.0001432795698924731, 'epoch': 1.08}
+{'loss': 1.0995, 'grad_norm': 2.9678328037261963, 'learning_rate': 0.00014325513196480937, 'epoch': 1.08}
+ 54%|█████▍    | 6926/12776 [1:11:15<23:49,  4.09it/s] 54%|█████▍    | 6927/12776 [1:11:15<22:49,  4.27it/s]                                                       54%|█████▍    | 6927/12776 [1:11:15<22:49,  4.27it/s] 54%|█████▍    | 6928/12776 [1:11:16<24:42,  3.94it/s]                                                       54%|█████▍    | 6928/12776 [1:11:16<24:42,  3.94it/s] 54%|█████▍    | 6929/12776 [1:11:16<23:20,  4.17it/s]                                                       54%|█████▍    | 6929/12776 [1:11:16<23:20,  4.17it/s] 54%|█████▍    | 6930/12776 [1:11:16<22:16,  4.37it/s]                                                       54%|█████▍    | 6930/12776 [1:11:16<22:16,  4.37it/s] 54%|█████▍    | 6931/12776 [1:11:16<21:24,  4.55it/s]                                                       54%|█████▍    | 6931/12776 [1:11:16<21:24,  4.55it/s] 54%|█████▍    | 6932/12776 [1:11:16<20:45,  4.69it/s]                                                       54%|█████▍    | 6932/12776 [1:11:16<20:45,  4.69it/s] 54%|█████▍    | 6933/12776 [1:11:17<22:55,  4.25it/s]                                                       54%|█████▍    | 6933/12776 [1:11:17<22:55,  4.25it/s] 54%|█████▍    | 6934/12776 [1:11:17<21:44,  4.48it/s]                                                       54%|█████▍    | 6934/12776 [1:11:17<21:44,  4.48it/s] 54%|█████▍    | 6935/12776 [1:11:17<20:49,  4.67it/s]                                                       54%|█████▍    | 6935/12776 [1:11:17<20:49,  4.67it/s] 54%|█████▍    | 6936/12776 [1:11:17<20:03,  4.85it/s]                                                       54%|█████▍    | 6936/12776 [1:11:17<20:03,  4.85it/s] 54%|█████▍    | 6937/12776 [1:11:17<19:28,  5.00it/s]                                                       54%|█████▍    | 6937/12776 [1:11:17<19:28,  5.00it/s] 54%|█████▍    | 6938/12776 [1:11:18<34:26,  2.82it/s]                                                       54%|█████▍    | 6938/12776 [1:11:18<34:26,  2.82it/s] 54%|█████▍    | 6939/12776 [1:11:20<1:04:39,  1.50it/s]                                                         54%|█████▍    | 6939/12776 [1:11:20<1:04:39,  1.50it/s] 54%|█████▍    | 6940/12776 [1:11:21<1:16:42,  1.27it/s]                                                         54%|█████▍    | 6940/12776 [1:11:21<1:16:42,  1.27it/s] 54%|█████▍    | 6941/12776 [1:11:22<1:20:19,  1.21it/s]                                                         54%|█████▍    | 6941/12776 [1:11:22<1:20:19,  1.21it/s] 54%|█████▍    | 6942/12776 [1:11:22<1:20:00,  1.22it/s]                                                         54%|█████▍    | 6942/12776 [1:11:22<1:20:00,  1.22it/s] 54%|█████▍    | 6943/12776 [1:11:23<1:17:56,  1.25it/s]                                                         54%|█████▍    | 6943/12776 [1:11:23<1:17:56,  1.25it/s] 54%|█████▍    | 6944/12776 [1:11:24<1:18:04,  1.24it/s]                                                         54%|█████▍    | 6944/12776 [1:11:24<1:18:04,  1.24it/s] 54%|█████▍    | 6945/12776 [1:11:25<1:16:20,  1.27it/s]                                                         54%|█████▍    | 6945/12776 [1:11:25<1:16:20,  1.27it/s] 54%|█████▍    | 6946/12776 [1:11:25<1:11:50,  1.35it/s]                                                         54%|█████▍    | 6946/12776 [1:11:25<1:11:50,  1.35it/s] 54%|█████▍    | 6947/12776 [1:11:26<1:06:48,  1.45it/s]                                                         54%|█████▍    | 6947/12776 [1:11:26<1:06:48,  1.45it/s] 54%|█████▍    | 6948/12776 [1:11:26<1:03:22,  1.53it/s]                                                         54%|█████▍    | 6948/12776 [1:11:26<1:03:22,  1.53it/s] 54%|█████▍    | 6949/12776 [1:11:27<1:00:43,  1.60it/s]                                                         54%|█████▍    | 6949/12776 [1:11:27<1:00:43,  1.60it/s] 54%|█████▍    | 6950/12776 [1:11:27<57:41,  1.68it/s]                                                         54%|█████▍    | 6950/12776 [1:11:27<57:41,  1.68it/s] 54%|█████▍    | 6951/12776 [1:11:28<59:30,  1.63it/s]                                                       54%|█████▍    | 6951/12776 [1:11:28<59:30,  1.63it/s] 54%|█████▍    | 6952/12776 [1:11:29<54:52,  1.77it/s]                                                       54%|█████▍    | 6952/12776 [1:11:29<54:52,  1.77it/s] 54%|█████▍    | 6953/12776 [1:11:29<51:00,  1.90it/s]                                                       54%|█████▍    | 6953/12776 [1:11:29<51:00,  1.90it/s] 54%|█████▍    | 6954/12776 [1:11:29<49:27,  1.96it/s]                                                       54%|█████▍    | 6954/12776 [1:11:29<49:27,  1.96it/s] 54%|█████▍    | 6955/12776 [1:11:30<46:12,  2.10it/s]                                                       54%|█████▍    | 6955/12776 [1:11:30<46:12,  2.10it/s] 54%|█████▍    | 6956/12776 [1:11:30<43:37,  2.22it/s]                                                       54%|█████▍    | 6956/12776 [1:11:30<43:37,  2.22it/s] 54%|█████▍    | 6957/12776 [1:11:31<42:44,  2.27it/s]                                                       54%|█████▍    | 6957/12776 [1:11:31<42:44,  2.27it/s] 54%|█████▍    | 6958/12776 [1:11:31<40:22,  2.40it/s]                                                       54%|█████▍    | 6958/12776 [1:11:31<40:22,  2.40it/s] 54%|█████▍    | 6959/12776 [1:11:31<38:31,  2.52it/s]                                                       54%|█████▍    | 6959/12776 [1:11:31<38:31,  2.52it/s] 54%|█████▍    | 6960/12776 [1:11:32<38:06,  2.54it/s]                                                       54%|█████▍    | 6960/12776 [1:11:32<38:06,  2.54it/s] 54%|█████▍    | 6961/12776 [1:11:32<36:24,  2.66it/s]                                                       54%|█████▍    | 6961/12776 [1:11:32<36:24,  2.66it/s] 54%|█████▍    | 6962/12776 [1:11:32<34:44,  2.79it/s]                                                       54%|█████▍    | 6962/12776 [1:11:32<34:44,  2.79it/s] 55%|█████▍    | 6963/12776 [1:11:33<35:31,  2.73it/s]                                                       55%|█████▍    | 6963/12776 [1:11:33<35:31,  2.73it/s] 55%|█████▍    | 6964/12776 [1:11:33<33:29,  2.89it/s]                                                       55%|██��██▍    | 6964/12776 [1:11:33<33:29,  2.89it/s] 55%|█████▍    | 6965/12776 [1:11:33<31:49,  3.04it/s]                                                       55%|█████▍    | 6965/12776 [1:11:33<31:49,  3.04it/s] 55%|█████▍    | 6966/12776 [1:11:34<33:21,  2.90it/s]                                                       55%|█████▍    | 6966/12776 [1:11:34<33:21,  2.90it/s] 55%|█████▍    | 6967/12776 [1:11:34<31:11,  3.10it/s]                                                       55%|█████▍    | 6967/12776 [1:11:34<31:11,  3.10it/s] 55%|█████▍    | 6968/12776 [1:11:34<29:29,  3.28it/s]                                                       55%|█████▍    | 6968/12776 [1:11:34<29:29,  3.28it/s] 55%|█████▍    | 6969/12776 [1:11:35<27:58,  3.46it/s]                                                       55%|█████▍    | 6969/12776 [1:11:35<27:58,  3.46it/s] 55%|█████▍    | 6970/12776 [1:11:35<29:37,  3.27it/s]                                                       55%|█████▍    | 6970/12776 [1:11:35<29:37,  3.27it/s] 55%|█████▍    | 6971/12776 [1:11:35<27:51,  3.47it/s]                                                       55%|█████▍    | 6971/12776 [1:11:35<27:51,  3.47it/s] 55%|█████▍    | 6972/12776 [1:11:35<26:22,  3.67it/s]                                                       55%|█████▍    | 6972/12776 [1:11:35<26:22,  3.67it/s] 55%|█████▍    | 6973/12776 [1:11:36<25:12,  3.84it/s]                                                       55%|█████▍    | 6973/12776 [1:11:36<25:12,  3.84it/s] 55%|█████▍    | 6974/12776 [1:11:36<24:19,  3.98it/s]                                                       55%|█████▍    | 6974/12776 [1:11:36<24:19,  3.98it/s] 55%|█████▍    | 6975/12776 [1:11:36<26:06,  3.70it/s]                                                       55%|█████▍    | 6975/12776 [1:11:36<26:06,  3.70it/s] 55%|█████▍    | 6976/12776 [1:11:36<24:30,  3.94it/s]                                                       55%|█████▍    | 6976/12776 [1:11:36<24:30,  3.94it/s] 55%|█████▍    | 6977/12776 [1:11:37<23:18,  4.15it/s]                                                       55%|█████▍    | 6977/12776 [1:11:37<23:18,  4.15it/s] 55%|█████▍    | 6978/12776 [1:11:37<22:25,  4.31it/s]                                                       55%|█████▍    | 6978/12776 [1:11:37<22:25,  4.31it/s] 55%|█████▍    | 6979/12776 [1:11:37<21:41,  4.45it/s]                                                       55%|█████▍    | 6979/12776 [1:11:37<21:41,  4.45it/s] 55%|█████▍    | 6980/12776 [1:11:37<23:42,  4.07it/s]                                                       55%|█████▍    | 6980/12776 [1:11:37<23:42,  4.07it/s] 55%|█████▍    | 6981/12776 [1:11:38<22:29,  4.29it/s]                                                       55%|█████▍    | 6981/12776 [1:11:38<22:29,  4.29it/s] 55%|█████▍    | 6982/12776 [1:11:38<21:29,  4.49it/s]                                                       55%|█████▍    | 6982/12776 [1:11:38<21:29,  4.49it/s] 55%|█████▍    | 6983/12776 [1:11:38<20:45,  4.65it/s]                                                       55%|█████▍    | 6983/12776 [1:11:38<20:45,  4.65it/s] 55%|█████▍    | 6984/12776 [1:11:38<20:15,  4.76it/s]                                                       55%|█████▍    | 6984/12776 [1:11:38<20:15,  4.76it/s] 55%|█████▍    | 6985/12776 [1:11:38<23:45,  4.06it/s]                                                       55%|█████▍    | 6985/12776 [1:11:38<23:45,  4.06it/s] 55%|█████▍    | 6986/12776 [1:11:39<22:12,  4.35it/s]                                                       55%|█████▍    | 6986/12776 [1:11:39<22:12,  4.35it/s] 55%|█████▍    | 6987/12776 [1:11:39<20:54,  4.61it/s]                                                       55%|█████▍    | 6987/12776 [1:11:39<20:54,  4.61it/s] 55%|█████▍    | 6988/12776 [1:11:40<36:34,  2.64it/s]                                                       55%|█████▍    | 6988/12776 [1:11:40<36:34,  2.64it/s] 55%|█████▍    | 6989/12776 [1:11:41<1:05:33,  1.47it/s]                                                         55%|█████▍    | 6989/12776 [1:11:41<1:05:33,  1.47it/s] 55%|█████▍    | 6990/12776 [1:11:42<1:13:12,  1.32it/s]                                                         55%|█████▍    | 6990/12776 [1:11:42<1:13:12,  1.32it/s] 55%|█████▍    | 6991/12776 [1:11:43<1:20:10,  1.20it/s]                                                         55%|█████▍    | 6991/12776 [1:11:43<1:20:10,  1.20it/s] 55%|█████▍    | 6992/12776 [1:11:44<1:18:09,  1.23it/s]                                                         55%|█████▍    | 6992/12776 [1:11:44<1:18:09,  1.23it/s] 55%|█████▍    | 6993/12776 [1:11:44<1:18:06,  1.23it/s]                                                         55%|█████▍    | 6993/12776 [1:11:44<1:18:06,  1.23it/s] 55%|█████▍    | 6994/12776 [1:11:45<1:15:35,  1.27it/s]                                                         55%|█████▍    | 6994/12776 [1:11:45<1:15:35,  1.27it/s] 55%|█████▍    | 6995/12776 [1:11:46<1:11:52,  1.34it/s]                                                         55%|█████▍    | 6995/12776 [1:11:46<1:11:52,  1.34it/s] 55%|█████▍    | 6996/12776 [1:11:47<1:11:59,  1.34it/s]                                                         55%|█████▍    | 6996/12776 [1:11:47<1:11:59,  1.34it/s] 55%|█████▍    | 6997/12776 [1:11:47<1:07:20,  1.43it/s]                                                         55%|█████▍    | 6997/12776 [1:11:47<1:07:20,  1.43it/s] 55%|█████▍    | 6998/12776 [1:11:48<1:06:20,  1.45it/s]                                                         55%|█████▍    | 6998/12776 [1:11:48<1:06:20,  1.45it/s] 55%|█████▍    | 6999/12776 [1:11:48<1:01:51,  1.56it/s]                                                         55%|█████▍    | 6999/12776 [1:11:48<1:01:51,  1.56it/s] 55%|█████▍    | 7000/12776 [1:11:49<1:00:18,  1.60it/s]                                                         55%|█████▍    | 7000/12776 [1:11:49<1:00:18,  1.60it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 0.9321, 'grad_norm': 2.3805899620056152, 'learning_rate': 0.00014323069403714565, 'epoch': 1.08}
+{'loss': 0.7621, 'grad_norm': 2.5837199687957764, 'learning_rate': 0.0001432062561094819, 'epoch': 1.08}
+{'loss': 0.9827, 'grad_norm': 2.366360664367676, 'learning_rate': 0.00014318181818181818, 'epoch': 1.08}
+{'loss': 1.1201, 'grad_norm': 3.093599796295166, 'learning_rate': 0.00014315738025415443, 'epoch': 1.08}
+{'loss': 1.1055, 'grad_norm': 3.7076659202575684, 'learning_rate': 0.0001431329423264907, 'epoch': 1.08}
+{'loss': 1.3536, 'grad_norm': 3.0820627212524414, 'learning_rate': 0.00014310850439882699, 'epoch': 1.09}
+{'loss': 1.7647, 'grad_norm': 8.682129859924316, 'learning_rate': 0.00014308406647116324, 'epoch': 1.09}
+{'loss': 1.6023, 'grad_norm': 2.9856128692626953, 'learning_rate': 0.0001430596285434995, 'epoch': 1.09}
+{'loss': 0.2357, 'grad_norm': 2.074152708053589, 'learning_rate': 0.00014303519061583577, 'epoch': 1.09}
+{'loss': 0.9179, 'grad_norm': 3.0086069107055664, 'learning_rate': 0.00014301075268817202, 'epoch': 1.09}
+{'loss': 1.0155, 'grad_norm': 4.519247531890869, 'learning_rate': 0.0001429863147605083, 'epoch': 1.09}
+{'loss': 0.6405, 'grad_norm': 1.4745041131973267, 'learning_rate': 0.00014296187683284457, 'epoch': 1.09}
+{'loss': 0.9687, 'grad_norm': 3.711125135421753, 'learning_rate': 0.00014293743890518083, 'epoch': 1.09}
+{'loss': 0.2144, 'grad_norm': 0.41256627440452576, 'learning_rate': 0.00014291300097751708, 'epoch': 1.09}
+{'loss': 0.2536, 'grad_norm': 0.39318832755088806, 'learning_rate': 0.00014288856304985336, 'epoch': 1.09}
+{'loss': 0.2521, 'grad_norm': 0.5901644229888916, 'learning_rate': 0.00014286412512218963, 'epoch': 1.09}
+{'loss': 0.2123, 'grad_norm': 0.5370203852653503, 'learning_rate': 0.00014283968719452589, 'epoch': 1.09}
+{'loss': 0.3049, 'grad_norm': 0.546731173992157, 'learning_rate': 0.00014281524926686216, 'epoch': 1.09}
+{'loss': 0.1984, 'grad_norm': 0.36624133586883545, 'learning_rate': 0.00014279081133919841, 'epoch': 1.09}
+{'loss': 0.2255, 'grad_norm': 0.6180028319358826, 'learning_rate': 0.0001427663734115347, 'epoch': 1.09}
+{'loss': 0.4502, 'grad_norm': 0.9784983396530151, 'learning_rate': 0.00014274193548387097, 'epoch': 1.09}
+{'loss': 0.27, 'grad_norm': 0.5127384066581726, 'learning_rate': 0.00014271749755620722, 'epoch': 1.09}
+{'loss': 0.2674, 'grad_norm': 0.6251503825187683, 'learning_rate': 0.00014269305962854347, 'epoch': 1.09}
+{'loss': 0.31, 'grad_norm': 0.6730063557624817, 'learning_rate': 0.00014266862170087975, 'epoch': 1.09}
+{'loss': 0.302, 'grad_norm': 0.7769110798835754, 'learning_rate': 0.00014264418377321603, 'epoch': 1.09}
+{'loss': 0.3961, 'grad_norm': 1.0139259099960327, 'learning_rate': 0.00014261974584555228, 'epoch': 1.09}
+{'loss': 0.2495, 'grad_norm': 0.7795376777648926, 'learning_rate': 0.00014259530791788856, 'epoch': 1.09}
+{'loss': 0.4041, 'grad_norm': 0.770647406578064, 'learning_rate': 0.0001425708699902248, 'epoch': 1.09}
+{'loss': 0.2077, 'grad_norm': 0.9551957845687866, 'learning_rate': 0.0001425464320625611, 'epoch': 1.09}
+{'loss': 0.4206, 'grad_norm': 0.8338924646377563, 'learning_rate': 0.00014252199413489737, 'epoch': 1.09}
+{'loss': 0.31, 'grad_norm': 0.7971686720848083, 'learning_rate': 0.00014249755620723362, 'epoch': 1.09}
+{'loss': 0.6022, 'grad_norm': 1.553481101989746, 'learning_rate': 0.00014247311827956987, 'epoch': 1.09}
+{'loss': 0.4746, 'grad_norm': 1.3536430597305298, 'learning_rate': 0.00014244868035190615, 'epoch': 1.09}
+{'loss': 0.6675, 'grad_norm': 0.8568630814552307, 'learning_rate': 0.0001424242424242424, 'epoch': 1.09}
+{'loss': 0.56, 'grad_norm': 0.9841766357421875, 'learning_rate': 0.00014239980449657868, 'epoch': 1.09}
+{'loss': 0.6362, 'grad_norm': 1.8160009384155273, 'learning_rate': 0.00014237536656891496, 'epoch': 1.09}
+{'loss': 0.387, 'grad_norm': 1.0871392488479614, 'learning_rate': 0.0001423509286412512, 'epoch': 1.09}
+{'loss': 0.5088, 'grad_norm': 2.788846731185913, 'learning_rate': 0.00014232649071358746, 'epoch': 1.09}
+{'loss': 0.6828, 'grad_norm': 2.411051034927368, 'learning_rate': 0.00014230205278592374, 'epoch': 1.09}
+{'loss': 0.7937, 'grad_norm': 1.6987364292144775, 'learning_rate': 0.00014227761485826002, 'epoch': 1.09}
+{'loss': 0.3764, 'grad_norm': 1.3906645774841309, 'learning_rate': 0.00014225317693059627, 'epoch': 1.09}
+{'loss': 0.4234, 'grad_norm': 1.8853800296783447, 'learning_rate': 0.00014222873900293255, 'epoch': 1.09}
+{'loss': 1.001, 'grad_norm': 4.4653639793396, 'learning_rate': 0.0001422043010752688, 'epoch': 1.09}
+{'loss': 0.6155, 'grad_norm': 1.9520578384399414, 'learning_rate': 0.00014217986314760508, 'epoch': 1.09}
+{'loss': 0.5802, 'grad_norm': 2.5212900638580322, 'learning_rate': 0.00014215542521994135, 'epoch': 1.09}
+{'loss': 0.819, 'grad_norm': 3.541121006011963, 'learning_rate': 0.0001421309872922776, 'epoch': 1.09}
+{'loss': 1.0289, 'grad_norm': 4.24415922164917, 'learning_rate': 0.00014210654936461386, 'epoch': 1.09}
+{'loss': 1.1641, 'grad_norm': 2.449824094772339, 'learning_rate': 0.00014208211143695013, 'epoch': 1.09}
+{'loss': 0.4655, 'grad_norm': 1.3512892723083496, 'learning_rate': 0.0001420576735092864, 'epoch': 1.09}
+{'loss': 1.4388, 'grad_norm': 3.9181694984436035, 'learning_rate': 0.00014203323558162266, 'epoch': 1.09}
+{'loss': 1.5037, 'grad_norm': 3.194889545440674, 'learning_rate': 0.00014200879765395894, 'epoch': 1.09}
+{'loss': 0.8793, 'grad_norm': 3.483785629272461, 'learning_rate': 0.0001419843597262952, 'epoch': 1.09}
+{'loss': 0.8187, 'grad_norm': 2.2053303718566895, 'learning_rate': 0.00014195992179863147, 'epoch': 1.09}
+{'loss': 0.9644, 'grad_norm': 3.390990734100342, 'learning_rate': 0.00014193548387096772, 'epoch': 1.09}
+{'loss': 1.2942, 'grad_norm': 3.826531171798706, 'learning_rate': 0.000141911045943304, 'epoch': 1.09}
+{'loss': 1.08, 'grad_norm': 2.1854097843170166, 'learning_rate': 0.00014188660801564025, 'epoch': 1.09}
+{'loss': 1.066, 'grad_norm': 1.7453972101211548, 'learning_rate': 0.00014186217008797653, 'epoch': 1.09}
+{'loss': 1.0875, 'grad_norm': 1.322583556175232, 'learning_rate': 0.00014183773216031278, 'epoch': 1.09}
+{'loss': 0.9858, 'grad_norm': 3.8100578784942627, 'learning_rate': 0.00014181329423264906, 'epoch': 1.09}
+{'loss': 1.0154, 'grad_norm': 1.9491938352584839, 'learning_rate': 0.00014178885630498534, 'epoch': 1.09}
+{'loss': 0.5821, 'grad_norm': 1.2058466672897339, 'learning_rate': 0.0001417644183773216, 'epoch': 1.09}
+{'loss': 0.3841, 'grad_norm': 1.4509241580963135, 'learning_rate': 0.00014173998044965784, 'epoch': 1.09}
+{'loss': 0.772, 'grad_norm': 1.7737778425216675, 'learning_rate': 0.00014171554252199412, 'epoch': 1.09}
+{'loss': 0.2943, 'grad_norm': 0.6553102135658264, 'learning_rate': 0.0001416911045943304, 'epoch': 1.09}
+{'loss': 0.1739, 'grad_norm': 0.4766181409358978, 'learning_rate': 0.00014166666666666665, 'epoch': 1.09}
+{'loss': 0.229, 'grad_norm': 0.475190132856369, 'learning_rate': 0.00014164222873900293, 'epoch': 1.09}
+{'loss': 0.2992, 'grad_norm': 0.4964744448661804, 'learning_rate': 0.00014161779081133918, 'epoch': 1.09}
+{'loss': 0.3533, 'grad_norm': 0.7098869681358337, 'learning_rate': 0.00014159335288367546, 'epoch': 1.09}
+{'loss': 0.3115, 'grad_norm': 0.6949512958526611, 'learning_rate': 0.00014156891495601174, 'epoch': 1.09}
+{'loss': 0.2106, 'grad_norm': 1.074985384941101, 'learning_rate': 0.000141544477028348, 'epoch': 1.1}
+{'loss': 0.2574, 'grad_norm': 1.048966884613037, 'learning_rate': 0.00014152003910068424, 'epoch': 1.1}
+{'loss': 0.2964, 'grad_norm': 0.6164217591285706, 'learning_rate': 0.00014149560117302052, 'epoch': 1.1}
+{'loss': 0.2955, 'grad_norm': 0.6958305239677429, 'learning_rate': 0.0001414711632453568, 'epoch': 1.1}
+{'loss': 0.332, 'grad_norm': 0.7299469709396362, 'learning_rate': 0.00014144672531769305, 'epoch': 1.1}
+{'loss': 0.3274, 'grad_norm': 2.961721181869507, 'learning_rate': 0.00014142228739002932, 'epoch': 1.1}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:07,  6.08it/s][A
+  0%|          | 3/774 [00:00<02:50,  4.52it/s][A
+  1%|          | 4/774 [00:00<03:16,  3.92it/s][A
+  1%|          | 5/774 [00:01<03:16,  3.92it/s][A
+  1%|          | 6/774 [00:01<03:29,  3.66it/s][A
+  1%|          | 7/774 [00:01<03:27,  3.70it/s][A
+  1%|          | 8/774 [00:02<03:29,  3.66it/s][A
+  1%|          | 9/774 [00:02<03:17,  3.86it/s][A
+  1%|▏         | 10/774 [00:02<03:17,  3.88it/s][A
+  1%|▏         | 11/774 [00:02<03:31,  3.61it/s][A
+  2%|▏         | 12/774 [00:03<03:18,  3.84it/s][A
+  2%|▏         | 13/774 [00:03<03:10,  3.99it/s][A
+  2%|▏         | 14/774 [00:03<03:22,  3.76it/s][A
+  2%|▏         | 15/774 [00:03<03:38,  3.47it/s][A
+  2%|▏         | 16/774 [00:04<03:37,  3.49it/s][A
+  2%|▏         | 17/774 [00:04<03:14,  3.90it/s][A
+  2%|▏         | 18/774 [00:04<03:06,  4.05it/s][A
+  2%|▏         | 19/774 [00:04<03:16,  3.85it/s][A
+  3%|▎         | 20/774 [00:05<03:11,  3.93it/s][A
+  3%|▎         | 21/774 [00:05<03:16,  3.84it/s][A
+  3%|▎         | 22/774 [00:05<03:21,  3.73it/s][A
+  3%|▎         | 23/774 [00:06<03:32,  3.53it/s][A
+  3%|▎         | 24/774 [00:06<03:29,  3.58it/s][A
+  3%|▎         | 25/774 [00:06<03:36,  3.47it/s][A
+  3%|▎         | 26/774 [00:06<03:34,  3.48it/s][A
+  3%|▎         | 27/774 [00:07<03:33,  3.50it/s][A
+  4%|▎         | 28/774 [00:07<03:38,  3.42it/s][A
+  4%|▎         | 29/774 [00:07<03:43,  3.33it/s][A
+  4%|▍         | 30/774 [00:08<03:31,  3.51it/s][A
+  4%|▍         | 31/774 [00:08<03:33,  3.48it/s][A
+  4%|▍         | 32/774 [00:08<04:36,  2.68it/s][A
+  4%|▍         | 33/774 [00:09<04:15,  2.90it/s][A
+  4%|▍         | 34/774 [00:09<03:56,  3.13it/s][A
+  5%|▍         | 35/774 [00:09<03:56,  3.12it/s][A
+  5%|▍         | 36/774 [00:10<03:54,  3.15it/s][A
+  5%|▍         | 37/774 [00:10<03:52,  3.17it/s][A
+  5%|▍         | 38/774 [00:10<03:41,  3.32it/s][A
+  5%|▌         | 39/774 [00:10<03:24,  3.59it/s][A
+  5%|▌         | 40/774 [00:11<03:28,  3.51it/s][A
+  5%|▌         | 41/774 [00:11<03:25,  3.56it/s][A
+  5%|▌         | 42/774 [00:11<03:13,  3.78it/s][A
+  6%|▌         | 43/774 [00:12<03:25,  3.55it/s][A
+  6%|▌         | 44/774 [00:12<03:29,  3.48it/s][A
+  6%|▌         | 45/774 [00:12<03:18,  3.67it/s][A
+  6%|▌         | 46/774 [00:12<03:03,  3.97it/s][A
+  6%|▌         | 47/774 [00:12<02:51,  4.24it/s][A
+  6%|▌         | 48/774 [00:13<02:52,  4.21it/s][A
+  6%|▋         | 49/774 [00:13<02:53,  4.17it/s][A
+  6%|▋         | 50/774 [00:13<02:56,  4.11it/s][A
+  7%|▋         | 51/774 [00:13<02:57,  4.08it/s][A
+  7%|▋         | 52/774 [00:14<02:56,  4.10it/s][A
+  7%|▋         | 53/774 [00:14<03:03,  3.93it/s][A
+  7%|▋         | 54/774 [00:14<03:08,  3.82it/s][A
+  7%|▋         | 55/774 [00:15<03:17,  3.64it/s][A
+  7%|▋         | 56/774 [00:15<03:17,  3.63it/s][A
+  7%|▋         | 57/774 [00:15<03:22,  3.53it/s][A
+  7%|▋         | 58/774 [00:15<03:22,  3.53it/s][A
+  8%|▊         | 59/774 [00:16<03:06,  3.83it/s][A
+  8%|▊         | 60/774 [00:16<02:53,  4.12it/s][A
+  8%|▊         | 61/774 [00:16<02:31,  4.70it/s][A
+  8%|▊         | 62/774 [00:16<02:29,  4.75it/s][A
+  8%|▊         | 63/774 [00:17<02:55,  4.06it/s][A
+  8%|▊         | 64/774 [00:17<02:46,  4.27it/s][A
+  8%|▊         | 65/774 [00:17<02:48,  4.21it/s][A
+  9%|▊         | 66/774 [00:17<02:45,  4.27it/s][A
+  9%|▊         | 67/774 [00:17<02:38,  4.47it/s][A
+  9%|▉         | 68/774 [00:18<02:34,  4.56it/s][A
+  9%|▉         | 69/774 [00:18<02:27,  4.79it/s][A
+  9%|▉         | 70/774 [00:18<02:35,  4.53it/s][A
+  9%|▉         | 71/774 [00:18<02:30,  4.67it/s][A
+  9%|▉         | 72/774 [00:19<02:41,  4.35it/s][A
+  9%|▉         | 73/774 [00:19<02:51,  4.10it/s][A
+ 10%|▉         | 74/774 [00:19<02:58,  3.93it/s][A
+ 10%|▉         | 75/774 [00:19<03:04,  3.80it/s][A
+ 10%|▉         | 76/774 [00:20<03:00,  3.87it/s][A
+ 10%|▉         | 77/774 [00:20<03:12,  3.63it/s][A
+ 10%|█         | 78/774 [00:20<02:53,  4.01it/s][A
+ 10%|█         | 79/774 [00:20<02:41,  4.29it/s][A
+ 10%|█         | 80/774 [00:21<02:39,  4.36it/s][A
+ 10%|█         | 81/774 [00:21<02:17,  5.02it/s][A
+ 11%|█         | 82/774 [00:21<02:17,  5.02it/s][A
+ 11%|█         | 83/774 [00:21<02:21,  4.90it/s][A
+ 11%|█         | 84/774 [00:21<02:27,  4.69it/s][A
+ 11%|█         | 85/774 [00:22<02:36,  4.41it/s][A
+ 11%|█         | 86/774 [00:22<02:43,  4.21it/s][A
+ 11%|█         | 87/774 [00:22<02:44,  4.19it/s][A
+ 11%|█▏        | 88/774 [00:22<02:32,  4.51it/s][A
+ 11%|█▏        | 89/774 [00:22<02:26,  4.68it/s][A
+ 12%|█▏        | 90/774 [00:23<02:34,  4.43it/s][A
+ 12%|█▏        | 91/774 [00:23<02:48,  4.06it/s][A
+ 12%|█▏        | 92/774 [00:23<03:00,  3.77it/s][A
+ 12%|█▏        | 93/774 [00:24<02:57,  3.85it/s][A
+ 12%|█▏        | 94/774 [00:24<03:00,  3.76it/s][A
+ 12%|█▏        | 95/774 [00:24<02:59,  3.79it/s][A
+ 12%|█▏        | 96/774 [00:24<02:54,  3.88it/s][A
+ 13%|█▎        | 97/774 [00:24<02:38,  4.28it/s][A
+ 13%|█▎        | 98/774 [00:25<02:31,  4.47it/s][A
+ 13%|█▎        | 99/774 [00:25<02:43,  4.12it/s][A
+ 13%|█▎        | 100/774 [00:25<02:55,  3.84it/s][A
+ 13%|█▎        | 101/774 [00:26<03:01,  3.71it/s][A
+ 13%|█▎        | 102/774 [00:26<03:13,  3.47it/s][A
+ 13%|█▎        | 103/774 [00:26<03:16,  3.42it/s][A
+ 13%|█▎        | 104/774 [00:26<03:15,  3.43it/s][A
+ 14%|█▎        | 105/774 [00:27<03:15,  3.42it/s][A
+ 14%|█▎        | 106/774 [00:27<03:35,  3.11it/s][A
+ 14%|█▍        | 107/774 [00:28<03:46,  2.95it/s][A
+ 14%|█▍        | 108/774 [00:28<03:37,  3.06it/s][A
+ 14%|█▍        | 109/774 [00:28<03:34,  3.09it/s][A
+ 14%|█▍        | 110/774 [00:28<03:24,  3.25it/s][A
+ 14%|█▍        | 111/774 [00:29<03:23,  3.26it/s][A
+ 14%|█▍        | 112/774 [00:29<03:13,  3.43it/s][A
+ 15%|█▍        | 113/774 [00:29<03:16,  3.37it/s][A
+ 15%|█▍        | 114/774 [00:30<03:20,  3.29it/s][A
+ 15%|█▍        | 115/774 [00:30<03:14,  3.38it/s][A
+ 15%|█▍        | 116/774 [00:30<03:00,  3.65it/s][A
+ 15%|█▌        | 117/774 [00:30<03:05,  3.54it/s][A
+ 15%|█▌        | 118/774 [00:31<03:04,  3.55it/s][A
+ 15%|█▌        | 119/774 [00:31<02:57,  3.69it/s][A
+ 16%|█▌        | 120/774 [00:31<03:08,  3.48it/s][A
+ 16%|█▌        | 121/774 [00:32<03:03,  3.57it/s][A
+ 16%|█▌        | 122/774 [00:32<03:06,  3.50it/s][A
+ 16%|█▌        | 123/774 [00:32<02:58,  3.65it/s][A
+ 16%|█▌        | 124/774 [00:32<02:59,  3.62it/s][A
+ 16%|█▌        | 125/774 [00:33<03:01,  3.57it/s][A
+ 16%|█▋        | 126/774 [00:33<03:09,  3.42it/s][A
+ 16%|█▋        | 127/774 [00:33<03:17,  3.27it/s][A
+ 17%|█▋        | 128/774 [00:34<03:08,  3.43it/s][A
+ 17%|█▋        | 129/774 [00:34<03:09,  3.40it/s][A
+ 17%|█▋        | 130/774 [00:34<03:16,  3.27it/s][A
+ 17%|█▋        | 131/774 [00:34<03:07,  3.43it/s][A
+ 17%|█▋        | 132/774 [00:35<03:07,  3.42it/s][A
+ 17%|█▋        | 133/774 [00:35<03:03,  3.49it/s][A
+ 17%|█▋        | 134/774 [00:35<03:03,  3.49it/s][A
+ 17%|█▋        | 135/774 [00:36<03:19,  3.20it/s][A
+ 18%|█▊        | 136/774 [00:36<03:26,  3.09it/s][A
+ 18%|█▊        | 137/774 [00:36<03:25,  3.10it/s][A
+ 18%|█▊        | 138/774 [00:37<03:21,  3.16it/s][A
+ 18%|█▊        | 139/774 [00:37<03:22,  3.13it/s][A
+ 18%|█▊        | 140/774 [00:37<03:18,  3.19it/s][A
+ 18%|█▊        | 141/774 [00:38<03:09,  3.34it/s][A
+ 18%|█▊        | 142/774 [00:38<03:20,  3.16it/s][A
+ 18%|█▊        | 143/774 [00:38<03:17,  3.19it/s][A
+ 19%|█▊        | 144/774 [00:38<03:07,  3.36it/s][A
+ 19%|█▊        | 145/774 [00:39<02:59,  3.51it/s][A
+ 19%|█▉        | 146/774 [00:39<02:49,  3.71it/s][A
+ 19%|█▉        | 147/774 [00:39<02:40,  3.90it/s][A
+ 19%|█▉        | 148/774 [00:40<02:50,  3.67it/s][A
+ 19%|█▉        | 149/774 [00:40<03:01,  3.43it/s][A
+ 19%|█▉        | 150/774 [00:40<03:04,  3.37it/s][A
+ 20%|█▉        | 151/774 [00:40<02:55,  3.56it/s][A
+ 20%|█▉        | 152/774 [00:41<02:47,  3.72it/s][A
+ 20%|█▉        | 153/774 [00:41<02:54,  3.56it/s][A
+ 20%|█▉        | 154/774 [00:41<02:49,  3.66it/s][A
+ 20%|██        | 155/774 [00:41<02:46,  3.72it/s][A
+ 20%|██        | 156/774 [00:42<02:40,  3.84it/s][A
+ 20%|██        | 157/774 [00:42<02:34,  4.00it/s][A
+ 20%|██        | 158/774 [00:42<02:38,  3.90it/s][A
+ 21%|██        | 159/774 [00:42<02:41,  3.81it/s][A
+ 21%|██        | 160/774 [00:43<02:33,  4.01it/s][A
+ 21%|██        | 161/774 [00:43<02:42,  3.77it/s][A
+ 21%|██        | 162/774 [00:43<02:48,  3.63it/s][A
+ 21%|██        | 163/774 [00:44<02:46,  3.66it/s][A
+ 21%|██        | 164/774 [00:44<02:39,  3.82it/s][A
+ 21%|██▏       | 165/774 [00:44<02:37,  3.86it/s][A
+ 21%|██▏       | 166/774 [00:44<02:41,  3.76it/s][A
+ 22%|██▏       | 167/774 [00:45<02:42,  3.73it/s][A
+ 22%|██▏       | 168/774 [00:45<02:34,  3.92it/s][A
+ 22%|██▏       | 169/774 [00:45<02:27,  4.11it/s][A
+ 22%|██▏       | 170/774 [00:45<02:35,  3.87it/s][A
+ 22%|██▏       | 171/774 [00:46<02:46,  3.62it/s][A
+ 22%|██▏       | 172/774 [00:46<02:54,  3.46it/s][A
+ 22%|██▏       | 173/774 [00:46<02:50,  3.53it/s][A
+ 22%|██▏       | 174/774 [00:46<02:42,  3.68it/s][A
+ 23%|██▎       | 175/774 [00:47<02:43,  3.66it/s][A
+ 23%|██▎       | 176/774 [00:47<02:36,  3.82it/s][A
+ 23%|██▎       | 177/774 [00:47<02:50,  3.50it/s][A
+ 23%|██▎       | 178/774 [00:48<02:34,  3.85it/s][A
+ 23%|██▎       | 179/774 [00:48<02:21,  4.20it/s][A
+ 23%|██▎       | 180/774 [00:48<02:16,  4.34it/s][A
+ 23%|██▎       | 181/774 [00:48<02:20,  4.23it/s][A
+ 24%|██▎       | 182/774 [00:48<02:23,  4.12it/s][A
+ 24%|██▎       | 183/774 [00:49<02:25,  4.07it/s][A
+ 24%|██▍       | 184/774 [00:49<02:35,  3.79it/s][A
+ 24%|██▍       | 185/774 [00:49<02:42,  3.63it/s][A
+ 24%|██▍       | 186/774 [00:50<02:41,  3.65it/s][A
+ 24%|██▍       | 187/774 [00:50<02:34,  3.79it/s][A
+ 24%|██▍       | 188/774 [00:50<02:33,  3.81it/s][A
+ 24%|██▍       | 189/774 [00:50<02:32,  3.84it/s][A
+ 25%|██▍       | 190/774 [00:51<02:27,  3.95it/s][A
+ 25%|██▍       | 191/774 [00:51<02:32,  3.82it/s][A
+ 25%|██▍       | 192/774 [00:51<02:37,  3.70it/s][A
+ 25%|██▍       | 193/774 [00:51<02:40,  3.62it/s][A
+ 25%|██▌       | 194/774 [00:52<02:49,  3.41it/s][A
+ 25%|██▌       | 195/774 [00:52<02:57,  3.26it/s][A
+ 25%|██▌       | 196/774 [00:52<02:58,  3.24it/s][A
+ 25%|██▌       | 197/774 [00:53<02:55,  3.29it/s][A
+ 26%|██▌       | 198/774 [00:53<02:45,  3.48it/s][A
+ 26%|██▌       | 199/774 [00:53<02:45,  3.47it/s][A
+ 26%|██▌       | 200/774 [00:54<02:40,  3.59it/s][A
+ 26%|██▌       | 201/774 [00:54<02:37,  3.65it/s][A
+ 26%|██▌       | 202/774 [00:54<02:33,  3.73it/s][A
+ 26%|██▌       | 203/774 [00:54<02:25,  3.91it/s][A
+ 26%|██▋       | 204/774 [00:55<02:29,  3.82it/s][A
+ 26%|██▋       | 205/774 [00:55<02:39,  3.58it/s][A
+ 27%|██▋       | 206/774 [00:55<02:33,  3.70it/s][A
+ 27%|██▋       | 207/774 [00:55<02:31,  3.75it/s][A
+ 27%|██▋       | 208/774 [00:56<02:31,  3.73it/s][A
+ 27%|██▋       | 209/774 [00:56<02:30,  3.75it/s][A
+ 27%|██▋       | 210/774 [00:56<02:29,  3.76it/s][A
+ 27%|██▋       | 211/774 [00:56<02:26,  3.84it/s][A
+ 27%|██▋       | 212/774 [00:57<02:16,  4.13it/s][A
+ 28%|██▊       | 213/774 [00:57<02:01,  4.63it/s][A
+ 28%|██▊       | 214/774 [00:57<02:03,  4.54it/s][A
+ 28%|██▊       | 215/774 [00:57<02:02,  4.55it/s][A
+ 28%|██▊       | 216/774 [00:57<02:00,  4.62it/s][A
+ 28%|██▊       | 217/774 [00:58<02:04,  4.47it/s][A
+ 28%|██▊       | 218/774 [00:58<02:11,  4.24it/s][A
+ 28%|██▊       | 219/774 [00:58<02:21,  3.93it/s][A
+ 28%|██▊       | 220/774 [00:58<02:20,  3.95it/s][A
+ 29%|██▊       | 221/774 [00:59<02:25,  3.81it/s][A
+ 29%|██▊       | 222/774 [00:59<02:33,  3.59it/s][A
+ 29%|██▉       | 223/774 [00:59<02:50,  3.23it/s][A
+ 29%|██▉       | 224/774 [01:00<02:58,  3.08it/s][A
+ 29%|██▉       | 225/774 [01:00<03:10,  2.88it/s][A
+ 29%|██▉       | 226/774 [01:01<03:15,  2.81it/s][A
+ 29%|██▉       | 227/774 [01:01<03:11,  2.86it/s][A
+ 29%|██▉       | 228/774 [01:01<03:03,  2.97it/s][A
+ 30%|██▉       | 229/774 [01:02<03:18,  2.74it/s][A
+ 30%|██▉       | 230/774 [01:02<03:04,  2.95it/s][A
+ 30%|██▉       | 231/774 [01:02<03:00,  3.01it/s][A
+ 30%|██▉       | 232/774 [01:03<02:52,  3.14it/s][A
+ 30%|███       | 233/774 [01:03<03:08,  2.88it/s][A
+ 30%|███       | 234/774 [01:03<03:10,  2.83it/s][A
+ 30%|███       | 235/774 [01:04<03:09,  2.85it/s][A
+ 30%|███       | 236/774 [01:04<03:12,  2.79it/s][A
+ 31%|███       | 237/774 [01:04<03:09,  2.83it/s][A
+ 31%|███       | 238/774 [01:05<03:00,  2.97it/s][A
+ 31%|███       | 239/774 [01:05<02:58,  2.99it/s][A
+ 31%|███       | 240/774 [01:05<02:58,  2.99it/s][A
+ 31%|███       | 241/774 [01:06<03:00,  2.95it/s][A
+ 31%|███▏      | 242/774 [01:06<03:11,  2.78it/s][A
+ 31%|███▏      | 243/774 [01:07<03:21,  2.64it/s][A
+ 32%|███▏      | 244/774 [01:07<03:14,  2.72it/s][A
+ 32%|███▏      | 245/774 [01:07<03:07,  2.83it/s][A
+ 32%|███▏      | 246/774 [01:08<03:06,  2.83it/s][A
+ 32%|███▏      | 247/774 [01:08<03:44,  2.35it/s][A
+ 32%|███▏      | 248/774 [01:09<03:50,  2.28it/s][A
+ 32%|███▏      | 249/774 [01:09<03:27,  2.53it/s][A
+ 32%|███▏      | 250/774 [01:09<03:19,  2.62it/s][A
+ 32%|███▏      | 251/774 [01:10<03:17,  2.64it/s][A
+ 33%|███▎      | 252/774 [01:10<03:14,  2.68it/s][A
+ 33%|███▎      | 253/774 [01:10<03:11,  2.71it/s][A
+ 33%|███▎      | 254/774 [01:11<03:07,  2.78it/s][A
+ 33%|███▎      | 255/774 [01:11<03:02,  2.84it/s][A
+ 33%|███▎      | 256/774 [01:11<02:58,  2.90it/s][A
+ 33%|███▎      | 257/774 [01:12<02:57,  2.92it/s][A
+ 33%|███▎      | 258/774 [01:12<02:43,  3.16it/s][A
+ 33%|███▎      | 259/774 [01:12<02:25,  3.54it/s][A
+ 34%|███▎      | 260/774 [01:12<02:24,  3.55it/s][A
+ 34%|███▎      | 261/774 [01:13<02:28,  3.46it/s][A
+ 34%|███▍      | 262/774 [01:13<02:13,  3.83it/s][A
+ 34%|███▍      | 263/774 [01:13<02:06,  4.05it/s][A
+ 34%|███▍      | 264/774 [01:13<02:15,  3.76it/s][A
+ 34%|███▍      | 265/774 [01:14<02:09,  3.92it/s][A
+ 34%|███▍      | 266/774 [01:14<02:02,  4.13it/s][A
+ 34%|███▍      | 267/774 [01:14<02:01,  4.17it/s][A
+ 35%|███▍      | 268/774 [01:14<02:08,  3.94it/s][A
+ 35%|███▍      | 269/774 [01:15<02:14,  3.76it/s][A
+ 35%|███▍      | 270/774 [01:15<02:18,  3.63it/s][A
+ 35%|███▌      | 271/774 [01:15<02:15,  3.72it/s][A
+ 35%|███▌      | 272/774 [01:15<02:04,  4.03it/s][A
+ 35%|███▌      | 273/774 [01:16<02:01,  4.14it/s][A
+ 35%|███▌      | 274/774 [01:16<02:05,  3.98it/s][A
+ 36%|███▌      | 275/774 [01:16<01:58,  4.22it/s][A
+ 36%|███▌      | 276/774 [01:16<01:52,  4.41it/s][A
+ 36%|███▌      | 277/774 [01:17<01:56,  4.25it/s][A
+ 36%|███▌      | 278/774 [01:17<01:59,  4.15it/s][A
+ 36%|███▌      | 279/774 [01:17<01:53,  4.36it/s][A
+ 36%|███▌      | 280/774 [01:17<01:54,  4.33it/s][A
+ 36%|███▋      | 281/774 [01:18<02:05,  3.94it/s][A
+ 36%|███▋      | 282/774 [01:18<02:16,  3.61it/s][A
+ 37%|███▋      | 283/774 [01:18<02:12,  3.71it/s][A
+ 37%|███▋      | 284/774 [01:18<02:13,  3.68it/s][A
+ 37%|███▋      | 285/774 [01:19<02:06,  3.87it/s][A
+ 37%|███▋      | 286/774 [01:19<02:01,  4.03it/s][A
+ 37%|███▋      | 287/774 [01:19<02:12,  3.67it/s][A
+ 37%|███▋      | 288/774 [01:20<02:16,  3.56it/s][A
+ 37%|███▋      | 289/774 [01:20<02:13,  3.64it/s][A
+ 37%|███▋      | 290/774 [01:20<02:09,  3.75it/s][A
+ 38%|███▊      | 291/774 [01:20<02:08,  3.76it/s][A
+ 38%|███▊      | 292/774 [01:21<02:05,  3.84it/s][A
+ 38%|███▊      | 293/774 [01:21<01:54,  4.18it/s][A
+ 38%|███▊      | 294/774 [01:21<01:50,  4.34it/s][A
+ 38%|███▊      | 295/774 [01:21<01:49,  4.39it/s][A
+ 38%|███▊      | 296/774 [01:21<01:44,  4.60it/s][A
+ 38%|███▊      | 297/774 [01:22<01:38,  4.83it/s][A
+ 39%|███▊      | 298/774 [01:22<01:44,  4.58it/s][A
+ 39%|███▊      | 299/774 [01:22<01:46,  4.44it/s][A
+ 39%|███▉      | 300/774 [01:22<01:53,  4.18it/s][A
+ 39%|███▉      | 301/774 [01:23<01:46,  4.45it/s][A
+ 39%|███▉      | 302/774 [01:23<01:40,  4.69it/s][A
+ 39%|███▉      | 303/774 [01:23<01:37,  4.81it/s][A
+ 39%|███▉      | 304/774 [01:23<01:26,  5.42it/s][A
+ 39%|███▉      | 305/774 [01:23<01:24,  5.52it/s][A
+ 40%|███▉      | 306/774 [01:23<01:37,  4.80it/s][A
+ 40%|███▉      | 307/774 [01:24<01:42,  4.54it/s][A
+ 40%|███▉      | 308/774 [01:24<01:38,  4.75it/s][A
+ 40%|███▉      | 309/774 [01:24<01:38,  4.72it/s][A
+ 40%|████      | 310/774 [01:24<01:42,  4.51it/s][A
+ 40%|████      | 311/774 [01:25<01:41,  4.56it/s][A
+ 40%|████      | 312/774 [01:25<01:39,  4.66it/s][A
+ 40%|████      | 313/774 [01:25<01:39,  4.62it/s][A
+ 41%|████      | 314/774 [01:25<01:41,  4.55it/s][A
+ 41%|████      | 315/774 [01:26<01:48,  4.21it/s][A
+ 41%|████      | 316/774 [01:26<01:39,  4.58it/s][A
+ 41%|████      | 317/774 [01:26<01:32,  4.91it/s][A
+ 41%|████      | 318/774 [01:26<01:36,  4.73it/s][A
+ 41%|████      | 319/774 [01:26<01:38,  4.61it/s][A
+ 41%|████▏     | 320/774 [01:27<01:39,  4.57it/s][A
+ 41%|████▏     | 321/774 [01:27<01:29,  5.04it/s][A
+ 42%|████▏     | 322/774 [01:27<01:24,  5.33it/s][A
+ 42%|████▏     | 323/774 [01:27<01:16,  5.90it/s][A
+ 42%|████▏     | 324/774 [01:27<01:23,  5.38it/s][A
+ 42%|████▏     | 325/774 [01:27<01:27,  5.10it/s][A
+ 42%|████▏     | 326/774 [01:28<01:25,  5.27it/s][A
+ 42%|████▏     | 327/774 [01:28<01:27,  5.09it/s][A
+ 42%|████▏     | 328/774 [01:28<01:25,  5.21it/s][A
+ 43%|████▎     | 329/774 [01:28<01:34,  4.72it/s][A
+ 43%|████▎     | 330/774 [01:28<01:30,  4.90it/s][A
+ 43%|████▎     | 331/774 [01:29<01:22,  5.38it/s][A
+ 43%|████▎     | 332/774 [01:29<01:20,  5.50it/s][A
+ 43%|████▎     | 333/774 [01:29<01:23,  5.31it/s][A
+ 43%|████▎     | 334/774 [01:29<01:27,  5.05it/s][A
+ 43%|████▎     | 335/774 [01:29<01:29,  4.90it/s][A
+ 43%|████▎     | 336/774 [01:30<01:27,  4.98it/s][A
+ 44%|████▎     | 337/774 [01:30<01:21,  5.37it/s][A
+ 44%|████▎     | 338/774 [01:30<01:16,  5.73it/s][A
+ 44%|████▍     | 339/774 [01:30<01:11,  6.09it/s][A
+ 44%|████▍     | 340/774 [01:30<01:10,  6.19it/s][A
+ 44%|████▍     | 341/774 [01:30<01:27,  4.93it/s][A
+ 44%|████▍     | 342/774 [01:31<01:36,  4.45it/s][A
+ 44%|████▍     | 343/774 [01:31<01:38,  4.38it/s][A
+ 44%|████▍     | 344/774 [01:31<01:42,  4.21it/s][A
+ 45%|████▍     | 345/774 [01:32<01:45,  4.08it/s][A
+ 45%|████▍     | 346/774 [01:32<01:47,  3.98it/s][A
+ 45%|████▍     | 347/774 [01:32<01:44,  4.10it/s][A
+ 45%|████▍     | 348/774 [01:32<01:39,  4.27it/s][A
+ 45%|████▌     | 349/774 [01:32<01:35,  4.46it/s][A
+ 45%|████▌     | 350/774 [01:33<01:38,  4.31it/s][A
+ 45%|████▌     | 351/774 [01:33<01:38,  4.30it/s][A
+ 45%|████▌     | 352/774 [01:33<01:34,  4.48it/s][A
+ 46%|████▌     | 353/774 [01:33<01:34,  4.47it/s][A
+ 46%|████▌     | 354/774 [01:34<01:33,  4.48it/s][A
+ 46%|████▌     | 355/774 [01:34<01:38,  4.24it/s][A
+ 46%|████▌     | 356/774 [01:34<01:48,  3.86it/s][A
+ 46%|████▌     | 357/774 [01:35<02:05,  3.34it/s][A
+ 46%|████▋     | 358/774 [01:35<02:08,  3.25it/s][A
+ 46%|████▋     | 359/774 [01:35<02:07,  3.26it/s][A
+ 47%|████▋     | 360/774 [01:35<02:07,  3.25it/s][A
+ 47%|████▋     | 361/774 [01:36<02:01,  3.40it/s][A
+ 47%|████▋     | 362/774 [01:36<02:07,  3.24it/s][A
+ 47%|████▋     | 363/774 [01:36<02:06,  3.26it/s][A
+ 47%|████▋     | 364/774 [01:37<02:07,  3.21it/s][A
+ 47%|████▋     | 365/774 [01:37<02:04,  3.29it/s][A
+ 47%|████▋     | 366/774 [01:37<01:54,  3.57it/s][A
+ 47%|████▋     | 367/774 [01:37<01:51,  3.65it/s][A
+ 48%|████▊     | 368/774 [01:38<01:48,  3.74it/s][A
+ 48%|████▊     | 369/774 [01:38<01:55,  3.50it/s][A
+ 48%|████▊     | 370/774 [01:38<02:08,  3.13it/s][A
+ 48%|████▊     | 371/774 [01:39<02:00,  3.35it/s][A
+ 48%|████▊     | 372/774 [01:39<02:00,  3.32it/s][A
+ 48%|████▊     | 373/774 [01:39<01:59,  3.36it/s][A
+ 48%|████▊     | 374/774 [01:40<01:55,  3.46it/s][A
+ 48%|████▊     | 375/774 [01:40<01:55,  3.44it/s][A
+ 49%|████▊     | 376/774 [01:40<02:00,  3.30it/s][A
+ 49%|████▊     | 377/774 [01:41<02:12,  2.99it/s][A
+ 49%|████▉     | 378/774 [01:41<02:13,  2.97it/s][A
+ 49%|████▉     | 379/774 [01:41<02:03,  3.19it/s][A
+ 49%|████▉     | 380/774 [01:41<01:53,  3.47it/s][A
+ 49%|████▉     | 381/774 [01:42<01:45,  3.72it/s][A
+ 49%|████▉     | 382/774 [01:42<01:41,  3.84it/s][A
+ 49%|████▉     | 383/774 [01:42<01:40,  3.90it/s][A
+ 50%|████▉     | 384/774 [01:42<01:47,  3.62it/s][A
+ 50%|████▉     | 385/774 [01:43<01:56,  3.35it/s][A
+ 50%|████▉     | 386/774 [01:43<01:48,  3.57it/s][A
+ 50%|█████     | 387/774 [01:43<01:42,  3.79it/s][A
+ 50%|█████     | 388/774 [01:44<01:47,  3.60it/s][A
+ 50%|█████     | 389/774 [01:44<01:43,  3.72it/s][A
+ 50%|█████     | 390/774 [01:44<01:56,  3.29it/s][A
+ 51%|█████     | 391/774 [01:45<01:58,  3.24it/s][A
+ 51%|█████     | 392/774 [01:45<01:48,  3.52it/s][A
+ 51%|█████     | 393/774 [01:45<01:40,  3.80it/s][A
+ 51%|█████     | 394/774 [01:45<01:40,  3.78it/s][A
+ 51%|█████     | 395/774 [01:46<01:47,  3.52it/s][A
+ 51%|█████     | 396/774 [01:46<01:45,  3.59it/s][A
+ 51%|█████▏    | 397/774 [01:46<01:48,  3.48it/s][A
+ 51%|█████▏    | 398/774 [01:46<01:43,  3.63it/s][A
+ 52%|█████▏    | 399/774 [01:47<01:40,  3.72it/s][A
+ 52%|█████▏    | 400/774 [01:47<01:33,  3.99it/s][A
+ 52%|█████▏    | 401/774 [01:47<01:30,  4.12it/s][A
+ 52%|█████▏    | 402/774 [01:47<01:31,  4.08it/s][A
+ 52%|█████▏    | 403/774 [01:48<01:35,  3.90it/s][A
+ 52%|█████▏    | 404/774 [01:48<01:40,  3.69it/s][A
+ 52%|█████▏    | 405/774 [01:48<01:36,  3.82it/s][A
+ 52%|█████▏    | 406/774 [01:48<01:39,  3.71it/s][A
+ 53%|█████▎    | 407/774 [01:49<01:45,  3.48it/s][A
+ 53%|█████▎    | 408/774 [01:49<01:41,  3.62it/s][A
+ 53%|█████▎    | 409/774 [01:49<01:38,  3.72it/s][A
+ 53%|█████▎    | 410/774 [01:50<01:39,  3.67it/s][A
+ 53%|█████▎    | 411/774 [01:50<01:39,  3.65it/s][A
+ 53%|█████▎    | 412/774 [01:50<01:40,  3.60it/s][A
+ 53%|█████▎    | 413/774 [01:50<01:38,  3.66it/s][A
+ 53%|█████▎    | 414/774 [01:51<01:36,  3.75it/s][A
+ 54%|█████▎    | 415/774 [01:51<01:24,  4.23it/s][A
+ 54%|█████▎    | 416/774 [01:51<01:25,  4.18it/s][A
+ 54%|█████▍    | 417/774 [01:51<01:24,  4.24it/s][A
+ 54%|█████▍    | 418/774 [01:51<01:17,  4.58it/s][A
+ 54%|█████▍    | 419/774 [01:52<01:32,  3.85it/s][A
+ 54%|█████▍    | 420/774 [01:52<01:36,  3.66it/s][A
+ 54%|█████▍    | 421/774 [01:52<01:36,  3.66it/s][A
+ 55%|█████▍    | 422/774 [01:53<01:35,  3.68it/s][A
+ 55%|█████▍    | 423/774 [01:53<01:36,  3.63it/s][A
+ 55%|█████▍    | 424/774 [01:53<01:34,  3.69it/s][A
+ 55%|█████▍    | 425/774 [01:53<01:24,  4.15it/s][A
+ 55%|█████▌    | 426/774 [01:54<01:17,  4.46it/s][A
+ 55%|█████▌    | 427/774 [01:54<01:13,  4.73it/s][A
+ 55%|█████▌    | 428/774 [01:54<01:15,  4.57it/s][A
+ 55%|█████▌    | 429/774 [01:54<01:18,  4.42it/s][A
+ 56%|█████▌    | 430/774 [01:54<01:22,  4.15it/s][A
+ 56%|█████▌    | 431/774 [01:55<01:35,  3.61it/s][A
+ 56%|█████▌    | 432/774 [01:55<01:34,  3.63it/s][A
+ 56%|█████▌    | 433/774 [01:55<01:27,  3.90it/s][A
+ 56%|█████▌    | 434/774 [01:56<01:22,  4.12it/s][A
+ 56%|█████▌    | 435/774 [01:56<01:21,  4.15it/s][A
+ 56%|█████▋    | 436/774 [01:56<01:22,  4.08it/s][A
+ 56%|█████▋    | 437/774 [01:56<01:19,  4.23it/s][A
+ 57%|█████▋    | 438/774 [01:56<01:15,  4.44it/s][A
+ 57%|█████▋    | 439/774 [01:57<01:19,  4.23it/s][A
+ 57%|█████▋    | 440/774 [01:57<01:23,  4.02it/s][A
+ 57%|█████▋    | 441/774 [01:57<01:26,  3.84it/s][A
+ 57%|█████▋    | 442/774 [01:58<01:28,  3.74it/s][A
+ 57%|█████▋    | 443/774 [01:58<01:26,  3.82it/s][A
+ 57%|█████▋    | 444/774 [01:58<01:25,  3.88it/s][A
+ 57%|█████▋    | 445/774 [01:58<01:25,  3.86it/s][A
+ 58%|█████▊    | 446/774 [01:59<01:23,  3.95it/s][A
+ 58%|█████▊    | 447/774 [01:59<01:22,  3.98it/s][A
+ 58%|█████▊    | 448/774 [01:59<01:14,  4.36it/s][A
+ 58%|█████▊    | 449/774 [01:59<01:15,  4.30it/s][A
+ 58%|█████▊    | 450/774 [02:00<01:18,  4.13it/s][A
+ 58%|█████▊    | 451/774 [02:00<01:15,  4.26it/s][A
+ 58%|█████▊    | 452/774 [02:00<01:11,  4.47it/s][A
+ 59%|█████▊    | 453/774 [02:00<01:10,  4.53it/s][A
+ 59%|█████▊    | 454/774 [02:00<01:16,  4.16it/s][A
+ 59%|█████▉    | 455/774 [02:01<01:22,  3.89it/s][A
+ 59%|█████▉    | 456/774 [02:01<01:25,  3.70it/s][A
+ 59%|█████▉    | 457/774 [02:01<01:19,  3.98it/s][A
+ 59%|█████▉    | 458/774 [02:01<01:19,  3.99it/s][A
+ 59%|█████▉    | 459/774 [02:02<01:17,  4.06it/s][A
+ 59%|█████▉    | 460/774 [02:02<01:21,  3.83it/s][A
+ 60%|█████▉    | 461/774 [02:02<01:29,  3.51it/s][A
+ 60%|█████▉    | 462/774 [02:03<01:26,  3.60it/s][A
+ 60%|█████▉    | 463/774 [02:03<01:24,  3.69it/s][A
+ 60%|█████▉    | 464/774 [02:03<01:23,  3.69it/s][A
+ 60%|██████    | 465/774 [02:03<01:16,  4.06it/s][A
+ 60%|██████    | 466/774 [02:04<01:13,  4.21it/s][A
+ 60%|██████    | 467/774 [02:04<01:08,  4.45it/s][A
+ 60%|██████    | 468/774 [02:04<01:09,  4.41it/s][A
+ 61%|██████    | 469/774 [02:04<01:03,  4.81it/s][A
+ 61%|██████    | 470/774 [02:04<00:59,  5.09it/s][A
+ 61%|██████    | 471/774 [02:05<01:02,  4.87it/s][A
+ 61%|██████    | 472/774 [02:05<01:06,  4.51it/s][A
+ 61%|██████    | 473/774 [02:05<01:09,  4.30it/s][A
+ 61%|██████    | 474/774 [02:05<01:08,  4.39it/s][A
+ 61%|██████▏   | 475/774 [02:05<01:08,  4.33it/s][A
+ 61%|██████▏   | 476/774 [02:06<01:17,  3.86it/s][A
+ 62%|██████▏   | 477/774 [02:06<01:31,  3.24it/s][A
+ 62%|██████▏   | 478/774 [02:07<01:32,  3.19it/s][A
+ 62%|██████▏   | 479/774 [02:07<01:30,  3.26it/s][A
+ 62%|██████▏   | 480/774 [02:07<01:27,  3.37it/s][A
+ 62%|██████▏   | 481/774 [02:07<01:28,  3.31it/s][A
+ 62%|██████▏   | 482/774 [02:08<01:26,  3.38it/s][A
+ 62%|██████▏   | 483/774 [02:08<01:24,  3.46it/s][A
+ 63%|██████▎   | 484/774 [02:08<01:25,  3.39it/s][A
+ 63%|██████▎   | 485/774 [02:09<01:27,  3.30it/s][A
+ 63%|██████▎   | 486/774 [02:09<01:24,  3.43it/s][A
+ 63%|██████▎   | 487/774 [02:09<01:25,  3.36it/s][A
+ 63%|██████▎   | 488/774 [02:09<01:23,  3.44it/s][A
+ 63%|██████▎   | 489/774 [02:10<01:18,  3.65it/s][A
+ 63%|██████▎   | 490/774 [02:10<01:18,  3.64it/s][A
+ 63%|██████▎   | 491/774 [02:10<01:16,  3.68it/s][A
+ 64%|██████▎   | 492/774 [02:11<01:18,  3.58it/s][A
+ 64%|██████▎   | 493/774 [02:11<01:19,  3.55it/s][A
+ 64%|██████▍   | 494/774 [02:11<01:17,  3.60it/s][A
+ 64%|██████▍   | 495/774 [02:11<01:17,  3.60it/s][A
+ 64%|██████▍   | 496/774 [02:12<01:22,  3.36it/s][A
+ 64%|██████▍   | 497/774 [02:12<01:23,  3.31it/s][A
+ 64%|██████▍   | 498/774 [02:12<01:22,  3.35it/s][A
+ 64%|██████▍   | 499/774 [02:13<01:19,  3.44it/s][A
+ 65%|██████▍   | 500/774 [02:13<01:17,  3.52it/s][A
+ 65%|██████▍   | 501/774 [02:13<01:14,  3.64it/s][A
+ 65%|██████▍   | 502/774 [02:13<01:14,  3.66it/s][A
+ 65%|██████▍   | 503/774 [02:14<01:19,  3.41it/s][A
+ 65%|██████▌   | 504/774 [02:14<01:21,  3.31it/s][A
+ 65%|██████▌   | 505/774 [02:14<01:18,  3.41it/s][A
+ 65%|██████▌   | 506/774 [02:15<01:18,  3.41it/s][A
+ 66%|██████▌   | 507/774 [02:15<01:22,  3.22it/s][A
+ 66%|██████▌   | 508/774 [02:15<01:20,  3.29it/s][A
+ 66%|██████▌   | 509/774 [02:16<01:19,  3.33it/s][A
+ 66%|██████▌   | 510/774 [02:16<01:17,  3.41it/s][A
+ 66%|██████▌   | 511/774 [02:16<01:12,  3.61it/s][A
+ 66%|██████▌   | 512/774 [02:16<01:11,  3.68it/s][A
+ 66%|██████▋   | 513/774 [02:17<01:14,  3.49it/s][A
+ 66%|██████▋   | 514/774 [02:17<01:16,  3.40it/s][A
+ 67%|██████▋   | 515/774 [02:17<01:22,  3.15it/s][A
+ 67%|██████▋   | 516/774 [02:18<01:16,  3.36it/s][A
+ 67%|██████▋   | 517/774 [02:18<01:10,  3.65it/s][A
+ 67%|██████▋   | 518/774 [02:18<01:07,  3.77it/s][A
+ 67%|██████▋   | 519/774 [02:18<01:10,  3.62it/s][A
+ 67%|██████▋   | 520/774 [02:19<01:09,  3.64it/s][A
+ 67%|██████▋   | 521/774 [02:19<01:07,  3.73it/s][A
+ 67%|██████▋   | 522/774 [02:19<01:04,  3.93it/s][A
+ 68%|██████▊   | 523/774 [02:19<01:03,  3.97it/s][A
+ 68%|██████▊   | 524/774 [02:20<01:06,  3.77it/s][A
+ 68%|██████▊   | 525/774 [02:20<01:07,  3.69it/s][A
+ 68%|██████▊   | 526/774 [02:20<01:10,  3.54it/s][A
+ 68%|██████▊   | 527/774 [02:21<01:11,  3.45it/s][A
+ 68%|██████▊   | 528/774 [02:21<01:10,  3.49it/s][A
+ 68%|██████▊   | 529/774 [02:21<01:06,  3.67it/s][A
+ 68%|██████▊   | 530/774 [02:21<01:05,  3.72it/s][A
+ 69%|██████▊   | 531/774 [02:22<01:05,  3.73it/s][A
+ 69%|██████▊   | 532/774 [02:22<01:03,  3.83it/s][A
+ 69%|██████▉   | 533/774 [02:22<00:59,  4.08it/s][A
+ 69%|██████▉   | 534/774 [02:22<00:56,  4.23it/s][A
+ 69%|██████▉   | 535/774 [02:23<00:59,  4.04it/s][A
+ 69%|██████▉   | 536/774 [02:23<01:01,  3.88it/s][A
+ 69%|██████▉   | 537/774 [02:23<01:01,  3.82it/s][A
+ 70%|██████▉   | 538/774 [02:23<01:05,  3.59it/s][A
+ 70%|██████▉   | 539/774 [02:24<01:04,  3.62it/s][A
+ 70%|██████▉   | 540/774 [02:24<01:04,  3.64it/s][A
+ 70%|██████▉   | 541/774 [02:24<01:02,  3.74it/s][A
+ 70%|███████   | 542/774 [02:24<01:01,  3.75it/s][A
+ 70%|███████   | 543/774 [02:25<01:03,  3.65it/s][A
+ 70%|███████   | 544/774 [02:25<01:03,  3.63it/s][A
+ 70%|███████   | 545/774 [02:25<01:01,  3.75it/s][A
+ 71%|███████   | 546/774 [02:25<00:57,  3.95it/s][A
+ 71%|███████   | 547/774 [02:26<00:55,  4.10it/s][A
+ 71%|███████   | 548/774 [02:26<00:54,  4.16it/s][A
+ 71%|███████   | 549/774 [02:26<00:55,  4.07it/s][A
+ 71%|███████   | 550/774 [02:27<00:58,  3.83it/s][A
+ 71%|███████   | 551/774 [02:27<01:00,  3.68it/s][A
+ 71%|███████▏  | 552/774 [02:27<01:03,  3.48it/s][A
+ 71%|███████▏  | 553/774 [02:27<01:07,  3.25it/s][A
+ 72%|███████▏  | 554/774 [02:28<01:06,  3.29it/s][A
+ 72%|███████▏  | 555/774 [02:28<01:06,  3.29it/s][A
+ 72%|███████▏  | 556/774 [02:28<01:02,  3.46it/s][A
+ 72%|███████▏  | 557/774 [02:29<01:06,  3.25it/s][A
+ 72%|███████▏  | 558/774 [02:29<01:01,  3.54it/s][A
+ 72%|███████▏  | 559/774 [02:29<00:55,  3.86it/s][A
+ 72%|███████▏  | 560/774 [02:29<01:00,  3.54it/s][A
+ 72%|███████▏  | 561/774 [02:30<00:57,  3.72it/s][A
+ 73%|███████▎  | 562/774 [02:30<00:52,  4.02it/s][A
+ 73%|███████▎  | 563/774 [02:30<00:50,  4.16it/s][A
+ 73%|███████▎  | 564/774 [02:30<00:52,  4.04it/s][A
+ 73%|███████▎  | 565/774 [02:31<00:53,  3.88it/s][A
+ 73%|███████▎  | 566/774 [02:31<00:49,  4.17it/s][A
+ 73%|███████▎  | 567/774 [02:31<00:45,  4.51it/s][A
+ 73%|███████▎  | 568/774 [02:31<00:47,  4.31it/s][A
+ 74%|███████▎  | 569/774 [02:32<00:48,  4.25it/s][A
+ 74%|███████▎  | 570/774 [02:32<00:48,  4.23it/s][A
+ 74%|███████▍  | 571/774 [02:32<00:52,  3.86it/s][A
+ 74%|██���████▍  | 572/774 [02:32<00:54,  3.71it/s][A
+ 74%|███████▍  | 573/774 [02:33<00:53,  3.75it/s][A
+ 74%|███████▍  | 574/774 [02:33<00:52,  3.84it/s][A
+ 74%|███████▍  | 575/774 [02:33<00:51,  3.86it/s][A
+ 74%|███████▍  | 576/774 [02:33<00:56,  3.48it/s][A
+ 75%|███████▍  | 577/774 [02:34<00:54,  3.60it/s][A
+ 75%|███████▍  | 578/774 [02:34<00:53,  3.65it/s][A
+ 75%|███████▍  | 579/774 [02:34<00:56,  3.47it/s][A
+ 75%|███████▍  | 580/774 [02:35<00:55,  3.48it/s][A
+ 75%|███████▌  | 581/774 [02:35<00:54,  3.53it/s][A
+ 75%|███████▌  | 582/774 [02:35<00:52,  3.63it/s][A
+ 75%|███████▌  | 583/774 [02:35<00:50,  3.76it/s][A
+ 75%|███████▌  | 584/774 [02:36<00:50,  3.79it/s][A
+ 76%|███████▌  | 585/774 [02:36<00:52,  3.62it/s][A
+ 76%|███████▌  | 586/774 [02:36<00:52,  3.58it/s][A
+ 76%|███████▌  | 587/774 [02:37<00:51,  3.63it/s][A
+ 76%|███████▌  | 588/774 [02:37<00:50,  3.70it/s][A
+ 76%|███████▌  | 589/774 [02:37<00:48,  3.78it/s][A
+ 76%|███████▌  | 590/774 [02:37<00:45,  4.08it/s][A
+ 76%|███████▋  | 591/774 [02:38<00:46,  3.92it/s][A
+ 76%|███████▋  | 592/774 [02:38<00:49,  3.68it/s][A
+ 77%|███████▋  | 593/774 [02:38<00:50,  3.61it/s][A
+ 77%|███████▋  | 594/774 [02:38<00:49,  3.61it/s][A
+ 77%|███████▋  | 595/774 [02:39<00:53,  3.33it/s][A
+ 77%|███████▋  | 596/774 [02:39<00:56,  3.16it/s][A
+ 77%|███████▋  | 597/774 [02:39<00:56,  3.13it/s][A
+ 77%|███████▋  | 598/774 [02:40<00:57,  3.04it/s][A
+ 77%|███████▋  | 599/774 [02:40<00:58,  3.00it/s][A
+ 78%|███████▊  | 600/774 [02:40<00:58,  2.99it/s][A
+ 78%|███████▊  | 601/774 [02:41<00:57,  2.99it/s][A
+ 78%|███████▊  | 602/774 [02:41<00:58,  2.96it/s][A
+ 78%|███████▊  | 603/774 [02:41<00:56,  3.00it/s][A
+ 78%|███████▊  | 604/774 [02:42<00:57,  2.95it/s][A
+ 78%|███████▊  | 605/774 [02:42<00:56,  3.00it/s][A
+ 78%|███████▊  | 606/774 [02:42<00:57,  2.90it/s][A
+ 78%|███████▊  | 607/774 [02:43<00:57,  2.92it/s][A
+ 79%|███████▊  | 608/774 [02:43<00:56,  2.94it/s][A
+ 79%|███████▊  | 609/774 [02:43<00:53,  3.06it/s][A
+ 79%|███████▉  | 610/774 [02:44<00:55,  2.98it/s][A
+ 79%|███████▉  | 611/774 [02:44<00:59,  2.76it/s][A
+ 79%|███████▉  | 612/774 [02:45<01:01,  2.64it/s][A
+ 79%|███████▉  | 613/774 [02:45<00:56,  2.84it/s][A
+ 79%|███████▉  | 614/774 [02:45<00:55,  2.90it/s][A
+ 79%|███████▉  | 615/774 [02:46<00:52,  3.04it/s][A
+ 80%|███████▉  | 616/774 [02:46<00:51,  3.09it/s][A
+ 80%|███████▉  | 617/774 [02:46<00:50,  3.10it/s][A
+ 80%|███████▉  | 618/774 [02:46<00:48,  3.25it/s][A
+ 80%|███████▉  | 619/774 [02:47<00:45,  3.41it/s][A
+ 80%|████████  | 620/774 [02:47<00:44,  3.45it/s][A
+ 80%|████████  | 621/774 [02:47<00:41,  3.71it/s][A
+ 80%|████████  | 622/774 [02:47<00:38,  3.97it/s][A
+ 80%|████████  | 623/774 [02:48<00:38,  3.97it/s][A
+ 81%|████████  | 624/774 [02:48<00:41,  3.63it/s][A
+ 81%|████████  | 625/774 [02:48<00:41,  3.57it/s][A
+ 81%|████████  | 626/774 [02:49<00:44,  3.31it/s][A
+ 81%|████████  | 627/774 [02:49<00:45,  3.25it/s][A
+ 81%|████████  | 628/774 [02:49<00:45,  3.23it/s][A
+ 81%|████████▏ | 629/774 [02:50<00:43,  3.33it/s][A
+ 81%|████████▏ | 630/774 [02:50<00:40,  3.55it/s][A
+ 82%|████████▏ | 631/774 [02:50<00:38,  3.73it/s][A
+ 82%|████████▏ | 632/774 [02:50<00:37,  3.74it/s][A
+ 82%|████████▏ | 633/774 [02:51<00:39,  3.56it/s][A
+ 82%|████████▏ | 634/774 [02:51<00:40,  3.45it/s][A
+ 82%|████████▏ | 635/774 [02:51<00:39,  3.52it/s][A
+ 82%|████████▏ | 636/774 [02:52<00:40,  3.44it/s][A
+ 82%|████████▏ | 637/774 [02:52<00:39,  3.49it/s][A
+ 82%|████████▏ | 638/774 [02:52<00:39,  3.47it/s][A
+ 83%|████████▎ | 639/774 [02:53<00:44,  3.04it/s][A
+ 83%|████████▎ | 640/774 [02:53<00:50,  2.65it/s][A
+ 83%|████████▎ | 641/774 [02:53<00:49,  2.68it/s][A
+ 83%|████████▎ | 642/774 [02:54<00:45,  2.88it/s][A
+ 83%|████████▎ | 643/774 [02:54<00:45,  2.90it/s][A
+ 83%|████████▎ | 644/774 [02:54<00:41,  3.12it/s][A
+ 83%|████████▎ | 645/774 [02:54<00:37,  3.41it/s][A
+ 83%|████████▎ | 646/774 [02:55<00:35,  3.64it/s][A
+ 84%|████████▎ | 647/774 [02:55<00:32,  3.90it/s][A
+ 84%|████████▎ | 648/774 [02:55<00:31,  4.04it/s][A
+ 84%|████████▍ | 649/774 [02:55<00:30,  4.08it/s][A
+ 84%|████████▍ | 650/774 [02:56<00:28,  4.30it/s][A
+ 84%|████████▍ | 651/774 [02:56<00:28,  4.25it/s][A
+ 84%|████████▍ | 652/774 [02:56<00:29,  4.13it/s][A
+ 84%|████████▍ | 653/774 [02:56<00:31,  3.85it/s][A
+ 84%|████████▍ | 654/774 [02:57<00:29,  4.09it/s][A
+ 85%|████████▍ | 655/774 [02:57<00:27,  4.40it/s][A
+ 85%|████████▍ | 656/774 [02:57<00:27,  4.26it/s][A
+ 85%|████████▍ | 657/774 [02:57<00:26,  4.45it/s][A
+ 85%|████████▌ | 658/774 [02:58<00:27,  4.24it/s][A
+ 85%|████████▌ | 659/774 [02:58<00:29,  3.89it/s][A
+ 85%|████████▌ | 660/774 [02:58<00:30,  3.75it/s][A
+ 85%|████████▌ | 661/774 [02:58<00:30,  3.72it/s][A
+ 86%|████████▌ | 662/774 [02:59<00:28,  3.88it/s][A
+ 86%|████████▌ | 663/774 [02:59<00:30,  3.66it/s][A
+ 86%|████████▌ | 664/774 [02:59<00:30,  3.63it/s][A
+ 86%|████████▌ | 665/774 [02:59<00:27,  3.92it/s][A
+ 86%|████████▌ | 666/774 [03:00<00:24,  4.35it/s][A
+ 86%|████████▌ | 667/774 [03:00<00:23,  4.62it/s][A
+ 86%|████████▋ | 668/774 [03:00<00:23,  4.45it/s][A
+ 86%|████████▋ | 669/774 [03:00<00:25,  4.19it/s][A
+ 87%|████████▋ | 670/774 [03:01<00:24,  4.31it/s][A
+ 87%|████████▋ | 671/774 [03:01<00:26,  3.89it/s][A
+ 87%|████████▋ | 672/774 [03:01<00:25,  3.96it/s][A
+ 87%|████████▋ | 673/774 [03:01<00:24,  4.06it/s][A
+ 87%|████████▋ | 674/774 [03:02<00:24,  4.00it/s][A
+ 87%|████████▋ | 675/774 [03:02<00:23,  4.22it/s][A
+ 87%|████████▋ | 676/774 [03:02<00:22,  4.40it/s][A
+ 87%|████████▋ | 677/774 [03:02<00:22,  4.36it/s][A
+ 88%|████████▊ | 678/774 [03:02<00:21,  4.40it/s][A
+ 88%|████████▊ | 679/774 [03:03<00:22,  4.17it/s][A
+ 88%|████████▊ | 680/774 [03:03<00:22,  4.16it/s][A
+ 88%|████████▊ | 681/774 [03:03<00:20,  4.44it/s][A
+ 88%|████████▊ | 682/774 [03:03<00:20,  4.47it/s][A
+ 88%|████████▊ | 683/774 [03:04<00:22,  4.11it/s][A
+ 88%|████████▊ | 684/774 [03:04<00:23,  3.86it/s][A
+ 89%|████████▊ | 685/774 [03:04<00:23,  3.71it/s][A
+ 89%|████████▊ | 686/774 [03:04<00:23,  3.82it/s][A
+ 89%|████████▉ | 687/774 [03:05<00:21,  4.03it/s][A
+ 89%|████████▉ | 688/774 [03:05<00:21,  4.04it/s][A
+ 89%|████████▉ | 689/774 [03:05<00:20,  4.18it/s][A
+ 89%|████████▉ | 690/774 [03:05<00:19,  4.33it/s][A
+ 89%|████████▉ | 691/774 [03:06<00:18,  4.42it/s][A
+ 89%|████████▉ | 692/774 [03:06<00:18,  4.48it/s][A
+ 90%|████████▉ | 693/774 [03:06<00:18,  4.49it/s][A
+ 90%|████████▉ | 694/774 [03:06<00:19,  4.20it/s][A
+ 90%|████████▉ | 695/774 [03:07<00:20,  3.89it/s][A
+ 90%|████████▉ | 696/774 [03:07<00:19,  3.98it/s][A
+ 90%|█████████ | 697/774 [03:07<00:19,  3.97it/s][A
+ 90%|█████████ | 698/774 [03:07<00:17,  4.36it/s][A
+ 90%|█████████ | 699/774 [03:07<00:15,  4.72it/s][A
+ 90%|█████████ | 700/774 [03:08<00:17,  4.34it/s][A
+ 91%|█████████ | 701/774 [03:08<00:16,  4.40it/s][A
+ 91%|█████████ | 702/774 [03:08<00:16,  4.39it/s][A
+ 91%|█████████ | 703/774 [03:08<00:16,  4.39it/s][A
+ 91%|█████████ | 704/774 [03:09<00:16,  4.25it/s][A
+ 91%|█████████ | 705/774 [03:09<00:15,  4.59it/s][A
+ 91%|█████████ | 706/774 [03:09<00:14,  4.75it/s][A
+ 91%|█████████▏| 707/774 [03:09<00:14,  4.66it/s][A
+ 91%|█████████▏| 708/774 [03:09<00:13,  4.90it/s][A
+ 92%|█████████▏| 709/774 [03:10<00:13,  4.75it/s][A
+ 92%|█████████▏| 710/774 [03:10<00:13,  4.66it/s][A
+ 92%|█████████▏| 711/774 [03:10<00:12,  4.88it/s][A
+ 92%|█████████▏| 712/774 [03:10<00:12,  5.11it/s][A
+ 92%|█████████▏| 713/774 [03:10<00:12,  4.91it/s][A
+ 92%|█████████▏| 714/774 [03:11<00:12,  4.63it/s][A
+ 92%|█████████▏| 715/774 [03:11<00:12,  4.73it/s][A
+ 93%|█████████▎| 716/774 [03:11<00:11,  5.22it/s][A
+ 93%|█████████▎| 717/774 [03:11<00:10,  5.35it/s][A
+ 93%|█████████▎| 718/774 [03:11<00:11,  4.76it/s][A
+ 93%|█████████▎| 719/774 [03:12<00:11,  4.62it/s][A
+ 93%|█████████▎| 720/774 [03:12<00:10,  4.94it/s][A
+ 93%|█████████▎| 721/774 [03:12<00:10,  5.21it/s][A
+ 93%|█████████▎| 722/774 [03:12<00:09,  5.61it/s][A
+ 93%|█████████▎| 723/774 [03:12<00:09,  5.48it/s][A
+ 94%|█████████▎| 724/774 [03:13<00:09,  5.39it/s][A
+ 94%|█████████▎| 725/774 [03:13<00:08,  5.54it/s][A
+ 94%|█████████▍| 726/774 [03:13<00:08,  5.58it/s][A
+ 94%|█████████▍| 727/774 [03:13<00:08,  5.37it/s][A
+ 94%|█████████▍| 728/774 [03:13<00:09,  4.81it/s][A
+ 94%|█████████▍| 729/774 [03:14<00:08,  5.16it/s][A
+ 94%|█████████▍| 730/774 [03:14<00:08,  5.43it/s][A
+ 94%|█████████▍| 731/774 [03:14<00:07,  5.42it/s][A
+ 95%|█████████▍| 732/774 [03:14<00:07,  5.57it/s][A
+ 95%|█████████▍| 733/774 [03:14<00:07,  5.55it/s][A
+ 95%|█████████▍| 734/774 [03:14<00:07,  5.60it/s][A
+ 95%|█████████▍| 735/774 [03:15<00:06,  5.73it/s][A
+ 95%|█████████▌| 736/774 [03:15<00:06,  5.90it/s][A
+ 95%|█████████▌| 737/774 [03:15<00:06,  5.80it/s][A
+ 95%|█████████▌| 738/774 [03:15<00:06,  5.60it/s][A
+ 95%|█████████▌| 739/774 [03:15<00:06,  5.54it/s][A
+ 96%|█████████▌| 740/774 [03:15<00:06,  5.44it/s][A
+ 96%|█████████▌| 741/774 [03:16<00:06,  5.12it/s][A
+ 96%|█████████▌| 742/774 [03:16<00:05,  5.36it/s][A
+ 96%|█████████▌| 743/774 [03:16<00:05,  5.66it/s][A
+ 96%|█████████▌| 744/774 [03:16<00:05,  5.45it/s][A
+ 96%|█████████▋| 745/774 [03:17<00:06,  4.52it/s][A
+ 96%|█████████▋| 746/774 [03:17<00:07,  3.90it/s][A
+ 97%|█████████▋| 747/774 [03:17<00:06,  4.14it/s][A
+ 97%|█████████▋| 748/774 [03:17<00:05,  4.34it/s][A
+ 97%|█████████▋| 749/774 [03:17<00:05,  4.63it/s][A
+ 97%|█████████▋| 750/774 [03:18<00:05,  4.30it/s][A
+ 97%|█████████▋| 751/774 [03:18<00:05,  4.49it/s][A
+ 97%|█████████▋| 752/774 [03:18<00:04,  4.44it/s][A
+ 97%|█████████▋| 753/774 [03:18<00:04,  4.72it/s][A
+ 97%|█████████▋| 754/774 [03:18<00:03,  5.35it/s][A
+ 98%|█████████▊| 755/774 [03:19<00:03,  5.64it/s][A
+ 98%|█████████▊| 756/774 [03:19<00:03,  5.48it/s][A
+ 98%|█████████▊| 757/774 [03:19<00:03,  5.31it/s][A
+ 98%|█████████▊| 758/774 [03:19<00:03,  5.21it/s][A
+ 98%|█████████▊| 759/774 [03:19<00:02,  5.55it/s][A
+ 98%|█████████▊| 760/774 [03:20<00:02,  5.48it/s][A
+ 98%|█████████▊| 761/774 [03:20<00:02,  5.91it/s][A
+ 98%|█████████▊| 762/774 [03:20<00:01,  6.02it/s][A
+ 99%|█████████▊| 763/774 [03:20<00:01,  6.24it/s][A
+ 99%|█████████▊| 764/774 [03:20<00:01,  6.35it/s][A
+ 99%|█████████▉| 765/774 [03:20<00:01,  6.24it/s][A
+ 99%|█████████▉| 766/774 [03:21<00:01,  5.40it/s][A
+ 99%|█████████▉| 767/774 [03:21<00:01,  5.48it/s][A
+ 99%|█████████▉| 768/774 [03:21<00:01,  5.45it/s][A
+ 99%|█████████▉| 769/774 [03:21<00:00,  5.16it/s][A
+ 99%|█████████▉| 770/774 [03:21<00:00,  5.04it/s][A
+100%|█████████▉| 771/774 [03:22<00:00,  5.32it/s][A
+100%|█████████▉| 772/774 [03:22<00:00,  5.12it/s][A
+100%|█████████▉| 773/774 [03:22<00:00,  4.92it/s][A                                                        
+                                                 [A 55%|█████▍    | 7000/12776 [1:15:14<1:00:18,  1.60it/s]
+100%|██████████| 774/774 [03:25<00:00,  4.92it/s][A
+                                                 [A 55%|█████▍    | 7001/12776 [1:15:15<99:46:19, 62.20s/it]                                                          55%|█████▍    | 7001/12776 [1:15:15<99:46:19, 62.20s/it] 55%|█████▍    | 7002/12776 [1:15:15<70:02:57, 43.67s/it]                                                          55%|█████▍    | 7002/12776 [1:15:15<70:02:57, 43.67s/it] 55%|█████▍    | 7003/12776 [1:15:16<49:13:56, 30.70s/it]                                                          55%|█████▍    | 7003/12776 [1:15:16<49:13:56, 30.70s/it] 55%|█████▍    | 7004/12776 [1:15:16<34:41:19, 21.64s/it]                                                          55%|█████▍    | 7004/12776 [1:15:16<34:41:19, 21.64s/it] 55%|█████▍    | 7005/12776 [1:15:17<24:28:16, 15.27s/it]                                                          55%|█████▍    | 7005/12776 [1:15:17<24:28:16, 15.27s/it] 55%|█████▍    | 7006/12776 [1:15:17<17:18:41, 10.80s/it]                                                          55%|█████▍    | 7006/12776 [1:15:17<17:18:41, 10.80s/it] 55%|█████▍    | 7007/12776 [1:15:17<12:17:54,  7.67s/it]                                                          55%|█████▍    | 7007/12776 [1:15:17<12:17:54,  7.67s/it] 55%|█████▍    | 7008/12776 [1:15:18<8:46:43,  5.48s/it]                                                          55%|█████▍    | 7008/12776 [1:15:18<8:46:43,  5.48s/it] 55%|█████▍    | 7009/12776 [1:15:18<6:18:38,  3.94s/it]                                                         55%|█████▍    | 7009/12776 [1:15:18<6:18:38,  3.94s/it] 55%|█████▍    | 7010/12776 [1:15:19<4:37:37,  2.89s/it]                                                         55%|█████▍    | 7010/12776 [1:15:19<4:37:37,  2.89s/it] 55%|█████▍    | 7011/12776 [1:15:19<3:23:45,  2.12s/it]                                                         55%|█████▍    | 7011/12776 [1:15:19<3:23:45,  2.12s/it] 55%|█████▍    | 7012/12776 [1:15:19<2:31:42,  1.58s/it]                                                         55%|█████▍    | 7012/12776 [1:15:19<2:31:42,  1.58s/it] 55%|█████▍    | 7013/12776 [1:15:19<1:54:51,  1.20s/it]                                                         55%|█████▍    | 7013/12776 [1:15:19<1:54:51,  1.20s/it] 55%|█████▍    | 7014/12776 [1:15:20<1:30:13,  1.06it/s]                                                         55%|█████▍    | 7014/12776 [1:15:20<1:30:13,  1.06it/s] 55%|█████▍    | 7015/12776 [1:15:20<1:11:31,  1.34it/s]                                                         55%|█████▍    | 7015/12776 [1:15:20<1:11:31,  1.34it/s] 55%|█████▍    | 7016/12776 [1:15:20<58:12,  1.65it/s]                                                         55%|█████▍    | 7016/12776 [1:15:20<58:12,  1.65it/s] 55%|█████▍    | 7017/12776 [1:15:21<48:36,  1.97it/s]                                                       55%|█████▍    | 7017/12776 [1:15:21<48:36,  1.97it/s] 55%|█████▍    | 7018/12776 [1:15:21<42:26,  2.26it/s]                                                       55%|█████▍    | 7018/12776 [1:15:21<42:26,  2.26it/s] 55%|█████▍    | 7019/12776 [1:15:21<37:00,  2.59it/s]                                                       55%|█████▍    | 7019/12776 [1:15:21<37:00,  2.59it/s] 55%|█████▍    | 7020/12776 [1:15:21<33:07,  2.90it/s]                                                       55%|█████▍    | 7020/12776 [1:15:21<33:07,  2.90it/s] 55%|█████▍    | 7021/12776 [1:15:22<30:17,  3.17it/s]                                                       55%|█████▍    | 7021/12776 [1:15:22<30:17,  3.17it/s] 55%|█████▍    | 7022/12776 [1:15:22<31:58,  3.00it/s]                                                       55%|█████▍    | 7022/12776 [1:15:22<31:58,  3.00it/s] 55%|█████▍    | 7023/12776 [1:15:22<29:12,  3.28it/s]                                                       55%|█████▍    | 7023/12776 [1:15:22<29:12,  3.28it/s] 55%|█████▍    | 7024/12776 [1:15:23<27:03,  3.54it/s]                                                       55%|█████▍    | 7024/12776 [1:15:23<27:03,  3.54it/s] 55%|█████▍    | 7025/12776 [1:15:23<25:22,  3.78it/s]                                                       55%|█████▍    | 7025/12776 [1:15:23<25:22,  3.78it/s] 55%|█████▍    | 7026/12776 [1:15:23<26:06,  3.67it/s]                                                       55%|█████▍    | 7026/12776 [1:15:23<26:06,  3.67it/s] 55%|█████▌    | 7027/12776 [1:15:23<24:23,  3.93it/s]                                                       55%|█████▌    | 7027/12776 [1:15:23<24:23,  3.93it/s] 55%|█████▌    | 7028/12776 [1:15:23<23:06,  4.14it/s]                                                       55%|█████▌    | 7028/12776 [1:15:23<23:06,  4.14it/s] 55%|█████▌    | 7029/12776 [1:15:24<22:14,  4.31it/s]                                                       55%|█████▌    | 7029/12776 [1:15:24<22:14,  4.31it/s] 55%|█████▌    | 7030/12776 [1:15:24<21:28,  4.46it/s]                                                       55%|█████▌    | 7030/12776 [1:15:24<21:28,  4.46it/s] 55%|█████▌    | 7031/12776 [1:15:24<23:43,  4.04it/s]                                                       55%|█████▌    | 7031/12776 [1:15:24<23:43,  4.04it/s] 55%|█████▌    | 7032/12776 [1:15:24<22:27,  4.26it/s]                                                       55%|█████▌    | 7032/12776 [1:15:24<22:27,  4.26it/s] 55%|█████▌    | 7033/12776 [1:15:25<21:24,  4.47it/s]                                                       55%|█████▌    | 7033/12776 [1:15:25<21:24,  4.47it/s] 55%|█████▌    | 7034/12776 [1:15:25<20:34,  4.65it/s]                                                       55%|█████▌    | 7034/12776 [1:15:25<20:34,  4.65it/s] 55%|█████▌    | 7035/12776 [1:15:25<19:54,  4.81it/s]                                                       55%|█████▌    | 7035/12776 [1:15:25<19:54,  4.81it/s] 55%|█████▌    | 7036/12776 [1:15:25<19:16,  4.96it/s]                                                       55%|█████▌    | 7036/12776 [1:15:25<19:16,  4.96it/s] 55%|█████▌    | 7037/12776 [1:15:25<21:54,  4.37it/s]                                                       55%|█████▌    | 7037/12776 [1:15:25<21:54,  4.37it/s] 55%|█████▌    | 7038/12776 [1:15:26<35:44,  2.68it/s]                                                       55%|█████▌    | 7038/12776 [1:15:26<35:44,  2.68it/s] 55%|█████▌    | 7039/12776 [1:15:28<1:07:56,  1.41it/s]                                                         55%|█████▌    | 7039/12776 [1:15:28<1:07:56,  1.41it/s] 55%|█████▌    | 7040/12776 [1:15:29<1:15:18,  1.27it/s]                                                         55%|█████▌    | 7040/12776 [1:15:29<1:15:18,  1.27it/s] 55%|█████▌    | 7041/12776 [1:15:30<1:17:18,  1.24it/s]                                                         55%|█████▌    | 7041/12776 [1:15:30<1:17:18,  1.24it/s] 55%|█████▌    | 7042/12776 [1:15:30<1:17:10,  1.24it/s]                                                         55%|█████▌    | 7042/12776 [1:15:30<1:17:10,  1.24it/s] 55%|█████▌    | 7043/12776 [1:15:31<1:18:15,  1.22it/s]                                                         55%|█████▌    | 7043/12776 [1:15:31<1:18:15,  1.22it/s] 55%|█████▌    | 7044/12776 [1:15:32<1:14:35,  1.28it/s]                                                         55%|█████▌    | 7044/12776 [1:15:32<1:14:35,  1.28it/s] 55%|█████▌    | 7045/12776 [1:15:33<1:13:00,  1.31it/s]                                                         55%|█████▌    | 7045/12776 [1:15:33<1:13:00,  1.31it/s] 55%|█████▌    | 7046/12776 [1:15:33<1:09:07,  1.38it/s]                                                         55%|█████▌    | 7046/12776 [1:15:33<1:09:07,  1.38it/s] 55%|█████▌    | 7047/12776 [1:15:34<1:06:01,  1.45it/s]                                                         55%|█████▌    | 7047/12776 [1:15:34<1:06:01,  1.45it/s] 55%|█████▌    | 7048/12776 [1:15:34<1:02:29,  1.53it/s]                                                         55%|█████▌    | 7048/12776 [1:15:34<1:02:29,  1.53it/s] 55%|█████▌    | 7049/12776 [1:15:35<59:10,  1.61it/s]                                                         55%|█████▌    | 7049/12776 [1:15:35<59:10,  1.61it/s] 55%|█████▌    | 7050/12776 [1:15:35<56:03,  1.70it/s]                                                       55%|█████▌    | 7050/12776 [1:15:35<56:03,  1.70it/s] 55%|█████▌    | 7051/12776 [1:15:36<55:52,  1.71it/s]                                                       55%|█████▌    | 7051/12776 [1:15:36<55:52,  1.71it/s] 55%|█████▌    | 7052/12776 [1:15:36<52:15,  1.83it/s]                                                       55%|█████▌    | 7052/12776 [1:15:36<52:15,  1.83it/s] 55%|█████▌    | 7053/12776 [1:15:37<54:39,  1.75it/s]                                                       55%|█████▌    | 7053/12776 [1:15:37<54:39,  1.75it/s] 55%|█████▌    | 7054/12776 [1:15:38<50:46,  1.88it/s]                                                       55%|█████▌    | 7054/12776 [1:15:38<50:46,  1.88it/s] 55%|█████▌    | 7055/12776 [1:15:38<50:09,  1.90it/s]                                                       55%|█████▌    | 7055/12776 [1:15:38<50:09,  1.90it/s] 55%|█████▌    | 7056/12776 [1:15:38<46:44,  2.04it/s]                                                       55%|█████▌    | 7056/12776 [1:15:38<46:44,  2.04it/s] 55%|█████▌    | 7057/12776 [1:15:39<44:03,  2.16it/s]                                                       55%|█████▌    | 7057/12776 [1:15:39<44:03,  2.16it/s] 55%|█████▌    | 7058/12776 [1:15:39<45:31,  2.09it/s]                                                       55%|█████▌    | 7058/12776 [1:15:39<45:31,  2.09it/s] 55%|█████▌    | 7059/12776 [1:15:40<42:23,  2.25it/s]                                                       55%|█████▌    | 7059/12776 [1:15:40<42:23,  2.25it/s] 55%|█████▌    | 7060/12776 [1:15:40<39:51,  2.39it/s]                                                       55%|█████▌    | 7060/12776 [1:15:40<39:51,  2.39it/s] 55%|█████▌    | 7061/12776 [1:15:41<41:01,  2.32it/s]                                                       55%|█████▌    | 7061/12776 [1:15:41<41:01,  2.32it/s] 55%|█████▌    | 7062/12776 [1:15:41<38:16,  2.49it/s]                                                       55%|█████▌    | 7062/12776 [1:15:41<38:16,  2.49it/s] 55%|█████▌    | 7063/12776 [1:15:41<36:09,  2.63it/s]                                                       55%|█████▌    | 7063/12776 [1:15:41<36:09,  2.63it/s] 55%|█████▌    | 7064/12776 [1:15:42<35:16,  2.70it/s]                                                       55%|█████▌    | 7064/12776 [1:15:42<35:16,  2.70it/s] 55%|█████▌    | 7065/12776 [1:15:42<33:17,  2.86it/s]                                                       55%|█████▌    | 7065/12776 [1:15:42<33:17,  2.86it/s] 55%|█████▌    | 7066/12776 [1:15:42<31:44,  3.00it/s]                                                       55%|█████▌    | 7066/12776 [1:15:42<31:44,  3.00it/s] 55%|█████▌    | 7067/12776 [1:15:42<30:24,  3.13it/s]                                                       55%|█████▌    | 7067/12776 [1:15:42<30:24,  3.13it/s] 55%|█████▌    | 7068/12776 [1:15:43<32:13,  2.95it/s]                                                       55%|█████▌    | 7068/12776 [1:15:43<32:13,  2.95it/s] 55%|█████▌    | 7069/12776 [1:15:43<30:14,  3.14it/s]                                                       55%|█████▌    | 7069/12776 [1:15:43<30:14,  3.14it/s] 55%|█████▌    | 7070/12776 [1:15:43<28:44,  3.31it/s]                                                       55%|█████▌    | 7070/12776 [1:15:43<28:44,  3.31it/s] 55%|█████▌    | 7071/12776 [1:15:44<27:24,  3.47it/s]                                                       55%|█████▌    | 7071/12776 [1:15:44<27:24,  3.47it/s] 55%|█████▌    | 7072/12776 [1:15:44<28:07,  3.38it/s]                                                       55%|█████▌    | 7072/12776 [1:15:44<28:07,  3.38it/s] 55%|█████▌    | 7073/12776 [1:15:44<26:38,  3.57it/s]                                                       55%|█████▌    | 7073/12776 [1:15:44<26:38,  3.57it/s] 55%|█████▌    | 7074/12776 [1:15:44<25:26,  3.73it/s]                                                       55%|█████▌    | 7074/12776 [1:15:44<25:26,  3.73it/s] 55%|█████▌    | 7075/12776 [1:15:45<24:18,  3.91it/s]                                                       55%|█████▌    | 7075/12776 [1:15:45<24:18,  3.91it/s] 55%|█████▌    | 7076/12776 [1:15:45<25:14,  3.76it/s]                                                       55%|█████▌    | 7076/12776 [1:15:45<25:14,  3.76it/s] 55%|█████▌    | 7077/12776 [1:15:45<23:48,  3.99it/s]                                                      {'eval_loss': 0.524387538433075, 'eval_wer': 0.33754807998809955, 'eval_runtime': 205.3671, 'eval_samples_per_second': 60.297, 'eval_steps_per_second': 3.769, 'epoch': 1.1}
+{'loss': 0.2684, 'grad_norm': 0.8560186624526978, 'learning_rate': 0.00014139784946236558, 'epoch': 1.1}
+{'loss': 0.3631, 'grad_norm': 0.9131744503974915, 'learning_rate': 0.00014137341153470185, 'epoch': 1.1}
+{'loss': 0.7318, 'grad_norm': 2.9472861289978027, 'learning_rate': 0.0001413489736070381, 'epoch': 1.1}
+{'loss': 0.3798, 'grad_norm': 1.0932270288467407, 'learning_rate': 0.00014132453567937438, 'epoch': 1.1}
+{'loss': 0.6468, 'grad_norm': 1.729344367980957, 'learning_rate': 0.00014130009775171064, 'epoch': 1.1}
+{'loss': 0.4306, 'grad_norm': 2.079954147338867, 'learning_rate': 0.00014127565982404691, 'epoch': 1.1}
+{'loss': 0.4454, 'grad_norm': 0.8679769039154053, 'learning_rate': 0.00014125122189638316, 'epoch': 1.1}
+{'loss': 0.3538, 'grad_norm': 0.751891553401947, 'learning_rate': 0.00014122678396871944, 'epoch': 1.1}
+{'loss': 0.3732, 'grad_norm': 2.451910972595215, 'learning_rate': 0.00014120234604105572, 'epoch': 1.1}
+{'loss': 0.6134, 'grad_norm': 1.3220970630645752, 'learning_rate': 0.00014117790811339197, 'epoch': 1.1}
+{'loss': 0.5123, 'grad_norm': 2.3990230560302734, 'learning_rate': 0.00014115347018572822, 'epoch': 1.1}
+{'loss': 0.8965, 'grad_norm': 3.5617446899414062, 'learning_rate': 0.0001411290322580645, 'epoch': 1.1}
+{'loss': 0.4453, 'grad_norm': 1.2367515563964844, 'learning_rate': 0.00014110459433040078, 'epoch': 1.1}
+{'loss': 0.6857, 'grad_norm': 1.6141263246536255, 'learning_rate': 0.00014108015640273703, 'epoch': 1.1}
+{'loss': 0.8078, 'grad_norm': 1.6750285625457764, 'learning_rate': 0.0001410557184750733, 'epoch': 1.1}
+{'loss': 0.3836, 'grad_norm': 1.4285156726837158, 'learning_rate': 0.00014103128054740956, 'epoch': 1.1}
+{'loss': 0.4486, 'grad_norm': 2.9705681800842285, 'learning_rate': 0.00014100684261974584, 'epoch': 1.1}
+{'loss': 0.5095, 'grad_norm': 1.3870073556900024, 'learning_rate': 0.00014098240469208212, 'epoch': 1.1}
+{'loss': 0.3421, 'grad_norm': 1.9252225160598755, 'learning_rate': 0.00014095796676441837, 'epoch': 1.1}
+{'loss': 0.9082, 'grad_norm': 3.1899898052215576, 'learning_rate': 0.00014093352883675462, 'epoch': 1.1}
+{'loss': 1.0056, 'grad_norm': 2.3937432765960693, 'learning_rate': 0.0001409090909090909, 'epoch': 1.1}
+{'loss': 0.7778, 'grad_norm': 1.9584646224975586, 'learning_rate': 0.00014088465298142718, 'epoch': 1.1}
+{'loss': 0.7928, 'grad_norm': 1.8999119997024536, 'learning_rate': 0.00014086021505376343, 'epoch': 1.1}
+{'loss': 0.3812, 'grad_norm': 1.100942611694336, 'learning_rate': 0.0001408357771260997, 'epoch': 1.1}
+{'loss': 0.8766, 'grad_norm': 3.0417444705963135, 'learning_rate': 0.00014081133919843596, 'epoch': 1.1}
+{'loss': 1.1982, 'grad_norm': 3.2357630729675293, 'learning_rate': 0.00014078690127077224, 'epoch': 1.1}
+{'loss': 1.1896, 'grad_norm': 1.8465913534164429, 'learning_rate': 0.0001407624633431085, 'epoch': 1.1}
+{'loss': 0.9382, 'grad_norm': 1.5905226469039917, 'learning_rate': 0.00014073802541544477, 'epoch': 1.1}
+{'loss': 1.0176, 'grad_norm': 2.8563072681427, 'learning_rate': 0.00014071358748778102, 'epoch': 1.1}
+{'loss': 1.497, 'grad_norm': 3.9410860538482666, 'learning_rate': 0.0001406891495601173, 'epoch': 1.1}
+{'loss': 1.0797, 'grad_norm': 2.426844358444214, 'learning_rate': 0.00014066471163245355, 'epoch': 1.1}
+{'loss': 1.0159, 'grad_norm': 2.4825949668884277, 'learning_rate': 0.00014064027370478983, 'epoch': 1.1}
+{'loss': 1.1596, 'grad_norm': 2.973639965057373, 'learning_rate': 0.00014061583577712608, 'epoch': 1.1}
+{'loss': 0.3454, 'grad_norm': 2.815019369125366, 'learning_rate': 0.00014059139784946236, 'epoch': 1.1}
+{'loss': 0.6438, 'grad_norm': 1.925430178642273, 'learning_rate': 0.0001405669599217986, 'epoch': 1.1}
+{'loss': 0.5383, 'grad_norm': 4.631162166595459, 'learning_rate': 0.00014054252199413488, 'epoch': 1.1}
+{'loss': 1.449, 'grad_norm': 3.7617523670196533, 'learning_rate': 0.00014051808406647116, 'epoch': 1.1}
+{'loss': 1.1796, 'grad_norm': 2.4548864364624023, 'learning_rate': 0.00014049364613880741, 'epoch': 1.1}
+{'loss': 0.2382, 'grad_norm': 0.4076089560985565, 'learning_rate': 0.00014046920821114367, 'epoch': 1.1}
+{'loss': 0.1617, 'grad_norm': 0.7186834216117859, 'learning_rate': 0.00014044477028347994, 'epoch': 1.1}
+{'loss': 0.1957, 'grad_norm': 0.419718861579895, 'learning_rate': 0.00014042033235581622, 'epoch': 1.1}
+{'loss': 0.2296, 'grad_norm': 0.6931477189064026, 'learning_rate': 0.00014039589442815247, 'epoch': 1.1}
+{'loss': 0.4824, 'grad_norm': 1.0758944749832153, 'learning_rate': 0.00014037145650048875, 'epoch': 1.1}
+{'loss': 0.3532, 'grad_norm': 0.721367359161377, 'learning_rate': 0.000140347018572825, 'epoch': 1.1}
+{'loss': 0.2786, 'grad_norm': 0.8995819091796875, 'learning_rate': 0.00014032258064516128, 'epoch': 1.1}
+{'loss': 0.2787, 'grad_norm': 0.5277887582778931, 'learning_rate': 0.00014029814271749756, 'epoch': 1.1}
+{'loss': 0.2092, 'grad_norm': 0.6943791508674622, 'learning_rate': 0.0001402737047898338, 'epoch': 1.1}
+{'loss': 0.3526, 'grad_norm': 1.1070524454116821, 'learning_rate': 0.00014024926686217006, 'epoch': 1.1}
+{'loss': 0.3538, 'grad_norm': 1.660226583480835, 'learning_rate': 0.00014022482893450634, 'epoch': 1.1}
+{'loss': 0.6886, 'grad_norm': 1.7828032970428467, 'learning_rate': 0.0001402003910068426, 'epoch': 1.1}
+{'loss': 0.3213, 'grad_norm': 1.3736459016799927, 'learning_rate': 0.00014017595307917887, 'epoch': 1.1}
+{'loss': 0.3819, 'grad_norm': 1.7011991739273071, 'learning_rate': 0.00014015151515151515, 'epoch': 1.1}
+{'loss': 0.3103, 'grad_norm': 1.1513324975967407, 'learning_rate': 0.0001401270772238514, 'epoch': 1.1}
+{'loss': 0.2689, 'grad_norm': 0.6520640254020691, 'learning_rate': 0.00014010263929618765, 'epoch': 1.1}
+{'loss': 0.7632, 'grad_norm': 1.3298591375350952, 'learning_rate': 0.00014007820136852393, 'epoch': 1.1}
+{'loss': 0.3654, 'grad_norm': 1.7201464176177979, 'learning_rate': 0.0001400537634408602, 'epoch': 1.1}
+{'loss': 0.3845, 'grad_norm': 0.8514449596405029, 'learning_rate': 0.00014002932551319646, 'epoch': 1.1}
+{'loss': 0.4584, 'grad_norm': 1.0151549577713013, 'learning_rate': 0.00014000488758553274, 'epoch': 1.1}
+{'loss': 0.5458, 'grad_norm': 0.9856629967689514, 'learning_rate': 0.000139980449657869, 'epoch': 1.11}
+{'loss': 0.2539, 'grad_norm': 0.9040552377700806, 'learning_rate': 0.00013995601173020527, 'epoch': 1.11}
+{'loss': 0.6513, 'grad_norm': 1.2553309202194214, 'learning_rate': 0.00013993157380254155, 'epoch': 1.11}
+{'loss': 0.6279, 'grad_norm': 3.3496036529541016, 'learning_rate': 0.0001399071358748778, 'epoch': 1.11}
+{'loss': 0.5166, 'grad_norm': 2.5765726566314697, 'learning_rate': 0.00013988269794721405, 'epoch': 1.11}
+{'loss': 0.4737, 'grad_norm': 1.883778691291809, 'learning_rate': 0.00013985826001955033, 'epoch': 1.11}
+{'loss': 0.3679, 'grad_norm': 0.8956629037857056, 'learning_rate': 0.0001398338220918866, 'epoch': 1.11}
+{'loss': 0.5382, 'grad_norm': 1.2945466041564941, 'learning_rate': 0.00013980938416422286, 'epoch': 1.11}
+{'loss': 0.3423, 'grad_norm': 3.4074881076812744, 'learning_rate': 0.00013978494623655913, 'epoch': 1.11}
+{'loss': 0.6275, 'grad_norm': 1.626989722251892, 'learning_rate': 0.00013976050830889539, 'epoch': 1.11}
+{'loss': 0.7302, 'grad_norm': 1.9460794925689697, 'learning_rate': 0.00013973607038123166, 'epoch': 1.11}
+{'loss': 0.6552, 'grad_norm': 1.4970815181732178, 'learning_rate': 0.00013971163245356794, 'epoch': 1.11}
+{'loss': 0.6406, 'grad_norm': 1.4219391345977783, 'learning_rate': 0.0001396871945259042, 'epoch': 1.11}
+{'loss': 0.8659, 'grad_norm': 2.354487180709839, 'learning_rate': 0.00013966275659824044, 'epoch': 1.11}
+{'loss': 0.7318, 'grad_norm': 3.323763370513916, 'learning_rate': 0.00013963831867057672, 'epoch': 1.11}
+{'loss': 1.239, 'grad_norm': 2.702415943145752, 'learning_rate': 0.00013961388074291297, 'epoch': 1.11}
+{'loss': 1.0215, 'grad_norm': 2.1730268001556396, 'learning_rate': 0.00013958944281524925, 'epoch': 1.11}
+{'loss': 0.7656, 'grad_norm': 1.8554517030715942, 'learning_rate': 0.00013956500488758553, 'epoch': 1.11}
+ 55%|█████▌    | 7077/12776 [1:15:45<23:48,  3.99it/s] 55%|█████▌    | 7078/12776 [1:15:45<22:42,  4.18it/s]                                                       55%|█████▌    | 7078/12776 [1:15:45<22:42,  4.18it/s] 55%|█████▌    | 7079/12776 [1:15:46<21:54,  4.33it/s]                                                       55%|█████▌    | 7079/12776 [1:15:46<21:54,  4.33it/s] 55%|█████▌    | 7080/12776 [1:15:46<21:14,  4.47it/s]                                                       55%|█████▌    | 7080/12776 [1:15:46<21:14,  4.47it/s] 55%|█████▌    | 7081/12776 [1:15:46<23:32,  4.03it/s]                                                       55%|█████▌    | 7081/12776 [1:15:46<23:32,  4.03it/s] 55%|█████▌    | 7082/12776 [1:15:46<22:11,  4.28it/s]                                                       55%|█████▌    | 7082/12776 [1:15:46<22:11,  4.28it/s] 55%|█████▌    | 7083/12776 [1:15:47<21:10,  4.48it/s]                                                       55%|█████▌    | 7083/12776 [1:15:47<21:10,  4.48it/s] 55%|█████▌    | 7084/12776 [1:15:47<20:22,  4.65it/s]                                                       55%|█████▌    | 7084/12776 [1:15:47<20:22,  4.65it/s] 55%|█████▌    | 7085/12776 [1:15:47<19:44,  4.81it/s]                                                       55%|█████▌    | 7085/12776 [1:15:47<19:44,  4.81it/s] 55%|█████▌    | 7086/12776 [1:15:47<19:14,  4.93it/s]                                                       55%|█████▌    | 7086/12776 [1:15:47<19:14,  4.93it/s] 55%|█████▌    | 7087/12776 [1:15:47<22:01,  4.30it/s]                                                       55%|█████▌    | 7087/12776 [1:15:47<22:01,  4.30it/s] 55%|█████▌    | 7088/12776 [1:15:48<34:46,  2.73it/s]                                                       55%|█████▌    | 7088/12776 [1:15:48<34:46,  2.73it/s] 55%|█████▌    | 7089/12776 [1:15:49<1:00:42,  1.56it/s]                                                         55%|█████▌    | 7089/12776 [1:15:49<1:00:42,  1.56it/s] 55%|█████▌    | 7090/12776 [1:15:50<1:09:06,  1.37it/s]                                                         55%|█████▌    | 7090/12776 [1:15:50<1:09:06,  1.37it/s] 56%|█████▌    | 7091/12776 [1:15:51<1:15:05,  1.26it/s]                                                         56%|█████▌    | 7091/12776 [1:15:51<1:15:05,  1.26it/s] 56%|█████▌    | 7092/12776 [1:15:52<1:14:06,  1.28it/s]                                                         56%|█████▌    | 7092/12776 [1:15:52<1:14:06,  1.28it/s] 56%|█��███▌    | 7093/12776 [1:15:53<1:12:52,  1.30it/s]                                                         56%|█████▌    | 7093/12776 [1:15:53<1:12:52,  1.30it/s] 56%|█████▌    | 7094/12776 [1:15:53<1:10:37,  1.34it/s]                                                         56%|█████▌    | 7094/12776 [1:15:53<1:10:37,  1.34it/s] 56%|█████▌    | 7095/12776 [1:15:54<1:09:41,  1.36it/s]                                                         56%|█████▌    | 7095/12776 [1:15:54<1:09:41,  1.36it/s] 56%|█████▌    | 7096/12776 [1:15:55<1:06:16,  1.43it/s]                                                         56%|█████▌    | 7096/12776 [1:15:55<1:06:16,  1.43it/s] 56%|█████▌    | 7097/12776 [1:15:55<1:02:43,  1.51it/s]                                                         56%|█████▌    | 7097/12776 [1:15:55<1:02:43,  1.51it/s] 56%|█████▌    | 7098/12776 [1:15:56<59:33,  1.59it/s]                                                         56%|█████▌    | 7098/12776 [1:15:56<59:33,  1.59it/s] 56%|█████▌    | 7099/12776 [1:15:56<58:43,  1.61it/s]                                                       56%|█████▌    | 7099/12776 [1:15:56<58:43,  1.61it/s] 56%|█████▌    | 7100/12776 [1:15:57<55:06,  1.72it/s]                                                       56%|█████▌    | 7100/12776 [1:15:57<55:06,  1.72it/s] 56%|█████▌    | 7101/12776 [1:15:57<51:29,  1.84it/s]                                                       56%|█████▌    | 7101/12776 [1:15:57<51:29,  1.84it/s] 56%|█████▌    | 7102/12776 [1:15:58<49:56,  1.89it/s]                                                       56%|█████▌    | 7102/12776 [1:15:58<49:56,  1.89it/s] 56%|█████▌    | 7103/12776 [1:15:58<47:02,  2.01it/s]                                                       56%|█████▌    | 7103/12776 [1:15:58<47:02,  2.01it/s] 56%|█████▌    | 7104/12776 [1:15:59<45:44,  2.07it/s]                                                       56%|█████▌    | 7104/12776 [1:15:59<45:44,  2.07it/s] 56%|█████▌    | 7105/12776 [1:15:59<43:21,  2.18it/s]                                                       56%|█████▌    | 7105/12776 [1:15:59<43:21,  2.18it/s] 56%|█████▌    | 7106/12776 [1:16:00<41:19,  2.29it/s]                                                       56%|█████▌    | 7106/12776 [1:16:00<41:19,  2.29it/s] 56%|█████▌    | 7107/12776 [1:16:00<42:58,  2.20it/s]                                                       56%|█████▌    | 7107/12776 [1:16:00<42:58,  2.20it/s] 56%|█████▌    | 7108/12776 [1:16:00<40:13,  2.35it/s]                                                       56%|█████▌    | 7108/12776 [1:16:00<40:13,  2.35it/s] 56%|█████▌    | 7109/12776 [1:16:01<38:03,  2.48it/s]                                                       56%|█████▌    | 7109/12776 [1:16:01<38:03,  2.48it/s] 56%|█████▌    | 7110/12776 [1:16:01<38:51,  2.43it/s]                                                       56%|█████▌    | 7110/12776 [1:16:01<38:51,  2.43it/s] 56%|█████▌    | 7111/12776 [1:16:02<36:36,  2.58it/s]                                                       56%|█████▌    | 7111/12776 [1:16:02<36:36,  2.58it/s] 56%|█████▌    | 7112/12776 [1:16:02<34:54,  2.70it/s]                                                       56%|█████▌    | 7112/12776 [1:16:02<34:54,  2.70it/s] 56%|█████▌    | 7113/12776 [1:16:02<33:47,  2.79it/s]                                                       56%|█████▌    | 7113/12776 [1:16:02<33:47,  2.79it/s] 56%|█████▌    | 7114/12776 [1:16:02<32:03,  2.94it/s]                                                       56%|█████▌    | 7114/12776 [1:16:02<32:03,  2.94it/s] 56%|█████▌    | 7115/12776 [1:16:03<30:43,  3.07it/s]                                                       56%|█████▌    | 7115/12776 [1:16:03<30:43,  3.07it/s] 56%|█████▌    | 7116/12776 [1:16:03<29:31,  3.20it/s]                                                       56%|█████▌    | 7116/12776 [1:16:03<29:31,  3.20it/s] 56%|█████▌    | 7117/12776 [1:16:03<31:18,  3.01it/s]                                                       56%|█████▌    | 7117/12776 [1:16:03<31:18,  3.01it/s] 56%|█████▌    | 7118/12776 [1:16:04<29:28,  3.20it/s]                                                       56%|█████▌    | 7118/12776 [1:16:04<29:28,  3.20it/s] 56%|█████▌    | 7119/12776 [1:16:04<27:51,  3.38it/s]                                                       56%|█████▌    | 7119/12776 [1:16:04<27:51,  3.38it/s] 56%|█████▌    | 7120/12776 [1:16:04<26:34,  3.55it/s]                                                       56%|█████▌    | 7120/12776 [1:16:04<26:34,  3.55it/s] 56%|█████▌    | 7121/12776 [1:16:05<28:56,  3.26it/s]                                                       56%|█████▌    | 7121/12776 [1:16:05<28:56,  3.26it/s] 56%|█████▌    | 7122/12776 [1:16:05<27:07,  3.47it/s]                                                       56%|█████▌    | 7122/12776 [1:16:05<27:07,  3.47it/s] 56%|█████▌    | 7123/12776 [1:16:05<25:39,  3.67it/s]                                                       56%|█████▌    | 7123/12776 [1:16:05<25:39,  3.67it/s] 56%|█████▌    | 7124/12776 [1:16:05<24:28,  3.85it/s]                                                       56%|█████▌    | 7124/12776 [1:16:05<24:28,  3.85it/s] 56%|█████▌    | 7125/12776 [1:16:06<25:02,  3.76it/s]                                                       56%|█████▌    | 7125/12776 [1:16:06<25:02,  3.76it/s] 56%|█████▌    | 7126/12776 [1:16:06<23:40,  3.98it/s]                                                       56%|█████▌    | 7126/12776 [1:16:06<23:40,  3.98it/s] 56%|█████▌    | 7127/12776 [1:16:06<22:33,  4.17it/s]                                                       56%|█████▌    | 7127/12776 [1:16:06<22:33,  4.17it/s] 56%|█████▌    | 7128/12776 [1:16:06<21:39,  4.35it/s]                                                       56%|█████▌    | 7128/12776 [1:16:06<21:39,  4.35it/s] 56%|█████▌    | 7129/12776 [1:16:06<21:02,  4.47it/s]                                                       56%|█████▌    | 7129/12776 [1:16:06<21:02,  4.47it/s] 56%|█████▌    | 7130/12776 [1:16:07<23:09,  4.06it/s]                                                       56%|█████▌    | 7130/12776 [1:16:07<23:09,  4.06it/s] 56%|█████▌    | 7131/12776 [1:16:07<21:52,  4.30it/s]                                                       56%|█████▌    | 7131/12776 [1:16:07<21:52,  4.30it/s] 56%|█████▌    | 7132/12776 [1:16:07<20:55,  4.50it/s]                                                       56%|█████▌    | 7132/12776 [1:16:07<20:55,  4.50it/s] 56%|█████▌    | 7133/12776 [1:16:07<20:10,  4.66it/s]                                                       56%|█████▌    | 7133/12776 [1:16:07<20:10,  4.66it/s] 56%|█████▌    | 7134/12776 [1:16:08<19:33,  4.81it/s]                                                       56%|█████▌    | 7134/12776 [1:16:08<19:33,  4.81it/s] 56%|█████▌    | 7135/12776 [1:16:08<19:08,  4.91it/s]                                                       56%|█████▌    | 7135/12776 [1:16:08<19:08,  4.91it/s] 56%|█████▌    | 7136/12776 [1:16:08<20:42,  4.54it/s]                                                       56%|█████▌    | 7136/12776 [1:16:08<20:42,  4.54it/s] 56%|█████▌    | 7137/12776 [1:16:08<19:44,  4.76it/s]                                                       56%|█████▌    | 7137/12776 [1:16:08<19:44,  4.76it/s] 56%|█████▌    | 7138/12776 [1:16:09<37:15,  2.52it/s]                                                       56%|█████▌    | 7138/12776 [1:16:09<37:15,  2.52it/s] 56%|█████▌    | 7139/12776 [1:16:10<1:06:16,  1.42it/s]                                                         56%|█████▌    | 7139/12776 [1:16:10<1:06:16,  1.42it/s] 56%|█████▌    | 7140/12776 [1:16:11<1:13:07,  1.28it/s]                                                         56%|█████▌    | 7140/12776 [1:16:11<1:13:07,  1.28it/s] 56%|█████▌    | 7141/12776 [1:16:12<1:15:26,  1.24it/s]                                                         56%|█████▌    | 7141/12776 [1:16:12<1:15:26,  1.24it/s] 56%|█████▌    | 7142/12776 [1:16:13<1:15:04,  1.25it/s]                                                         56%|█████▌    | 7142/12776 [1:16:13<1:15:04,  1.25it/s] 56%|█████▌    | 7143/12776 [1:16:14<1:17:01,  1.22it/s]                                                         56%|█████▌    | 7143/12776 [1:16:14<1:17:01,  1.22it/s] 56%|█████▌    | 7144/12776 [1:16:15<1:13:27,  1.28it/s]                                                         56%|█████▌    | 7144/12776 [1:16:15<1:13:27,  1.28it/s] 56%|█████▌    | 7145/12776 [1:16:15<1:09:34,  1.35it/s]                                                         56%|█████▌    | 7145/12776 [1:16:15<1:09:34,  1.35it/s] 56%|█████▌    | 7146/12776 [1:16:16<1:06:36,  1.41it/s]                                                         56%|█████▌    | 7146/12776 [1:16:16<1:06:36,  1.41it/s] 56%|█████▌    | 7147/12776 [1:16:16<1:02:33,  1.50it/s]                                                         56%|█████▌    | 7147/12776 [1:16:16<1:02:33,  1.50it/s] 56%|█████▌    | 7148/12776 [1:16:17<1:00:24,  1.55it/s]                                                         56%|█████▌    | 7148/12776 [1:16:17<1:00:24,  1.55it/s] 56%|█████▌    | 7149/12776 [1:16:18<57:02,  1.64it/s]                                                         56%|█████▌    | 7149/12776 [1:16:18<57:02,  1.64it/s] 56%|█████▌    | 7150/12776 [1:16:18<58:19,  1.61it/s]                                                       56%|█████▌    | 7150/12776 [1:16:18<58:19,  1.61it/s] 56%|█████▌    | 7151/12776 [1:16:19<54:04,  1.73it/s]                                                       56%|█████▌    | 7151/12776 [1:16:19<54:04,  1.73it/s] 56%|█████▌    | 7152/12776 [1:16:19<50:32,  1.85it/s]                                                       56%|█████▌    | 7152/12776 [1:16:19<50:32,  1.85it/s] 56%|█████▌    | 7153/12776 [1:16:20<49:03,  1.91it/s]                                                       56%|█████▌    | 7153/12776 [1:16:20<49:03,  1.91it/s] 56%|█████▌    | 7154/12776 [1:16:20<46:22,  2.02it/s]                                                      {'loss': 0.793, 'grad_norm': 3.0490665435791016, 'learning_rate': 0.00013954056695992178, 'epoch': 1.11}
+{'loss': 1.0286, 'grad_norm': 2.6447458267211914, 'learning_rate': 0.00013951612903225803, 'epoch': 1.11}
+{'loss': 0.5665, 'grad_norm': 0.89915931224823, 'learning_rate': 0.0001394916911045943, 'epoch': 1.11}
+{'loss': 0.6443, 'grad_norm': 1.9024105072021484, 'learning_rate': 0.0001394672531769306, 'epoch': 1.11}
+{'loss': 1.179, 'grad_norm': 1.5853981971740723, 'learning_rate': 0.00013944281524926684, 'epoch': 1.11}
+{'loss': 0.9776, 'grad_norm': 3.08502197265625, 'learning_rate': 0.00013941837732160312, 'epoch': 1.11}
+{'loss': 1.2178, 'grad_norm': 3.0830800533294678, 'learning_rate': 0.00013939393939393937, 'epoch': 1.11}
+{'loss': 1.0084, 'grad_norm': 2.8586432933807373, 'learning_rate': 0.00013936950146627565, 'epoch': 1.11}
+{'loss': 0.5648, 'grad_norm': 1.6677206754684448, 'learning_rate': 0.00013934506353861193, 'epoch': 1.11}
+{'loss': 0.6297, 'grad_norm': 2.1359286308288574, 'learning_rate': 0.00013932062561094818, 'epoch': 1.11}
+{'loss': 0.9579, 'grad_norm': 6.148825168609619, 'learning_rate': 0.00013929618768328443, 'epoch': 1.11}
+{'loss': 0.6473, 'grad_norm': 1.7311344146728516, 'learning_rate': 0.0001392717497556207, 'epoch': 1.11}
+{'loss': 0.1864, 'grad_norm': 0.3622504472732544, 'learning_rate': 0.000139247311827957, 'epoch': 1.11}
+{'loss': 0.189, 'grad_norm': 0.5219933986663818, 'learning_rate': 0.00013922287390029324, 'epoch': 1.11}
+{'loss': 0.2183, 'grad_norm': 0.3946225345134735, 'learning_rate': 0.00013919843597262952, 'epoch': 1.11}
+{'loss': 0.155, 'grad_norm': 0.47833681106567383, 'learning_rate': 0.00013917399804496577, 'epoch': 1.11}
+{'loss': 0.1816, 'grad_norm': 0.4084630608558655, 'learning_rate': 0.00013914956011730205, 'epoch': 1.11}
+{'loss': 0.2926, 'grad_norm': 0.45835641026496887, 'learning_rate': 0.0001391251221896383, 'epoch': 1.11}
+{'loss': 0.2767, 'grad_norm': 0.5102660655975342, 'learning_rate': 0.00013910068426197458, 'epoch': 1.11}
+{'loss': 0.2147, 'grad_norm': 0.914391040802002, 'learning_rate': 0.00013907624633431083, 'epoch': 1.11}
+{'loss': 0.2518, 'grad_norm': 0.6715912222862244, 'learning_rate': 0.0001390518084066471, 'epoch': 1.11}
+{'loss': 0.3206, 'grad_norm': 0.5033890604972839, 'learning_rate': 0.00013902737047898336, 'epoch': 1.11}
+{'loss': 0.2376, 'grad_norm': 0.7809054851531982, 'learning_rate': 0.00013900293255131963, 'epoch': 1.11}
+{'loss': 0.1966, 'grad_norm': 0.5782076120376587, 'learning_rate': 0.0001389784946236559, 'epoch': 1.11}
+{'loss': 0.3316, 'grad_norm': 1.2228553295135498, 'learning_rate': 0.00013895405669599216, 'epoch': 1.11}
+{'loss': 0.5064, 'grad_norm': 2.8982632160186768, 'learning_rate': 0.00013892961876832842, 'epoch': 1.11}
+{'loss': 0.3983, 'grad_norm': 1.0576456785202026, 'learning_rate': 0.0001389051808406647, 'epoch': 1.11}
+{'loss': 0.5427, 'grad_norm': 1.5845671892166138, 'learning_rate': 0.00013888074291300097, 'epoch': 1.11}
+{'loss': 0.4551, 'grad_norm': 0.8153006434440613, 'learning_rate': 0.00013885630498533722, 'epoch': 1.11}
+{'loss': 0.5381, 'grad_norm': 1.2537267208099365, 'learning_rate': 0.0001388318670576735, 'epoch': 1.11}
+{'loss': 0.3709, 'grad_norm': 1.0054357051849365, 'learning_rate': 0.00013880742913000975, 'epoch': 1.11}
+{'loss': 0.7605, 'grad_norm': 3.644198179244995, 'learning_rate': 0.00013878299120234603, 'epoch': 1.11}
+{'loss': 0.2972, 'grad_norm': 1.2919872999191284, 'learning_rate': 0.0001387585532746823, 'epoch': 1.11}
+{'loss': 0.4665, 'grad_norm': 1.0998269319534302, 'learning_rate': 0.00013873411534701856, 'epoch': 1.11}
+{'loss': 0.6518, 'grad_norm': 3.0610947608947754, 'learning_rate': 0.0001387096774193548, 'epoch': 1.11}
+{'loss': 0.5192, 'grad_norm': 2.023725986480713, 'learning_rate': 0.0001386852394916911, 'epoch': 1.11}
+{'loss': 0.5305, 'grad_norm': 2.0220577716827393, 'learning_rate': 0.00013866080156402737, 'epoch': 1.11}
+{'loss': 0.4925, 'grad_norm': 1.242210030555725, 'learning_rate': 0.00013863636363636362, 'epoch': 1.11}
+{'loss': 0.4868, 'grad_norm': 1.3182028532028198, 'learning_rate': 0.0001386119257086999, 'epoch': 1.11}
+{'loss': 0.4217, 'grad_norm': 1.934834361076355, 'learning_rate': 0.00013858748778103615, 'epoch': 1.11}
+{'loss': 0.5035, 'grad_norm': 1.3865692615509033, 'learning_rate': 0.00013856304985337243, 'epoch': 1.11}
+{'loss': 1.0412, 'grad_norm': 1.6283689737319946, 'learning_rate': 0.00013853861192570868, 'epoch': 1.11}
+{'loss': 1.0473, 'grad_norm': 1.3820260763168335, 'learning_rate': 0.00013851417399804496, 'epoch': 1.11}
+{'loss': 1.0301, 'grad_norm': 1.8468633890151978, 'learning_rate': 0.0001384897360703812, 'epoch': 1.11}
+{'loss': 0.4976, 'grad_norm': 1.0981626510620117, 'learning_rate': 0.0001384652981427175, 'epoch': 1.11}
+{'loss': 0.6123, 'grad_norm': 1.9592245817184448, 'learning_rate': 0.00013844086021505374, 'epoch': 1.11}
+{'loss': 1.0584, 'grad_norm': 4.474161624908447, 'learning_rate': 0.00013841642228739002, 'epoch': 1.12}
+{'loss': 0.6644, 'grad_norm': 2.5803914070129395, 'learning_rate': 0.0001383919843597263, 'epoch': 1.12}
+{'loss': 0.8667, 'grad_norm': 1.8294001817703247, 'learning_rate': 0.00013836754643206255, 'epoch': 1.12}
+{'loss': 0.8291, 'grad_norm': 3.3992698192596436, 'learning_rate': 0.0001383431085043988, 'epoch': 1.12}
+{'loss': 0.9821, 'grad_norm': 2.8989360332489014, 'learning_rate': 0.00013831867057673508, 'epoch': 1.12}
+{'loss': 0.6455, 'grad_norm': 2.251101016998291, 'learning_rate': 0.00013829423264907135, 'epoch': 1.12}
+{'loss': 1.1246, 'grad_norm': 4.921138763427734, 'learning_rate': 0.0001382697947214076, 'epoch': 1.12}
+{'loss': 0.997, 'grad_norm': 1.7035528421401978, 'learning_rate': 0.00013824535679374388, 'epoch': 1.12}
+{'loss': 0.9107, 'grad_norm': 5.489228248596191, 'learning_rate': 0.00013822091886608014, 'epoch': 1.12}
+{'loss': 0.7538, 'grad_norm': 3.2178499698638916, 'learning_rate': 0.00013819648093841641, 'epoch': 1.12}
+{'loss': 0.8375, 'grad_norm': 2.4438555240631104, 'learning_rate': 0.0001381720430107527, 'epoch': 1.12}
+{'loss': 1.1526, 'grad_norm': 3.7935683727264404, 'learning_rate': 0.00013814760508308894, 'epoch': 1.12}
+{'loss': 0.2834, 'grad_norm': 1.034727931022644, 'learning_rate': 0.0001381231671554252, 'epoch': 1.12}
+{'loss': 0.4101, 'grad_norm': 5.797002792358398, 'learning_rate': 0.00013809872922776147, 'epoch': 1.12}
+{'loss': 0.589, 'grad_norm': 1.3776600360870361, 'learning_rate': 0.00013807429130009775, 'epoch': 1.12}
+{'loss': 1.4797, 'grad_norm': 2.727193832397461, 'learning_rate': 0.000138049853372434, 'epoch': 1.12}
+{'loss': 0.3785, 'grad_norm': 0.7956806421279907, 'learning_rate': 0.00013802541544477028, 'epoch': 1.12}
+{'loss': 0.2568, 'grad_norm': 0.5914304256439209, 'learning_rate': 0.00013800097751710653, 'epoch': 1.12}
+{'loss': 0.2745, 'grad_norm': 0.49332305788993835, 'learning_rate': 0.00013797653958944278, 'epoch': 1.12}
+{'loss': 0.1904, 'grad_norm': 0.7752807140350342, 'learning_rate': 0.00013795210166177906, 'epoch': 1.12}
+{'loss': 0.2937, 'grad_norm': 0.6126279234886169, 'learning_rate': 0.00013792766373411534, 'epoch': 1.12}
+{'loss': 0.2494, 'grad_norm': 0.710114061832428, 'learning_rate': 0.0001379032258064516, 'epoch': 1.12}
+{'loss': 0.1989, 'grad_norm': 0.5817550420761108, 'learning_rate': 0.00013787878787878787, 'epoch': 1.12}
+{'loss': 0.2376, 'grad_norm': 0.6137755513191223, 'learning_rate': 0.00013785434995112412, 'epoch': 1.12}
+{'loss': 0.2195, 'grad_norm': 0.6869461536407471, 'learning_rate': 0.0001378299120234604, 'epoch': 1.12}
+{'loss': 0.1858, 'grad_norm': 0.5993041396141052, 'learning_rate': 0.00013780547409579668, 'epoch': 1.12}
+{'loss': 0.3344, 'grad_norm': 0.7191832065582275, 'learning_rate': 0.00013778103616813293, 'epoch': 1.12}
+{'loss': 2.0161, 'grad_norm': 8.964902877807617, 'learning_rate': 0.00013775659824046918, 'epoch': 1.12}
+{'loss': 0.3409, 'grad_norm': 0.8494324088096619, 'learning_rate': 0.00013773216031280546, 'epoch': 1.12}
+{'loss': 0.3471, 'grad_norm': 2.661954402923584, 'learning_rate': 0.00013770772238514174, 'epoch': 1.12}
+{'loss': 0.3072, 'grad_norm': 1.1184935569763184, 'learning_rate': 0.000137683284457478, 'epoch': 1.12}
+ 56%|█████▌    | 7154/12776 [1:16:20<46:22,  2.02it/s] 56%|█████▌    | 7155/12776 [1:16:20<45:04,  2.08it/s]                                                       56%|█████▌    | 7155/12776 [1:16:20<45:04,  2.08it/s] 56%|█████▌    | 7156/12776 [1:16:21<42:37,  2.20it/s]                                                       56%|█████▌    | 7156/12776 [1:16:21<42:37,  2.20it/s] 56%|█████▌    | 7157/12776 [1:16:21<40:32,  2.31it/s]                                                       56%|█████▌    | 7157/12776 [1:16:21<40:32,  2.31it/s] 56%|█████▌    | 7158/12776 [1:16:22<39:01,  2.40it/s]                                                       56%|█████▌    | 7158/12776 [1:16:22<39:01,  2.40it/s] 56%|█████▌    | 7159/12776 [1:16:22<37:04,  2.53it/s]                                                       56%|█████▌    | 7159/12776 [1:16:22<37:04,  2.53it/s] 56%|█████▌    | 7160/12776 [1:16:22<35:34,  2.63it/s]                                                       56%|█████▌    | 7160/12776 [1:16:22<35:34,  2.63it/s] 56%|█████▌    | 7161/12776 [1:16:23<36:57,  2.53it/s]                                                       56%|█████▌    | 7161/12776 [1:16:23<36:57,  2.53it/s] 56%|█████▌    | 7162/12776 [1:16:23<34:52,  2.68it/s]                                                       56%|█████▌    | 7162/12776 [1:16:23<34:52,  2.68it/s] 56%|█████▌    | 7163/12776 [1:16:23<32:57,  2.84it/s]                                                       56%|█████▌    | 7163/12776 [1:16:23<32:57,  2.84it/s] 56%|█████▌    | 7164/12776 [1:16:24<31:40,  2.95it/s]                                                       56%|█████▌    | 7164/12776 [1:16:24<31:40,  2.95it/s] 56%|█████▌    | 7165/12776 [1:16:24<31:42,  2.95it/s]                                                       56%|█████▌    | 7165/12776 [1:16:24<31:42,  2.95it/s] 56%|█████▌    | 7166/12776 [1:16:24<29:57,  3.12it/s]                                                       56%|█████▌    | 7166/12776 [1:16:24<29:57,  3.12it/s] 56%|█████▌    | 7167/12776 [1:16:25<28:36,  3.27it/s]                                                       56%|█████▌    | 7167/12776 [1:16:25<28:36,  3.27it/s] 56%|█████▌    | 7168/12776 [1:16:25<27:26,  3.41it/s]                                                       56%|█████▌    | 7168/12776 [1:16:25<27:26,  3.41it/s] 56%|█████▌    | 7169/12776 [1:16:25<28:04,  3.33it/s]                                                       56%|█████▌    | 7169/12776 [1:16:25<28:04,  3.33it/s] 56%|█████▌    | 7170/12776 [1:16:25<26:37,  3.51it/s]                                                       56%|█████▌    | 7170/12776 [1:16:25<26:37,  3.51it/s] 56%|█████▌    | 7171/12776 [1:16:26<25:26,  3.67it/s]                                                       56%|█████▌    | 7171/12776 [1:16:26<25:26,  3.67it/s] 56%|█████▌    | 7172/12776 [1:16:26<24:28,  3.82it/s]                                                       56%|█████▌    | 7172/12776 [1:16:26<24:28,  3.82it/s] 56%|█████▌    | 7173/12776 [1:16:26<26:20,  3.55it/s]                                                       56%|█████▌    | 7173/12776 [1:16:26<26:20,  3.55it/s] 56%|█████▌    | 7174/12776 [1:16:26<24:38,  3.79it/s]                                                       56%|█████▌    | 7174/12776 [1:16:26<24:38,  3.79it/s] 56%|█████▌    | 7175/12776 [1:16:27<23:13,  4.02it/s]                                                       56%|█████▌    | 7175/12776 [1:16:27<23:13,  4.02it/s] 56%|█████▌    | 7176/12776 [1:16:27<22:09,  4.21it/s]                                                       56%|█████▌    | 7176/12776 [1:16:27<22:09,  4.21it/s] 56%|█████▌    | 7177/12776 [1:16:27<21:23,  4.36it/s]                                                       56%|█████▌    | 7177/12776 [1:16:27<21:23,  4.36it/s] 56%|█████▌    | 7178/12776 [1:16:27<23:07,  4.03it/s]                                                       56%|█████▌    | 7178/12776 [1:16:27<23:07,  4.03it/s] 56%|█████▌    | 7179/12776 [1:16:28<21:50,  4.27it/s]                                                       56%|█████▌    | 7179/12776 [1:16:28<21:50,  4.27it/s] 56%|█████▌    | 7180/12776 [1:16:28<20:50,  4.47it/s]                                                       56%|█████▌    | 7180/12776 [1:16:28<20:50,  4.47it/s] 56%|█████▌    | 7181/12776 [1:16:28<20:06,  4.64it/s]                                                       56%|█████▌    | 7181/12776 [1:16:28<20:06,  4.64it/s] 56%|█████▌    | 7182/12776 [1:16:28<19:35,  4.76it/s]                                                       56%|█████▌    | 7182/12776 [1:16:28<19:35,  4.76it/s] 56%|█████▌    | 7183/12776 [1:16:28<22:50,  4.08it/s]                                                       56%|█████▌    | 7183/12776 [1:16:28<22:50,  4.08it/s] 56%|█████▌    | 7184/12776 [1:16:29<21:20,  4.37it/s]                                                       56%|█████▌    | 7184/12776 [1:16:29<21:20,  4.37it/s] 56%|█████▌    | 7185/12776 [1:16:29<20:07,  4.63it/s]                                                       56%|█████▌    | 7185/12776 [1:16:29<20:07,  4.63it/s] 56%|█████▌    | 7186/12776 [1:16:29<19:10,  4.86it/s]                                                       56%|█████▌    | 7186/12776 [1:16:29<19:10,  4.86it/s] 56%|█████▋    | 7187/12776 [1:16:29<18:29,  5.04it/s]                                                       56%|█████▋    | 7187/12776 [1:16:29<18:29,  5.04it/s] 56%|█████▋    | 7188/12776 [1:16:30<31:50,  2.92it/s]                                                       56%|█████▋    | 7188/12776 [1:16:30<31:50,  2.92it/s] 56%|█████▋    | 7189/12776 [1:16:31<1:03:13,  1.47it/s]                                                         56%|█████▋    | 7189/12776 [1:16:31<1:03:13,  1.47it/s] 56%|█████▋    | 7190/12776 [1:16:32<1:12:28,  1.28it/s]                                                         56%|█████▋    | 7190/12776 [1:16:32<1:12:28,  1.28it/s] 56%|█████▋    | 7191/12776 [1:16:33<1:17:14,  1.21it/s]                                                         56%|█████▋    | 7191/12776 [1:16:33<1:17:14,  1.21it/s] 56%|█████▋    | 7192/12776 [1:16:34<1:16:18,  1.22it/s]                                                         56%|█████▋    | 7192/12776 [1:16:34<1:16:18,  1.22it/s] 56%|█████▋    | 7193/12776 [1:16:35<1:13:03,  1.27it/s]                                                         56%|█████▋    | 7193/12776 [1:16:35<1:13:03,  1.27it/s] 56%|█████▋    | 7194/12776 [1:16:36<1:11:08,  1.31it/s]                                                         56%|█████▋    | 7194/12776 [1:16:36<1:11:08,  1.31it/s] 56%|█████▋    | 7195/12776 [1:16:36<1:07:40,  1.37it/s]                                                         56%|█████▋    | 7195/12776 [1:16:36<1:07:40,  1.37it/s] 56%|█████▋    | 7196/12776 [1:16:37<1:04:33,  1.44it/s]                                                         56%|█████▋    | 7196/12776 [1:16:37<1:04:33,  1.44it/s] 56%|█████▋    | 7197/12776 [1:16:37<1:01:04,  1.52it/s]                                                         56%|█████▋    | 7197/12776 [1:16:37<1:01:04,  1.52it/s] 56%|█████▋    | 7198/12776 [1:16:38<58:14,  1.60it/s]                                                         56%|█████▋    | 7198/12776 [1:16:38<58:14,  1.60it/s] 56%|█████▋    | 7199/12776 [1:16:38<55:25,  1.68it/s]                                                       56%|█████▋    | 7199/12776 [1:16:38<55:25,  1.68it/s] 56%|█████▋    | 7200/12776 [1:16:39<55:29,  1.67it/s]                                                       56%|█████▋    | 7200/12776 [1:16:39<55:29,  1.67it/s]Saving model checkpoint to ./checkpoint-7200
+Configuration saved in ./checkpoint-7200/config.json
+Model weights saved in ./checkpoint-7200/model.safetensors
+Feature extractor saved in ./checkpoint-7200/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-7200/tokenizer_config.json
+Special tokens file saved in ./checkpoint-7200/special_tokens_map.json
+added tokens file saved in ./checkpoint-7200/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-6000] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 56%|█████▋    | 7201/12776 [1:16:45<3:18:27,  2.14s/it]                                                         56%|█████▋    | 7201/12776 [1:16:45<3:18:27,  2.14s/it] 56%|█████▋    | 7202/12776 [1:16:45<2:30:38,  1.62s/it]                                                         56%|█████▋    | 7202/12776 [1:16:45<2:30:38,  1.62s/it] 56%|█████▋    | 7203/12776 [1:16:46<2:00:13,  1.29s/it]                                                         56%|█████▋    | 7203/12776 [1:16:46<2:00:13,  1.29s/it] 56%|█████▋    | 7204/12776 [1:16:46<1:35:31,  1.03s/it]                                                         56%|█████▋    | 7204/12776 [1:16:46<1:35:31,  1.03s/it] 56%|█████▋    | 7205/12776 [1:16:47<1:18:01,  1.19it/s]                                                         56%|█████▋    | 7205/12776 [1:16:47<1:18:01,  1.19it/s] 56%|█████▋    | 7206/12776 [1:16:47<1:05:27,  1.42it/s]                                                         56%|█████▋    | 7206/12776 [1:16:47<1:05:27,  1.42it/s] 56%|█████▋    | 7207/12776 [1:16:47<55:43,  1.67it/s]                                                         56%|█████▋    | 7207/12776 [1:16:47<55:43,  1.67it/s] 56%|█████▋    | 7208/12776 [1:16:48<48:28,  1.91it/s]                                                       56%|█████▋    | 7208/12776 [1:16:48<48:28,  1.91it/s] 56%|█████▋    | 7209/12776 [1:16:48<44:47,  2.07it/s]                                                       56%|█████▋    | 7209/12776 [1:16:48<44:47,  2.07it/s] 56%|█████▋    | 7210/12776 [1:16:48<40:12,  2.31it/s]                                                       56%|█████▋    | 7210/12776 [1:16:48<40:12,  2.31it/s] 56%|█████▋    | 7211/12776 [1:16:49<36:48,  2.52it/s]                                                       56%|█████▋    | 7211/12776 [1:16:49<36:48,  2.52it/s] 56%|█████▋    | 7212/12776 [1:16:49<35:51,  2.59it/s]                                                       56%|█████▋    | 7212/12776 [1:16:49<35:51,  2.59it/s] 56%|█████▋    | 7213/12776 [1:16:49<33:12,  2.79it/s]                                                       56%|█████▋    | 7213/12776 [1:16:49<33:12,  2.79it/s] 56%|█████▋    | 7214/12776 [1:16:50<30:51,  3.00it/s]                                                       56%|█████▋    | 7214/12776 [1:16:50<30:51,  3.00it/s] 56%|█████▋    | 7215/12776 [1:16:50<29:02,  3.19it/s]                                                       56%|█████▋    | 7215/12776 [1:16:50<29:02,  3.19it/s] 56%|█████▋    | 7216/12776 [1:16:50<28:56,  3.20it/s]                                                       56%|█████▋    | 7216/12776 [1:16:50<28:56,  3.20it/s] 56%|█████▋    | 7217/12776 [1:16:50<27:17,  3.39it/s]                                                       56%|█████▋    | 7217/12776 [1:16:50<27:17,  3.39it/s] 56%|█████▋    | 7218/12776 [1:16:51<26:00,  3.56it/s]                                                       56%|█████▋    | 7218/12776 [1:16:51<26:00,  3.56it/s] 57%|█████▋    | 7219/12776 [1:16:51<24:51,  3.72it/s]                                                       57%|█████▋    | 7219/12776 [1:16:51<24:51,  3.72it/s] 57%|█████▋    | 7220/12776 [1:16:51<24:01,  3.85it/s]                                                       57%|█████▋    | 7220/12776 [1:16:51<24:01,  3.85it/s] 57%|█████▋    | 7221/12776 [1:16:51<24:53,  3.72it/s]                                                       57%|█████▋    | 7221/12776 [1:16:51<24:53,  3.72it/s] 57%|█████▋    | 7222/12776 [1:16:52<23:26,  3.95it/s]                                                       57%|█████▋    | 7222/12776 [1:16:52<23:26,  3.95it/s] 57%|█████▋    | 7223/12776 [1:16:52<22:16,  4.15it/s]                                                       57%|█████▋    | 7223/12776 [1:16:52<22:16,  4.15it/s] 57%|█████▋    | 7224/12776 [1:16:52<21:14,  4.36it/s]                                                       57%|█████▋    | 7224/12776 [1:16:52<21:14,  4.36it/s] 57%|█████▋    | 7225/12776 [1:16:52<20:21,  4.55it/s]                                                       57%|█████▋    | 7225/12776 [1:16:52<20:21,  4.55it/s] 57%|█████▋    | 7226/12776 [1:16:53<21:34,  4.29it/s]                                                       57%|█████▋    | 7226/12776 [1:16:53<21:34,  4.29it/s] 57%|█████▋    | 7227/12776 [1:16:53<20:25,  4.53it/s]                                                       57%|█████▋    | 7227/12776 [1:16:53<20:25,  4.53it/s] 57%|█████▋    | 7228/12776 [1:16:53<19:29,  4.74it/s]                                                       57%|█████▋    | 7228/12776 [1:16:53<19:29,  4.74it/s] 57%|█████▋    | 7229/12776 [1:16:53<18:45,  4.93it/s]                                                       57%|█████▋    | 7229/12776 [1:16:53<18:45,  4.93it/s] 57%|█████▋    | 7230/12776 [1:16:53<18:17,  5.06it/s]                                                       57%|█████▋    | 7230/12776 [1:16:53<18:17,  5.06it/s] 57%|█████▋    | 7231/12776 [1:16:54<20:22,  4.54it/s]                                                      {'loss': 0.3564, 'grad_norm': 0.8442760109901428, 'learning_rate': 0.00013765884652981427, 'epoch': 1.12}
+{'loss': 0.537, 'grad_norm': 1.7543659210205078, 'learning_rate': 0.00013763440860215052, 'epoch': 1.12}
+{'loss': 0.5696, 'grad_norm': 4.0782060623168945, 'learning_rate': 0.0001376099706744868, 'epoch': 1.12}
+{'loss': 0.4484, 'grad_norm': 1.1545088291168213, 'learning_rate': 0.00013758553274682307, 'epoch': 1.12}
+{'loss': 0.5164, 'grad_norm': 1.2286568880081177, 'learning_rate': 0.00013756109481915933, 'epoch': 1.12}
+{'loss': 0.3714, 'grad_norm': 1.0465517044067383, 'learning_rate': 0.00013753665689149558, 'epoch': 1.12}
+{'loss': 0.2982, 'grad_norm': 1.1569772958755493, 'learning_rate': 0.00013751221896383186, 'epoch': 1.12}
+{'loss': 0.5142, 'grad_norm': 2.364917755126953, 'learning_rate': 0.00013748778103616813, 'epoch': 1.12}
+{'loss': 0.5519, 'grad_norm': 1.6103262901306152, 'learning_rate': 0.00013746334310850439, 'epoch': 1.12}
+{'loss': 0.4566, 'grad_norm': 2.0225682258605957, 'learning_rate': 0.00013743890518084066, 'epoch': 1.12}
+{'loss': 0.4267, 'grad_norm': 1.9558186531066895, 'learning_rate': 0.00013741446725317691, 'epoch': 1.12}
+{'loss': 0.6969, 'grad_norm': 3.2678587436676025, 'learning_rate': 0.00013739002932551317, 'epoch': 1.12}
+{'loss': 0.6032, 'grad_norm': 1.560784101486206, 'learning_rate': 0.00013736559139784944, 'epoch': 1.12}
+{'loss': 0.3596, 'grad_norm': 1.4639748334884644, 'learning_rate': 0.00013734115347018572, 'epoch': 1.12}
+{'loss': 0.7597, 'grad_norm': 1.3413069248199463, 'learning_rate': 0.00013731671554252197, 'epoch': 1.12}
+{'loss': 0.8581, 'grad_norm': 2.400970935821533, 'learning_rate': 0.00013729227761485825, 'epoch': 1.12}
+{'loss': 0.5273, 'grad_norm': 1.9984264373779297, 'learning_rate': 0.0001372678396871945, 'epoch': 1.12}
+{'loss': 1.0934, 'grad_norm': 2.334470510482788, 'learning_rate': 0.00013724340175953078, 'epoch': 1.12}
+{'loss': 1.0502, 'grad_norm': 6.798165321350098, 'learning_rate': 0.00013721896383186706, 'epoch': 1.12}
+{'loss': 0.806, 'grad_norm': 3.1501338481903076, 'learning_rate': 0.0001371945259042033, 'epoch': 1.12}
+{'loss': 1.0714, 'grad_norm': 5.227148056030273, 'learning_rate': 0.00013717008797653956, 'epoch': 1.12}
+{'loss': 1.4736, 'grad_norm': 3.071563482284546, 'learning_rate': 0.00013714565004887584, 'epoch': 1.12}
+{'loss': 1.5653, 'grad_norm': 4.112018585205078, 'learning_rate': 0.00013712121212121212, 'epoch': 1.12}
+{'loss': 0.8757, 'grad_norm': 3.3567934036254883, 'learning_rate': 0.00013709677419354837, 'epoch': 1.12}
+{'loss': 0.7167, 'grad_norm': 2.7976365089416504, 'learning_rate': 0.00013707233626588465, 'epoch': 1.12}
+{'loss': 1.4257, 'grad_norm': 3.194066286087036, 'learning_rate': 0.0001370478983382209, 'epoch': 1.12}
+{'loss': 0.8894, 'grad_norm': 5.53701114654541, 'learning_rate': 0.00013702346041055718, 'epoch': 1.12}
+{'loss': 1.0472, 'grad_norm': 1.579353928565979, 'learning_rate': 0.00013699902248289346, 'epoch': 1.12}
+{'loss': 1.0859, 'grad_norm': 2.461728811264038, 'learning_rate': 0.0001369745845552297, 'epoch': 1.12}
+{'loss': 0.7208, 'grad_norm': 4.658527851104736, 'learning_rate': 0.00013695014662756596, 'epoch': 1.12}
+{'loss': 1.3767, 'grad_norm': 2.3477227687835693, 'learning_rate': 0.00013692570869990224, 'epoch': 1.12}
+{'loss': 1.0941, 'grad_norm': 1.8970081806182861, 'learning_rate': 0.00013690127077223852, 'epoch': 1.12}
+{'loss': 0.7754, 'grad_norm': 5.8194355964660645, 'learning_rate': 0.00013687683284457477, 'epoch': 1.12}
+{'loss': 1.0529, 'grad_norm': 2.781679153442383, 'learning_rate': 0.00013685239491691105, 'epoch': 1.13}
+{'loss': 1.2293, 'grad_norm': 3.368130683898926, 'learning_rate': 0.0001368279569892473, 'epoch': 1.13}
+{'loss': 0.1819, 'grad_norm': 0.35694655776023865, 'learning_rate': 0.00013680351906158355, 'epoch': 1.13}
+{'loss': 0.4191, 'grad_norm': 0.6090511083602905, 'learning_rate': 0.00013677908113391983, 'epoch': 1.13}
+{'loss': 0.202, 'grad_norm': 0.5560696721076965, 'learning_rate': 0.0001367546432062561, 'epoch': 1.13}
+{'loss': 0.1424, 'grad_norm': 0.4044315218925476, 'learning_rate': 0.00013673020527859236, 'epoch': 1.13}
+{'loss': 0.2354, 'grad_norm': 0.5974329113960266, 'learning_rate': 0.00013670576735092863, 'epoch': 1.13}
+{'loss': 0.3103, 'grad_norm': 0.4592570960521698, 'learning_rate': 0.00013668132942326489, 'epoch': 1.13}
+{'loss': 0.6261, 'grad_norm': 0.8667699694633484, 'learning_rate': 0.00013665689149560116, 'epoch': 1.13}
+{'loss': 0.2512, 'grad_norm': 0.5696028470993042, 'learning_rate': 0.00013663245356793744, 'epoch': 1.13}
+{'loss': 0.1274, 'grad_norm': 0.41011011600494385, 'learning_rate': 0.0001366080156402737, 'epoch': 1.13}
+{'loss': 0.3106, 'grad_norm': 0.686440646648407, 'learning_rate': 0.00013658357771260995, 'epoch': 1.13}
+{'loss': 0.2809, 'grad_norm': 0.823204755783081, 'learning_rate': 0.00013655913978494622, 'epoch': 1.13}
+{'loss': 0.263, 'grad_norm': 0.6141254901885986, 'learning_rate': 0.0001365347018572825, 'epoch': 1.13}
+{'loss': 0.2794, 'grad_norm': 0.6794694066047668, 'learning_rate': 0.00013651026392961875, 'epoch': 1.13}
+{'loss': 0.2181, 'grad_norm': 0.7313904166221619, 'learning_rate': 0.00013648582600195503, 'epoch': 1.13}
+{'loss': 0.3791, 'grad_norm': 0.7755028009414673, 'learning_rate': 0.00013646138807429128, 'epoch': 1.13}
+{'loss': 0.3037, 'grad_norm': 0.8282299041748047, 'learning_rate': 0.00013643695014662756, 'epoch': 1.13}
+{'loss': 0.5826, 'grad_norm': 1.4426270723342896, 'learning_rate': 0.00013641251221896384, 'epoch': 1.13}
+{'loss': 0.3046, 'grad_norm': 1.3686167001724243, 'learning_rate': 0.0001363880742913001, 'epoch': 1.13}
+{'loss': 0.3509, 'grad_norm': 1.07740318775177, 'learning_rate': 0.00013636363636363634, 'epoch': 1.13}
+{'loss': 0.4518, 'grad_norm': 1.3811142444610596, 'learning_rate': 0.00013633919843597262, 'epoch': 1.13}
+{'loss': 0.4252, 'grad_norm': 1.2326154708862305, 'learning_rate': 0.00013631476050830887, 'epoch': 1.13}
+{'loss': 0.5223, 'grad_norm': 1.8412114381790161, 'learning_rate': 0.00013629032258064515, 'epoch': 1.13}
+{'loss': 0.5177, 'grad_norm': 1.9933035373687744, 'learning_rate': 0.00013626588465298143, 'epoch': 1.13}
+{'loss': 0.5853, 'grad_norm': 1.4402484893798828, 'learning_rate': 0.00013624144672531768, 'epoch': 1.13}
+{'loss': 0.4662, 'grad_norm': 1.115515947341919, 'learning_rate': 0.00013621700879765393, 'epoch': 1.13}
+{'loss': 0.7741, 'grad_norm': 2.486177444458008, 'learning_rate': 0.0001361925708699902, 'epoch': 1.13}
+{'loss': 0.2729, 'grad_norm': 1.2071272134780884, 'learning_rate': 0.0001361681329423265, 'epoch': 1.13}
+{'loss': 0.9455, 'grad_norm': 5.3013529777526855, 'learning_rate': 0.00013614369501466274, 'epoch': 1.13}
+{'loss': 0.9704, 'grad_norm': 2.959777593612671, 'learning_rate': 0.00013611925708699902, 'epoch': 1.13}
+{'loss': 0.8733, 'grad_norm': 3.2113773822784424, 'learning_rate': 0.00013609481915933527, 'epoch': 1.13}
+{'loss': 0.616, 'grad_norm': 1.5459614992141724, 'learning_rate': 0.00013607038123167155, 'epoch': 1.13}
+{'loss': 0.2964, 'grad_norm': 2.0902748107910156, 'learning_rate': 0.00013604594330400782, 'epoch': 1.13}
+{'loss': 0.5847, 'grad_norm': 3.8055615425109863, 'learning_rate': 0.00013602150537634408, 'epoch': 1.13}
+{'loss': 1.0193, 'grad_norm': 2.048724412918091, 'learning_rate': 0.00013599706744868033, 'epoch': 1.13}
+{'loss': 0.8073, 'grad_norm': 2.0475356578826904, 'learning_rate': 0.0001359726295210166, 'epoch': 1.13}
+{'loss': 0.6983, 'grad_norm': 2.0801010131835938, 'learning_rate': 0.00013594819159335288, 'epoch': 1.13}
+{'loss': 0.78, 'grad_norm': 2.2826006412506104, 'learning_rate': 0.00013592375366568914, 'epoch': 1.13}
+{'loss': 0.6017, 'grad_norm': 2.933180093765259, 'learning_rate': 0.00013589931573802541, 'epoch': 1.13}
+{'loss': 1.0778, 'grad_norm': 2.438169240951538, 'learning_rate': 0.00013587487781036166, 'epoch': 1.13}
+{'loss': 0.851, 'grad_norm': 1.5362167358398438, 'learning_rate': 0.00013585043988269794, 'epoch': 1.13}
+{'loss': 1.0545, 'grad_norm': 3.1869149208068848, 'learning_rate': 0.00013582600195503422, 'epoch': 1.13}
+{'loss': 0.9157, 'grad_norm': 1.230353593826294, 'learning_rate': 0.00013580156402737047, 'epoch': 1.13}
+ 57%|█████▋    | 7231/12776 [1:16:54<20:22,  4.54it/s] 57%|█████▋    | 7232/12776 [1:16:54<19:12,  4.81it/s]                                                       57%|█████▋    | 7232/12776 [1:16:54<19:12,  4.81it/s] 57%|█████▋    | 7233/12776 [1:16:54<18:20,  5.04it/s]                                                       57%|█████▋    | 7233/12776 [1:16:54<18:20,  5.04it/s] 57%|█████▋    | 7234/12776 [1:16:54<17:37,  5.24it/s]                                                       57%|█████▋    | 7234/12776 [1:16:54<17:37,  5.24it/s] 57%|█████▋    | 7235/12776 [1:16:54<17:05,  5.40it/s]                                                       57%|█████▋    | 7235/12776 [1:16:54<17:05,  5.40it/s] 57%|█████▋    | 7236/12776 [1:16:54<16:36,  5.56it/s]                                                       57%|█████▋    | 7236/12776 [1:16:54<16:36,  5.56it/s] 57%|█████▋    | 7237/12776 [1:16:55<18:33,  4.97it/s]                                                       57%|█████▋    | 7237/12776 [1:16:55<18:33,  4.97it/s] 57%|█████▋    | 7238/12776 [1:16:55<33:00,  2.80it/s]                                                       57%|█████▋    | 7238/12776 [1:16:55<33:00,  2.80it/s] 57%|█████▋    | 7239/12776 [1:16:57<1:06:36,  1.39it/s]                                                         57%|█████▋    | 7239/12776 [1:16:57<1:06:36,  1.39it/s] 57%|█████▋    | 7240/12776 [1:16:58<1:13:48,  1.25it/s]                                                         57%|█████▋    | 7240/12776 [1:16:58<1:13:48,  1.25it/s] 57%|█████▋    | 7241/12776 [1:16:59<1:16:34,  1.20it/s]                                                         57%|█████▋    | 7241/12776 [1:16:59<1:16:34,  1.20it/s] 57%|█████▋    | 7242/12776 [1:17:00<1:17:34,  1.19it/s]                                                         57%|█████▋    | 7242/12776 [1:17:00<1:17:34,  1.19it/s] 57%|█████▋    | 7243/12776 [1:17:00<1:15:27,  1.22it/s]                                                         57%|█████▋    | 7243/12776 [1:17:00<1:15:27,  1.22it/s] 57%|█████▋    | 7244/12776 [1:17:01<1:12:16,  1.28it/s]                                                         57%|█████▋    | 7244/12776 [1:17:01<1:12:16,  1.28it/s] 57%|█████▋    | 7245/12776 [1:17:02<1:09:36,  1.32it/s]                                                         57%|█████▋    | 7245/12776 [1:17:02<1:09:36,  1.32it/s] 57%|█████▋    | 7246/12776 [1:17:02<1:05:21,  1.41it/s]                                                         57%|█████▋    | 7246/12776 [1:17:02<1:05:21,  1.41it/s] 57%|█████▋    | 7247/12776 [1:17:03<1:02:24,  1.48it/s]                                                         57%|█████▋    | 7247/12776 [1:17:03<1:02:24,  1.48it/s] 57%|█████▋    | 7248/12776 [1:17:04<58:27,  1.58it/s]                                                         57%|█████▋    | 7248/12776 [1:17:04<58:27,  1.58it/s] 57%|█████▋    | 7249/12776 [1:17:04<57:29,  1.60it/s]                                                       57%|█████▋    | 7249/12776 [1:17:04<57:29,  1.60it/s] 57%|█████▋    | 7250/12776 [1:17:05<54:01,  1.70it/s]                                                       57%|█████▋    | 7250/12776 [1:17:05<54:01,  1.70it/s] 57%|█████▋    | 7251/12776 [1:17:05<52:21,  1.76it/s]                                                       57%|█████▋    | 7251/12776 [1:17:05<52:21,  1.76it/s] 57%|█████▋    | 7252/12776 [1:17:06<48:53,  1.88it/s]                                                       57%|█████▋    | 7252/12776 [1:17:06<48:53,  1.88it/s] 57%|█████▋    | 7253/12776 [1:17:06<48:29,  1.90it/s]                                                       57%|█████▋    | 7253/12776 [1:17:06<48:29,  1.90it/s] 57%|█████▋    | 7254/12776 [1:17:07<44:59,  2.05it/s]                                                       57%|█████▋    | 7254/12776 [1:17:07<44:59,  2.05it/s] 57%|█████▋    | 7255/12776 [1:17:07<42:03,  2.19it/s]                                                       57%|█████▋    | 7255/12776 [1:17:07<42:03,  2.19it/s] 57%|█████▋    | 7256/12776 [1:17:07<40:16,  2.28it/s]                                                       57%|█████▋    | 7256/12776 [1:17:07<40:16,  2.28it/s] 57%|█████▋    | 7257/12776 [1:17:08<37:57,  2.42it/s]                                                       57%|█████▋    | 7257/12776 [1:17:08<37:57,  2.42it/s] 57%|█████▋    | 7258/12776 [1:17:08<35:49,  2.57it/s]                                                       57%|█████▋    | 7258/12776 [1:17:08<35:49,  2.57it/s] 57%|█████▋    | 7259/12776 [1:17:08<37:24,  2.46it/s]                                                       57%|█████▋    | 7259/12776 [1:17:08<37:24,  2.46it/s] 57%|█████▋    | 7260/12776 [1:17:09<34:45,  2.65it/s]                                                       57%|█████▋    | 7260/12776 [1:17:09<34:45,  2.65it/s] 57%|█████▋    | 7261/12776 [1:17:09<32:38,  2.82it/s]                                                       57%|█████▋    | 7261/12776 [1:17:09<32:38,  2.82it/s] 57%|█████▋    | 7262/12776 [1:17:09<30:49,  2.98it/s]                                                       57%|█████▋    | 7262/12776 [1:17:09<30:49,  2.98it/s] 57%|█████▋    | 7263/12776 [1:17:10<32:09,  2.86it/s]                                                       57%|█████▋    | 7263/12776 [1:17:10<32:09,  2.86it/s] 57%|█████▋    | 7264/12776 [1:17:10<30:31,  3.01it/s]                                                       57%|█████▋    | 7264/12776 [1:17:10<30:31,  3.01it/s] 57%|█████▋    | 7265/12776 [1:17:10<29:15,  3.14it/s]                                                       57%|█████▋    | 7265/12776 [1:17:10<29:15,  3.14it/s] 57%|█████▋    | 7266/12776 [1:17:11<31:45,  2.89it/s]                                                       57%|█████▋    | 7266/12776 [1:17:11<31:45,  2.89it/s] 57%|█████▋    | 7267/12776 [1:17:11<29:42,  3.09it/s]                                                       57%|█████▋    | 7267/12776 [1:17:11<29:42,  3.09it/s] 57%|█████▋    | 7268/12776 [1:17:11<28:02,  3.27it/s]                                                       57%|█████��    | 7268/12776 [1:17:11<28:02,  3.27it/s] 57%|█████▋    | 7269/12776 [1:17:12<26:40,  3.44it/s]                                                       57%|█████▋    | 7269/12776 [1:17:12<26:40,  3.44it/s] 57%|█████▋    | 7270/12776 [1:17:12<28:08,  3.26it/s]                                                       57%|█████▋    | 7270/12776 [1:17:12<28:08,  3.26it/s] 57%|█████▋    | 7271/12776 [1:17:12<26:28,  3.47it/s]                                                       57%|█████▋    | 7271/12776 [1:17:12<26:28,  3.47it/s] 57%|█████▋    | 7272/12776 [1:17:12<25:06,  3.65it/s]                                                       57%|█████▋    | 7272/12776 [1:17:12<25:06,  3.65it/s] 57%|█████▋    | 7273/12776 [1:17:13<23:57,  3.83it/s]                                                       57%|█████▋    | 7273/12776 [1:17:13<23:57,  3.83it/s] 57%|█████▋    | 7274/12776 [1:17:13<22:47,  4.02it/s]                                                       57%|█████▋    | 7274/12776 [1:17:13<22:47,  4.02it/s] 57%|█████▋    | 7275/12776 [1:17:13<23:53,  3.84it/s]                                                       57%|█████▋    | 7275/12776 [1:17:13<23:53,  3.84it/s] 57%|█████▋    | 7276/12776 [1:17:13<22:07,  4.14it/s]                                                       57%|█████▋    | 7276/12776 [1:17:13<22:07,  4.14it/s] 57%|█████▋    | 7277/12776 [1:17:14<20:40,  4.43it/s]                                                       57%|█████▋    | 7277/12776 [1:17:14<20:40,  4.43it/s] 57%|█████▋    | 7278/12776 [1:17:14<19:36,  4.67it/s]                                                       57%|█████▋    | 7278/12776 [1:17:14<19:36,  4.67it/s] 57%|█████▋    | 7279/12776 [1:17:14<18:45,  4.89it/s]                                                       57%|█████▋    | 7279/12776 [1:17:14<18:45,  4.89it/s] 57%|█████▋    | 7280/12776 [1:17:14<19:59,  4.58it/s]                                                       57%|█████▋    | 7280/12776 [1:17:14<19:59,  4.58it/s] 57%|█████▋    | 7281/12776 [1:17:14<19:23,  4.72it/s]                                                       57%|█████▋    | 7281/12776 [1:17:14<19:23,  4.72it/s] 57%|█████▋    | 7282/12776 [1:17:15<18:36,  4.92it/s]                                                       57%|█████▋    | 7282/12776 [1:17:15<18:36,  4.92it/s] 57%|█████▋    | 7283/12776 [1:17:15<17:47,  5.15it/s]                                                       57%|█████▋    | 7283/12776 [1:17:15<17:47,  5.15it/s] 57%|█████▋    | 7284/12776 [1:17:15<17:14,  5.31it/s]                                                       57%|█████▋    | 7284/12776 [1:17:15<17:14,  5.31it/s] 57%|█████▋    | 7285/12776 [1:17:15<16:41,  5.48it/s]                                                       57%|█████▋    | 7285/12776 [1:17:15<16:41,  5.48it/s] 57%|█████▋    | 7286/12776 [1:17:15<17:48,  5.14it/s]                                                       57%|█████▋    | 7286/12776 [1:17:15<17:48,  5.14it/s] 57%|█████▋    | 7287/12776 [1:17:15<16:53,  5.42it/s]                                                       57%|█████▋    | 7287/12776 [1:17:15<16:53,  5.42it/s] 57%|█████▋    | 7288/12776 [1:17:16<32:59,  2.77it/s]                                                       57%|█████▋    | 7288/12776 [1:17:16<32:59,  2.77it/s] 57%|█████▋    | 7289/12776 [1:17:18<1:06:47,  1.37it/s]                                                         57%|█████▋    | 7289/12776 [1:17:18<1:06:47,  1.37it/s] 57%|█████▋    | 7290/12776 [1:17:19<1:12:33,  1.26it/s]                                                         57%|█████▋    | 7290/12776 [1:17:19<1:12:33,  1.26it/s] 57%|█████▋    | 7291/12776 [1:17:20<1:13:55,  1.24it/s]                                                         57%|█████▋    | 7291/12776 [1:17:20<1:13:55,  1.24it/s] 57%|█████▋    | 7292/12776 [1:17:20<1:12:39,  1.26it/s]                                                         57%|█████▋    | 7292/12776 [1:17:20<1:12:39,  1.26it/s] 57%|█████▋    | 7293/12776 [1:17:21<1:10:38,  1.29it/s]                                                         57%|█████▋    | 7293/12776 [1:17:21<1:10:38,  1.29it/s] 57%|█████▋    | 7294/12776 [1:17:22<1:08:05,  1.34it/s]                                                         57%|█████▋    | 7294/12776 [1:17:22<1:08:05,  1.34it/s] 57%|█████▋    | 7295/12776 [1:17:22<1:07:13,  1.36it/s]                                                         57%|█████▋    | 7295/12776 [1:17:22<1:07:13,  1.36it/s] 57%|█████▋    | 7296/12776 [1:17:23<1:04:16,  1.42it/s]                                                         57%|█████▋    | 7296/12776 [1:17:23<1:04:16,  1.42it/s] 57%|█████▋    | 7297/12776 [1:17:24<1:01:24,  1.49it/s]                                                         57%|█████▋    | 7297/12776 [1:17:24<1:01:24,  1.49it/s] 57%|█████▋    | 7298/12776 [1:17:24<58:22,  1.56it/s]                                                         57%|█████▋    | 7298/12776 [1:17:24<58:22,  1.56it/s] 57%|█████▋    | 7299/12776 [1:17:25<55:59,  1.63it/s]                                                       57%|█████▋    | 7299/12776 [1:17:25<55:59,  1.63it/s] 57%|█████▋    | 7300/12776 [1:17:25<53:53,  1.69it/s]                                                       57%|█████▋    | 7300/12776 [1:17:25<53:53,  1.69it/s] 57%|█████▋    | 7301/12776 [1:17:26<53:57,  1.69it/s]                                                       57%|█████▋    | 7301/12776 [1:17:26<53:57,  1.69it/s] 57%|█████▋    | 7302/12776 [1:17:26<50:36,  1.80it/s]                                                       57%|█████▋    | 7302/12776 [1:17:26<50:36,  1.80it/s] 57%|█████▋    | 7303/12776 [1:17:27<50:09,  1.82it/s]                                                       57%|█████▋    | 7303/12776 [1:17:27<50:09,  1.82it/s] 57%|█████▋    | 7304/12776 [1:17:27<46:56,  1.94it/s]                                                       57%|█████▋    | 7304/12776 [1:17:27<46:56,  1.94it/s] 57%|█████▋    | 7305/12776 [1:17:28<47:24,  1.92it/s]                                                       57%|█████▋    | 7305/12776 [1:17:28<47:24,  1.92it/s] 57%|█████▋    | 7306/12776 [1:17:28<44:15,  2.06it/s]                                                       57%|█████▋    | 7306/12776 [1:17:28<44:15,  2.06it/s] 57%|█████▋    | 7307/12776 [1:17:29<41:41,  2.19it/s]                                                       57%|█████▋    | 7307/12776 [1:17:29<41:41,  2.19it/s] 57%|█████▋    | 7308/12776 [1:17:29<43:13,  2.11it/s]                                                      {'loss': 1.0721, 'grad_norm': 1.9790470600128174, 'learning_rate': 0.00013577712609970672, 'epoch': 1.13}
+{'loss': 1.4632, 'grad_norm': 2.1077675819396973, 'learning_rate': 0.000135752688172043, 'epoch': 1.13}
+{'loss': 2.0202, 'grad_norm': 2.5034217834472656, 'learning_rate': 0.00013572825024437925, 'epoch': 1.13}
+{'loss': 0.8832, 'grad_norm': 1.6274192333221436, 'learning_rate': 0.00013570381231671553, 'epoch': 1.13}
+{'loss': 0.5584, 'grad_norm': 1.7369862794876099, 'learning_rate': 0.0001356793743890518, 'epoch': 1.13}
+{'loss': 0.8328, 'grad_norm': 2.9159839153289795, 'learning_rate': 0.00013565493646138806, 'epoch': 1.13}
+{'loss': 0.6125, 'grad_norm': 1.95712411403656, 'learning_rate': 0.0001356304985337243, 'epoch': 1.13}
+{'loss': 1.3286, 'grad_norm': 2.354888916015625, 'learning_rate': 0.0001356060606060606, 'epoch': 1.13}
+{'loss': 0.303, 'grad_norm': 0.7385460734367371, 'learning_rate': 0.00013558162267839687, 'epoch': 1.13}
+{'loss': 0.2112, 'grad_norm': 0.645324170589447, 'learning_rate': 0.00013555718475073312, 'epoch': 1.13}
+{'loss': 0.4106, 'grad_norm': 0.979313313961029, 'learning_rate': 0.0001355327468230694, 'epoch': 1.13}
+{'loss': 0.2252, 'grad_norm': 0.601364254951477, 'learning_rate': 0.00013550830889540565, 'epoch': 1.13}
+{'loss': 0.1746, 'grad_norm': 0.45826420187950134, 'learning_rate': 0.00013548387096774193, 'epoch': 1.13}
+{'loss': 0.3237, 'grad_norm': 0.6122409105300903, 'learning_rate': 0.0001354594330400782, 'epoch': 1.13}
+{'loss': 0.2608, 'grad_norm': 0.7006223201751709, 'learning_rate': 0.00013543499511241446, 'epoch': 1.13}
+{'loss': 0.2501, 'grad_norm': 0.7755613923072815, 'learning_rate': 0.0001354105571847507, 'epoch': 1.13}
+{'loss': 0.1775, 'grad_norm': 0.7348597645759583, 'learning_rate': 0.000135386119257087, 'epoch': 1.13}
+{'loss': 0.3969, 'grad_norm': 1.9953992366790771, 'learning_rate': 0.00013536168132942327, 'epoch': 1.13}
+{'loss': 0.3183, 'grad_norm': 0.8354098796844482, 'learning_rate': 0.00013533724340175952, 'epoch': 1.13}
+{'loss': 0.3691, 'grad_norm': 0.8279335498809814, 'learning_rate': 0.0001353128054740958, 'epoch': 1.13}
+{'loss': 0.2994, 'grad_norm': 0.7575170397758484, 'learning_rate': 0.00013528836754643205, 'epoch': 1.14}
+{'loss': 0.1918, 'grad_norm': 0.6239116787910461, 'learning_rate': 0.00013526392961876833, 'epoch': 1.14}
+{'loss': 0.45, 'grad_norm': 1.2877776622772217, 'learning_rate': 0.00013523949169110458, 'epoch': 1.14}
+{'loss': 0.3337, 'grad_norm': 1.0643291473388672, 'learning_rate': 0.00013521505376344086, 'epoch': 1.14}
+{'loss': 0.4557, 'grad_norm': 1.3853017091751099, 'learning_rate': 0.0001351906158357771, 'epoch': 1.14}
+{'loss': 0.3404, 'grad_norm': 1.043006181716919, 'learning_rate': 0.00013516617790811338, 'epoch': 1.14}
+{'loss': 0.3246, 'grad_norm': 1.343376636505127, 'learning_rate': 0.00013514173998044964, 'epoch': 1.14}
+{'loss': 0.3422, 'grad_norm': 1.1656047105789185, 'learning_rate': 0.00013511730205278591, 'epoch': 1.14}
+{'loss': 0.7908, 'grad_norm': 2.2045726776123047, 'learning_rate': 0.0001350928641251222, 'epoch': 1.14}
+{'loss': 0.3087, 'grad_norm': 1.6047229766845703, 'learning_rate': 0.00013506842619745844, 'epoch': 1.14}
+{'loss': 0.6155, 'grad_norm': 1.924752116203308, 'learning_rate': 0.0001350439882697947, 'epoch': 1.14}
+{'loss': 0.4651, 'grad_norm': 1.1636427640914917, 'learning_rate': 0.00013501955034213097, 'epoch': 1.14}
+{'loss': 0.3138, 'grad_norm': 1.3303437232971191, 'learning_rate': 0.00013499511241446725, 'epoch': 1.14}
+{'loss': 0.5431, 'grad_norm': 1.4854735136032104, 'learning_rate': 0.0001349706744868035, 'epoch': 1.14}
+{'loss': 0.7749, 'grad_norm': 1.4238706827163696, 'learning_rate': 0.00013494623655913978, 'epoch': 1.14}
+{'loss': 0.4627, 'grad_norm': 1.6186683177947998, 'learning_rate': 0.00013492179863147603, 'epoch': 1.14}
+{'loss': 0.6011, 'grad_norm': 2.7942728996276855, 'learning_rate': 0.0001348973607038123, 'epoch': 1.14}
+{'loss': 0.5155, 'grad_norm': 1.4344183206558228, 'learning_rate': 0.0001348729227761486, 'epoch': 1.14}
+{'loss': 0.5645, 'grad_norm': 1.8099091053009033, 'learning_rate': 0.00013484848484848484, 'epoch': 1.14}
+{'loss': 0.6922, 'grad_norm': 2.336829900741577, 'learning_rate': 0.0001348240469208211, 'epoch': 1.14}
+{'loss': 0.6654, 'grad_norm': 2.197136640548706, 'learning_rate': 0.00013479960899315737, 'epoch': 1.14}
+{'loss': 0.6727, 'grad_norm': 1.8803507089614868, 'learning_rate': 0.00013477517106549365, 'epoch': 1.14}
+{'loss': 0.6877, 'grad_norm': 2.4235281944274902, 'learning_rate': 0.0001347507331378299, 'epoch': 1.14}
+{'loss': 0.9574, 'grad_norm': 6.200526237487793, 'learning_rate': 0.00013472629521016618, 'epoch': 1.14}
+{'loss': 1.4685, 'grad_norm': 4.244660377502441, 'learning_rate': 0.00013470185728250243, 'epoch': 1.14}
+{'loss': 0.6895, 'grad_norm': 1.6314693689346313, 'learning_rate': 0.0001346774193548387, 'epoch': 1.14}
+{'loss': 1.0343, 'grad_norm': 2.2845335006713867, 'learning_rate': 0.00013465298142717496, 'epoch': 1.14}
+{'loss': 1.037, 'grad_norm': 2.063843250274658, 'learning_rate': 0.00013462854349951124, 'epoch': 1.14}
+{'loss': 1.1499, 'grad_norm': 3.8249752521514893, 'learning_rate': 0.0001346041055718475, 'epoch': 1.14}
+{'loss': 1.3537, 'grad_norm': 2.1687839031219482, 'learning_rate': 0.00013457966764418377, 'epoch': 1.14}
+{'loss': 1.3049, 'grad_norm': 1.7228095531463623, 'learning_rate': 0.00013455522971652002, 'epoch': 1.14}
+{'loss': 1.2728, 'grad_norm': 3.0871424674987793, 'learning_rate': 0.0001345307917888563, 'epoch': 1.14}
+{'loss': 0.8322, 'grad_norm': 1.9572391510009766, 'learning_rate': 0.00013450635386119258, 'epoch': 1.14}
+{'loss': 0.7581, 'grad_norm': 1.1564244031906128, 'learning_rate': 0.00013448191593352883, 'epoch': 1.14}
+{'loss': 0.7736, 'grad_norm': 2.321030378341675, 'learning_rate': 0.00013445747800586508, 'epoch': 1.14}
+{'loss': 0.3721, 'grad_norm': 3.072721481323242, 'learning_rate': 0.00013443304007820136, 'epoch': 1.14}
+{'loss': 0.6595, 'grad_norm': 2.0024454593658447, 'learning_rate': 0.00013440860215053763, 'epoch': 1.14}
+{'loss': 0.3166, 'grad_norm': 0.9508635401725769, 'learning_rate': 0.00013438416422287389, 'epoch': 1.14}
+{'loss': 0.2544, 'grad_norm': 0.390267014503479, 'learning_rate': 0.00013435972629521014, 'epoch': 1.14}
+{'loss': 0.2775, 'grad_norm': 0.6170450448989868, 'learning_rate': 0.00013433528836754642, 'epoch': 1.14}
+{'loss': 0.2613, 'grad_norm': 0.7190653681755066, 'learning_rate': 0.0001343108504398827, 'epoch': 1.14}
+{'loss': 0.1957, 'grad_norm': 0.6273028254508972, 'learning_rate': 0.00013428641251221894, 'epoch': 1.14}
+{'loss': 0.3157, 'grad_norm': 1.515903115272522, 'learning_rate': 0.00013426197458455522, 'epoch': 1.14}
+{'loss': 0.2462, 'grad_norm': 0.493855744600296, 'learning_rate': 0.00013423753665689147, 'epoch': 1.14}
+{'loss': 0.3651, 'grad_norm': 1.0583348274230957, 'learning_rate': 0.00013421309872922775, 'epoch': 1.14}
+{'loss': 0.2172, 'grad_norm': 0.5995838642120361, 'learning_rate': 0.00013418866080156403, 'epoch': 1.14}
+{'loss': 0.2177, 'grad_norm': 1.2257310152053833, 'learning_rate': 0.00013416422287390028, 'epoch': 1.14}
+{'loss': 0.1958, 'grad_norm': 0.7221239805221558, 'learning_rate': 0.00013413978494623653, 'epoch': 1.14}
+{'loss': 0.1939, 'grad_norm': 0.5078455209732056, 'learning_rate': 0.0001341153470185728, 'epoch': 1.14}
+{'loss': 0.3449, 'grad_norm': 0.8579664826393127, 'learning_rate': 0.00013409090909090906, 'epoch': 1.14}
+{'loss': 0.3881, 'grad_norm': 1.03110933303833, 'learning_rate': 0.00013406647116324534, 'epoch': 1.14}
+{'loss': 0.2653, 'grad_norm': 0.700508713722229, 'learning_rate': 0.00013404203323558162, 'epoch': 1.14}
+{'loss': 0.4662, 'grad_norm': 1.0958391427993774, 'learning_rate': 0.00013401759530791787, 'epoch': 1.14}
+{'loss': 0.2924, 'grad_norm': 1.1270815134048462, 'learning_rate': 0.00013399315738025412, 'epoch': 1.14}
+{'loss': 0.4216, 'grad_norm': 1.0125340223312378, 'learning_rate': 0.0001339687194525904, 'epoch': 1.14}
+{'loss': 0.5891, 'grad_norm': 3.5537991523742676, 'learning_rate': 0.00013394428152492668, 'epoch': 1.14}
+{'loss': 0.4454, 'grad_norm': 1.405709147453308, 'learning_rate': 0.00013391984359726293, 'epoch': 1.14}
+ 57%|█████▋    | 7308/12776 [1:17:29<43:13,  2.11it/s] 57%|█████▋    | 7309/12776 [1:17:30<40:02,  2.28it/s]                                                       57%|█████▋    | 7309/12776 [1:17:30<40:02,  2.28it/s] 57%|█████▋    | 7310/12776 [1:17:30<37:48,  2.41it/s]                                                       57%|█████▋    | 7310/12776 [1:17:30<37:48,  2.41it/s] 57%|█████▋    | 7311/12776 [1:17:30<37:47,  2.41it/s]                                                       57%|█████▋    | 7311/12776 [1:17:30<37:47,  2.41it/s] 57%|█████▋    | 7312/12776 [1:17:31<35:37,  2.56it/s]                                                       57%|█████▋    | 7312/12776 [1:17:31<35:37,  2.56it/s] 57%|█████▋    | 7313/12776 [1:17:31<33:57,  2.68it/s]                                                       57%|█████▋    | 7313/12776 [1:17:31<33:57,  2.68it/s] 57%|█████▋    | 7314/12776 [1:17:31<36:19,  2.51it/s]                                                       57%|█████▋    | 7314/12776 [1:17:31<36:19,  2.51it/s] 57%|█████▋    | 7315/12776 [1:17:32<33:40,  2.70it/s]                                                       57%|█████▋    | 7315/12776 [1:17:32<33:40,  2.70it/s] 57%|█████▋    | 7316/12776 [1:17:32<31:34,  2.88it/s]                                                       57%|█████▋    | 7316/12776 [1:17:32<31:34,  2.88it/s] 57%|█████▋    | 7317/12776 [1:17:32<33:04,  2.75it/s]                                                       57%|█████▋    | 7317/12776 [1:17:32<33:04,  2.75it/s] 57%|█████▋    | 7318/12776 [1:17:33<30:39,  2.97it/s]                                                       57%|█████▋    | 7318/12776 [1:17:33<30:39,  2.97it/s] 57%|█████▋    | 7319/12776 [1:17:33<28:40,  3.17it/s]                                                       57%|█████▋    | 7319/12776 [1:17:33<28:40,  3.17it/s] 57%|█████▋    | 7320/12776 [1:17:33<26:57,  3.37it/s]                                                       57%|█████▋    | 7320/12776 [1:17:33<26:57,  3.37it/s] 57%|█████▋    | 7321/12776 [1:17:34<28:25,  3.20it/s]                                                       57%|█████▋    | 7321/12776 [1:17:34<28:25,  3.20it/s] 57%|█████▋    | 7322/12776 [1:17:34<26:39,  3.41it/s]                                                       57%|█████▋    | 7322/12776 [1:17:34<26:39,  3.41it/s] 57%|█████▋    | 7323/12776 [1:17:34<25:18,  3.59it/s]                                                       57%|█████▋    | 7323/12776 [1:17:34<25:18,  3.59it/s] 57%|█████▋    | 7324/12776 [1:17:34<24:12,  3.75it/s]                                                       57%|█████▋    | 7324/12776 [1:17:34<24:12,  3.75it/s] 57%|█████▋    | 7325/12776 [1:17:35<23:12,  3.91it/s]                                                       57%|█████▋    | 7325/12776 [1:17:35<23:12,  3.91it/s] 57%|█████▋    | 7326/12776 [1:17:35<24:32,  3.70it/s]                                                       57%|█████▋    | 7326/12776 [1:17:35<24:32,  3.70it/s] 57%|█████▋    | 7327/12776 [1:17:35<23:06,  3.93it/s]                                                       57%|█████▋    | 7327/12776 [1:17:35<23:06,  3.93it/s] 57%|█████▋    | 7328/12776 [1:17:35<22:01,  4.12it/s]                                                       57%|█████▋    | 7328/12776 [1:17:35<22:01,  4.12it/s] 57%|█████▋    | 7329/12776 [1:17:36<21:12,  4.28it/s]                                                       57%|█████▋    | 7329/12776 [1:17:36<21:12,  4.28it/s] 57%|█████▋    | 7330/12776 [1:17:36<20:32,  4.42it/s]                                                       57%|█████▋    | 7330/12776 [1:17:36<20:32,  4.42it/s] 57%|█████▋    | 7331/12776 [1:17:36<21:25,  4.23it/s]                                                       57%|█████▋    | 7331/12776 [1:17:36<21:25,  4.23it/s] 57%|█████▋    | 7332/12776 [1:17:36<20:30,  4.42it/s]                                                       57%|█████▋    | 7332/12776 [1:17:36<20:30,  4.42it/s] 57%|█████▋    | 7333/12776 [1:17:36<19:46,  4.59it/s]                                                       57%|█████▋    | 7333/12776 [1:17:36<19:46,  4.59it/s] 57%|█████▋    | 7334/12776 [1:17:37<19:11,  4.73it/s]                                                       57%|█████▋    | 7334/12776 [1:17:37<19:11,  4.73it/s] 57%|█████▋    | 7335/12776 [1:17:37<18:40,  4.86it/s]                                                       57%|█████▋    | 7335/12776 [1:17:37<18:40,  4.86it/s] 57%|█████▋    | 7336/12776 [1:17:37<21:46,  4.16it/s]                                                       57%|█████▋    | 7336/12776 [1:17:37<21:46,  4.16it/s] 57%|█████▋    | 7337/12776 [1:17:37<20:17,  4.47it/s]                                                       57%|█████▋    | 7337/12776 [1:17:37<20:17,  4.47it/s] 57%|█████▋    | 7338/12776 [1:17:38<37:07,  2.44it/s]                                                       57%|█████▋    | 7338/12776 [1:17:38<37:07,  2.44it/s] 57%|█████▋    | 7339/12776 [1:17:40<1:08:16,  1.33it/s]                                                         57%|█████▋    | 7339/12776 [1:17:40<1:08:16,  1.33it/s] 57%|█████▋    | 7340/12776 [1:17:41<1:14:50,  1.21it/s]                                                         57%|█████▋    | 7340/12776 [1:17:41<1:14:50,  1.21it/s] 57%|█████▋    | 7341/12776 [1:17:42<1:17:44,  1.17it/s]                                                         57%|█████▋    | 7341/12776 [1:17:42<1:17:44,  1.17it/s] 57%|█████▋    | 7342/12776 [1:17:42<1:16:07,  1.19it/s]                                                         57%|█████▋    | 7342/12776 [1:17:42<1:16:07,  1.19it/s] 57%|█████▋    | 7343/12776 [1:17:43<1:14:37,  1.21it/s]                                                         57%|█████▋    | 7343/12776 [1:17:43<1:14:37,  1.21it/s] 57%|█████▋    | 7344/12776 [1:17:44<1:12:00,  1.26it/s]                                                         57%|█████▋    | 7344/12776 [1:17:44<1:12:00,  1.26it/s] 57%|█████▋    | 7345/12776 [1:17:45<1:08:37,  1.32it/s]                                                         57%|█████▋    | 7345/12776 [1:17:45<1:08:37,  1.32it/s] 57%|█████▋    | 7346/12776 [1:17:45<1:08:06,  1.33it/s]                                                         57%|█████▋    | 7346/12776 [1:17:45<1:08:06,  1.33it/s] 58%|█████▊    | 7347/12776 [1:17:46<1:03:57,  1.41it/s]                                                         58%|█████▊    | 7347/12776 [1:17:46<1:03:57,  1.41it/s] 58%|█████▊    | 7348/12776 [1:17:47<1:00:58,  1.48it/s]                                                         58%|█████▊    | 7348/12776 [1:17:47<1:00:58,  1.48it/s] 58%|█████▊    | 7349/12776 [1:17:47<58:05,  1.56it/s]                                                         58%|█████▊    | 7349/12776 [1:17:47<58:05,  1.56it/s] 58%|█████▊    | 7350/12776 [1:17:48<56:16,  1.61it/s]                                                       58%|█████▊    | 7350/12776 [1:17:48<56:16,  1.61it/s] 58%|█████▊    | 7351/12776 [1:17:48<53:12,  1.70it/s]                                                       58%|█████▊    | 7351/12776 [1:17:48<53:12,  1.70it/s] 58%|█████▊    | 7352/12776 [1:17:49<52:50,  1.71it/s]                                                       58%|█████▊    | 7352/12776 [1:17:49<52:50,  1.71it/s] 58%|█████▊    | 7353/12776 [1:17:49<49:05,  1.84it/s]                                                       58%|█████▊    | 7353/12776 [1:17:49<49:05,  1.84it/s] 58%|█████▊    | 7354/12776 [1:17:50<50:37,  1.78it/s]                                                       58%|█████▊    | 7354/12776 [1:17:50<50:37,  1.78it/s] 58%|█████▊    | 7355/12776 [1:17:50<46:38,  1.94it/s]                                                       58%|█████▊    | 7355/12776 [1:17:50<46:38,  1.94it/s] 58%|█████▊    | 7356/12776 [1:17:51<43:24,  2.08it/s]                                                       58%|█████▊    | 7356/12776 [1:17:51<43:24,  2.08it/s] 58%|█████▊    | 7357/12776 [1:17:51<42:27,  2.13it/s]                                                       58%|█████▊    | 7357/12776 [1:17:51<42:27,  2.13it/s] 58%|█████▊    | 7358/12776 [1:17:51<39:59,  2.26it/s]                                                       58%|█████▊    | 7358/12776 [1:17:51<39:59,  2.26it/s] 58%|█████▊    | 7359/12776 [1:17:52<37:37,  2.40it/s]                                                       58%|█████▊    | 7359/12776 [1:17:52<37:37,  2.40it/s] 58%|█████▊    | 7360/12776 [1:17:52<36:59,  2.44it/s]                                                       58%|█████▊    | 7360/12776 [1:17:52<36:59,  2.44it/s] 58%|█████▊    | 7361/12776 [1:17:53<34:57,  2.58it/s]                                                       58%|█████▊    | 7361/12776 [1:17:53<34:57,  2.58it/s] 58%|█████▊    | 7362/12776 [1:17:53<33:22,  2.70it/s]                                                       58%|█████▊    | 7362/12776 [1:17:53<33:22,  2.70it/s] 58%|█████▊    | 7363/12776 [1:17:53<34:42,  2.60it/s]                                                       58%|█████▊    | 7363/12776 [1:17:53<34:42,  2.60it/s] 58%|█████▊    | 7364/12776 [1:17:54<32:33,  2.77it/s]                                                       58%|█████▊    | 7364/12776 [1:17:54<32:33,  2.77it/s] 58%|█████▊    | 7365/12776 [1:17:54<30:53,  2.92it/s]                                                       58%|█████▊    | 7365/12776 [1:17:54<30:53,  2.92it/s] 58%|█████▊    | 7366/12776 [1:17:54<31:35,  2.85it/s]                                                       58%|█████▊    | 7366/12776 [1:17:54<31:35,  2.85it/s] 58%|█████▊    | 7367/12776 [1:17:55<29:48,  3.02it/s]                                                       58%|█████▊    | 7367/12776 [1:17:55<29:48,  3.02it/s] 58%|█████▊    | 7368/12776 [1:17:55<28:14,  3.19it/s]                                                       58%|█████▊    | 7368/12776 [1:17:55<28:14,  3.19it/s] 58%|█████▊    | 7369/12776 [1:17:55<26:54,  3.35it/s]                                                       58%|█████▊    | 7369/12776 [1:17:55<26:54,  3.35it/s] 58%|█████▊    | 7370/12776 [1:17:55<27:25,  3.29it/s]                                                       58%|█████▊    | 7370/12776 [1:17:55<27:25,  3.29it/s] 58%|█████▊    | 7371/12776 [1:17:56<25:59,  3.47it/s]                                                       58%|█████▊    | 7371/12776 [1:17:56<25:59,  3.47it/s] 58%|█████▊    | 7372/12776 [1:17:56<24:50,  3.63it/s]                                                       58%|█████▊    | 7372/12776 [1:17:56<24:50,  3.63it/s] 58%|█████▊    | 7373/12776 [1:17:56<23:54,  3.77it/s]                                                       58%|█████▊    | 7373/12776 [1:17:56<23:54,  3.77it/s] 58%|█████▊    | 7374/12776 [1:17:56<23:13,  3.88it/s]                                                       58%|█████▊    | 7374/12776 [1:17:56<23:13,  3.88it/s] 58%|█████▊    | 7375/12776 [1:17:57<24:03,  3.74it/s]                                                       58%|█████▊    | 7375/12776 [1:17:57<24:03,  3.74it/s] 58%|█████▊    | 7376/12776 [1:17:57<22:49,  3.94it/s]                                                       58%|█████▊    | 7376/12776 [1:17:57<22:49,  3.94it/s] 58%|█████▊    | 7377/12776 [1:17:57<21:48,  4.12it/s]                                                       58%|█████▊    | 7377/12776 [1:17:57<21:48,  4.12it/s] 58%|█████▊    | 7378/12776 [1:17:57<21:01,  4.28it/s]                                                       58%|█████▊    | 7378/12776 [1:17:57<21:01,  4.28it/s] 58%|█████▊    | 7379/12776 [1:17:58<20:25,  4.40it/s]                                                       58%|█████▊    | 7379/12776 [1:17:58<20:25,  4.40it/s] 58%|█████▊    | 7380/12776 [1:17:58<21:03,  4.27it/s]                                                       58%|█████▊    | 7380/12776 [1:17:58<21:03,  4.27it/s] 58%|█████▊    | 7381/12776 [1:17:58<20:17,  4.43it/s]                                                       58%|█████▊    | 7381/12776 [1:17:58<20:17,  4.43it/s] 58%|█████▊    | 7382/12776 [1:17:58<19:38,  4.58it/s]                                                       58%|█████▊    | 7382/12776 [1:17:58<19:38,  4.58it/s] 58%|█████▊    | 7383/12776 [1:17:58<19:05,  4.71it/s]                                                       58%|█████▊    | 7383/12776 [1:17:58<19:05,  4.71it/s] 58%|█████▊    | 7384/12776 [1:17:59<18:39,  4.82it/s]                                                       58%|█████▊    | 7384/12776 [1:17:59<18:39,  4.82it/s] 58%|█████▊    | 7385/12776 [1:17:59<21:35,  4.16it/s]                                                      {'loss': 0.6456, 'grad_norm': 1.4942262172698975, 'learning_rate': 0.0001338954056695992, 'epoch': 1.14}
+{'loss': 0.3922, 'grad_norm': 3.732208490371704, 'learning_rate': 0.00013387096774193546, 'epoch': 1.14}
+{'loss': 0.7081, 'grad_norm': 1.3866475820541382, 'learning_rate': 0.00013384652981427174, 'epoch': 1.14}
+{'loss': 0.2748, 'grad_norm': 0.8793286085128784, 'learning_rate': 0.00013382209188660802, 'epoch': 1.14}
+{'loss': 0.6008, 'grad_norm': 1.271528959274292, 'learning_rate': 0.00013379765395894427, 'epoch': 1.14}
+{'loss': 0.5822, 'grad_norm': 1.1284598112106323, 'learning_rate': 0.00013377321603128052, 'epoch': 1.14}
+{'loss': 0.4663, 'grad_norm': 1.5115302801132202, 'learning_rate': 0.0001337487781036168, 'epoch': 1.14}
+{'loss': 0.6408, 'grad_norm': 1.522741675376892, 'learning_rate': 0.00013372434017595308, 'epoch': 1.15}
+{'loss': 1.0592, 'grad_norm': 3.165752649307251, 'learning_rate': 0.00013369990224828933, 'epoch': 1.15}
+{'loss': 0.4573, 'grad_norm': 1.4550795555114746, 'learning_rate': 0.0001336754643206256, 'epoch': 1.15}
+{'loss': 0.7879, 'grad_norm': 1.3216426372528076, 'learning_rate': 0.00013365102639296186, 'epoch': 1.15}
+{'loss': 0.52, 'grad_norm': 3.195171356201172, 'learning_rate': 0.00013362658846529814, 'epoch': 1.15}
+{'loss': 0.7727, 'grad_norm': 2.3996787071228027, 'learning_rate': 0.0001336021505376344, 'epoch': 1.15}
+{'loss': 0.8789, 'grad_norm': 1.751764178276062, 'learning_rate': 0.00013357771260997066, 'epoch': 1.15}
+{'loss': 0.3855, 'grad_norm': 1.3786290884017944, 'learning_rate': 0.00013355327468230692, 'epoch': 1.15}
+{'loss': 0.7553, 'grad_norm': 1.4035483598709106, 'learning_rate': 0.0001335288367546432, 'epoch': 1.15}
+{'loss': 1.1153, 'grad_norm': 2.2080776691436768, 'learning_rate': 0.00013350439882697945, 'epoch': 1.15}
+{'loss': 0.7042, 'grad_norm': 3.5789425373077393, 'learning_rate': 0.00013347996089931572, 'epoch': 1.15}
+{'loss': 1.1949, 'grad_norm': 2.5934054851531982, 'learning_rate': 0.000133455522971652, 'epoch': 1.15}
+{'loss': 0.7631, 'grad_norm': 1.6251215934753418, 'learning_rate': 0.00013343108504398825, 'epoch': 1.15}
+{'loss': 0.8629, 'grad_norm': 2.1703507900238037, 'learning_rate': 0.0001334066471163245, 'epoch': 1.15}
+{'loss': 1.0605, 'grad_norm': 2.7989585399627686, 'learning_rate': 0.00013338220918866078, 'epoch': 1.15}
+{'loss': 0.7024, 'grad_norm': 2.1508922576904297, 'learning_rate': 0.00013335777126099706, 'epoch': 1.15}
+{'loss': 1.3555, 'grad_norm': 3.7469875812530518, 'learning_rate': 0.0001333333333333333, 'epoch': 1.15}
+{'loss': 1.0349, 'grad_norm': 2.172651767730713, 'learning_rate': 0.0001333088954056696, 'epoch': 1.15}
+{'loss': 1.2347, 'grad_norm': 3.3047091960906982, 'learning_rate': 0.00013328445747800584, 'epoch': 1.15}
+{'loss': 1.2959, 'grad_norm': 2.5102274417877197, 'learning_rate': 0.00013326001955034212, 'epoch': 1.15}
+{'loss': 0.487, 'grad_norm': 3.89743971824646, 'learning_rate': 0.0001332355816226784, 'epoch': 1.15}
+{'loss': 0.7017, 'grad_norm': 2.789970874786377, 'learning_rate': 0.00013321114369501465, 'epoch': 1.15}
+{'loss': 0.3889, 'grad_norm': 2.094162702560425, 'learning_rate': 0.0001331867057673509, 'epoch': 1.15}
+{'loss': 1.0959, 'grad_norm': 2.4462733268737793, 'learning_rate': 0.00013316226783968718, 'epoch': 1.15}
+{'loss': 0.2746, 'grad_norm': 0.5095207095146179, 'learning_rate': 0.00013313782991202346, 'epoch': 1.15}
+{'loss': 0.196, 'grad_norm': 0.5467092394828796, 'learning_rate': 0.0001331133919843597, 'epoch': 1.15}
+{'loss': 0.2072, 'grad_norm': 0.3943612277507782, 'learning_rate': 0.000133088954056696, 'epoch': 1.15}
+{'loss': 0.2318, 'grad_norm': 0.613156259059906, 'learning_rate': 0.00013306451612903224, 'epoch': 1.15}
+{'loss': 0.1638, 'grad_norm': 0.4345967173576355, 'learning_rate': 0.00013304007820136852, 'epoch': 1.15}
+{'loss': 0.2764, 'grad_norm': 0.9504840970039368, 'learning_rate': 0.0001330156402737048, 'epoch': 1.15}
+{'loss': 0.2428, 'grad_norm': 0.6306407451629639, 'learning_rate': 0.00013299120234604105, 'epoch': 1.15}
+{'loss': 0.315, 'grad_norm': 0.6337870955467224, 'learning_rate': 0.0001329667644183773, 'epoch': 1.15}
+{'loss': 0.2152, 'grad_norm': 0.47224870324134827, 'learning_rate': 0.00013294232649071358, 'epoch': 1.15}
+{'loss': 0.3225, 'grad_norm': 0.8735085725784302, 'learning_rate': 0.00013291788856304983, 'epoch': 1.15}
+{'loss': 0.1715, 'grad_norm': 0.5820073485374451, 'learning_rate': 0.0001328934506353861, 'epoch': 1.15}
+{'loss': 0.3605, 'grad_norm': 1.1216922998428345, 'learning_rate': 0.00013286901270772238, 'epoch': 1.15}
+{'loss': 0.381, 'grad_norm': 1.0515015125274658, 'learning_rate': 0.00013284457478005864, 'epoch': 1.15}
+{'loss': 0.3203, 'grad_norm': 0.727737545967102, 'learning_rate': 0.0001328201368523949, 'epoch': 1.15}
+{'loss': 0.3155, 'grad_norm': 1.1194623708724976, 'learning_rate': 0.00013279569892473117, 'epoch': 1.15}
+{'loss': 0.3207, 'grad_norm': 0.822720468044281, 'learning_rate': 0.00013277126099706744, 'epoch': 1.15}
+{'loss': 0.4093, 'grad_norm': 1.256633996963501, 'learning_rate': 0.0001327468230694037, 'epoch': 1.15}
+{'loss': 0.4382, 'grad_norm': 1.5676660537719727, 'learning_rate': 0.00013272238514173997, 'epoch': 1.15}
+{'loss': 0.5514, 'grad_norm': 1.2648556232452393, 'learning_rate': 0.00013269794721407622, 'epoch': 1.15}
+{'loss': 0.5204, 'grad_norm': 1.1596871614456177, 'learning_rate': 0.0001326735092864125, 'epoch': 1.15}
+{'loss': 0.3169, 'grad_norm': 1.1163748502731323, 'learning_rate': 0.00013264907135874878, 'epoch': 1.15}
+{'loss': 0.5897, 'grad_norm': 1.4104461669921875, 'learning_rate': 0.00013262463343108503, 'epoch': 1.15}
+{'loss': 0.5129, 'grad_norm': 0.9608981013298035, 'learning_rate': 0.00013260019550342128, 'epoch': 1.15}
+{'loss': 0.6277, 'grad_norm': 1.5088484287261963, 'learning_rate': 0.00013257575757575756, 'epoch': 1.15}
+{'loss': 0.6778, 'grad_norm': 1.8139734268188477, 'learning_rate': 0.00013255131964809384, 'epoch': 1.15}
+{'loss': 0.6887, 'grad_norm': 1.7767802476882935, 'learning_rate': 0.0001325268817204301, 'epoch': 1.15}
+{'loss': 0.3635, 'grad_norm': 1.6628837585449219, 'learning_rate': 0.00013250244379276637, 'epoch': 1.15}
+{'loss': 0.4019, 'grad_norm': 1.664116621017456, 'learning_rate': 0.00013247800586510262, 'epoch': 1.15}
+{'loss': 0.9246, 'grad_norm': 1.989309549331665, 'learning_rate': 0.0001324535679374389, 'epoch': 1.15}
+{'loss': 0.8514, 'grad_norm': 9.230467796325684, 'learning_rate': 0.00013242913000977515, 'epoch': 1.15}
+{'loss': 0.4943, 'grad_norm': 1.3691836595535278, 'learning_rate': 0.00013240469208211143, 'epoch': 1.15}
+{'loss': 0.5152, 'grad_norm': 1.0850152969360352, 'learning_rate': 0.00013238025415444768, 'epoch': 1.15}
+{'loss': 0.5313, 'grad_norm': 2.6206274032592773, 'learning_rate': 0.00013235581622678396, 'epoch': 1.15}
+{'loss': 0.5789, 'grad_norm': 1.9368460178375244, 'learning_rate': 0.0001323313782991202, 'epoch': 1.15}
+{'loss': 1.0362, 'grad_norm': 3.7106120586395264, 'learning_rate': 0.0001323069403714565, 'epoch': 1.15}
+{'loss': 0.5185, 'grad_norm': 1.4057819843292236, 'learning_rate': 0.00013228250244379277, 'epoch': 1.15}
+{'loss': 1.1545, 'grad_norm': 2.9499166011810303, 'learning_rate': 0.00013225806451612902, 'epoch': 1.15}
+{'loss': 0.9092, 'grad_norm': 2.847531318664551, 'learning_rate': 0.00013223362658846527, 'epoch': 1.15}
+{'loss': 1.0395, 'grad_norm': 2.4017632007598877, 'learning_rate': 0.00013220918866080155, 'epoch': 1.15}
+{'loss': 0.9163, 'grad_norm': 3.083125591278076, 'learning_rate': 0.00013218475073313783, 'epoch': 1.15}
+{'loss': 0.7357, 'grad_norm': 1.6642545461654663, 'learning_rate': 0.00013216031280547408, 'epoch': 1.16}
+{'loss': 1.3048, 'grad_norm': 2.6173577308654785, 'learning_rate': 0.00013213587487781036, 'epoch': 1.16}
+{'loss': 1.3639, 'grad_norm': 4.377566337585449, 'learning_rate': 0.0001321114369501466, 'epoch': 1.16}
+{'loss': 1.1259, 'grad_norm': 2.4152894020080566, 'learning_rate': 0.00013208699902248289, 'epoch': 1.16}
+{'loss': 0.9502, 'grad_norm': 2.7679460048675537, 'learning_rate': 0.00013206256109481916, 'epoch': 1.16}
+{'loss': 0.563, 'grad_norm': 1.6410295963287354, 'learning_rate': 0.00013203812316715541, 'epoch': 1.16}
+ 58%|█████▊    | 7385/12776 [1:17:59<21:35,  4.16it/s] 58%|█████▊    | 7386/12776 [1:17:59<20:09,  4.46it/s]                                                       58%|█████▊    | 7386/12776 [1:17:59<20:09,  4.46it/s] 58%|█████▊    | 7387/12776 [1:17:59<19:11,  4.68it/s]                                                       58%|█████▊    | 7387/12776 [1:17:59<19:11,  4.68it/s] 58%|█████▊    | 7388/12776 [1:18:00<33:19,  2.70it/s]                                                       58%|█████▊    | 7388/12776 [1:18:00<33:19,  2.70it/s] 58%|█████▊    | 7389/12776 [1:18:02<1:04:30,  1.39it/s]                                                         58%|█████▊    | 7389/12776 [1:18:02<1:04:30,  1.39it/s] 58%|█████▊    | 7390/12776 [1:18:03<1:14:23,  1.21it/s]                                                         58%|█████▊    | 7390/12776 [1:18:03<1:14:23,  1.21it/s] 58%|█████▊    | 7391/12776 [1:18:03<1:16:08,  1.18it/s]                                                         58%|█████▊    | 7391/12776 [1:18:03<1:16:08,  1.18it/s] 58%|█████▊    | 7392/12776 [1:18:04<1:15:26,  1.19it/s]                                                         58%|█████▊    | 7392/12776 [1:18:04<1:15:26,  1.19it/s] 58%|█████▊    | 7393/12776 [1:18:05<1:13:11,  1.23it/s]                                                         58%|█████▊    | 7393/12776 [1:18:05<1:13:11,  1.23it/s] 58%|█████▊    | 7394/12776 [1:18:06<1:10:03,  1.28it/s]                                                         58%|█████▊    | 7394/12776 [1:18:06<1:10:03,  1.28it/s] 58%|█████▊    | 7395/12776 [1:18:06<1:06:50,  1.34it/s]                                                         58%|█████▊    | 7395/12776 [1:18:06<1:06:50,  1.34it/s] 58%|█████▊    | 7396/12776 [1:18:07<1:06:17,  1.35it/s]                                                         58%|█████▊    | 7396/12776 [1:18:07<1:06:17,  1.35it/s] 58%|█████▊    | 7397/12776 [1:18:08<1:02:11,  1.44it/s]                                                         58%|█████▊    | 7397/12776 [1:18:08<1:02:11,  1.44it/s] 58%|█████▊    | 7398/12776 [1:18:08<1:00:13,  1.49it/s]                                                         58%|█████▊    | 7398/12776 [1:18:08<1:00:13,  1.49it/s] 58%|█████▊    | 7399/12776 [1:18:09<56:40,  1.58it/s]                                                         58%|█████▊    | 7399/12776 [1:18:09<56:40,  1.58it/s] 58%|█████▊    | 7400/12776 [1:18:09<55:10,  1.62it/s]                                                       58%|█████▊    | 7400/12776 [1:18:09<55:10,  1.62it/s] 58%|█████▊    | 7401/12776 [1:18:10<51:20,  1.75it/s]                                                       58%|█████▊    | 7401/12776 [1:18:10<51:20,  1.75it/s] 58%|█████▊    | 7402/12776 [1:18:11<51:47,  1.73it/s]                                                       58%|█████▊    | 7402/12776 [1:18:11<51:47,  1.73it/s] 58%|█████▊    | 7403/12776 [1:18:11<47:56,  1.87it/s]                                                       58%|█████▊    | 7403/12776 [1:18:11<47:56,  1.87it/s] 58%|█████▊    | 7404/12776 [1:18:11<46:48,  1.91it/s]                                                       58%|█████▊    | 7404/12776 [1:18:11<46:48,  1.91it/s] 58%|█████▊    | 7405/12776 [1:18:12<43:31,  2.06it/s]                                                       58%|█████▊    | 7405/12776 [1:18:12<43:31,  2.06it/s] 58%|█████▊    | 7406/12776 [1:18:12<40:49,  2.19it/s]                                                       58%|█████▊    | 7406/12776 [1:18:12<40:49,  2.19it/s] 58%|█████▊    | 7407/12776 [1:18:13<39:04,  2.29it/s]                                                       58%|█████▊    | 7407/12776 [1:18:13<39:04,  2.29it/s] 58%|█████▊    | 7408/12776 [1:18:13<36:40,  2.44it/s]                                                       58%|█████▊    | 7408/12776 [1:18:13<36:40,  2.44it/s] 58%|█████▊    | 7409/12776 [1:18:13<34:50,  2.57it/s]                                                       58%|█████▊    | 7409/12776 [1:18:13<34:50,  2.57it/s] 58%|█████▊    | 7410/12776 [1:18:14<36:18,  2.46it/s]                                                       58%|█████▊    | 7410/12776 [1:18:14<36:18,  2.46it/s] 58%|█████▊    | 7411/12776 [1:18:14<34:12,  2.61it/s]                                                       58%|█████▊    | 7411/12776 [1:18:14<34:12,  2.61it/s] 58%|█████▊    | 7412/12776 [1:18:14<32:22,  2.76it/s]                                                       58%|█████▊    | 7412/12776 [1:18:14<32:22,  2.76it/s] 58%|█████▊    | 7413/12776 [1:18:15<30:37,  2.92it/s]                                                       58%|█████▊    | 7413/12776 [1:18:15<30:37,  2.92it/s] 58%|█████▊    | 7414/12776 [1:18:15<30:26,  2.94it/s]                                                       58%|█████▊    | 7414/12776 [1:18:15<30:26,  2.94it/s] 58%|█████▊    | 7415/12776 [1:18:15<28:55,  3.09it/s]                                                       58%|█████▊    | 7415/12776 [1:18:15<28:55,  3.09it/s] 58%|█████▊    | 7416/12776 [1:18:16<27:39,  3.23it/s]                                                       58%|█████▊    | 7416/12776 [1:18:16<27:39,  3.23it/s] 58%|█████▊    | 7417/12776 [1:18:16<26:38,  3.35it/s]                                                       58%|█████▊    | 7417/12776 [1:18:16<26:38,  3.35it/s] 58%|█████▊    | 7418/12776 [1:18:16<27:13,  3.28it/s]                                                       58%|█████▊    | 7418/12776 [1:18:16<27:13,  3.28it/s] 58%|█████▊    | 7419/12776 [1:18:16<25:55,  3.44it/s]                                                       58%|█████▊    | 7419/12776 [1:18:16<25:55,  3.44it/s] 58%|█████▊    | 7420/12776 [1:18:17<24:49,  3.60it/s]                                                       58%|█████▊    | 7420/12776 [1:18:17<24:49,  3.60it/s] 58%|█████▊    | 7421/12776 [1:18:17<23:55,  3.73it/s]                                                       58%|█████▊    | 7421/12776 [1:18:17<23:55,  3.73it/s] 58%|█████▊    | 7422/12776 [1:18:17<26:07,  3.42it/s]                                                       58%|█████▊    | 7422/12776 [1:18:17<26:07,  3.42it/s] 58%|█████▊    | 7423/12776 [1:18:18<24:28,  3.65it/s]                                                       58%|█████▊    | 7423/12776 [1:18:18<24:28,  3.65it/s] 58%|█████▊    | 7424/12776 [1:18:18<23:12,  3.84it/s]                                                       58%|█████▊    | 7424/12776 [1:18:18<23:12,  3.84it/s] 58%|█████▊    | 7425/12776 [1:18:18<22:05,  4.04it/s]                                                       58%|█████▊    | 7425/12776 [1:18:18<22:05,  4.04it/s] 58%|█████▊    | 7426/12776 [1:18:18<24:31,  3.64it/s]                                                       58%|█████▊    | 7426/12776 [1:18:18<24:31,  3.64it/s] 58%|█████▊    | 7427/12776 [1:18:19<22:49,  3.91it/s]                                                       58%|█████▊    | 7427/12776 [1:18:19<22:49,  3.91it/s] 58%|█████▊    | 7428/12776 [1:18:19<21:38,  4.12it/s]                                                       58%|█████▊    | 7428/12776 [1:18:19<21:38,  4.12it/s] 58%|█████▊    | 7429/12776 [1:18:19<20:41,  4.31it/s]                                                       58%|█████▊    | 7429/12776 [1:18:19<20:41,  4.31it/s] 58%|█████▊    | 7430/12776 [1:18:19<20:00,  4.45it/s]                                                       58%|█████▊    | 7430/12776 [1:18:19<20:00,  4.45it/s] 58%|█████▊    | 7431/12776 [1:18:20<22:48,  3.90it/s]                                                       58%|█████▊    | 7431/12776 [1:18:20<22:48,  3.90it/s] 58%|█████▊    | 7432/12776 [1:18:20<21:22,  4.17it/s]                                                       58%|█████▊    | 7432/12776 [1:18:20<21:22,  4.17it/s] 58%|█████▊    | 7433/12776 [1:18:20<20:14,  4.40it/s]                                                       58%|█████▊    | 7433/12776 [1:18:20<20:14,  4.40it/s] 58%|█████▊    | 7434/12776 [1:18:20<19:25,  4.58it/s]                                                       58%|█████▊    | 7434/12776 [1:18:20<19:25,  4.58it/s] 58%|█████▊    | 7435/12776 [1:18:20<18:42,  4.76it/s]                                                       58%|█████▊    | 7435/12776 [1:18:20<18:42,  4.76it/s] 58%|█████▊    | 7436/12776 [1:18:20<18:02,  4.93it/s]                                                       58%|█████▊    | 7436/12776 [1:18:20<18:02,  4.93it/s] 58%|█████▊    | 7437/12776 [1:18:21<19:28,  4.57it/s]                                                       58%|█████▊    | 7437/12776 [1:18:21<19:28,  4.57it/s] 58%|█████▊    | 7438/12776 [1:18:21<33:27,  2.66it/s]                                                       58%|█████▊    | 7438/12776 [1:18:21<33:27,  2.66it/s] 58%|█████▊    | 7439/12776 [1:18:23<1:03:36,  1.40it/s]                                                         58%|█████▊    | 7439/12776 [1:18:23<1:03:36,  1.40it/s] 58%|█████▊    | 7440/12776 [1:18:24<1:13:36,  1.21it/s]                                                         58%|█████▊    | 7440/12776 [1:18:24<1:13:36,  1.21it/s] 58%|█████▊    | 7441/12776 [1:18:25<1:15:20,  1.18it/s]                                                         58%|█████▊    | 7441/12776 [1:18:25<1:15:20,  1.18it/s] 58%|█████▊    | 7442/12776 [1:18:26<1:14:39,  1.19it/s]                                                         58%|█████▊    | 7442/12776 [1:18:26<1:14:39,  1.19it/s] 58%|█████▊    | 7443/12776 [1:18:27<1:15:31,  1.18it/s]                                                         58%|█████▊    | 7443/12776 [1:18:27<1:15:31,  1.18it/s] 58%|█████▊    | 7444/12776 [1:18:28<1:15:50,  1.17it/s]                                                         58%|█████▊    | 7444/12776 [1:18:28<1:15:50,  1.17it/s] 58%|█████▊    | 7445/12776 [1:18:28<1:11:07,  1.25it/s]                                                         58%|█████▊    | 7445/12776 [1:18:28<1:11:07,  1.25it/s] 58%|█████▊    | 7446/12776 [1:18:29<1:09:46,  1.27it/s]                                                         58%|█████▊    | 7446/12776 [1:18:29<1:09:46,  1.27it/s] 58%|█████▊    | 7447/12776 [1:18:30<1:05:37,  1.35it/s]                                                         58%|█████▊    | 7447/12776 [1:18:30<1:05:37,  1.35it/s] 58%|█████▊    | 7448/12776 [1:18:30<1:01:53,  1.43it/s]                                                         58%|█████▊    | 7448/12776 [1:18:30<1:01:53,  1.43it/s] 58%|█████▊    | 7449/12776 [1:18:31<58:28,  1.52it/s]                                                         58%|█████▊    | 7449/12776 [1:18:31<58:28,  1.52it/s] 58%|█████▊    | 7450/12776 [1:18:31<55:37,  1.60it/s]                                                       58%|█████▊    | 7450/12776 [1:18:31<55:37,  1.60it/s] 58%|█████▊    | 7451/12776 [1:18:32<52:58,  1.68it/s]                                                       58%|█████▊    | 7451/12776 [1:18:32<52:58,  1.68it/s] 58%|█████▊    | 7452/12776 [1:18:32<52:04,  1.70it/s]                                                       58%|█████▊    | 7452/12776 [1:18:32<52:04,  1.70it/s] 58%|█████▊    | 7453/12776 [1:18:33<49:01,  1.81it/s]                                                       58%|█████▊    | 7453/12776 [1:18:33<49:01,  1.81it/s] 58%|█████▊    | 7454/12776 [1:18:33<48:49,  1.82it/s]                                                       58%|█████▊    | 7454/12776 [1:18:33<48:49,  1.82it/s] 58%|█████▊    | 7455/12776 [1:18:34<45:43,  1.94it/s]                                                       58%|█████▊    | 7455/12776 [1:18:34<45:43,  1.94it/s] 58%|█████▊    | 7456/12776 [1:18:34<46:25,  1.91it/s]                                                       58%|█████▊    | 7456/12776 [1:18:34<46:25,  1.91it/s] 58%|█████▊    | 7457/12776 [1:18:35<43:07,  2.06it/s]                                                       58%|█████▊    | 7457/12776 [1:18:35<43:07,  2.06it/s] 58%|█████▊    | 7458/12776 [1:18:35<40:27,  2.19it/s]                                                       58%|█████▊    | 7458/12776 [1:18:35<40:27,  2.19it/s] 58%|█████▊    | 7459/12776 [1:18:36<38:57,  2.27it/s]                                                       58%|█████▊    | 7459/12776 [1:18:36<38:57,  2.27it/s] 58%|█████▊    | 7460/12776 [1:18:36<36:40,  2.42it/s]                                                       58%|█████▊    | 7460/12776 [1:18:36<36:40,  2.42it/s] 58%|█████▊    | 7461/12776 [1:18:36<34:55,  2.54it/s]                                                       58%|█████▊    | 7461/12776 [1:18:36<34:55,  2.54it/s] 58%|█████▊    | 7462/12776 [1:18:37<36:03,  2.46it/s]                                                      {'loss': 0.3714, 'grad_norm': 1.5366709232330322, 'learning_rate': 0.00013201368523949167, 'epoch': 1.16}
+{'loss': 0.5333, 'grad_norm': 2.130624771118164, 'learning_rate': 0.00013198924731182794, 'epoch': 1.16}
+{'loss': 0.1698, 'grad_norm': 0.982833743095398, 'learning_rate': 0.00013196480938416422, 'epoch': 1.16}
+{'loss': 1.0673, 'grad_norm': 3.6173794269561768, 'learning_rate': 0.00013194037145650047, 'epoch': 1.16}
+{'loss': 0.2432, 'grad_norm': 0.45585983991622925, 'learning_rate': 0.00013191593352883675, 'epoch': 1.16}
+{'loss': 0.2334, 'grad_norm': 0.5601482391357422, 'learning_rate': 0.000131891495601173, 'epoch': 1.16}
+{'loss': 0.2739, 'grad_norm': 0.5899141430854797, 'learning_rate': 0.00013186705767350928, 'epoch': 1.16}
+{'loss': 0.1941, 'grad_norm': 0.6862410306930542, 'learning_rate': 0.00013184261974584553, 'epoch': 1.16}
+{'loss': 0.2567, 'grad_norm': 0.8341695070266724, 'learning_rate': 0.0001318181818181818, 'epoch': 1.16}
+{'loss': 0.3397, 'grad_norm': 0.7754533290863037, 'learning_rate': 0.00013179374389051806, 'epoch': 1.16}
+{'loss': 0.3212, 'grad_norm': 1.1983238458633423, 'learning_rate': 0.00013176930596285434, 'epoch': 1.16}
+{'loss': 0.2555, 'grad_norm': 0.5611344575881958, 'learning_rate': 0.0001317448680351906, 'epoch': 1.16}
+{'loss': 0.2091, 'grad_norm': 0.4834452271461487, 'learning_rate': 0.00013172043010752687, 'epoch': 1.16}
+{'loss': 0.3318, 'grad_norm': 1.169364333152771, 'learning_rate': 0.00013169599217986315, 'epoch': 1.16}
+{'loss': 0.1914, 'grad_norm': 0.8401133418083191, 'learning_rate': 0.0001316715542521994, 'epoch': 1.16}
+{'loss': 0.4167, 'grad_norm': 1.0948725938796997, 'learning_rate': 0.00013164711632453565, 'epoch': 1.16}
+{'loss': 0.3204, 'grad_norm': 0.6502231955528259, 'learning_rate': 0.00013162267839687193, 'epoch': 1.16}
+{'loss': 0.2374, 'grad_norm': 0.8095880746841431, 'learning_rate': 0.0001315982404692082, 'epoch': 1.16}
+{'loss': 0.3654, 'grad_norm': 1.0704963207244873, 'learning_rate': 0.00013157380254154446, 'epoch': 1.16}
+{'loss': 0.3836, 'grad_norm': 1.0541001558303833, 'learning_rate': 0.00013154936461388074, 'epoch': 1.16}
+{'loss': 0.4202, 'grad_norm': 0.9816923141479492, 'learning_rate': 0.000131524926686217, 'epoch': 1.16}
+{'loss': 0.2639, 'grad_norm': 1.2711502313613892, 'learning_rate': 0.00013150048875855327, 'epoch': 1.16}
+{'loss': 0.3089, 'grad_norm': 1.2869318723678589, 'learning_rate': 0.00013147605083088955, 'epoch': 1.16}
+{'loss': 0.3307, 'grad_norm': 1.1554820537567139, 'learning_rate': 0.0001314516129032258, 'epoch': 1.16}
+{'loss': 0.8516, 'grad_norm': 1.7759408950805664, 'learning_rate': 0.00013142717497556205, 'epoch': 1.16}
+{'loss': 0.3851, 'grad_norm': 1.0223851203918457, 'learning_rate': 0.00013140273704789833, 'epoch': 1.16}
+{'loss': 0.4526, 'grad_norm': 1.761975646018982, 'learning_rate': 0.0001313782991202346, 'epoch': 1.16}
+{'loss': 0.43, 'grad_norm': 0.7343161702156067, 'learning_rate': 0.00013135386119257086, 'epoch': 1.16}
+{'loss': 0.7205, 'grad_norm': 1.6459717750549316, 'learning_rate': 0.00013132942326490713, 'epoch': 1.16}
+{'loss': 0.5562, 'grad_norm': 1.6329326629638672, 'learning_rate': 0.00013130498533724339, 'epoch': 1.16}
+{'loss': 0.7984, 'grad_norm': 2.7363922595977783, 'learning_rate': 0.00013128054740957964, 'epoch': 1.16}
+{'loss': 0.5795, 'grad_norm': 1.381791114807129, 'learning_rate': 0.00013125610948191592, 'epoch': 1.16}
+{'loss': 0.3651, 'grad_norm': 1.932962417602539, 'learning_rate': 0.0001312316715542522, 'epoch': 1.16}
+{'loss': 0.4857, 'grad_norm': 1.733339786529541, 'learning_rate': 0.00013120723362658845, 'epoch': 1.16}
+{'loss': 0.5785, 'grad_norm': 1.2550915479660034, 'learning_rate': 0.00013118279569892472, 'epoch': 1.16}
+{'loss': 0.4028, 'grad_norm': 1.603405475616455, 'learning_rate': 0.00013115835777126097, 'epoch': 1.16}
+{'loss': 0.4517, 'grad_norm': 0.9606602191925049, 'learning_rate': 0.00013113391984359725, 'epoch': 1.16}
+{'loss': 0.6706, 'grad_norm': 2.063321590423584, 'learning_rate': 0.00013110948191593353, 'epoch': 1.16}
+{'loss': 0.7549, 'grad_norm': 2.597672462463379, 'learning_rate': 0.00013108504398826978, 'epoch': 1.16}
+{'loss': 0.9992, 'grad_norm': 3.000110149383545, 'learning_rate': 0.00013106060606060603, 'epoch': 1.16}
+{'loss': 1.199, 'grad_norm': 2.6195647716522217, 'learning_rate': 0.0001310361681329423, 'epoch': 1.16}
+{'loss': 0.9195, 'grad_norm': 3.0307538509368896, 'learning_rate': 0.0001310117302052786, 'epoch': 1.16}
+{'loss': 1.1316, 'grad_norm': 3.3168554306030273, 'learning_rate': 0.00013098729227761484, 'epoch': 1.16}
+{'loss': 1.2571, 'grad_norm': 2.105064630508423, 'learning_rate': 0.00013096285434995112, 'epoch': 1.16}
+{'loss': 0.9023, 'grad_norm': 3.6035208702087402, 'learning_rate': 0.00013093841642228737, 'epoch': 1.16}
+{'loss': 1.6656, 'grad_norm': 2.2905807495117188, 'learning_rate': 0.00013091397849462365, 'epoch': 1.16}
+{'loss': 1.6509, 'grad_norm': 2.3140320777893066, 'learning_rate': 0.00013088954056695993, 'epoch': 1.16}
+{'loss': 1.0378, 'grad_norm': 4.755015850067139, 'learning_rate': 0.00013086510263929618, 'epoch': 1.16}
+{'loss': 0.926, 'grad_norm': 1.8031630516052246, 'learning_rate': 0.00013084066471163243, 'epoch': 1.16}
+{'loss': 0.3366, 'grad_norm': 1.6042096614837646, 'learning_rate': 0.0001308162267839687, 'epoch': 1.16}
+{'loss': 0.8145, 'grad_norm': 1.8584673404693604, 'learning_rate': 0.000130791788856305, 'epoch': 1.16}
+{'loss': 0.6222, 'grad_norm': 2.789856433868408, 'learning_rate': 0.00013076735092864124, 'epoch': 1.16}
+{'loss': 0.7207, 'grad_norm': 2.3871476650238037, 'learning_rate': 0.00013074291300097752, 'epoch': 1.16}
+{'loss': 0.6828, 'grad_norm': 2.0751960277557373, 'learning_rate': 0.00013071847507331377, 'epoch': 1.16}
+{'loss': 0.183, 'grad_norm': 0.985443115234375, 'learning_rate': 0.00013069403714565002, 'epoch': 1.16}
+{'loss': 0.2147, 'grad_norm': 0.46885955333709717, 'learning_rate': 0.0001306695992179863, 'epoch': 1.16}
+{'loss': 0.2076, 'grad_norm': 0.5241273641586304, 'learning_rate': 0.00013064516129032258, 'epoch': 1.16}
+{'loss': 0.1665, 'grad_norm': 0.5375959873199463, 'learning_rate': 0.00013062072336265883, 'epoch': 1.16}
+{'loss': 0.2586, 'grad_norm': 0.7745464444160461, 'learning_rate': 0.0001305962854349951, 'epoch': 1.17}
+{'loss': 0.2376, 'grad_norm': 0.511017382144928, 'learning_rate': 0.00013057184750733136, 'epoch': 1.17}
+{'loss': 0.3418, 'grad_norm': 0.8513729572296143, 'learning_rate': 0.00013054740957966764, 'epoch': 1.17}
+{'loss': 0.3008, 'grad_norm': 0.9605493545532227, 'learning_rate': 0.00013052297165200391, 'epoch': 1.17}
+{'loss': 0.2765, 'grad_norm': 0.7531507015228271, 'learning_rate': 0.00013049853372434016, 'epoch': 1.17}
+{'loss': 0.3855, 'grad_norm': 0.8994906544685364, 'learning_rate': 0.00013047409579667642, 'epoch': 1.17}
+{'loss': 0.2462, 'grad_norm': 0.48256492614746094, 'learning_rate': 0.0001304496578690127, 'epoch': 1.17}
+{'loss': 0.3641, 'grad_norm': 1.1383583545684814, 'learning_rate': 0.00013042521994134897, 'epoch': 1.17}
+{'loss': 0.2236, 'grad_norm': 1.2914742231369019, 'learning_rate': 0.00013040078201368522, 'epoch': 1.17}
+{'loss': 0.2485, 'grad_norm': 0.5207766890525818, 'learning_rate': 0.0001303763440860215, 'epoch': 1.17}
+{'loss': 0.3618, 'grad_norm': 1.1940803527832031, 'learning_rate': 0.00013035190615835775, 'epoch': 1.17}
+{'loss': 0.7458, 'grad_norm': 2.0108392238616943, 'learning_rate': 0.00013032746823069403, 'epoch': 1.17}
+{'loss': 0.22, 'grad_norm': 1.687880277633667, 'learning_rate': 0.0001303030303030303, 'epoch': 1.17}
+{'loss': 0.3396, 'grad_norm': 2.061474084854126, 'learning_rate': 0.00013027859237536656, 'epoch': 1.17}
+{'loss': 0.3342, 'grad_norm': 0.789428174495697, 'learning_rate': 0.0001302541544477028, 'epoch': 1.17}
+{'loss': 0.4941, 'grad_norm': 1.0013548135757446, 'learning_rate': 0.0001302297165200391, 'epoch': 1.17}
+{'loss': 0.4534, 'grad_norm': 1.2764099836349487, 'learning_rate': 0.00013020527859237534, 'epoch': 1.17}
+{'loss': 0.9874, 'grad_norm': 1.8446581363677979, 'learning_rate': 0.00013018084066471162, 'epoch': 1.17}
+{'loss': 0.2257, 'grad_norm': 1.0535941123962402, 'learning_rate': 0.0001301564027370479, 'epoch': 1.17}
+ 58%|█████▊    | 7462/12776 [1:18:37<36:03,  2.46it/s] 58%|█████▊    | 7463/12776 [1:18:37<33:54,  2.61it/s]                                                       58%|█████▊    | 7463/12776 [1:18:37<33:54,  2.61it/s] 58%|█████▊    | 7464/12776 [1:18:37<31:54,  2.77it/s]                                                       58%|█████▊    | 7464/12776 [1:18:37<31:54,  2.77it/s] 58%|█████▊    | 7465/12776 [1:18:38<30:14,  2.93it/s]                                                       58%|█████▊    | 7465/12776 [1:18:38<30:14,  2.93it/s] 58%|█████▊    | 7466/12776 [1:18:38<30:49,  2.87it/s]                                                       58%|█████▊    | 7466/12776 [1:18:38<30:49,  2.87it/s] 58%|█████▊    | 7467/12776 [1:18:38<29:02,  3.05it/s]                                                       58%|█████▊    | 7467/12776 [1:18:38<29:02,  3.05it/s] 58%|█████▊    | 7468/12776 [1:18:39<27:33,  3.21it/s]                                                       58%|█████▊    | 7468/12776 [1:18:39<27:33,  3.21it/s] 58%|█████▊    | 7469/12776 [1:18:39<26:22,  3.35it/s]                                                       58%|█████▊    | 7469/12776 [1:18:39<26:22,  3.35it/s] 58%|█████▊    | 7470/12776 [1:18:39<26:54,  3.29it/s]                                                       58%|█████▊    | 7470/12776 [1:18:39<26:54,  3.29it/s] 58%|█████▊    | 7471/12776 [1:18:39<25:34,  3.46it/s]                                                       58%|█████▊    | 7471/12776 [1:18:39<25:34,  3.46it/s] 58%|█████▊    | 7472/12776 [1:18:40<24:36,  3.59it/s]                                                       58%|█████▊    | 7472/12776 [1:18:40<24:36,  3.59it/s] 58%|█████▊    | 7473/12776 [1:18:40<23:38,  3.74it/s]                                                       58%|█████▊    | 7473/12776 [1:18:40<23:38,  3.74it/s] 59%|█████▊    | 7474/12776 [1:18:40<25:16,  3.50it/s]                                                       59%|█████▊    | 7474/12776 [1:18:40<25:16,  3.50it/s] 59%|█████▊    | 7475/12776 [1:18:40<23:45,  3.72it/s]                                                       59%|█████▊    | 7475/12776 [1:18:40<23:45,  3.72it/s] 59%|█████▊    | 7476/12776 [1:18:41<22:26,  3.94it/s]                                                       59%|█████▊    | 7476/12776 [1:18:41<22:26,  3.94it/s] 59%|█████▊    | 7477/12776 [1:18:41<21:19,  4.14it/s]                                                       59%|█████▊    | 7477/12776 [1:18:41<21:19,  4.14it/s] 59%|█████▊    | 7478/12776 [1:18:41<20:22,  4.33it/s]                                                       59%|█████▊    | 7478/12776 [1:18:41<20:22,  4.33it/s] 59%|█████▊    | 7479/12776 [1:18:41<22:44,  3.88it/s]                                                       59%|█████▊    | 7479/12776 [1:18:41<22:44,  3.88it/s] 59%|█████▊    | 7480/12776 [1:18:42<21:20,  4.13it/s]                                                       59%|█████▊    | 7480/12776 [1:18:42<21:20,  4.13it/s] 59%|█████▊    | 7481/12776 [1:18:42<20:13,  4.37it/s]                                                       59%|█████▊    | 7481/12776 [1:18:42<20:13,  4.37it/s] 59%|█████▊    | 7482/12776 [1:18:42<19:20,  4.56it/s]                                                       59%|█████▊    | 7482/12776 [1:18:42<19:20,  4.56it/s] 59%|█████▊    | 7483/12776 [1:18:42<18:44,  4.71it/s]                                                       59%|█████▊    | 7483/12776 [1:18:42<18:44,  4.71it/s] 59%|█████▊    | 7484/12776 [1:18:43<20:49,  4.23it/s]                                                       59%|█████▊    | 7484/12776 [1:18:43<20:49,  4.23it/s] 59%|█████▊    | 7485/12776 [1:18:43<19:37,  4.49it/s]                                                       59%|█████▊    | 7485/12776 [1:18:43<19:37,  4.49it/s] 59%|█████▊    | 7486/12776 [1:18:43<18:39,  4.73it/s]                                                       59%|█████▊    | 7486/12776 [1:18:43<18:39,  4.73it/s] 59%|█████▊    | 7487/12776 [1:18:43<17:58,  4.91it/s]                                                       59%|█████▊    | 7487/12776 [1:18:43<17:58,  4.91it/s] 59%|█████▊    | 7488/12776 [1:18:44<31:19,  2.81it/s]                                                       59%|█████▊    | 7488/12776 [1:18:44<31:19,  2.81it/s] 59%|█████▊    | 7489/12776 [1:18:45<1:01:38,  1.43it/s]                                                         59%|█████▊    | 7489/12776 [1:18:45<1:01:38,  1.43it/s] 59%|█████▊    | 7490/12776 [1:18:46<1:13:54,  1.19it/s]                                                         59%|█████▊    | 7490/12776 [1:18:46<1:13:54,  1.19it/s] 59%|█████▊    | 7491/12776 [1:18:47<1:16:55,  1.14it/s]                                                         59%|█████▊    | 7491/12776 [1:18:47<1:16:55,  1.14it/s] 59%|█████▊    | 7492/12776 [1:18:48<1:15:54,  1.16it/s]                                                         59%|█████▊    | 7492/12776 [1:18:48<1:15:54,  1.16it/s] 59%|█████▊    | 7493/12776 [1:18:49<1:13:32,  1.20it/s]                                                         59%|█████▊    | 7493/12776 [1:18:49<1:13:32,  1.20it/s] 59%|█████▊    | 7494/12776 [1:18:50<1:11:03,  1.24it/s]                                                         59%|█████▊    | 7494/12776 [1:18:50<1:11:03,  1.24it/s] 59%|█████▊    | 7495/12776 [1:18:50<1:09:30,  1.27it/s]                                                         59%|█████▊    | 7495/12776 [1:18:50<1:09:30,  1.27it/s] 59%|█████▊    | 7496/12776 [1:18:51<1:06:05,  1.33it/s]                                                         59%|█████▊    | 7496/12776 [1:18:51<1:06:05,  1.33it/s] 59%|█████▊    | 7497/12776 [1:18:52<1:06:06,  1.33it/s]                                                         59%|█████▊    | 7497/12776 [1:18:52<1:06:06,  1.33it/s] 59%|█████▊    | 7498/12776 [1:18:53<1:02:18,  1.41it/s]                                                         59%|█████▊    | 7498/12776 [1:18:53<1:02:18,  1.41it/s] 59%|█████▊    | 7499/12776 [1:18:53<58:59,  1.49it/s]                                                         59%|█████▊    | 7499/12776 [1:18:53<58:59,  1.49it/s] 59%|█████▊    | 7500/12776 [1:18:54<55:16,  1.59it/s]                                                       59%|█████▊    | 7500/12776 [1:18:54<55:16,  1.59it/s] 59%|█████▊    | 7501/12776 [1:18:54<54:27,  1.61it/s]                                                       59%|█████▊    | 7501/12776 [1:18:54<54:27,  1.61it/s] 59%|█████▊    | 7502/12776 [1:18:55<50:30,  1.74it/s]                                                       59%|█████▊    | 7502/12776 [1:18:55<50:30,  1.74it/s] 59%|█████▊    | 7503/12776 [1:18:55<49:49,  1.76it/s]                                                       59%|█████▊    | 7503/12776 [1:18:55<49:49,  1.76it/s] 59%|█████▊    | 7504/12776 [1:18:56<46:26,  1.89it/s]                                                       59%|█████▊    | 7504/12776 [1:18:56<46:26,  1.89it/s] 59%|█████▊    | 7505/12776 [1:18:56<46:35,  1.89it/s]                                                       59%|█████▊    | 7505/12776 [1:18:56<46:35,  1.89it/s] 59%|█████▉    | 7506/12776 [1:18:57<43:26,  2.02it/s]                                                       59%|█████▉    | 7506/12776 [1:18:57<43:26,  2.02it/s] 59%|█████▉    | 7507/12776 [1:18:57<40:55,  2.15it/s]                                                       59%|█████▉    | 7507/12776 [1:18:57<40:55,  2.15it/s] 59%|█████▉    | 7508/12776 [1:18:58<41:13,  2.13it/s]                                                       59%|█████▉    | 7508/12776 [1:18:58<41:13,  2.13it/s] 59%|█████▉    | 7509/12776 [1:18:58<38:39,  2.27it/s]                                                       59%|█████▉    | 7509/12776 [1:18:58<38:39,  2.27it/s] 59%|█████▉    | 7510/12776 [1:18:58<36:20,  2.41it/s]                                                       59%|█████▉    | 7510/12776 [1:18:58<36:20,  2.41it/s] 59%|█████▉    | 7511/12776 [1:18:59<36:08,  2.43it/s]                                                       59%|█████▉    | 7511/12776 [1:18:59<36:08,  2.43it/s] 59%|█████▉    | 7512/12776 [1:18:59<34:09,  2.57it/s]                                                       59%|█████▉    | 7512/12776 [1:18:59<34:09,  2.57it/s] 59%|█████▉    | 7513/12776 [1:18:59<32:33,  2.69it/s]                                                       59%|█████▉    | 7513/12776 [1:18:59<32:33,  2.69it/s] 59%|█████▉    | 7514/12776 [1:19:00<33:38,  2.61it/s]                                                       59%|█████▉    | 7514/12776 [1:19:00<33:38,  2.61it/s] 59%|█████▉    | 7515/12776 [1:19:00<31:32,  2.78it/s]                                                       59%|█████▉    | 7515/12776 [1:19:00<31:32,  2.78it/s] 59%|█████▉    | 7516/12776 [1:19:00<29:48,  2.94it/s]                                                       59%|█████▉    | 7516/12776 [1:19:00<29:48,  2.94it/s] 59%|█████▉    | 7517/12776 [1:19:01<31:15,  2.80it/s]                                                       59%|█████▉    | 7517/12776 [1:19:01<31:15,  2.80it/s] 59%|█████▉    | 7518/12776 [1:19:01<29:03,  3.02it/s]                                                       59%|█████▉    | 7518/12776 [1:19:01<29:03,  3.02it/s] 59%|█████▉    | 7519/12776 [1:19:01<27:20,  3.20it/s]                                                       59%|█████▉    | 7519/12776 [1:19:01<27:20,  3.20it/s] 59%|█████▉    | 7520/12776 [1:19:02<26:00,  3.37it/s]                                                       59%|█████▉    | 7520/12776 [1:19:02<26:00,  3.37it/s] 59%|█████▉    | 7521/12776 [1:19:02<27:04,  3.24it/s]                                                       59%|█████▉    | 7521/12776 [1:19:02<27:04,  3.24it/s] 59%|█████▉    | 7522/12776 [1:19:02<25:30,  3.43it/s]                                                       59%|█████▉    | 7522/12776 [1:19:02<25:30,  3.43it/s] 59%|█████▉    | 7523/12776 [1:19:02<24:17,  3.61it/s]                                                       59%|█████▉    | 7523/12776 [1:19:02<24:17,  3.61it/s] 59%|█████▉    | 7524/12776 [1:19:03<23:19,  3.75it/s]                                                       59%|█████▉    | 7524/12776 [1:19:03<23:19,  3.75it/s] 59%|█████▉    | 7525/12776 [1:19:03<22:29,  3.89it/s]                                                       59%|█████▉    | 7525/12776 [1:19:03<22:29,  3.89it/s] 59%|█████▉    | 7526/12776 [1:19:03<23:23,  3.74it/s]                                                       59%|█████▉    | 7526/12776 [1:19:03<23:23,  3.74it/s] 59%|█████▉    | 7527/12776 [1:19:03<22:06,  3.96it/s]                                                       59%|█████▉    | 7527/12776 [1:19:03<22:06,  3.96it/s] 59%|█████▉    | 7528/12776 [1:19:04<21:04,  4.15it/s]                                                       59%|█████▉    | 7528/12776 [1:19:04<21:04,  4.15it/s] 59%|█████▉    | 7529/12776 [1:19:04<20:14,  4.32it/s]                                                       59%|█████▉    | 7529/12776 [1:19:04<20:14,  4.32it/s] 59%|█████▉    | 7530/12776 [1:19:04<19:40,  4.44it/s]                                                       59%|█████▉    | 7530/12776 [1:19:04<19:40,  4.44it/s] 59%|█████▉    | 7531/12776 [1:19:04<21:43,  4.02it/s]                                                       59%|█████▉    | 7531/12776 [1:19:04<21:43,  4.02it/s] 59%|█████▉    | 7532/12776 [1:19:04<20:27,  4.27it/s]                                                       59%|█████▉    | 7532/12776 [1:19:04<20:27,  4.27it/s] 59%|█████▉    | 7533/12776 [1:19:05<19:32,  4.47it/s]                                                       59%|█████▉    | 7533/12776 [1:19:05<19:32,  4.47it/s] 59%|█████▉    | 7534/12776 [1:19:05<18:45,  4.66it/s]                                                       59%|█████▉    | 7534/12776 [1:19:05<18:45,  4.66it/s] 59%|█████▉    | 7535/12776 [1:19:05<18:11,  4.80it/s]                                                       59%|█████▉    | 7535/12776 [1:19:05<18:11,  4.80it/s] 59%|█████▉    | 7536/12776 [1:19:05<19:19,  4.52it/s]                                                       59%|█████▉    | 7536/12776 [1:19:05<19:19,  4.52it/s] 59%|█████▉    | 7537/12776 [1:19:05<18:25,  4.74it/s]                                                       59%|█████▉    | 7537/12776 [1:19:05<18:25,  4.74it/s] 59%|█████▉    | 7538/12776 [1:19:06<32:55,  2.65it/s]                                                       59%|█████▉    | 7538/12776 [1:19:06<32:55,  2.65it/s] 59%|█████▉    | 7539/12776 [1:19:08<1:02:38,  1.39it/s]                                                        {'loss': 0.9646, 'grad_norm': 2.2027220726013184, 'learning_rate': 0.00013013196480938415, 'epoch': 1.17}
+{'loss': 0.5224, 'grad_norm': 1.3150688409805298, 'learning_rate': 0.0001301075268817204, 'epoch': 1.17}
+{'loss': 0.5533, 'grad_norm': 1.6204612255096436, 'learning_rate': 0.00013008308895405668, 'epoch': 1.17}
+{'loss': 1.438, 'grad_norm': 7.107356548309326, 'learning_rate': 0.00013005865102639296, 'epoch': 1.17}
+{'loss': 0.5204, 'grad_norm': 2.1435375213623047, 'learning_rate': 0.0001300342130987292, 'epoch': 1.17}
+{'loss': 0.8244, 'grad_norm': 2.206997871398926, 'learning_rate': 0.0001300097751710655, 'epoch': 1.17}
+{'loss': 0.5618, 'grad_norm': 2.2694783210754395, 'learning_rate': 0.00012998533724340174, 'epoch': 1.17}
+{'loss': 1.1019, 'grad_norm': 2.4387333393096924, 'learning_rate': 0.00012996089931573802, 'epoch': 1.17}
+{'loss': 0.4208, 'grad_norm': 1.3861206769943237, 'learning_rate': 0.0001299364613880743, 'epoch': 1.17}
+{'loss': 0.6122, 'grad_norm': 1.4845746755599976, 'learning_rate': 0.00012991202346041055, 'epoch': 1.17}
+{'loss': 0.7805, 'grad_norm': 3.262490749359131, 'learning_rate': 0.0001298875855327468, 'epoch': 1.17}
+{'loss': 0.5199, 'grad_norm': 2.262986421585083, 'learning_rate': 0.00012986314760508308, 'epoch': 1.17}
+{'loss': 0.8374, 'grad_norm': 1.8733512163162231, 'learning_rate': 0.00012983870967741936, 'epoch': 1.17}
+{'loss': 0.7151, 'grad_norm': 1.8209384679794312, 'learning_rate': 0.0001298142717497556, 'epoch': 1.17}
+{'loss': 0.9015, 'grad_norm': 1.9471930265426636, 'learning_rate': 0.00012978983382209188, 'epoch': 1.17}
+{'loss': 0.5801, 'grad_norm': 2.2029271125793457, 'learning_rate': 0.00012976539589442814, 'epoch': 1.17}
+{'loss': 1.2584, 'grad_norm': 2.9882936477661133, 'learning_rate': 0.00012974095796676441, 'epoch': 1.17}
+{'loss': 0.8488, 'grad_norm': 2.2298552989959717, 'learning_rate': 0.0001297165200391007, 'epoch': 1.17}
+{'loss': 0.9486, 'grad_norm': 2.1507680416107178, 'learning_rate': 0.00012969208211143694, 'epoch': 1.17}
+{'loss': 1.1718, 'grad_norm': 2.448796033859253, 'learning_rate': 0.0001296676441837732, 'epoch': 1.17}
+{'loss': 1.2157, 'grad_norm': 1.807196021080017, 'learning_rate': 0.00012964320625610947, 'epoch': 1.17}
+{'loss': 1.6794, 'grad_norm': 1.9326443672180176, 'learning_rate': 0.00012961876832844572, 'epoch': 1.17}
+{'loss': 1.0942, 'grad_norm': 2.4775540828704834, 'learning_rate': 0.000129594330400782, 'epoch': 1.17}
+{'loss': 0.9137, 'grad_norm': 3.0577571392059326, 'learning_rate': 0.00012956989247311828, 'epoch': 1.17}
+{'loss': 1.0002, 'grad_norm': 1.4351956844329834, 'learning_rate': 0.00012954545454545453, 'epoch': 1.17}
+{'loss': 0.6121, 'grad_norm': 2.7734296321868896, 'learning_rate': 0.00012952101661779078, 'epoch': 1.17}
+{'loss': 0.6366, 'grad_norm': 3.1361334323883057, 'learning_rate': 0.00012949657869012706, 'epoch': 1.17}
+{'loss': 0.2406, 'grad_norm': 0.458670049905777, 'learning_rate': 0.00012947214076246334, 'epoch': 1.17}
+{'loss': 0.1887, 'grad_norm': 0.4541315734386444, 'learning_rate': 0.0001294477028347996, 'epoch': 1.17}
+{'loss': 0.6726, 'grad_norm': 1.0763859748840332, 'learning_rate': 0.00012942326490713587, 'epoch': 1.17}
+{'loss': 0.2505, 'grad_norm': 0.6392644643783569, 'learning_rate': 0.00012939882697947212, 'epoch': 1.17}
+{'loss': 0.2844, 'grad_norm': 0.6221238374710083, 'learning_rate': 0.0001293743890518084, 'epoch': 1.17}
+{'loss': 0.2483, 'grad_norm': 0.6850442886352539, 'learning_rate': 0.00012934995112414468, 'epoch': 1.17}
+{'loss': 0.2442, 'grad_norm': 0.808659553527832, 'learning_rate': 0.00012932551319648093, 'epoch': 1.17}
+{'loss': 0.3581, 'grad_norm': 0.7983716130256653, 'learning_rate': 0.00012930107526881718, 'epoch': 1.17}
+{'loss': 0.4542, 'grad_norm': 0.5840036273002625, 'learning_rate': 0.00012927663734115346, 'epoch': 1.17}
+{'loss': 0.238, 'grad_norm': 0.5525220036506653, 'learning_rate': 0.00012925219941348974, 'epoch': 1.17}
+{'loss': 0.4005, 'grad_norm': 1.0121021270751953, 'learning_rate': 0.000129227761485826, 'epoch': 1.17}
+{'loss': 0.3244, 'grad_norm': 2.506730556488037, 'learning_rate': 0.00012920332355816227, 'epoch': 1.17}
+{'loss': 0.2825, 'grad_norm': 0.6981834173202515, 'learning_rate': 0.00012917888563049852, 'epoch': 1.17}
+{'loss': 0.2646, 'grad_norm': 1.0318002700805664, 'learning_rate': 0.0001291544477028348, 'epoch': 1.17}
+{'loss': 0.3837, 'grad_norm': 1.0581806898117065, 'learning_rate': 0.00012913000977517108, 'epoch': 1.17}
+{'loss': 0.3138, 'grad_norm': 1.221384048461914, 'learning_rate': 0.00012910557184750733, 'epoch': 1.17}
+{'loss': 0.241, 'grad_norm': 0.9835034608840942, 'learning_rate': 0.00012908113391984358, 'epoch': 1.17}
+{'loss': 0.2297, 'grad_norm': 0.9940537810325623, 'learning_rate': 0.00012905669599217986, 'epoch': 1.18}
+{'loss': 0.4561, 'grad_norm': 0.7861664891242981, 'learning_rate': 0.0001290322580645161, 'epoch': 1.18}
+{'loss': 0.6284, 'grad_norm': 1.2852720022201538, 'learning_rate': 0.00012900782013685239, 'epoch': 1.18}
+{'loss': 0.5136, 'grad_norm': 1.645680546760559, 'learning_rate': 0.00012898338220918866, 'epoch': 1.18}
+{'loss': 0.4755, 'grad_norm': 1.9410187005996704, 'learning_rate': 0.00012895894428152492, 'epoch': 1.18}
+{'loss': 0.3838, 'grad_norm': 0.807033360004425, 'learning_rate': 0.00012893450635386117, 'epoch': 1.18}
+{'loss': 0.3806, 'grad_norm': 3.5870234966278076, 'learning_rate': 0.00012891006842619744, 'epoch': 1.18}
+{'loss': 0.443, 'grad_norm': 1.55286705493927, 'learning_rate': 0.00012888563049853372, 'epoch': 1.18}
+{'loss': 0.8595, 'grad_norm': 1.928011417388916, 'learning_rate': 0.00012886119257086997, 'epoch': 1.18}
+{'loss': 0.503, 'grad_norm': 1.085798978805542, 'learning_rate': 0.00012883675464320625, 'epoch': 1.18}
+{'loss': 0.4457, 'grad_norm': 1.993884801864624, 'learning_rate': 0.0001288123167155425, 'epoch': 1.18}
+{'loss': 0.6351, 'grad_norm': 1.3057135343551636, 'learning_rate': 0.00012878787878787878, 'epoch': 1.18}
+{'loss': 0.8021, 'grad_norm': 2.9822328090667725, 'learning_rate': 0.00012876344086021506, 'epoch': 1.18}
+{'loss': 0.5327, 'grad_norm': 1.463084101676941, 'learning_rate': 0.0001287390029325513, 'epoch': 1.18}
+{'loss': 0.4669, 'grad_norm': 1.026001214981079, 'learning_rate': 0.00012871456500488756, 'epoch': 1.18}
+{'loss': 0.4046, 'grad_norm': 0.9538169503211975, 'learning_rate': 0.00012869012707722384, 'epoch': 1.18}
+{'loss': 0.5993, 'grad_norm': 1.1642636060714722, 'learning_rate': 0.00012866568914956012, 'epoch': 1.18}
+{'loss': 0.7352, 'grad_norm': 1.9186804294586182, 'learning_rate': 0.00012864125122189637, 'epoch': 1.18}
+{'loss': 0.8638, 'grad_norm': 2.1181206703186035, 'learning_rate': 0.00012861681329423265, 'epoch': 1.18}
+{'loss': 0.9885, 'grad_norm': 2.4149346351623535, 'learning_rate': 0.0001285923753665689, 'epoch': 1.18}
+{'loss': 0.8638, 'grad_norm': 1.8316506147384644, 'learning_rate': 0.00012856793743890518, 'epoch': 1.18}
+{'loss': 0.8473, 'grad_norm': 1.9527243375778198, 'learning_rate': 0.00012854349951124143, 'epoch': 1.18}
+{'loss': 1.134, 'grad_norm': 2.2546331882476807, 'learning_rate': 0.0001285190615835777, 'epoch': 1.18}
+{'loss': 1.0515, 'grad_norm': 1.679229497909546, 'learning_rate': 0.00012849462365591396, 'epoch': 1.18}
+{'loss': 1.3671, 'grad_norm': 7.857295036315918, 'learning_rate': 0.00012847018572825024, 'epoch': 1.18}
+{'loss': 1.3619, 'grad_norm': 2.6357522010803223, 'learning_rate': 0.0001284457478005865, 'epoch': 1.18}
+{'loss': 1.5904, 'grad_norm': 4.2092084884643555, 'learning_rate': 0.00012842130987292277, 'epoch': 1.18}
+{'loss': 1.5906, 'grad_norm': 2.7293174266815186, 'learning_rate': 0.00012839687194525905, 'epoch': 1.18}
+{'loss': 1.0352, 'grad_norm': 1.7189364433288574, 'learning_rate': 0.0001283724340175953, 'epoch': 1.18}
+{'loss': 0.4995, 'grad_norm': 3.9607129096984863, 'learning_rate': 0.00012834799608993155, 'epoch': 1.18}
+{'loss': 0.8067, 'grad_norm': 3.564509391784668, 'learning_rate': 0.00012832355816226783, 'epoch': 1.18}
+{'loss': 0.8116, 'grad_norm': 2.912269115447998, 'learning_rate': 0.0001282991202346041, 'epoch': 1.18}
+{'loss': 0.7333, 'grad_norm': 1.9242192506790161, 'learning_rate': 0.00012827468230694036, 'epoch': 1.18}
+ 59%|█████▉    | 7539/12776 [1:19:08<1:02:38,  1.39it/s] 59%|█████▉    | 7540/12776 [1:19:09<1:09:49,  1.25it/s]                                                         59%|█████▉    | 7540/12776 [1:19:09<1:09:49,  1.25it/s] 59%|█████▉    | 7541/12776 [1:19:10<1:11:27,  1.22it/s]                                                         59%|█████▉    | 7541/12776 [1:19:10<1:11:27,  1.22it/s] 59%|█████▉    | 7542/12776 [1:19:10<1:11:16,  1.22it/s]                                                         59%|█████▉    | 7542/12776 [1:19:10<1:11:16,  1.22it/s] 59%|█████▉    | 7543/12776 [1:19:11<1:08:36,  1.27it/s]                                                         59%|█████▉    | 7543/12776 [1:19:11<1:08:36,  1.27it/s] 59%|█████▉    | 7544/12776 [1:19:12<1:05:44,  1.33it/s]                                                         59%|█████▉    | 7544/12776 [1:19:12<1:05:44,  1.33it/s] 59%|█████▉    | 7545/12776 [1:19:13<1:03:45,  1.37it/s]                                                         59%|█████▉    | 7545/12776 [1:19:13<1:03:45,  1.37it/s] 59%|█████▉    | 7546/12776 [1:19:13<1:00:51,  1.43it/s]                                                         59%|█████▉    | 7546/12776 [1:19:13<1:00:51,  1.43it/s] 59%|█████▉    | 7547/12776 [1:19:14<57:56,  1.50it/s]                                                         59%|█████▉    | 7547/12776 [1:19:14<57:56,  1.50it/s] 59%|█████▉    | 7548/12776 [1:19:14<55:11,  1.58it/s]                                                       59%|█████▉    | 7548/12776 [1:19:14<55:11,  1.58it/s] 59%|█████▉    | 7549/12776 [1:19:15<53:37,  1.62it/s]                                                       59%|█████▉    | 7549/12776 [1:19:15<53:37,  1.62it/s] 59%|█████▉    | 7550/12776 [1:19:15<51:15,  1.70it/s]                                                       59%|█████▉    | 7550/12776 [1:19:15<51:15,  1.70it/s] 59%|█████▉    | 7551/12776 [1:19:16<51:33,  1.69it/s]                                                       59%|█████▉    | 7551/12776 [1:19:16<51:33,  1.69it/s] 59%|█████▉    | 7552/12776 [1:19:16<48:23,  1.80it/s]                                                       59%|█████▉    | 7552/12776 [1:19:16<48:23,  1.80it/s] 59%|█████▉    | 7553/12776 [1:19:17<47:50,  1.82it/s]                                                       59%|█████▉    | 7553/12776 [1:19:17<47:50,  1.82it/s] 59%|█████▉    | 7554/12776 [1:19:17<44:46,  1.94it/s]                                                       59%|█████▉    | 7554/12776 [1:19:17<44:46,  1.94it/s] 59%|█████▉    | 7555/12776 [1:19:18<45:38,  1.91it/s]                                                       59%|█████▉    | 7555/12776 [1:19:18<45:38,  1.91it/s] 59%|█████▉    | 7556/12776 [1:19:18<42:36,  2.04it/s]                                                       59%|█████▉    | 7556/12776 [1:19:18<42:36,  2.04it/s] 59%|█████▉    | 7557/12776 [1:19:19<40:14,  2.16it/s]                                                       59%|█████▉    | 7557/12776 [1:19:19<40:14,  2.16it/s] 59%|█████▉    | 7558/12776 [1:19:19<40:50,  2.13it/s]                                                       59%|█████▉    | 7558/12776 [1:19:19<40:50,  2.13it/s] 59%|█████▉    | 7559/12776 [1:19:20<38:09,  2.28it/s]                                                       59%|█████▉    | 7559/12776 [1:19:20<38:09,  2.28it/s] 59%|█████▉    | 7560/12776 [1:19:20<36:04,  2.41it/s]                                                       59%|█████▉    | 7560/12776 [1:19:20<36:04,  2.41it/s] 59%|█████▉    | 7561/12776 [1:19:20<36:54,  2.36it/s]                                                       59%|█████▉    | 7561/12776 [1:19:20<36:54,  2.36it/s] 59%|█████▉    | 7562/12776 [1:19:21<34:37,  2.51it/s]                                                       59%|█████▉    | 7562/12776 [1:19:21<34:37,  2.51it/s] 59%|█████▉    | 7563/12776 [1:19:21<32:49,  2.65it/s]                                                       59%|█████▉    | 7563/12776 [1:19:21<32:49,  2.65it/s] 59%|█████▉    | 7564/12776 [1:19:21<32:59,  2.63it/s]                                                       59%|█████▉    | 7564/12776 [1:19:21<32:59,  2.63it/s] 59%|█████▉    | 7565/12776 [1:19:22<30:51,  2.81it/s]                                                       59%|█████▉    | 7565/12776 [1:19:22<30:51,  2.81it/s] 59%|█████▉    | 7566/12776 [1:19:22<29:09,  2.98it/s]                                                       59%|█████▉    | 7566/12776 [1:19:22<29:09,  2.98it/s] 59%|█████▉    | 7567/12776 [1:19:22<30:18,  2.87it/s]                                                       59%|█████▉    | 7567/12776 [1:19:22<30:18,  2.87it/s] 59%|█████▉    | 7568/12776 [1:19:23<28:17,  3.07it/s]                                                       59%|█████▉    | 7568/12776 [1:19:23<28:17,  3.07it/s] 59%|█████▉    | 7569/12776 [1:19:23<26:42,  3.25it/s]                                                       59%|█████▉    | 7569/12776 [1:19:23<26:42,  3.25it/s] 59%|█████▉    | 7570/12776 [1:19:23<25:26,  3.41it/s]                                                       59%|█████▉    | 7570/12776 [1:19:23<25:26,  3.41it/s] 59%|█████▉    | 7571/12776 [1:19:24<26:19,  3.29it/s]                                                       59%|█████▉    | 7571/12776 [1:19:24<26:19,  3.29it/s] 59%|█████▉    | 7572/12776 [1:19:24<24:50,  3.49it/s]                                                       59%|█████▉    | 7572/12776 [1:19:24<24:50,  3.49it/s] 59%|█████▉    | 7573/12776 [1:19:24<23:40,  3.66it/s]                                                       59%|█████▉    | 7573/12776 [1:19:24<23:40,  3.66it/s] 59%|█████▉    | 7574/12776 [1:19:24<22:44,  3.81it/s]                                                       59%|█████▉    | 7574/12776 [1:19:24<22:44,  3.81it/s] 59%|█████▉    | 7575/12776 [1:19:25<21:52,  3.96it/s]                                                       59%|█████▉    | 7575/12776 [1:19:25<21:52,  3.96it/s] 59%|█████▉    | 7576/12776 [1:19:25<23:15,  3.73it/s]                                                       59%|█████▉    | 7576/12776 [1:19:25<23:15,  3.73it/s] 59%|█████▉    | 7577/12776 [1:19:25<21:54,  3.96it/s]                                                       59%|█████▉    | 7577/12776 [1:19:25<21:54,  3.96it/s] 59%|█████▉    | 7578/12776 [1:19:25<20:44,  4.18it/s]                                                       59%|█████▉    | 7578/12776 [1:19:25<20:44,  4.18it/s] 59%|█████▉    | 7579/12776 [1:19:25<19:57,  4.34it/s]                                                       59%|█████▉    | 7579/12776 [1:19:25<19:57,  4.34it/s] 59%|█████▉    | 7580/12776 [1:19:26<19:20,  4.48it/s]                                                       59%|█████▉    | 7580/12776 [1:19:26<19:20,  4.48it/s] 59%|█████▉    | 7581/12776 [1:19:26<21:27,  4.04it/s]                                                       59%|█████▉    | 7581/12776 [1:19:26<21:27,  4.04it/s] 59%|█████▉    | 7582/12776 [1:19:26<20:14,  4.28it/s]                                                       59%|█████▉    | 7582/12776 [1:19:26<20:14,  4.28it/s] 59%|█████▉    | 7583/12776 [1:19:26<19:18,  4.48it/s]                                                       59%|█████▉    | 7583/12776 [1:19:26<19:18,  4.48it/s] 59%|█████▉    | 7584/12776 [1:19:27<18:46,  4.61it/s]                                                       59%|█████▉    | 7584/12776 [1:19:27<18:46,  4.61it/s] 59%|█████▉    | 7585/12776 [1:19:27<18:07,  4.77it/s]                                                       59%|█████▉    | 7585/12776 [1:19:27<18:07,  4.77it/s] 59%|█████▉    | 7586/12776 [1:19:27<20:09,  4.29it/s]                                                       59%|█████▉    | 7586/12776 [1:19:27<20:09,  4.29it/s] 59%|█████▉    | 7587/12776 [1:19:27<18:58,  4.56it/s]                                                       59%|█████▉    | 7587/12776 [1:19:27<18:58,  4.56it/s] 59%|█████▉    | 7588/12776 [1:19:28<34:44,  2.49it/s]                                                       59%|█████▉    | 7588/12776 [1:19:28<34:44,  2.49it/s] 59%|█████▉    | 7589/12776 [1:19:30<1:03:59,  1.35it/s]                                                         59%|█████▉    | 7589/12776 [1:19:30<1:03:59,  1.35it/s] 59%|█████▉    | 7590/12776 [1:19:31<1:09:49,  1.24it/s]                                                         59%|█████▉    | 7590/12776 [1:19:31<1:09:49,  1.24it/s] 59%|█████▉    | 7591/12776 [1:19:32<1:14:45,  1.16it/s]                                                         59%|█████▉    | 7591/12776 [1:19:32<1:14:45,  1.16it/s] 59%|█████▉    | 7592/12776 [1:19:32<1:13:04,  1.18it/s]                                                         59%|█████▉    | 7592/12776 [1:19:32<1:13:04,  1.18it/s] 59%|█████▉    | 7593/12776 [1:19:33<1:11:37,  1.21it/s]                                                         59%|█████▉    | 7593/12776 [1:19:33<1:11:37,  1.21it/s] 59%|█████▉    | 7594/12776 [1:19:34<1:09:17,  1.25it/s]                                                         59%|█████▉    | 7594/12776 [1:19:34<1:09:17,  1.25it/s] 59%|█████▉    | 7595/12776 [1:19:35<1:05:35,  1.32it/s]                                                         59%|█████▉    | 7595/12776 [1:19:35<1:05:35,  1.32it/s] 59%|█████▉    | 7596/12776 [1:19:35<1:04:37,  1.34it/s]                                                         59%|█████▉    | 7596/12776 [1:19:35<1:04:37,  1.34it/s] 59%|█████▉    | 7597/12776 [1:19:36<1:00:47,  1.42it/s]                                                         59%|█████▉    | 7597/12776 [1:19:36<1:00:47,  1.42it/s] 59%|█████▉    | 7598/12776 [1:19:37<58:38,  1.47it/s]                                                         59%|█████▉    | 7598/12776 [1:19:37<58:38,  1.47it/s] 59%|█████▉    | 7599/12776 [1:19:37<55:15,  1.56it/s]                                                       59%|█████▉    | 7599/12776 [1:19:37<55:15,  1.56it/s] 59%|█████▉    | 7600/12776 [1:19:38<53:35,  1.61it/s]                                                       59%|█████▉    | 7600/12776 [1:19:38<53:35,  1.61it/s]Saving model checkpoint to ./checkpoint-7600
+Configuration saved in ./checkpoint-7600/config.json
+Model weights saved in ./checkpoint-7600/model.safetensors
+Feature extractor saved in ./checkpoint-7600/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-7600/tokenizer_config.json
+Special tokens file saved in ./checkpoint-7600/special_tokens_map.json
+added tokens file saved in ./checkpoint-7600/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-6400] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 59%|█████▉    | 7601/12776 [1:19:43<3:06:24,  2.16s/it]                                                         59%|█████▉    | 7601/12776 [1:19:43<3:06:24,  2.16s/it] 60%|█████▉    | 7602/12776 [1:19:44<2:21:33,  1.64s/it]                                                         60%|█████▉    | 7602/12776 [1:19:44<2:21:33,  1.64s/it] 60%|█████▉    | 7603/12776 [1:19:44<1:51:18,  1.29s/it]                                                         60%|█████▉    | 7603/12776 [1:19:44<1:51:18,  1.29s/it] 60%|█████▉    | 7604/12776 [1:19:45<1:29:02,  1.03s/it]                                                         60%|█████▉    | 7604/12776 [1:19:45<1:29:02,  1.03s/it] 60%|█████▉    | 7605/12776 [1:19:45<1:14:37,  1.15it/s]                                                         60%|█████▉    | 7605/12776 [1:19:45<1:14:37,  1.15it/s] 60%|█████▉    | 7606/12776 [1:19:46<1:01:40,  1.40it/s]                                                         60%|█████▉    | 7606/12776 [1:19:46<1:01:40,  1.40it/s] 60%|█████▉    | 7607/12776 [1:19:46<52:18,  1.65it/s]                                                         60%|█████▉    | 7607/12776 [1:19:46<52:18,  1.65it/s] 60%|█████▉    | 7608/12776 [1:19:46<48:23,  1.78it/s]                                                       60%|█████▉    | 7608/12776 [1:19:46<48:23,  1.78it/s] 60%|█████▉    | 7609/12776 [1:19:47<42:10,  2.04it/s]                                                       60%|█████▉    | 7609/12776 [1:19:47<42:10,  2.04it/s] 60%|█████▉    | 7610/12776 [1:19:47<37:39,  2.29it/s]                                                       60%|█████▉    | 7610/12776 [1:19:47<37:39,  2.29it/s] 60%|█████▉    | 7611/12776 [1:19:47<34:10,  2.52it/s]                                                       60%|█████▉    | 7611/12776 [1:19:47<34:10,  2.52it/s] 60%|█████▉    | 7612/12776 [1:19:48<33:59,  2.53it/s]                                                       60%|█████▉    | 7612/12776 [1:19:48<33:59,  2.53it/s] 60%|█████▉    | 7613/12776 [1:19:48<30:58,  2.78it/s]                                                       60%|█████▉    | 7613/12776 [1:19:48<30:58,  2.78it/s] 60%|█████▉    | 7614/12776 [1:19:48<28:38,  3.00it/s]                                                       60%|█████▉    | 7614/12776 [1:19:48<28:38,  3.00it/s] 60%|█████▉    | 7615/12776 [1:19:49<26:48,  3.21it/s]                                                       60%|█████▉    | 7615/12776 [1:19:49<26:48,  3.21it/s] 60%|█████▉    | 7616/12776 [1:19:49<26:16,  3.27it/s]                                                      {'loss': 0.1944, 'grad_norm': 0.3993604779243469, 'learning_rate': 0.00012825024437927664, 'epoch': 1.18}
+{'loss': 0.1609, 'grad_norm': 0.4738137722015381, 'learning_rate': 0.00012822580645161289, 'epoch': 1.18}
+{'loss': 0.2162, 'grad_norm': 0.41736552119255066, 'learning_rate': 0.00012820136852394916, 'epoch': 1.18}
+{'loss': 0.221, 'grad_norm': 0.5973852276802063, 'learning_rate': 0.00012817693059628544, 'epoch': 1.18}
+{'loss': 0.1776, 'grad_norm': 0.42182105779647827, 'learning_rate': 0.0001281524926686217, 'epoch': 1.18}
+{'loss': 0.2, 'grad_norm': 1.2317596673965454, 'learning_rate': 0.00012812805474095795, 'epoch': 1.18}
+{'loss': 0.2841, 'grad_norm': 0.7421743869781494, 'learning_rate': 0.00012810361681329422, 'epoch': 1.18}
+{'loss': 0.2966, 'grad_norm': 0.7382256388664246, 'learning_rate': 0.0001280791788856305, 'epoch': 1.18}
+{'loss': 0.2194, 'grad_norm': 0.6662998199462891, 'learning_rate': 0.00012805474095796675, 'epoch': 1.18}
+{'loss': 0.3308, 'grad_norm': 0.5631325840950012, 'learning_rate': 0.000128030303030303, 'epoch': 1.18}
+{'loss': 0.2426, 'grad_norm': 0.8521823883056641, 'learning_rate': 0.00012800586510263928, 'epoch': 1.18}
+{'loss': 0.2999, 'grad_norm': 0.6082648634910583, 'learning_rate': 0.00012798142717497556, 'epoch': 1.18}
+{'loss': 0.5316, 'grad_norm': 1.2765244245529175, 'learning_rate': 0.0001279569892473118, 'epoch': 1.18}
+{'loss': 0.5345, 'grad_norm': 1.1634889841079712, 'learning_rate': 0.0001279325513196481, 'epoch': 1.18}
+{'loss': 0.3129, 'grad_norm': 1.5110682249069214, 'learning_rate': 0.00012790811339198434, 'epoch': 1.18}
+{'loss': 0.2753, 'grad_norm': 0.8883544206619263, 'learning_rate': 0.0001278836754643206, 'epoch': 1.18}
+{'loss': 0.312, 'grad_norm': 1.2819592952728271, 'learning_rate': 0.00012785923753665687, 'epoch': 1.18}
+{'loss': 0.5772, 'grad_norm': 1.9334875345230103, 'learning_rate': 0.00012783479960899315, 'epoch': 1.18}
+{'loss': 0.4345, 'grad_norm': 1.7190932035446167, 'learning_rate': 0.0001278103616813294, 'epoch': 1.18}
+{'loss': 0.4088, 'grad_norm': 1.4696842432022095, 'learning_rate': 0.00012778592375366568, 'epoch': 1.18}
+{'loss': 0.3475, 'grad_norm': 0.6554337739944458, 'learning_rate': 0.00012776148582600193, 'epoch': 1.18}
+{'loss': 0.3618, 'grad_norm': 0.9007388353347778, 'learning_rate': 0.0001277370478983382, 'epoch': 1.18}
+{'loss': 0.511, 'grad_norm': 2.4028732776641846, 'learning_rate': 0.0001277126099706745, 'epoch': 1.18}
+{'loss': 0.4563, 'grad_norm': 1.1445221900939941, 'learning_rate': 0.00012768817204301074, 'epoch': 1.18}
+{'loss': 0.3692, 'grad_norm': 1.3807297945022583, 'learning_rate': 0.000127663734115347, 'epoch': 1.18}
+{'loss': 0.7251, 'grad_norm': 1.8694292306900024, 'learning_rate': 0.00012763929618768327, 'epoch': 1.18}
+{'loss': 0.6297, 'grad_norm': 2.2322914600372314, 'learning_rate': 0.00012761485826001955, 'epoch': 1.18}
+{'loss': 0.4646, 'grad_norm': 1.5895439386367798, 'learning_rate': 0.0001275904203323558, 'epoch': 1.18}
+{'loss': 0.4732, 'grad_norm': 1.2254077196121216, 'learning_rate': 0.00012756598240469208, 'epoch': 1.18}
+{'loss': 0.4586, 'grad_norm': 2.8192131519317627, 'learning_rate': 0.00012754154447702833, 'epoch': 1.18}
+{'loss': 0.5003, 'grad_norm': 1.3876596689224243, 'learning_rate': 0.0001275171065493646, 'epoch': 1.18}
+{'loss': 1.1688, 'grad_norm': 3.00423264503479, 'learning_rate': 0.00012749266862170088, 'epoch': 1.19}
+{'loss': 0.6379, 'grad_norm': 2.323437213897705, 'learning_rate': 0.00012746823069403714, 'epoch': 1.19}
+{'loss': 0.8426, 'grad_norm': 1.7432076930999756, 'learning_rate': 0.0001274437927663734, 'epoch': 1.19}
+{'loss': 0.9046, 'grad_norm': 3.104215621948242, 'learning_rate': 0.00012741935483870967, 'epoch': 1.19}
+{'loss': 0.7857, 'grad_norm': 1.629520058631897, 'learning_rate': 0.00012739491691104592, 'epoch': 1.19}
+{'loss': 0.7239, 'grad_norm': 1.4630589485168457, 'learning_rate': 0.0001273704789833822, 'epoch': 1.19}
+{'loss': 0.8086, 'grad_norm': 1.9049055576324463, 'learning_rate': 0.00012734604105571847, 'epoch': 1.19}
+{'loss': 1.3258, 'grad_norm': 2.1848504543304443, 'learning_rate': 0.00012732160312805472, 'epoch': 1.19}
+{'loss': 0.7483, 'grad_norm': 3.515286922454834, 'learning_rate': 0.00012729716520039098, 'epoch': 1.19}
+{'loss': 0.9094, 'grad_norm': 2.0993247032165527, 'learning_rate': 0.00012727272727272725, 'epoch': 1.19}
+{'loss': 0.8514, 'grad_norm': 2.1793994903564453, 'learning_rate': 0.00012724828934506353, 'epoch': 1.19}
+{'loss': 0.89, 'grad_norm': 2.337986469268799, 'learning_rate': 0.00012722385141739978, 'epoch': 1.19}
+{'loss': 0.8589, 'grad_norm': 1.6112979650497437, 'learning_rate': 0.00012719941348973606, 'epoch': 1.19}
+{'loss': 1.0901, 'grad_norm': 3.4834136962890625, 'learning_rate': 0.0001271749755620723, 'epoch': 1.19}
+{'loss': 1.0381, 'grad_norm': 2.0947000980377197, 'learning_rate': 0.0001271505376344086, 'epoch': 1.19}
+{'loss': 0.812, 'grad_norm': 3.770942211151123, 'learning_rate': 0.00012712609970674487, 'epoch': 1.19}
+{'loss': 0.4694, 'grad_norm': 3.0660061836242676, 'learning_rate': 0.00012710166177908112, 'epoch': 1.19}
+{'loss': 0.5581, 'grad_norm': 2.4887964725494385, 'learning_rate': 0.00012707722385141737, 'epoch': 1.19}
+{'loss': 0.3082, 'grad_norm': 1.3472387790679932, 'learning_rate': 0.00012705278592375365, 'epoch': 1.19}
+{'loss': 0.2746, 'grad_norm': 0.712585985660553, 'learning_rate': 0.00012702834799608993, 'epoch': 1.19}
+{'loss': 0.2139, 'grad_norm': 0.5410864353179932, 'learning_rate': 0.00012700391006842618, 'epoch': 1.19}
+{'loss': 0.1462, 'grad_norm': 0.34346261620521545, 'learning_rate': 0.00012697947214076246, 'epoch': 1.19}
+{'loss': 0.2067, 'grad_norm': 0.6002309918403625, 'learning_rate': 0.0001269550342130987, 'epoch': 1.19}
+{'loss': 0.186, 'grad_norm': 0.6561062335968018, 'learning_rate': 0.000126930596285435, 'epoch': 1.19}
+{'loss': 0.233, 'grad_norm': 0.6932880878448486, 'learning_rate': 0.00012690615835777127, 'epoch': 1.19}
+{'loss': 0.409, 'grad_norm': 1.0030970573425293, 'learning_rate': 0.00012688172043010752, 'epoch': 1.19}
+{'loss': 0.1951, 'grad_norm': 0.5043971538543701, 'learning_rate': 0.00012685728250244377, 'epoch': 1.19}
+{'loss': 0.2255, 'grad_norm': 0.5543043613433838, 'learning_rate': 0.00012683284457478005, 'epoch': 1.19}
+{'loss': 0.324, 'grad_norm': 0.7690470814704895, 'learning_rate': 0.0001268084066471163, 'epoch': 1.19}
+{'loss': 0.3889, 'grad_norm': 0.9051692485809326, 'learning_rate': 0.00012678396871945258, 'epoch': 1.19}
+{'loss': 0.279, 'grad_norm': 1.1886712312698364, 'learning_rate': 0.00012675953079178886, 'epoch': 1.19}
+{'loss': 0.2286, 'grad_norm': 0.8935725688934326, 'learning_rate': 0.0001267350928641251, 'epoch': 1.19}
+{'loss': 0.2867, 'grad_norm': 0.8272036910057068, 'learning_rate': 0.00012671065493646136, 'epoch': 1.19}
+{'loss': 0.1458, 'grad_norm': 0.7367708086967468, 'learning_rate': 0.00012668621700879764, 'epoch': 1.19}
+{'loss': 0.2471, 'grad_norm': 0.7323951721191406, 'learning_rate': 0.00012666177908113391, 'epoch': 1.19}
+{'loss': 0.8892, 'grad_norm': 2.0071024894714355, 'learning_rate': 0.00012663734115347017, 'epoch': 1.19}
+{'loss': 0.5891, 'grad_norm': 2.2433035373687744, 'learning_rate': 0.00012661290322580644, 'epoch': 1.19}
+{'loss': 0.2786, 'grad_norm': 2.180143356323242, 'learning_rate': 0.0001265884652981427, 'epoch': 1.19}
+{'loss': 0.4072, 'grad_norm': 1.3844753503799438, 'learning_rate': 0.00012656402737047897, 'epoch': 1.19}
+{'loss': 0.3149, 'grad_norm': 1.1671643257141113, 'learning_rate': 0.00012653958944281525, 'epoch': 1.19}
+{'loss': 0.4545, 'grad_norm': 1.9654929637908936, 'learning_rate': 0.0001265151515151515, 'epoch': 1.19}
+{'loss': 0.6904, 'grad_norm': 1.6887761354446411, 'learning_rate': 0.00012649071358748775, 'epoch': 1.19}
+{'loss': 0.4628, 'grad_norm': 1.5617834329605103, 'learning_rate': 0.00012646627565982403, 'epoch': 1.19}
+{'loss': 0.9632, 'grad_norm': 4.053790092468262, 'learning_rate': 0.0001264418377321603, 'epoch': 1.19}
+{'loss': 0.3495, 'grad_norm': 1.2538058757781982, 'learning_rate': 0.00012641739980449656, 'epoch': 1.19}
+{'loss': 0.8079, 'grad_norm': 2.1369194984436035, 'learning_rate': 0.00012639296187683284, 'epoch': 1.19}
+ 60%|█████▉    | 7616/12776 [1:19:49<26:16,  3.27it/s] 60%|█████▉    | 7617/12776 [1:19:49<24:42,  3.48it/s]                                                       60%|█████▉    | 7617/12776 [1:19:49<24:42,  3.48it/s] 60%|█████▉    | 7618/12776 [1:19:49<23:30,  3.66it/s]                                                       60%|█████▉    | 7618/12776 [1:19:49<23:30,  3.66it/s] 60%|█████▉    | 7619/12776 [1:19:50<22:23,  3.84it/s]                                                       60%|█████▉    | 7619/12776 [1:19:50<22:23,  3.84it/s] 60%|█████▉    | 7620/12776 [1:19:50<23:24,  3.67it/s]                                                       60%|█████▉    | 7620/12776 [1:19:50<23:24,  3.67it/s] 60%|█████▉    | 7621/12776 [1:19:50<22:10,  3.88it/s]                                                       60%|█████▉    | 7621/12776 [1:19:50<22:10,  3.88it/s] 60%|█████▉    | 7622/12776 [1:19:50<21:07,  4.07it/s]                                                       60%|█████▉    | 7622/12776 [1:19:50<21:07,  4.07it/s] 60%|█████▉    | 7623/12776 [1:19:50<20:13,  4.24it/s]                                                       60%|█████▉    | 7623/12776 [1:19:50<20:13,  4.24it/s] 60%|█████▉    | 7624/12776 [1:19:51<19:29,  4.40it/s]                                                       60%|█████▉    | 7624/12776 [1:19:51<19:29,  4.40it/s] 60%|█████▉    | 7625/12776 [1:19:51<20:26,  4.20it/s]                                                       60%|█████▉    | 7625/12776 [1:19:51<20:26,  4.20it/s] 60%|█████▉    | 7626/12776 [1:19:51<19:20,  4.44it/s]                                                       60%|█████▉    | 7626/12776 [1:19:51<19:20,  4.44it/s] 60%|█████▉    | 7627/12776 [1:19:51<18:28,  4.65it/s]                                                       60%|█████▉    | 7627/12776 [1:19:51<18:28,  4.65it/s] 60%|█████▉    | 7628/12776 [1:19:52<17:50,  4.81it/s]                                                       60%|█████▉    | 7628/12776 [1:19:52<17:50,  4.81it/s] 60%|█████▉    | 7629/12776 [1:19:52<17:21,  4.94it/s]                                                       60%|█████▉    | 7629/12776 [1:19:52<17:21,  4.94it/s] 60%|█████▉    | 7630/12776 [1:19:52<16:52,  5.08it/s]                                                       60%|█████▉    | 7630/12776 [1:19:52<16:52,  5.08it/s] 60%|█████▉    | 7631/12776 [1:19:52<19:04,  4.50it/s]                                                       60%|█████▉    | 7631/12776 [1:19:52<19:04,  4.50it/s] 60%|█████▉    | 7632/12776 [1:19:52<17:55,  4.78it/s]                                                       60%|█████▉    | 7632/12776 [1:19:52<17:55,  4.78it/s] 60%|█████▉    | 7633/12776 [1:19:53<17:03,  5.03it/s]                                                       60%|█████▉    | 7633/12776 [1:19:53<17:03,  5.03it/s] 60%|█████▉    | 7634/12776 [1:19:53<16:25,  5.22it/s]                                                       60%|█████▉    | 7634/12776 [1:19:53<16:25,  5.22it/s] 60%|█████▉    | 7635/12776 [1:19:53<15:53,  5.39it/s]                                                       60%|█████▉    | 7635/12776 [1:19:53<15:53,  5.39it/s] 60%|█████▉    | 7636/12776 [1:19:53<15:31,  5.52it/s]                                                       60%|█████▉    | 7636/12776 [1:19:53<15:31,  5.52it/s] 60%|█████▉    | 7637/12776 [1:19:53<17:23,  4.93it/s]                                                       60%|█████▉    | 7637/12776 [1:19:53<17:23,  4.93it/s] 60%|█████▉    | 7638/12776 [1:19:54<29:50,  2.87it/s]                                                       60%|█████▉    | 7638/12776 [1:19:54<29:50,  2.87it/s] 60%|█████▉    | 7639/12776 [1:19:55<51:56,  1.65it/s]                                                       60%|█████▉    | 7639/12776 [1:19:55<51:56,  1.65it/s] 60%|█████▉    | 7640/12776 [1:19:56<59:44,  1.43it/s]                                                       60%|█████▉    | 7640/12776 [1:19:56<59:44,  1.43it/s] 60%|█████▉    | 7641/12776 [1:19:57<1:03:49,  1.34it/s]                                                         60%|█████▉    | 7641/12776 [1:19:57<1:03:49,  1.34it/s] 60%|█████▉    | 7642/12776 [1:19:58<1:05:47,  1.30it/s]                                                         60%|█████▉    | 7642/12776 [1:19:58<1:05:47,  1.30it/s] 60%|█████▉    | 7643/12776 [1:19:59<1:04:34,  1.32it/s]                                                         60%|█████▉    | 7643/12776 [1:19:59<1:04:34,  1.32it/s] 60%|█████▉    | 7644/12776 [1:19:59<1:03:40,  1.34it/s]                                                         60%|█████▉    | 7644/12776 [1:19:59<1:03:40,  1.34it/s] 60%|█████▉    | 7645/12776 [1:20:00<1:00:39,  1.41it/s]                                                         60%|█████▉    | 7645/12776 [1:20:00<1:00:39,  1.41it/s] 60%|█████▉    | 7646/12776 [1:20:00<57:50,  1.48it/s]                                                         60%|█████▉    | 7646/12776 [1:20:00<57:50,  1.48it/s] 60%|█████▉    | 7647/12776 [1:20:01<54:47,  1.56it/s]                                                       60%|█████▉    | 7647/12776 [1:20:01<54:47,  1.56it/s] 60%|█████▉    | 7648/12776 [1:20:02<52:38,  1.62it/s]                                                       60%|█████▉    | 7648/12776 [1:20:02<52:38,  1.62it/s] 60%|█████▉    | 7649/12776 [1:20:02<49:49,  1.71it/s]                                                       60%|█████▉    | 7649/12776 [1:20:02<49:49,  1.71it/s] 60%|█████▉    | 7650/12776 [1:20:03<48:08,  1.77it/s]                                                       60%|█████▉    | 7650/12776 [1:20:03<48:08,  1.77it/s] 60%|█████▉    | 7651/12776 [1:20:03<45:25,  1.88it/s]                                                       60%|█████▉    | 7651/12776 [1:20:03<45:25,  1.88it/s] 60%|█████▉    | 7652/12776 [1:20:04<44:21,  1.92it/s]                                                       60%|█████▉    | 7652/12776 [1:20:04<44:21,  1.92it/s] 60%|█████▉    | 7653/12776 [1:20:04<41:30,  2.06it/s]                                                       60%|█████▉    | 7653/12776 [1:20:04<41:30,  2.06it/s] 60%|█████▉    | 7654/12776 [1:20:04<39:09,  2.18it/s]                                                       60%|█████▉    | 7654/12776 [1:20:04<39:09,  2.18it/s] 60%|█████▉    | 7655/12776 [1:20:05<40:59,  2.08it/s]                                                       60%|█████▉    | 7655/12776 [1:20:05<40:59,  2.08it/s] 60%|█████▉    | 7656/12776 [1:20:05<37:54,  2.25it/s]                                                       60%|█████▉    | 7656/12776 [1:20:05<37:54,  2.25it/s] 60%|█████▉    | 7657/12776 [1:20:06<35:33,  2.40it/s]                                                       60%|█████▉    | 7657/12776 [1:20:06<35:33,  2.40it/s] 60%|█████▉    | 7658/12776 [1:20:06<35:28,  2.40it/s]                                                       60%|█████▉    | 7658/12776 [1:20:06<35:28,  2.40it/s] 60%|█████▉    | 7659/12776 [1:20:06<33:11,  2.57it/s]                                                       60%|█████▉    | 7659/12776 [1:20:06<33:11,  2.57it/s] 60%|█████▉    | 7660/12776 [1:20:07<31:18,  2.72it/s]                                                       60%|█████▉    | 7660/12776 [1:20:07<31:18,  2.72it/s] 60%|█████▉    | 7661/12776 [1:20:07<30:52,  2.76it/s]                                                       60%|█████▉    | 7661/12776 [1:20:07<30:52,  2.76it/s] 60%|█████▉    | 7662/12776 [1:20:07<29:10,  2.92it/s]                                                       60%|█████▉    | 7662/12776 [1:20:07<29:10,  2.92it/s] 60%|█████▉    | 7663/12776 [1:20:08<27:37,  3.08it/s]                                                       60%|█████▉    | 7663/12776 [1:20:08<27:37,  3.08it/s] 60%|█████▉    | 7664/12776 [1:20:08<26:16,  3.24it/s]                                                       60%|█████▉    | 7664/12776 [1:20:08<26:16,  3.24it/s] 60%|█████▉    | 7665/12776 [1:20:08<25:41,  3.32it/s]                                                       60%|█████▉    | 7665/12776 [1:20:08<25:41,  3.32it/s] 60%|██████    | 7666/12776 [1:20:08<25:02,  3.40it/s]                                                       60%|██████    | 7666/12776 [1:20:08<25:02,  3.40it/s] 60%|██████    | 7667/12776 [1:20:09<24:19,  3.50it/s]                                                       60%|██████    | 7667/12776 [1:20:09<24:19,  3.50it/s] 60%|██████    | 7668/12776 [1:20:09<23:22,  3.64it/s]                                                       60%|██████    | 7668/12776 [1:20:09<23:22,  3.64it/s] 60%|██████    | 7669/12776 [1:20:09<24:46,  3.43it/s]                                                       60%|██████    | 7669/12776 [1:20:09<24:46,  3.43it/s] 60%|██████    | 7670/12776 [1:20:10<23:38,  3.60it/s]                                                       60%|██████    | 7670/12776 [1:20:10<23:38,  3.60it/s] 60%|██████    | 7671/12776 [1:20:10<22:41,  3.75it/s]                                                       60%|██████    | 7671/12776 [1:20:10<22:41,  3.75it/s] 60%|██████    | 7672/12776 [1:20:10<21:53,  3.89it/s]                                                       60%|██████    | 7672/12776 [1:20:10<21:53,  3.89it/s] 60%|██████    | 7673/12776 [1:20:10<21:11,  4.01it/s]                                                       60%|██████    | 7673/12776 [1:20:10<21:11,  4.01it/s] 60%|██████    | 7674/12776 [1:20:11<22:47,  3.73it/s]                                                       60%|██████    | 7674/12776 [1:20:11<22:47,  3.73it/s] 60%|██████    | 7675/12776 [1:20:11<21:27,  3.96it/s]                                                       60%|██████    | 7675/12776 [1:20:11<21:27,  3.96it/s] 60%|██████    | 7676/12776 [1:20:11<20:24,  4.16it/s]                                                       60%|██████    | 7676/12776 [1:20:11<20:24,  4.16it/s] 60%|██████    | 7677/12776 [1:20:11<19:40,  4.32it/s]                                                       60%|██████    | 7677/12776 [1:20:11<19:40,  4.32it/s] 60%|██████    | 7678/12776 [1:20:11<19:01,  4.47it/s]                                                       60%|██████    | 7678/12776 [1:20:11<19:01,  4.47it/s] 60%|██████    | 7679/12776 [1:20:12<20:42,  4.10it/s]                                                       60%|██████    | 7679/12776 [1:20:12<20:42,  4.10it/s] 60%|██████    | 7680/12776 [1:20:12<19:35,  4.33it/s]                                                       60%|██████    | 7680/12776 [1:20:12<19:35,  4.33it/s] 60%|██████    | 7681/12776 [1:20:12<18:46,  4.52it/s]                                                       60%|██████    | 7681/12776 [1:20:12<18:46,  4.52it/s] 60%|██████    | 7682/12776 [1:20:12<18:08,  4.68it/s]                                                       60%|██████    | 7682/12776 [1:20:12<18:08,  4.68it/s] 60%|██████    | 7683/12776 [1:20:12<17:40,  4.80it/s]                                                       60%|██████    | 7683/12776 [1:20:12<17:40,  4.80it/s] 60%|██████    | 7684/12776 [1:20:13<20:27,  4.15it/s]                                                       60%|██████    | 7684/12776 [1:20:13<20:27,  4.15it/s] 60%|██████    | 7685/12776 [1:20:13<19:04,  4.45it/s]                                                       60%|██████    | 7685/12776 [1:20:13<19:04,  4.45it/s] 60%|██████    | 7686/12776 [1:20:13<17:26,  4.86it/s]                                                       60%|██████    | 7686/12776 [1:20:13<17:26,  4.86it/s] 60%|██████    | 7687/12776 [1:20:13<16:16,  5.21it/s]                                                       60%|██████    | 7687/12776 [1:20:13<16:16,  5.21it/s] 60%|██████    | 7688/12776 [1:20:14<29:02,  2.92it/s]                                                       60%|██████    | 7688/12776 [1:20:14<29:02,  2.92it/s] 60%|██████    | 7689/12776 [1:20:15<55:18,  1.53it/s]                                                       60%|██████    | 7689/12776 [1:20:15<55:18,  1.53it/s] 60%|██████    | 7690/12776 [1:20:16<1:02:36,  1.35it/s]                                                         60%|██████    | 7690/12776 [1:20:16<1:02:36,  1.35it/s] 60%|██████    | 7691/12776 [1:20:17<1:05:14,  1.30it/s]                                                         60%|██████    | 7691/12776 [1:20:17<1:05:14,  1.30it/s] 60%|██████    | 7692/12776 [1:20:18<1:05:25,  1.30it/s]                                                         60%|██████    | 7692/12776 [1:20:18<1:05:25,  1.30it/s] 60%|██████    | 7693/12776 [1:20:19<1:04:08,  1.32it/s]                                                         60%|██████    | 7693/12776 [1:20:19<1:04:08,  1.32it/s] 60%|██████    | 7694/12776 [1:20:19<1:02:10,  1.36it/s]                                                        {'loss': 0.7607, 'grad_norm': 5.438715934753418, 'learning_rate': 0.0001263685239491691, 'epoch': 1.19}
+{'loss': 0.5004, 'grad_norm': 4.414690971374512, 'learning_rate': 0.00012634408602150537, 'epoch': 1.19}
+{'loss': 0.7119, 'grad_norm': 3.980987310409546, 'learning_rate': 0.00012631964809384162, 'epoch': 1.19}
+{'loss': 0.5477, 'grad_norm': 2.606283664703369, 'learning_rate': 0.0001262952101661779, 'epoch': 1.19}
+{'loss': 0.4294, 'grad_norm': 1.2772033214569092, 'learning_rate': 0.00012627077223851415, 'epoch': 1.19}
+{'loss': 0.9099, 'grad_norm': 2.65388822555542, 'learning_rate': 0.00012624633431085043, 'epoch': 1.19}
+{'loss': 1.0008, 'grad_norm': 2.885216236114502, 'learning_rate': 0.00012622189638318668, 'epoch': 1.19}
+{'loss': 0.4982, 'grad_norm': 1.257573127746582, 'learning_rate': 0.00012619745845552296, 'epoch': 1.19}
+{'loss': 1.2771, 'grad_norm': 6.712513446807861, 'learning_rate': 0.00012617302052785924, 'epoch': 1.19}
+{'loss': 0.7371, 'grad_norm': 2.6554524898529053, 'learning_rate': 0.0001261485826001955, 'epoch': 1.19}
+{'loss': 1.43, 'grad_norm': 2.5295774936676025, 'learning_rate': 0.00012612414467253174, 'epoch': 1.19}
+{'loss': 1.3552, 'grad_norm': 3.8119919300079346, 'learning_rate': 0.00012609970674486802, 'epoch': 1.19}
+{'loss': 1.7003, 'grad_norm': 3.1920182704925537, 'learning_rate': 0.0001260752688172043, 'epoch': 1.19}
+{'loss': 0.8041, 'grad_norm': 3.4198391437530518, 'learning_rate': 0.00012605083088954055, 'epoch': 1.19}
+{'loss': 0.9902, 'grad_norm': 2.1979782581329346, 'learning_rate': 0.00012602639296187683, 'epoch': 1.19}
+{'loss': 1.122, 'grad_norm': 2.4883174896240234, 'learning_rate': 0.00012600195503421308, 'epoch': 1.19}
+{'loss': 0.9887, 'grad_norm': 2.1847050189971924, 'learning_rate': 0.00012597751710654936, 'epoch': 1.19}
+{'loss': 1.2614, 'grad_norm': 2.9857428073883057, 'learning_rate': 0.00012595307917888563, 'epoch': 1.19}
+{'loss': 1.1563, 'grad_norm': 3.8710601329803467, 'learning_rate': 0.00012592864125122189, 'epoch': 1.2}
+{'loss': 0.281, 'grad_norm': 1.628251314163208, 'learning_rate': 0.00012590420332355814, 'epoch': 1.2}
+{'loss': 0.8694, 'grad_norm': 2.4681708812713623, 'learning_rate': 0.00012587976539589442, 'epoch': 1.2}
+{'loss': 0.6151, 'grad_norm': 1.361156940460205, 'learning_rate': 0.0001258553274682307, 'epoch': 1.2}
+{'loss': 0.4191, 'grad_norm': 2.5139424800872803, 'learning_rate': 0.00012583088954056695, 'epoch': 1.2}
+{'loss': 0.266, 'grad_norm': 0.4481685161590576, 'learning_rate': 0.00012580645161290322, 'epoch': 1.2}
+{'loss': 0.3183, 'grad_norm': 0.5807831287384033, 'learning_rate': 0.00012578201368523947, 'epoch': 1.2}
+{'loss': 0.203, 'grad_norm': 0.433891236782074, 'learning_rate': 0.00012575757575757575, 'epoch': 1.2}
+{'loss': 0.2185, 'grad_norm': 0.3798646628856659, 'learning_rate': 0.000125733137829912, 'epoch': 1.2}
+{'loss': 0.2084, 'grad_norm': 0.5597085952758789, 'learning_rate': 0.00012570869990224828, 'epoch': 1.2}
+{'loss': 0.3176, 'grad_norm': 0.8188731670379639, 'learning_rate': 0.00012568426197458453, 'epoch': 1.2}
+{'loss': 0.2583, 'grad_norm': 0.44461318850517273, 'learning_rate': 0.0001256598240469208, 'epoch': 1.2}
+{'loss': 0.2882, 'grad_norm': 0.6256333589553833, 'learning_rate': 0.00012563538611925706, 'epoch': 1.2}
+{'loss': 0.2987, 'grad_norm': 0.5847500562667847, 'learning_rate': 0.00012561094819159334, 'epoch': 1.2}
+{'loss': 0.2004, 'grad_norm': 0.4308290183544159, 'learning_rate': 0.00012558651026392962, 'epoch': 1.2}
+{'loss': 0.2055, 'grad_norm': 0.6642153859138489, 'learning_rate': 0.00012556207233626587, 'epoch': 1.2}
+{'loss': 0.5965, 'grad_norm': 1.537541151046753, 'learning_rate': 0.00012553763440860212, 'epoch': 1.2}
+{'loss': 0.2567, 'grad_norm': 0.6113720536231995, 'learning_rate': 0.0001255131964809384, 'epoch': 1.2}
+{'loss': 0.3958, 'grad_norm': 1.1813939809799194, 'learning_rate': 0.00012548875855327468, 'epoch': 1.2}
+{'loss': 0.4031, 'grad_norm': 1.415676474571228, 'learning_rate': 0.00012546432062561093, 'epoch': 1.2}
+{'loss': 0.39, 'grad_norm': 2.827056884765625, 'learning_rate': 0.0001254398826979472, 'epoch': 1.2}
+{'loss': 0.4073, 'grad_norm': 1.0784145593643188, 'learning_rate': 0.00012541544477028346, 'epoch': 1.2}
+{'loss': 0.3409, 'grad_norm': 1.3074734210968018, 'learning_rate': 0.00012539100684261974, 'epoch': 1.2}
+{'loss': 0.7839, 'grad_norm': 5.250173091888428, 'learning_rate': 0.00012536656891495602, 'epoch': 1.2}
+{'loss': 0.4488, 'grad_norm': 0.9771109223365784, 'learning_rate': 0.00012534213098729227, 'epoch': 1.2}
+{'loss': 0.5873, 'grad_norm': 2.0896804332733154, 'learning_rate': 0.00012531769305962852, 'epoch': 1.2}
+{'loss': 0.3258, 'grad_norm': 1.340659260749817, 'learning_rate': 0.0001252932551319648, 'epoch': 1.2}
+{'loss': 0.434, 'grad_norm': 2.5258636474609375, 'learning_rate': 0.00012526881720430108, 'epoch': 1.2}
+{'loss': 0.4309, 'grad_norm': 0.8065431714057922, 'learning_rate': 0.00012524437927663733, 'epoch': 1.2}
+{'loss': 0.8626, 'grad_norm': 2.020216464996338, 'learning_rate': 0.0001252199413489736, 'epoch': 1.2}
+{'loss': 0.4851, 'grad_norm': 1.528834581375122, 'learning_rate': 0.00012519550342130986, 'epoch': 1.2}
+{'loss': 0.4344, 'grad_norm': 1.1906168460845947, 'learning_rate': 0.00012517106549364614, 'epoch': 1.2}
+{'loss': 0.9714, 'grad_norm': 1.7082940340042114, 'learning_rate': 0.0001251466275659824, 'epoch': 1.2}
+{'loss': 0.4113, 'grad_norm': 1.5816830396652222, 'learning_rate': 0.00012512218963831867, 'epoch': 1.2}
+{'loss': 0.9074, 'grad_norm': 1.9569361209869385, 'learning_rate': 0.00012509775171065492, 'epoch': 1.2}
+{'loss': 0.6514, 'grad_norm': 2.3706414699554443, 'learning_rate': 0.0001250733137829912, 'epoch': 1.2}
+{'loss': 0.6741, 'grad_norm': 2.0549864768981934, 'learning_rate': 0.00012504887585532745, 'epoch': 1.2}
+{'loss': 0.7198, 'grad_norm': 2.067312002182007, 'learning_rate': 0.00012502443792766372, 'epoch': 1.2}
+{'loss': 0.9955, 'grad_norm': 2.430595874786377, 'learning_rate': 0.000125, 'epoch': 1.2}
+{'loss': 0.8613, 'grad_norm': 1.7246347665786743, 'learning_rate': 0.00012497556207233625, 'epoch': 1.2}
+{'loss': 0.8444, 'grad_norm': 3.665044069290161, 'learning_rate': 0.0001249511241446725, 'epoch': 1.2}
+{'loss': 0.8613, 'grad_norm': 3.734281539916992, 'learning_rate': 0.00012492668621700878, 'epoch': 1.2}
+{'loss': 1.4506, 'grad_norm': 2.5038020610809326, 'learning_rate': 0.00012490224828934506, 'epoch': 1.2}
+{'loss': 0.982, 'grad_norm': 2.5142405033111572, 'learning_rate': 0.0001248778103616813, 'epoch': 1.2}
+{'loss': 0.9359, 'grad_norm': 4.540086269378662, 'learning_rate': 0.0001248533724340176, 'epoch': 1.2}
+{'loss': 0.6585, 'grad_norm': 1.838870644569397, 'learning_rate': 0.00012482893450635384, 'epoch': 1.2}
+{'loss': 1.318, 'grad_norm': 3.0553319454193115, 'learning_rate': 0.00012480449657869012, 'epoch': 1.2}
+{'loss': 1.5615, 'grad_norm': 3.3202075958251953, 'learning_rate': 0.0001247800586510264, 'epoch': 1.2}
+{'loss': 1.1045, 'grad_norm': 5.532286643981934, 'learning_rate': 0.00012475562072336265, 'epoch': 1.2}
+{'loss': 0.7781, 'grad_norm': 2.526785135269165, 'learning_rate': 0.0001247311827956989, 'epoch': 1.2}
+{'loss': 0.9252, 'grad_norm': 5.095958709716797, 'learning_rate': 0.00012470674486803518, 'epoch': 1.2}
+{'loss': 0.793, 'grad_norm': 2.2262609004974365, 'learning_rate': 0.00012468230694037146, 'epoch': 1.2}
+{'loss': 1.4444, 'grad_norm': 4.54476261138916, 'learning_rate': 0.0001246578690127077, 'epoch': 1.2}
+{'loss': 1.1172, 'grad_norm': 3.232273578643799, 'learning_rate': 0.000124633431085044, 'epoch': 1.2}
+{'loss': 0.6069, 'grad_norm': 2.0127787590026855, 'learning_rate': 0.00012460899315738024, 'epoch': 1.2}
+{'loss': 0.2224, 'grad_norm': 0.4151703417301178, 'learning_rate': 0.0001245845552297165, 'epoch': 1.2}
+{'loss': 0.2122, 'grad_norm': 0.4115196168422699, 'learning_rate': 0.00012456011730205277, 'epoch': 1.2}
+{'loss': 0.2164, 'grad_norm': 0.49626508355140686, 'learning_rate': 0.00012453567937438905, 'epoch': 1.2}
+{'loss': 0.1638, 'grad_norm': 0.6475088000297546, 'learning_rate': 0.0001245112414467253, 'epoch': 1.2}
+{'loss': 0.1996, 'grad_norm': 0.5242400765419006, 'learning_rate': 0.00012448680351906158, 'epoch': 1.2}
+ 60%|██████    | 7694/12776 [1:20:19<1:02:10,  1.36it/s] 60%|██████    | 7695/12776 [1:20:20<1:00:46,  1.39it/s]                                                         60%|██████    | 7695/12776 [1:20:20<1:00:46,  1.39it/s] 60%|██████    | 7696/12776 [1:20:21<58:13,  1.45it/s]                                                         60%|██████    | 7696/12776 [1:20:21<58:13,  1.45it/s] 60%|██████    | 7697/12776 [1:20:21<57:14,  1.48it/s]                                                       60%|██████    | 7697/12776 [1:20:21<57:14,  1.48it/s] 60%|██████    | 7698/12776 [1:20:22<54:56,  1.54it/s]                                                       60%|██████    | 7698/12776 [1:20:22<54:56,  1.54it/s] 60%|██████    | 7699/12776 [1:20:22<52:53,  1.60it/s]                                                       60%|██████    | 7699/12776 [1:20:22<52:53,  1.60it/s] 60%|██████    | 7700/12776 [1:20:23<50:43,  1.67it/s]                                                       60%|██████    | 7700/12776 [1:20:23<50:43,  1.67it/s] 60%|██████    | 7701/12776 [1:20:23<48:32,  1.74it/s]                                                       60%|██████    | 7701/12776 [1:20:23<48:32,  1.74it/s] 60%|██████    | 7702/12776 [1:20:24<46:28,  1.82it/s]                                                       60%|██████    | 7702/12776 [1:20:24<46:28,  1.82it/s] 60%|██████    | 7703/12776 [1:20:24<45:41,  1.85it/s]                                                       60%|██████    | 7703/12776 [1:20:24<45:41,  1.85it/s] 60%|██████    | 7704/12776 [1:20:25<43:03,  1.96it/s]                                                       60%|██████    | 7704/12776 [1:20:25<43:03,  1.96it/s] 60%|██████    | 7705/12776 [1:20:25<42:44,  1.98it/s]                                                       60%|██████    | 7705/12776 [1:20:25<42:44,  1.98it/s] 60%|██████    | 7706/12776 [1:20:26<40:14,  2.10it/s]                                                       60%|██████    | 7706/12776 [1:20:26<40:14,  2.10it/s] 60%|██████    | 7707/12776 [1:20:26<38:06,  2.22it/s]                                                       60%|██████    | 7707/12776 [1:20:26<38:06,  2.22it/s] 60%|██████    | 7708/12776 [1:20:27<40:04,  2.11it/s]                                                       60%|██████    | 7708/12776 [1:20:27<40:04,  2.11it/s] 60%|██████    | 7709/12776 [1:20:27<36:59,  2.28it/s]                                                       60%|██████    | 7709/12776 [1:20:27<36:59,  2.28it/s] 60%|██████    | 7710/12776 [1:20:27<35:00,  2.41it/s]                                                       60%|██████    | 7710/12776 [1:20:27<35:00,  2.41it/s] 60%|██████    | 7711/12776 [1:20:28<35:18,  2.39it/s]                                                       60%|██████    | 7711/12776 [1:20:28<35:18,  2.39it/s] 60%|██████    | 7712/12776 [1:20:28<33:08,  2.55it/s]                                                       60%|██████    | 7712/12776 [1:20:28<33:08,  2.55it/s] 60%|██████    | 7713/12776 [1:20:29<31:16,  2.70it/s]                                                       60%|██████    | 7713/12776 [1:20:29<31:16,  2.70it/s] 60%|██████    | 7714/12776 [1:20:29<30:25,  2.77it/s]                                                       60%|██████    | 7714/12776 [1:20:29<30:25,  2.77it/s] 60%|██████    | 7715/12776 [1:20:29<28:50,  2.92it/s]                                                       60%|██████    | 7715/12776 [1:20:29<28:50,  2.92it/s] 60%|██████    | 7716/12776 [1:20:29<27:29,  3.07it/s]                                                       60%|██████    | 7716/12776 [1:20:29<27:29,  3.07it/s] 60%|██████    | 7717/12776 [1:20:30<26:31,  3.18it/s]                                                       60%|██████    | 7717/12776 [1:20:30<26:31,  3.18it/s] 60%|██████    | 7718/12776 [1:20:30<27:58,  3.01it/s]                                                       60%|██████    | 7718/12776 [1:20:30<27:58,  3.01it/s] 60%|██████    | 7719/12776 [1:20:30<25:57,  3.25it/s]                                                       60%|██████    | 7719/12776 [1:20:30<25:57,  3.25it/s] 60%|██████    | 7720/12776 [1:20:31<24:36,  3.43it/s]                                                       60%|██████    | 7720/12776 [1:20:31<24:36,  3.43it/s] 60%|██████    | 7721/12776 [1:20:31<23:32,  3.58it/s]                                                       60%|██████    | 7721/12776 [1:20:31<23:32,  3.58it/s] 60%|██████    | 7722/12776 [1:20:31<25:05,  3.36it/s]                                                       60%|██████    | 7722/12776 [1:20:31<25:05,  3.36it/s] 60%|██████    | 7723/12776 [1:20:31<23:37,  3.56it/s]                                                       60%|██████    | 7723/12776 [1:20:31<23:37,  3.56it/s] 60%|██████    | 7724/12776 [1:20:32<22:23,  3.76it/s]                                                       60%|██████    | 7724/12776 [1:20:32<22:23,  3.76it/s] 60%|██████    | 7725/12776 [1:20:32<21:21,  3.94it/s]                                                       60%|██████    | 7725/12776 [1:20:32<21:21,  3.94it/s] 60%|██████    | 7726/12776 [1:20:32<22:47,  3.69it/s]                                                       60%|██████    | 7726/12776 [1:20:32<22:47,  3.69it/s] 60%|██████    | 7727/12776 [1:20:32<21:25,  3.93it/s]                                                       60%|██████    | 7727/12776 [1:20:32<21:25,  3.93it/s] 60%|██████    | 7728/12776 [1:20:33<20:17,  4.15it/s]                                                       60%|██████    | 7728/12776 [1:20:33<20:17,  4.15it/s] 60%|██████    | 7729/12776 [1:20:33<19:30,  4.31it/s]                                                       60%|██████    | 7729/12776 [1:20:33<19:30,  4.31it/s] 61%|██████    | 7730/12776 [1:20:33<18:48,  4.47it/s]                                                       61%|██████    | 7730/12776 [1:20:33<18:48,  4.47it/s] 61%|██████    | 7731/12776 [1:20:33<20:53,  4.02it/s]                                                       61%|██████    | 7731/12776 [1:20:33<20:53,  4.02it/s] 61%|██████    | 7732/12776 [1:20:34<19:38,  4.28it/s]                                                       61%|██████    | 7732/12776 [1:20:34<19:38,  4.28it/s] 61%|██████    | 7733/12776 [1:20:34<18:42,  4.49it/s]                                                       61%|██████    | 7733/12776 [1:20:34<18:42,  4.49it/s] 61%|██████    | 7734/12776 [1:20:34<17:57,  4.68it/s]                                                       61%|██████    | 7734/12776 [1:20:34<17:57,  4.68it/s] 61%|██████    | 7735/12776 [1:20:34<17:34,  4.78it/s]                                                       61%|██████    | 7735/12776 [1:20:34<17:34,  4.78it/s] 61%|██████    | 7736/12776 [1:20:34<16:56,  4.96it/s]                                                       61%|██████    | 7736/12776 [1:20:34<16:56,  4.96it/s] 61%|██████    | 7737/12776 [1:20:35<19:27,  4.32it/s]                                                       61%|██████    | 7737/12776 [1:20:35<19:27,  4.32it/s] 61%|██████    | 7738/12776 [1:20:35<30:52,  2.72it/s]                                                       61%|██████    | 7738/12776 [1:20:35<30:52,  2.72it/s] 61%|██████    | 7739/12776 [1:20:37<54:24,  1.54it/s]                                                       61%|██████    | 7739/12776 [1:20:37<54:24,  1.54it/s] 61%|██████    | 7740/12776 [1:20:38<1:02:24,  1.34it/s]                                                         61%|██████    | 7740/12776 [1:20:38<1:02:24,  1.34it/s] 61%|██████    | 7741/12776 [1:20:39<1:06:24,  1.26it/s]                                                         61%|██████    | 7741/12776 [1:20:39<1:06:24,  1.26it/s] 61%|██████    | 7742/12776 [1:20:39<1:10:05,  1.20it/s]                                                         61%|██████    | 7742/12776 [1:20:39<1:10:05,  1.20it/s] 61%|██████    | 7743/12776 [1:20:40<1:09:59,  1.20it/s]                                                         61%|██████    | 7743/12776 [1:20:40<1:09:59,  1.20it/s] 61%|██████    | 7744/12776 [1:20:41<1:07:07,  1.25it/s]                                                         61%|██████    | 7744/12776 [1:20:41<1:07:07,  1.25it/s] 61%|██████    | 7745/12776 [1:20:42<1:04:34,  1.30it/s]                                                         61%|██████    | 7745/12776 [1:20:42<1:04:34,  1.30it/s] 61%|██████    | 7746/12776 [1:20:42<1:01:23,  1.37it/s]                                                         61%|██████    | 7746/12776 [1:20:42<1:01:23,  1.37it/s] 61%|██████    | 7747/12776 [1:20:43<58:14,  1.44it/s]                                                         61%|██████    | 7747/12776 [1:20:43<58:14,  1.44it/s] 61%|██████    | 7748/12776 [1:20:44<54:59,  1.52it/s]                                                       61%|██████    | 7748/12776 [1:20:44<54:59,  1.52it/s] 61%|██████    | 7749/12776 [1:20:44<52:07,  1.61it/s]                                                       61%|██████    | 7749/12776 [1:20:44<52:07,  1.61it/s] 61%|██████    | 7750/12776 [1:20:45<49:23,  1.70it/s]                                                       61%|██████    | 7750/12776 [1:20:45<49:23,  1.70it/s] 61%|██████    | 7751/12776 [1:20:45<49:08,  1.70it/s]                                                       61%|██████    | 7751/12776 [1:20:45<49:08,  1.70it/s] 61%|██████    | 7752/12776 [1:20:46<46:18,  1.81it/s]                                                       61%|██████    | 7752/12776 [1:20:46<46:18,  1.81it/s] 61%|██████    | 7753/12776 [1:20:46<46:09,  1.81it/s]                                                       61%|██████    | 7753/12776 [1:20:46<46:09,  1.81it/s] 61%|██████    | 7754/12776 [1:20:47<43:21,  1.93it/s]                                                       61%|██████    | 7754/12776 [1:20:47<43:21,  1.93it/s] 61%|██████    | 7755/12776 [1:20:47<43:32,  1.92it/s]                                                       61%|██████    | 7755/12776 [1:20:47<43:32,  1.92it/s] 61%|██████    | 7756/12776 [1:20:48<40:37,  2.06it/s]                                                       61%|██████    | 7756/12776 [1:20:48<40:37,  2.06it/s] 61%|██████    | 7757/12776 [1:20:48<38:10,  2.19it/s]                                                       61%|██████    | 7757/12776 [1:20:48<38:10,  2.19it/s] 61%|██████    | 7758/12776 [1:20:48<36:03,  2.32it/s]                                                       61%|██████    | 7758/12776 [1:20:48<36:03,  2.32it/s] 61%|██████    | 7759/12776 [1:20:49<34:05,  2.45it/s]                                                       61%|██████    | 7759/12776 [1:20:49<34:05,  2.45it/s] 61%|██████    | 7760/12776 [1:20:49<32:41,  2.56it/s]                                                       61%|██████    | 7760/12776 [1:20:49<32:41,  2.56it/s] 61%|██████    | 7761/12776 [1:20:49<33:04,  2.53it/s]                                                       61%|██████    | 7761/12776 [1:20:49<33:04,  2.53it/s] 61%|██████    | 7762/12776 [1:20:50<31:21,  2.66it/s]                                                       61%|██████    | 7762/12776 [1:20:50<31:21,  2.66it/s] 61%|██████    | 7763/12776 [1:20:50<29:44,  2.81it/s]                                                       61%|██████    | 7763/12776 [1:20:50<29:44,  2.81it/s] 61%|██████    | 7764/12776 [1:20:50<28:26,  2.94it/s]                                                       61%|██████    | 7764/12776 [1:20:50<28:26,  2.94it/s] 61%|██████    | 7765/12776 [1:20:51<28:16,  2.95it/s]                                                       61%|██████    | 7765/12776 [1:20:51<28:16,  2.95it/s] 61%|██████    | 7766/12776 [1:20:51<27:02,  3.09it/s]                                                       61%|██████    | 7766/12776 [1:20:51<27:02,  3.09it/s] 61%|██████    | 7767/12776 [1:20:51<25:57,  3.22it/s]                                                       61%|██████    | 7767/12776 [1:20:51<25:57,  3.22it/s] 61%|██████    | 7768/12776 [1:20:52<25:01,  3.34it/s]                                                       61%|██████    | 7768/12776 [1:20:52<25:01,  3.34it/s] 61%|██████    | 7769/12776 [1:20:52<24:32,  3.40it/s]                                                       61%|██████    | 7769/12776 [1:20:52<24:32,  3.40it/s] 61%|██████    | 7770/12776 [1:20:52<23:35,  3.54it/s]                                                       61%|██████    | 7770/12776 [1:20:52<23:35,  3.54it/s] 61%|██████    | 7771/12776 [1:20:52<22:52,  3.65it/s]                                                      {'loss': 0.2789, 'grad_norm': 0.5632511973381042, 'learning_rate': 0.00012446236559139783, 'epoch': 1.2}
+{'loss': 0.251, 'grad_norm': 0.5035088658332825, 'learning_rate': 0.0001244379276637341, 'epoch': 1.2}
+{'loss': 0.2104, 'grad_norm': 0.42443281412124634, 'learning_rate': 0.00012441348973607038, 'epoch': 1.2}
+{'loss': 0.4027, 'grad_norm': 4.329174518585205, 'learning_rate': 0.00012438905180840664, 'epoch': 1.2}
+{'loss': 0.3523, 'grad_norm': 0.7891638875007629, 'learning_rate': 0.0001243646138807429, 'epoch': 1.21}
+{'loss': 0.3149, 'grad_norm': 0.9422610998153687, 'learning_rate': 0.00012434017595307917, 'epoch': 1.21}
+{'loss': 0.3606, 'grad_norm': 1.680687427520752, 'learning_rate': 0.00012431573802541544, 'epoch': 1.21}
+{'loss': 0.2848, 'grad_norm': 1.3018105030059814, 'learning_rate': 0.0001242913000977517, 'epoch': 1.21}
+{'loss': 0.2945, 'grad_norm': 2.048955202102661, 'learning_rate': 0.00012426686217008797, 'epoch': 1.21}
+{'loss': 0.4641, 'grad_norm': 1.392334222793579, 'learning_rate': 0.00012424242424242422, 'epoch': 1.21}
+{'loss': 0.4634, 'grad_norm': 1.0475993156433105, 'learning_rate': 0.0001242179863147605, 'epoch': 1.21}
+{'loss': 0.5416, 'grad_norm': 1.2346774339675903, 'learning_rate': 0.00012419354838709678, 'epoch': 1.21}
+{'loss': 0.2803, 'grad_norm': 1.159487009048462, 'learning_rate': 0.00012416911045943303, 'epoch': 1.21}
+{'loss': 0.2959, 'grad_norm': 1.400073528289795, 'learning_rate': 0.00012414467253176928, 'epoch': 1.21}
+{'loss': 0.2158, 'grad_norm': 0.6044692397117615, 'learning_rate': 0.00012412023460410556, 'epoch': 1.21}
+{'loss': 0.6542, 'grad_norm': 1.8610830307006836, 'learning_rate': 0.00012409579667644184, 'epoch': 1.21}
+{'loss': 0.4033, 'grad_norm': 1.094446063041687, 'learning_rate': 0.0001240713587487781, 'epoch': 1.21}
+{'loss': 0.4836, 'grad_norm': 4.96532678604126, 'learning_rate': 0.00012404692082111437, 'epoch': 1.21}
+{'loss': 0.4183, 'grad_norm': 1.7114042043685913, 'learning_rate': 0.00012402248289345062, 'epoch': 1.21}
+{'loss': 0.4931, 'grad_norm': 1.576427936553955, 'learning_rate': 0.00012399804496578687, 'epoch': 1.21}
+{'loss': 0.3988, 'grad_norm': 1.6242084503173828, 'learning_rate': 0.00012397360703812315, 'epoch': 1.21}
+{'loss': 1.0594, 'grad_norm': 6.9016499519348145, 'learning_rate': 0.00012394916911045943, 'epoch': 1.21}
+{'loss': 0.5999, 'grad_norm': 1.4505754709243774, 'learning_rate': 0.00012392473118279568, 'epoch': 1.21}
+{'loss': 0.4765, 'grad_norm': 1.9598108530044556, 'learning_rate': 0.00012390029325513196, 'epoch': 1.21}
+{'loss': 0.5586, 'grad_norm': 1.3153584003448486, 'learning_rate': 0.0001238758553274682, 'epoch': 1.21}
+{'loss': 0.4036, 'grad_norm': 1.5149145126342773, 'learning_rate': 0.0001238514173998045, 'epoch': 1.21}
+{'loss': 0.8044, 'grad_norm': 2.801050901412964, 'learning_rate': 0.00012382697947214077, 'epoch': 1.21}
+{'loss': 0.7702, 'grad_norm': 2.656320095062256, 'learning_rate': 0.00012380254154447702, 'epoch': 1.21}
+{'loss': 0.8827, 'grad_norm': 3.1893441677093506, 'learning_rate': 0.00012377810361681327, 'epoch': 1.21}
+{'loss': 1.0204, 'grad_norm': 4.1985063552856445, 'learning_rate': 0.00012375366568914955, 'epoch': 1.21}
+{'loss': 0.9891, 'grad_norm': 2.559026002883911, 'learning_rate': 0.00012372922776148583, 'epoch': 1.21}
+{'loss': 1.0455, 'grad_norm': 1.904913067817688, 'learning_rate': 0.00012370478983382208, 'epoch': 1.21}
+{'loss': 1.2292, 'grad_norm': 3.1764469146728516, 'learning_rate': 0.00012368035190615836, 'epoch': 1.21}
+{'loss': 0.7425, 'grad_norm': 1.6916859149932861, 'learning_rate': 0.0001236559139784946, 'epoch': 1.21}
+{'loss': 0.9403, 'grad_norm': 1.8501296043395996, 'learning_rate': 0.00012363147605083089, 'epoch': 1.21}
+{'loss': 0.6932, 'grad_norm': 2.4505608081817627, 'learning_rate': 0.00012360703812316716, 'epoch': 1.21}
+{'loss': 0.9876, 'grad_norm': 2.6362080574035645, 'learning_rate': 0.00012358260019550342, 'epoch': 1.21}
+{'loss': 1.1326, 'grad_norm': 3.168712854385376, 'learning_rate': 0.00012355816226783967, 'epoch': 1.21}
+{'loss': 0.9867, 'grad_norm': 1.8203378915786743, 'learning_rate': 0.00012353372434017594, 'epoch': 1.21}
+{'loss': 1.2667, 'grad_norm': 1.6555746793746948, 'learning_rate': 0.0001235092864125122, 'epoch': 1.21}
+{'loss': 0.3442, 'grad_norm': 1.094726324081421, 'learning_rate': 0.00012348484848484847, 'epoch': 1.21}
+{'loss': 0.5379, 'grad_norm': 2.5407097339630127, 'learning_rate': 0.00012346041055718475, 'epoch': 1.21}
+{'loss': 0.1715, 'grad_norm': 1.3805251121520996, 'learning_rate': 0.000123435972629521, 'epoch': 1.21}
+{'loss': 0.7459, 'grad_norm': 3.8366167545318604, 'learning_rate': 0.00012341153470185726, 'epoch': 1.21}
+{'loss': 0.9848, 'grad_norm': 2.2765462398529053, 'learning_rate': 0.00012338709677419353, 'epoch': 1.21}
+{'loss': 0.2307, 'grad_norm': 0.41554513573646545, 'learning_rate': 0.0001233626588465298, 'epoch': 1.21}
+{'loss': 0.1887, 'grad_norm': 0.5446311235427856, 'learning_rate': 0.00012333822091886606, 'epoch': 1.21}
+{'loss': 0.2161, 'grad_norm': 0.42583855986595154, 'learning_rate': 0.00012331378299120234, 'epoch': 1.21}
+{'loss': 0.2454, 'grad_norm': 0.40142467617988586, 'learning_rate': 0.0001232893450635386, 'epoch': 1.21}
+{'loss': 0.2714, 'grad_norm': 0.6083351969718933, 'learning_rate': 0.00012326490713587487, 'epoch': 1.21}
+{'loss': 0.3219, 'grad_norm': 0.5035018920898438, 'learning_rate': 0.00012324046920821115, 'epoch': 1.21}
+{'loss': 0.2845, 'grad_norm': 0.8749467730522156, 'learning_rate': 0.0001232160312805474, 'epoch': 1.21}
+{'loss': 0.2443, 'grad_norm': 0.8860273957252502, 'learning_rate': 0.00012319159335288365, 'epoch': 1.21}
+{'loss': 0.3225, 'grad_norm': 0.9518566727638245, 'learning_rate': 0.00012316715542521993, 'epoch': 1.21}
+{'loss': 0.2441, 'grad_norm': 0.7895660996437073, 'learning_rate': 0.0001231427174975562, 'epoch': 1.21}
+{'loss': 0.3121, 'grad_norm': 0.9704890847206116, 'learning_rate': 0.00012311827956989246, 'epoch': 1.21}
+{'loss': 0.302, 'grad_norm': 0.8872887492179871, 'learning_rate': 0.00012309384164222874, 'epoch': 1.21}
+{'loss': 0.2676, 'grad_norm': 0.5776874423027039, 'learning_rate': 0.000123069403714565, 'epoch': 1.21}
+{'loss': 0.3352, 'grad_norm': 0.9891446232795715, 'learning_rate': 0.00012304496578690127, 'epoch': 1.21}
+{'loss': 0.3109, 'grad_norm': 1.1194006204605103, 'learning_rate': 0.00012302052785923755, 'epoch': 1.21}
+{'loss': 0.3955, 'grad_norm': 1.1497498750686646, 'learning_rate': 0.0001229960899315738, 'epoch': 1.21}
+{'loss': 0.8381, 'grad_norm': 2.879631519317627, 'learning_rate': 0.00012297165200391005, 'epoch': 1.21}
+{'loss': 0.5012, 'grad_norm': 2.2817280292510986, 'learning_rate': 0.00012294721407624633, 'epoch': 1.21}
+{'loss': 0.4131, 'grad_norm': 2.1993658542633057, 'learning_rate': 0.00012292277614858258, 'epoch': 1.21}
+{'loss': 0.3529, 'grad_norm': 0.96397864818573, 'learning_rate': 0.00012289833822091886, 'epoch': 1.21}
+{'loss': 0.3834, 'grad_norm': 1.8400605916976929, 'learning_rate': 0.00012287390029325514, 'epoch': 1.21}
+{'loss': 0.6518, 'grad_norm': 2.0213780403137207, 'learning_rate': 0.00012284946236559139, 'epoch': 1.21}
+{'loss': 0.5768, 'grad_norm': 1.1717514991760254, 'learning_rate': 0.00012282502443792764, 'epoch': 1.21}
+{'loss': 0.6203, 'grad_norm': 1.5490142107009888, 'learning_rate': 0.00012280058651026392, 'epoch': 1.22}
+{'loss': 0.8325, 'grad_norm': 2.197330951690674, 'learning_rate': 0.0001227761485826002, 'epoch': 1.22}
+{'loss': 0.449, 'grad_norm': 1.72406804561615, 'learning_rate': 0.00012275171065493645, 'epoch': 1.22}
+{'loss': 0.7901, 'grad_norm': 2.904944658279419, 'learning_rate': 0.00012272727272727272, 'epoch': 1.22}
+{'loss': 0.8409, 'grad_norm': 1.9773554801940918, 'learning_rate': 0.00012270283479960898, 'epoch': 1.22}
+{'loss': 1.4507, 'grad_norm': 6.0343337059021, 'learning_rate': 0.00012267839687194525, 'epoch': 1.22}
+{'loss': 0.9013, 'grad_norm': 2.157370090484619, 'learning_rate': 0.00012265395894428153, 'epoch': 1.22}
+{'loss': 0.5475, 'grad_norm': 2.05031418800354, 'learning_rate': 0.00012262952101661778, 'epoch': 1.22}
+{'loss': 1.015, 'grad_norm': 2.67240834236145, 'learning_rate': 0.00012260508308895403, 'epoch': 1.22}
+ 61%|██████    | 7771/12776 [1:20:52<22:52,  3.65it/s] 61%|██████    | 7772/12776 [1:20:53<22:14,  3.75it/s]                                                       61%|██████    | 7772/12776 [1:20:53<22:14,  3.75it/s] 61%|██████    | 7773/12776 [1:20:53<24:16,  3.43it/s]                                                       61%|██████    | 7773/12776 [1:20:53<24:16,  3.43it/s] 61%|██████    | 7774/12776 [1:20:53<22:55,  3.64it/s]                                                       61%|██████    | 7774/12776 [1:20:53<22:55,  3.64it/s] 61%|██████    | 7775/12776 [1:20:53<21:53,  3.81it/s]                                                       61%|██████    | 7775/12776 [1:20:53<21:53,  3.81it/s] 61%|██████    | 7776/12776 [1:20:54<21:05,  3.95it/s]                                                       61%|██████    | 7776/12776 [1:20:54<21:05,  3.95it/s] 61%|██████    | 7777/12776 [1:20:54<22:42,  3.67it/s]                                                       61%|██████    | 7777/12776 [1:20:54<22:42,  3.67it/s] 61%|██████    | 7778/12776 [1:20:54<21:11,  3.93it/s]                                                       61%|██████    | 7778/12776 [1:20:54<21:11,  3.93it/s] 61%|██████    | 7779/12776 [1:20:54<20:02,  4.16it/s]                                                       61%|██████    | 7779/12776 [1:20:54<20:02,  4.16it/s] 61%|██████    | 7780/12776 [1:20:55<19:15,  4.32it/s]                                                       61%|██████    | 7780/12776 [1:20:55<19:15,  4.32it/s] 61%|██████    | 7781/12776 [1:20:55<18:35,  4.48it/s]                                                       61%|██████    | 7781/12776 [1:20:55<18:35,  4.48it/s] 61%|██████    | 7782/12776 [1:20:55<20:47,  4.00it/s]                                                       61%|██████    | 7782/12776 [1:20:55<20:47,  4.00it/s] 61%|██████    | 7783/12776 [1:20:55<19:30,  4.26it/s]                                                       61%|██████    | 7783/12776 [1:20:55<19:30,  4.26it/s] 61%|██████    | 7784/12776 [1:20:55<18:31,  4.49it/s]                                                       61%|██████    | 7784/12776 [1:20:55<18:31,  4.49it/s] 61%|██████    | 7785/12776 [1:20:56<17:44,  4.69it/s]                                                       61%|██████    | 7785/12776 [1:20:56<17:44,  4.69it/s] 61%|██████    | 7786/12776 [1:20:56<17:10,  4.84it/s]                                                       61%|██████    | 7786/12776 [1:20:56<17:10,  4.84it/s] 61%|██████    | 7787/12776 [1:20:56<16:40,  4.99it/s]                                                       61%|██████    | 7787/12776 [1:20:56<16:40,  4.99it/s] 61%|██████    | 7788/12776 [1:20:57<27:53,  2.98it/s]                                                       61%|██████    | 7788/12776 [1:20:57<27:53,  2.98it/s] 61%|██████    | 7789/12776 [1:20:58<51:36,  1.61it/s]                                                       61%|██████    | 7789/12776 [1:20:58<51:36,  1.61it/s] 61%|██████    | 7790/12776 [1:20:59<1:00:02,  1.38it/s]                                                         61%|██████    | 7790/12776 [1:20:59<1:00:02,  1.38it/s] 61%|██████    | 7791/12776 [1:21:00<1:02:56,  1.32it/s]                                                         61%|██████    | 7791/12776 [1:21:00<1:02:56,  1.32it/s] 61%|██████    | 7792/12776 [1:21:01<1:04:00,  1.30it/s]                                                         61%|██████    | 7792/12776 [1:21:01<1:04:00,  1.30it/s] 61%|█��████    | 7793/12776 [1:21:01<1:03:10,  1.31it/s]                                                         61%|██████    | 7793/12776 [1:21:01<1:03:10,  1.31it/s] 61%|██████    | 7794/12776 [1:21:02<1:01:42,  1.35it/s]                                                         61%|██████    | 7794/12776 [1:21:02<1:01:42,  1.35it/s] 61%|██████    | 7795/12776 [1:21:03<59:48,  1.39it/s]                                                         61%|██████    | 7795/12776 [1:21:03<59:48,  1.39it/s] 61%|██████    | 7796/12776 [1:21:03<1:01:26,  1.35it/s]                                                         61%|██████    | 7796/12776 [1:21:03<1:01:26,  1.35it/s] 61%|██████    | 7797/12776 [1:21:04<57:55,  1.43it/s]                                                         61%|██████    | 7797/12776 [1:21:04<57:55,  1.43it/s] 61%|██████    | 7798/12776 [1:21:05<55:32,  1.49it/s]                                                       61%|██████    | 7798/12776 [1:21:05<55:32,  1.49it/s] 61%|██████    | 7799/12776 [1:21:05<52:17,  1.59it/s]                                                       61%|██████    | 7799/12776 [1:21:05<52:17,  1.59it/s] 61%|██████    | 7800/12776 [1:21:06<50:45,  1.63it/s]                                                       61%|██████    | 7800/12776 [1:21:06<50:45,  1.63it/s] 61%|██████    | 7801/12776 [1:21:06<47:44,  1.74it/s]                                                       61%|██████    | 7801/12776 [1:21:06<47:44,  1.74it/s] 61%|██████    | 7802/12776 [1:21:07<46:33,  1.78it/s]                                                       61%|██████    | 7802/12776 [1:21:07<46:33,  1.78it/s] 61%|██████    | 7803/12776 [1:21:07<43:39,  1.90it/s]                                                       61%|██████    | 7803/12776 [1:21:07<43:39,  1.90it/s] 61%|██████    | 7804/12776 [1:21:08<43:13,  1.92it/s]                                                       61%|██████    | 7804/12776 [1:21:08<43:13,  1.92it/s] 61%|██████    | 7805/12776 [1:21:08<40:25,  2.05it/s]                                                       61%|██████    | 7805/12776 [1:21:08<40:25,  2.05it/s] 61%|██████    | 7806/12776 [1:21:09<37:50,  2.19it/s]                                                       61%|██████    | 7806/12776 [1:21:09<37:50,  2.19it/s] 61%|██████    | 7807/12776 [1:21:09<35:27,  2.34it/s]                                                       61%|██████    | 7807/12776 [1:21:09<35:27,  2.34it/s] 61%|██████    | 7808/12776 [1:21:09<33:28,  2.47it/s]                                                       61%|██████    | 7808/12776 [1:21:09<33:28,  2.47it/s] 61%|██████    | 7809/12776 [1:21:10<31:48,  2.60it/s]                                                       61%|██████    | 7809/12776 [1:21:10<31:48,  2.60it/s] 61%|██████    | 7810/12776 [1:21:10<33:08,  2.50it/s]                                                       61%|██████    | 7810/12776 [1:21:10<33:08,  2.50it/s] 61%|██████    | 7811/12776 [1:21:10<31:03,  2.66it/s]                                                       61%|██████    | 7811/12776 [1:21:10<31:03,  2.66it/s] 61%|██████    | 7812/12776 [1:21:11<29:13,  2.83it/s]                                                       61%|██████    | 7812/12776 [1:21:11<29:13,  2.83it/s] 61%|██████    | 7813/12776 [1:21:11<27:43,  2.98it/s]                                                       61%|██████    | 7813/12776 [1:21:11<27:43,  2.98it/s] 61%|██████    | 7814/12776 [1:21:11<28:23,  2.91it/s]                                                       61%|██████    | 7814/12776 [1:21:11<28:23,  2.91it/s] 61%|██████    | 7815/12776 [1:21:12<26:49,  3.08it/s]                                                       61%|██████    | 7815/12776 [1:21:12<26:49,  3.08it/s] 61%|██████    | 7816/12776 [1:21:12<25:33,  3.23it/s]                                                       61%|██████    | 7816/12776 [1:21:12<25:33,  3.23it/s] 61%|██████    | 7817/12776 [1:21:12<24:27,  3.38it/s]                                                       61%|██████    | 7817/12776 [1:21:12<24:27,  3.38it/s] 61%|██████    | 7818/12776 [1:21:12<25:40,  3.22it/s]                                                       61%|██████    | 7818/12776 [1:21:12<25:40,  3.22it/s] 61%|██████    | 7819/12776 [1:21:13<24:10,  3.42it/s]                                                       61%|██████    | 7819/12776 [1:21:13<24:10,  3.42it/s] 61%|██████    | 7820/12776 [1:21:13<22:58,  3.60it/s]                                                       61%|██████    | 7820/12776 [1:21:13<22:58,  3.60it/s] 61%|██████    | 7821/12776 [1:21:13<22:02,  3.75it/s]                                                       61%|██████    | 7821/12776 [1:21:13<22:02,  3.75it/s] 61%|██████    | 7822/12776 [1:21:14<24:13,  3.41it/s]                                                       61%|██████    | 7822/12776 [1:21:14<24:13,  3.41it/s] 61%|██████    | 7823/12776 [1:21:14<22:29,  3.67it/s]                                                       61%|██████    | 7823/12776 [1:21:14<22:29,  3.67it/s] 61%|██████    | 7824/12776 [1:21:14<21:09,  3.90it/s]                                                       61%|██████    | 7824/12776 [1:21:14<21:09,  3.90it/s] 61%|██████    | 7825/12776 [1:21:14<20:03,  4.11it/s]                                                       61%|██████    | 7825/12776 [1:21:14<20:03,  4.11it/s] 61%|██████▏   | 7826/12776 [1:21:14<19:14,  4.29it/s]                                                       61%|██████▏   | 7826/12776 [1:21:14<19:14,  4.29it/s] 61%|██████▏   | 7827/12776 [1:21:15<20:41,  3.98it/s]                                                       61%|██████▏   | 7827/12776 [1:21:15<20:41,  3.98it/s] 61%|██████▏   | 7828/12776 [1:21:15<19:34,  4.21it/s]                                                       61%|██████▏   | 7828/12776 [1:21:15<19:34,  4.21it/s] 61%|██████▏   | 7829/12776 [1:21:15<18:45,  4.40it/s]                                                       61%|██████▏   | 7829/12776 [1:21:15<18:45,  4.40it/s] 61%|██████▏   | 7830/12776 [1:21:15<18:01,  4.57it/s]                                                       61%|██████▏   | 7830/12776 [1:21:15<18:01,  4.57it/s] 61%|██████▏   | 7831/12776 [1:21:16<17:30,  4.71it/s]                                                       61%|██████▏   | 7831/12776 [1:21:16<17:30,  4.71it/s] 61%|██████▏   | 7832/12776 [1:21:16<19:15,  4.28it/s]                                                       61%|██████▏   | 7832/12776 [1:21:16<19:15,  4.28it/s] 61%|██████▏   | 7833/12776 [1:21:16<18:29,  4.46it/s]                                                       61%|██████▏   | 7833/12776 [1:21:16<18:29,  4.46it/s] 61%|██████▏   | 7834/12776 [1:21:16<17:41,  4.66it/s]                                                       61%|██████▏   | 7834/12776 [1:21:16<17:41,  4.66it/s] 61%|██████▏   | 7835/12776 [1:21:16<17:08,  4.80it/s]                                                       61%|██████▏   | 7835/12776 [1:21:16<17:08,  4.80it/s] 61%|██████▏   | 7836/12776 [1:21:17<16:35,  4.96it/s]                                                       61%|██████▏   | 7836/12776 [1:21:17<16:35,  4.96it/s] 61%|██████▏   | 7837/12776 [1:21:17<17:51,  4.61it/s]                                                       61%|██████▏   | 7837/12776 [1:21:17<17:51,  4.61it/s] 61%|██████▏   | 7838/12776 [1:21:18<29:13,  2.82it/s]                                                       61%|██████▏   | 7838/12776 [1:21:18<29:13,  2.82it/s] 61%|██████▏   | 7839/12776 [1:21:19<56:12,  1.46it/s]                                                       61%|██████▏   | 7839/12776 [1:21:19<56:12,  1.46it/s] 61%|██████▏   | 7840/12776 [1:21:20<1:01:56,  1.33it/s]                                                         61%|██████▏   | 7840/12776 [1:21:20<1:01:56,  1.33it/s] 61%|██████▏   | 7841/12776 [1:21:21<1:04:17,  1.28it/s]                                                         61%|██████▏   | 7841/12776 [1:21:21<1:04:17,  1.28it/s] 61%|██████▏   | 7842/12776 [1:21:21<1:03:32,  1.29it/s]                                                         61%|██████▏   | 7842/12776 [1:21:21<1:03:32,  1.29it/s] 61%|██████▏   | 7843/12776 [1:21:22<1:02:28,  1.32it/s]                                                         61%|██████▏   | 7843/12776 [1:21:22<1:02:28,  1.32it/s] 61%|██████▏   | 7844/12776 [1:21:23<1:01:15,  1.34it/s]                                                         61%|██████▏   | 7844/12776 [1:21:23<1:01:15,  1.34it/s] 61%|██████▏   | 7845/12776 [1:21:24<1:01:03,  1.35it/s]                                                         61%|██████▏   | 7845/12776 [1:21:24<1:01:03,  1.35it/s] 61%|██████▏   | 7846/12776 [1:21:24<57:50,  1.42it/s]                                                         61%|██████▏   | 7846/12776 [1:21:24<57:50,  1.42it/s] 61%|██████▏   | 7847/12776 [1:21:25<54:28,  1.51it/s]                                                       61%|██████▏   | 7847/12776 [1:21:25<54:28,  1.51it/s] 61%|██████▏   | 7848/12776 [1:21:25<51:49,  1.58it/s]                                                      {'loss': 1.03, 'grad_norm': 4.982744216918945, 'learning_rate': 0.0001225806451612903, 'epoch': 1.22}
+{'loss': 0.9372, 'grad_norm': 2.2275023460388184, 'learning_rate': 0.0001225562072336266, 'epoch': 1.22}
+{'loss': 0.4652, 'grad_norm': 1.2302268743515015, 'learning_rate': 0.00012253176930596284, 'epoch': 1.22}
+{'loss': 0.8191, 'grad_norm': 2.873462200164795, 'learning_rate': 0.00012250733137829912, 'epoch': 1.22}
+{'loss': 0.885, 'grad_norm': 2.2558248043060303, 'learning_rate': 0.00012248289345063537, 'epoch': 1.22}
+{'loss': 0.7907, 'grad_norm': 2.1223719120025635, 'learning_rate': 0.00012245845552297165, 'epoch': 1.22}
+{'loss': 0.9959, 'grad_norm': 2.519007682800293, 'learning_rate': 0.0001224340175953079, 'epoch': 1.22}
+{'loss': 0.7831, 'grad_norm': 2.0962483882904053, 'learning_rate': 0.00012240957966764418, 'epoch': 1.22}
+{'loss': 0.5034, 'grad_norm': 2.0779409408569336, 'learning_rate': 0.00012238514173998043, 'epoch': 1.22}
+{'loss': 0.5842, 'grad_norm': 1.5359145402908325, 'learning_rate': 0.0001223607038123167, 'epoch': 1.22}
+{'loss': 1.1284, 'grad_norm': 4.660392761230469, 'learning_rate': 0.00012233626588465296, 'epoch': 1.22}
+{'loss': 0.5375, 'grad_norm': 2.8180553913116455, 'learning_rate': 0.00012231182795698924, 'epoch': 1.22}
+{'loss': 1.0843, 'grad_norm': 2.4774529933929443, 'learning_rate': 0.00012228739002932552, 'epoch': 1.22}
+{'loss': 1.9087, 'grad_norm': 4.466963768005371, 'learning_rate': 0.00012226295210166177, 'epoch': 1.22}
+{'loss': 0.9215, 'grad_norm': 3.7716944217681885, 'learning_rate': 0.00012223851417399802, 'epoch': 1.22}
+{'loss': 0.4391, 'grad_norm': 1.5880441665649414, 'learning_rate': 0.0001222140762463343, 'epoch': 1.22}
+{'loss': 0.2122, 'grad_norm': 1.5129741430282593, 'learning_rate': 0.00012218963831867058, 'epoch': 1.22}
+{'loss': 0.9607, 'grad_norm': 2.9316365718841553, 'learning_rate': 0.00012216520039100683, 'epoch': 1.22}
+{'loss': 0.2167, 'grad_norm': 0.47794780135154724, 'learning_rate': 0.0001221407624633431, 'epoch': 1.22}
+{'loss': 0.2389, 'grad_norm': 0.9191654324531555, 'learning_rate': 0.00012211632453567936, 'epoch': 1.22}
+{'loss': 0.1648, 'grad_norm': 0.43953564763069153, 'learning_rate': 0.00012209188660801564, 'epoch': 1.22}
+{'loss': 0.2088, 'grad_norm': 0.49689897894859314, 'learning_rate': 0.0001220674486803519, 'epoch': 1.22}
+{'loss': 0.2505, 'grad_norm': 0.5962256193161011, 'learning_rate': 0.00012204301075268817, 'epoch': 1.22}
+{'loss': 0.2664, 'grad_norm': 0.6489814519882202, 'learning_rate': 0.00012201857282502443, 'epoch': 1.22}
+{'loss': 0.2782, 'grad_norm': 0.46907347440719604, 'learning_rate': 0.0001219941348973607, 'epoch': 1.22}
+{'loss': 0.1701, 'grad_norm': 0.5828919410705566, 'learning_rate': 0.00012196969696969696, 'epoch': 1.22}
+{'loss': 0.2443, 'grad_norm': 0.8440871238708496, 'learning_rate': 0.00012194525904203322, 'epoch': 1.22}
+{'loss': 0.193, 'grad_norm': 0.6275129914283752, 'learning_rate': 0.0001219208211143695, 'epoch': 1.22}
+{'loss': 0.2657, 'grad_norm': 0.6567816138267517, 'learning_rate': 0.00012189638318670575, 'epoch': 1.22}
+{'loss': 0.3339, 'grad_norm': 0.7370538711547852, 'learning_rate': 0.00012187194525904202, 'epoch': 1.22}
+{'loss': 0.2557, 'grad_norm': 0.6013770699501038, 'learning_rate': 0.00012184750733137828, 'epoch': 1.22}
+{'loss': 0.7065, 'grad_norm': 2.4509990215301514, 'learning_rate': 0.00012182306940371456, 'epoch': 1.22}
+{'loss': 0.1996, 'grad_norm': 0.5421009659767151, 'learning_rate': 0.00012179863147605081, 'epoch': 1.22}
+{'loss': 0.2315, 'grad_norm': 1.131841778755188, 'learning_rate': 0.00012177419354838708, 'epoch': 1.22}
+{'loss': 0.4474, 'grad_norm': 1.3324615955352783, 'learning_rate': 0.00012174975562072336, 'epoch': 1.22}
+{'loss': 0.4004, 'grad_norm': 0.9257820248603821, 'learning_rate': 0.00012172531769305962, 'epoch': 1.22}
+{'loss': 0.8599, 'grad_norm': 2.541618585586548, 'learning_rate': 0.00012170087976539587, 'epoch': 1.22}
+{'loss': 0.3127, 'grad_norm': 1.269856333732605, 'learning_rate': 0.00012167644183773215, 'epoch': 1.22}
+{'loss': 0.3841, 'grad_norm': 1.0655205249786377, 'learning_rate': 0.00012165200391006842, 'epoch': 1.22}
+{'loss': 0.472, 'grad_norm': 1.21709406375885, 'learning_rate': 0.00012162756598240467, 'epoch': 1.22}
+{'loss': 0.4834, 'grad_norm': 1.1578439474105835, 'learning_rate': 0.00012160312805474095, 'epoch': 1.22}
+{'loss': 0.462, 'grad_norm': 1.1873836517333984, 'learning_rate': 0.00012157869012707721, 'epoch': 1.22}
+{'loss': 0.7357, 'grad_norm': 1.573827862739563, 'learning_rate': 0.00012155425219941347, 'epoch': 1.22}
+{'loss': 0.8596, 'grad_norm': 2.879338502883911, 'learning_rate': 0.00012152981427174975, 'epoch': 1.22}
+{'loss': 0.2993, 'grad_norm': 1.6422525644302368, 'learning_rate': 0.000121505376344086, 'epoch': 1.22}
+{'loss': 0.3512, 'grad_norm': 1.4364542961120605, 'learning_rate': 0.00012148093841642227, 'epoch': 1.22}
+{'loss': 0.4175, 'grad_norm': 2.8140549659729004, 'learning_rate': 0.00012145650048875855, 'epoch': 1.22}
+{'loss': 0.3645, 'grad_norm': 1.8453247547149658, 'learning_rate': 0.00012143206256109481, 'epoch': 1.22}
+{'loss': 0.8623, 'grad_norm': 5.20889139175415, 'learning_rate': 0.00012140762463343106, 'epoch': 1.22}
+{'loss': 0.7932, 'grad_norm': 2.917222261428833, 'learning_rate': 0.00012138318670576734, 'epoch': 1.22}
+{'loss': 1.2039, 'grad_norm': 3.251565933227539, 'learning_rate': 0.00012135874877810361, 'epoch': 1.22}
+{'loss': 0.6937, 'grad_norm': 2.457594871520996, 'learning_rate': 0.00012133431085043986, 'epoch': 1.22}
+{'loss': 0.5691, 'grad_norm': 1.1662980318069458, 'learning_rate': 0.00012130987292277614, 'epoch': 1.22}
+{'loss': 1.1899, 'grad_norm': 4.571430206298828, 'learning_rate': 0.0001212854349951124, 'epoch': 1.22}
+{'loss': 0.8965, 'grad_norm': 3.9596447944641113, 'learning_rate': 0.00012126099706744867, 'epoch': 1.22}
+{'loss': 1.2272, 'grad_norm': 3.642796039581299, 'learning_rate': 0.00012123655913978494, 'epoch': 1.23}
+{'loss': 1.4294, 'grad_norm': 2.4847941398620605, 'learning_rate': 0.0001212121212121212, 'epoch': 1.23}
+{'loss': 1.0586, 'grad_norm': 5.294003963470459, 'learning_rate': 0.00012118768328445746, 'epoch': 1.23}
+{'loss': 1.0392, 'grad_norm': 4.286756992340088, 'learning_rate': 0.00012116324535679374, 'epoch': 1.23}
+{'loss': 0.8807, 'grad_norm': 3.1308891773223877, 'learning_rate': 0.00012113880742913, 'epoch': 1.23}
+{'loss': 1.1662, 'grad_norm': 4.852659702301025, 'learning_rate': 0.00012111436950146625, 'epoch': 1.23}
+{'loss': 1.4974, 'grad_norm': 4.063992500305176, 'learning_rate': 0.00012108993157380253, 'epoch': 1.23}
+{'loss': 1.2586, 'grad_norm': 3.3738880157470703, 'learning_rate': 0.0001210654936461388, 'epoch': 1.23}
+{'loss': 0.4073, 'grad_norm': 2.590223550796509, 'learning_rate': 0.00012104105571847505, 'epoch': 1.23}
+{'loss': 0.8075, 'grad_norm': 2.9616923332214355, 'learning_rate': 0.00012101661779081133, 'epoch': 1.23}
+{'loss': 0.8251, 'grad_norm': 3.8399484157562256, 'learning_rate': 0.00012099217986314759, 'epoch': 1.23}
+{'loss': 0.8066, 'grad_norm': 2.232974052429199, 'learning_rate': 0.00012096774193548386, 'epoch': 1.23}
+{'loss': 0.7184, 'grad_norm': 2.5134973526000977, 'learning_rate': 0.00012094330400782014, 'epoch': 1.23}
+{'loss': 0.2581, 'grad_norm': 0.7725585699081421, 'learning_rate': 0.00012091886608015639, 'epoch': 1.23}
+{'loss': 0.21, 'grad_norm': 0.4113602936267853, 'learning_rate': 0.00012089442815249265, 'epoch': 1.23}
+{'loss': 0.2286, 'grad_norm': 0.737152636051178, 'learning_rate': 0.00012086999022482893, 'epoch': 1.23}
+{'loss': 0.197, 'grad_norm': 0.8976015448570251, 'learning_rate': 0.00012084555229716518, 'epoch': 1.23}
+{'loss': 0.2104, 'grad_norm': 0.47844988107681274, 'learning_rate': 0.00012082111436950145, 'epoch': 1.23}
+{'loss': 0.2007, 'grad_norm': 0.49797824025154114, 'learning_rate': 0.00012079667644183772, 'epoch': 1.23}
+{'loss': 0.2887, 'grad_norm': 0.6173357963562012, 'learning_rate': 0.00012077223851417399, 'epoch': 1.23}
+{'loss': 0.1982, 'grad_norm': 0.406225323677063, 'learning_rate': 0.00012074780058651024, 'epoch': 1.23}
+{'loss': 0.3479, 'grad_norm': 0.7921119332313538, 'learning_rate': 0.00012072336265884652, 'epoch': 1.23}
+ 61%|██████▏   | 7848/12776 [1:21:25<51:49,  1.58it/s] 61%|██████▏   | 7849/12776 [1:21:26<50:47,  1.62it/s]                                                       61%|██████▏   | 7849/12776 [1:21:26<50:47,  1.62it/s] 61%|██████▏   | 7850/12776 [1:21:27<48:22,  1.70it/s]                                                       61%|██████▏   | 7850/12776 [1:21:27<48:22,  1.70it/s] 61%|██████▏   | 7851/12776 [1:21:27<49:58,  1.64it/s]                                                       61%|██████▏   | 7851/12776 [1:21:27<49:58,  1.64it/s] 61%|██████▏   | 7852/12776 [1:21:28<46:08,  1.78it/s]                                                       61%|██████▏   | 7852/12776 [1:21:28<46:08,  1.78it/s] 61%|██████▏   | 7853/12776 [1:21:28<43:04,  1.91it/s]                                                       61%|██████▏   | 7853/12776 [1:21:28<43:04,  1.91it/s] 61%|██████▏   | 7854/12776 [1:21:29<42:25,  1.93it/s]                                                       61%|██████▏   | 7854/12776 [1:21:29<42:25,  1.93it/s] 61%|██████▏   | 7855/12776 [1:21:29<39:36,  2.07it/s]                                                       61%|██████▏   | 7855/12776 [1:21:29<39:36,  2.07it/s] 61%|██████▏   | 7856/12776 [1:21:29<38:55,  2.11it/s]                                                       61%|██████▏   | 7856/12776 [1:21:29<38:55,  2.11it/s] 61%|██████▏   | 7857/12776 [1:21:30<36:13,  2.26it/s]                                                       61%|██████▏   | 7857/12776 [1:21:30<36:13,  2.26it/s] 62%|██████▏   | 7858/12776 [1:21:30<33:52,  2.42it/s]                                                       62%|██████▏   | 7858/12776 [1:21:30<33:52,  2.42it/s] 62%|██████▏   | 7859/12776 [1:21:31<33:38,  2.44it/s]                                                       62%|██████▏   | 7859/12776 [1:21:31<33:38,  2.44it/s] 62%|██████▏   | 7860/12776 [1:21:31<31:40,  2.59it/s]                                                       62%|██████▏   | 7860/12776 [1:21:31<31:40,  2.59it/s] 62%|██████▏   | 7861/12776 [1:21:31<29:56,  2.74it/s]                                                       62%|██████▏   | 7861/12776 [1:21:31<29:56,  2.74it/s] 62%|██████▏   | 7862/12776 [1:21:31<28:30,  2.87it/s]                                                       62%|██████▏   | 7862/12776 [1:21:31<28:30,  2.87it/s] 62%|██████▏   | 7863/12776 [1:21:32<28:14,  2.90it/s]                                                       62%|██████▏   | 7863/12776 [1:21:32<28:14,  2.90it/s] 62%|██████▏   | 7864/12776 [1:21:32<26:49,  3.05it/s]                                                       62%|██████▏   | 7864/12776 [1:21:32<26:49,  3.05it/s] 62%|██████▏   | 7865/12776 [1:21:32<25:41,  3.19it/s]                                                       62%|██████▏   | 7865/12776 [1:21:32<25:41,  3.19it/s] 62%|██████▏   | 7866/12776 [1:21:33<24:42,  3.31it/s]                                                       62%|██████▏   | 7866/12776 [1:21:33<24:42,  3.31it/s] 62%|██████▏   | 7867/12776 [1:21:33<24:39,  3.32it/s]                                                       62%|██████▏   | 7867/12776 [1:21:33<24:39,  3.32it/s] 62%|██████▏   | 7868/12776 [1:21:33<23:39,  3.46it/s]                                                       62%|██████▏   | 7868/12776 [1:21:33<23:39,  3.46it/s] 62%|██████▏   | 7869/12776 [1:21:33<22:48,  3.59it/s]                                                       62%|██████▏   | 7869/12776 [1:21:33<22:48,  3.59it/s] 62%|██████▏   | 7870/12776 [1:21:34<22:01,  3.71it/s]                                                       62%|██████▏   | 7870/12776 [1:21:34<22:01,  3.71it/s] 62%|██████▏   | 7871/12776 [1:21:34<24:15,  3.37it/s]                                                       62%|██████▏   | 7871/12776 [1:21:34<24:15,  3.37it/s] 62%|██████▏   | 7872/12776 [1:21:34<22:54,  3.57it/s]                                                       62%|██████▏   | 7872/12776 [1:21:34<22:54,  3.57it/s] 62%|██████▏   | 7873/12776 [1:21:35<21:48,  3.75it/s]                                                       62%|██████▏   | 7873/12776 [1:21:35<21:48,  3.75it/s] 62%|██████▏   | 7874/12776 [1:21:35<20:56,  3.90it/s]                                                       62%|██████▏   | 7874/12776 [1:21:35<20:56,  3.90it/s] 62%|██████▏   | 7875/12776 [1:21:35<21:49,  3.74it/s]                                                       62%|██████▏   | 7875/12776 [1:21:35<21:49,  3.74it/s] 62%|██████▏   | 7876/12776 [1:21:35<20:36,  3.96it/s]                                                       62%|██████▏   | 7876/12776 [1:21:35<20:36,  3.96it/s] 62%|██████▏   | 7877/12776 [1:21:36<19:40,  4.15it/s]                                                       62%|██████▏   | 7877/12776 [1:21:36<19:40,  4.15it/s] 62%|██████▏   | 7878/12776 [1:21:36<18:55,  4.31it/s]                                                       62%|██████▏   | 7878/12776 [1:21:36<18:55,  4.31it/s] 62%|██████▏   | 7879/12776 [1:21:36<18:23,  4.44it/s]                                                       62%|██████▏   | 7879/12776 [1:21:36<18:23,  4.44it/s] 62%|██████▏   | 7880/12776 [1:21:36<19:56,  4.09it/s]                                                       62%|██████▏   | 7880/12776 [1:21:36<19:56,  4.09it/s] 62%|██████▏   | 7881/12776 [1:21:36<19:14,  4.24it/s]                                                       62%|██████▏   | 7881/12776 [1:21:36<19:14,  4.24it/s] 62%|██████▏   | 7882/12776 [1:21:37<18:24,  4.43it/s]                                                       62%|██████▏   | 7882/12776 [1:21:37<18:24,  4.43it/s] 62%|██████▏   | 7883/12776 [1:21:37<17:44,  4.60it/s]                                                       62%|██████▏   | 7883/12776 [1:21:37<17:44,  4.60it/s] 62%|██████▏   | 7884/12776 [1:21:37<17:14,  4.73it/s]                                                       62%|██████▏   | 7884/12776 [1:21:37<17:14,  4.73it/s] 62%|██████▏   | 7885/12776 [1:21:37<19:17,  4.23it/s]                                                       62%|██████▏   | 7885/12776 [1:21:37<19:17,  4.23it/s] 62%|██████▏   | 7886/12776 [1:21:38<18:02,  4.52it/s]                                                       62%|██████▏   | 7886/12776 [1:21:38<18:02,  4.52it/s] 62%|██████▏   | 7887/12776 [1:21:38<17:11,  4.74it/s]                                                       62%|██████▏   | 7887/12776 [1:21:38<17:11,  4.74it/s] 62%|██████▏   | 7888/12776 [1:21:39<33:00,  2.47it/s]                                                       62%|██████▏   | 7888/12776 [1:21:39<33:00,  2.47it/s] 62%|██████▏   | 7889/12776 [1:21:40<59:55,  1.36it/s]                                                       62%|██████▏   | 7889/12776 [1:21:40<59:55,  1.36it/s] 62%|██████▏   | 7890/12776 [1:21:41<1:08:45,  1.18it/s]                                                         62%|██████▏   | 7890/12776 [1:21:41<1:08:45,  1.18it/s] 62%|██████▏   | 7891/12776 [1:21:42<1:08:41,  1.19it/s]                                                         62%|██████▏   | 7891/12776 [1:21:42<1:08:41,  1.19it/s] 62%|██████▏   | 7892/12776 [1:21:43<1:07:06,  1.21it/s]                                                         62%|██████▏   | 7892/12776 [1:21:43<1:07:06,  1.21it/s] 62%|██████▏   | 7893/12776 [1:21:44<1:04:31,  1.26it/s]                                                         62%|██████▏   | 7893/12776 [1:21:44<1:04:31,  1.26it/s] 62%|██████▏   | 7894/12776 [1:21:44<1:02:25,  1.30it/s]                                                         62%|██████▏   | 7894/12776 [1:21:44<1:02:25,  1.30it/s] 62%|██████▏   | 7895/12776 [1:21:45<59:29,  1.37it/s]                                                         62%|██████▏   | 7895/12776 [1:21:45<59:29,  1.37it/s] 62%|██████▏   | 7896/12776 [1:21:46<1:00:19,  1.35it/s]                                                         62%|██████▏   | 7896/12776 [1:21:46<1:00:19,  1.35it/s] 62%|██████▏   | 7897/12776 [1:21:46<56:35,  1.44it/s]                                                         62%|██████▏   | 7897/12776 [1:21:46<56:35,  1.44it/s] 62%|██████▏   | 7898/12776 [1:21:47<54:57,  1.48it/s]                                                       62%|██████▏   | 7898/12776 [1:21:47<54:57,  1.48it/s] 62%|██████▏   | 7899/12776 [1:21:47<51:40,  1.57it/s]                                                       62%|██████▏   | 7899/12776 [1:21:47<51:40,  1.57it/s] 62%|██████▏   | 7900/12776 [1:21:48<50:30,  1.61it/s]                                                       62%|██████▏   | 7900/12776 [1:21:48<50:30,  1.61it/s] 62%|██████▏   | 7901/12776 [1:21:48<47:23,  1.71it/s]                                                       62%|██████▏   | 7901/12776 [1:21:48<47:23,  1.71it/s] 62%|██████▏   | 7902/12776 [1:21:49<46:14,  1.76it/s]                                                       62%|██████▏   | 7902/12776 [1:21:49<46:14,  1.76it/s] 62%|██████▏   | 7903/12776 [1:21:49<43:03,  1.89it/s]                                                       62%|██████▏   | 7903/12776 [1:21:49<43:03,  1.89it/s] 62%|██████▏   | 7904/12776 [1:21:50<42:02,  1.93it/s]                                                       62%|██████▏   | 7904/12776 [1:21:50<42:02,  1.93it/s] 62%|██████▏   | 7905/12776 [1:21:50<39:29,  2.06it/s]                                                       62%|██████▏   | 7905/12776 [1:21:50<39:29,  2.06it/s] 62%|██████▏   | 7906/12776 [1:21:51<37:31,  2.16it/s]                                                       62%|██████▏   | 7906/12776 [1:21:51<37:31,  2.16it/s] 62%|██████▏   | 7907/12776 [1:21:51<38:13,  2.12it/s]                                                       62%|██████▏   | 7907/12776 [1:21:51<38:13,  2.12it/s] 62%|██████▏   | 7908/12776 [1:21:52<35:25,  2.29it/s]                                                       62%|██████▏   | 7908/12776 [1:21:52<35:25,  2.29it/s] 62%|██████▏   | 7909/12776 [1:21:52<33:10,  2.44it/s]                                                       62%|██████▏   | 7909/12776 [1:21:52<33:10,  2.44it/s] 62%|██████▏   | 7910/12776 [1:21:52<33:19,  2.43it/s]                                                       62%|██████▏   | 7910/12776 [1:21:52<33:19,  2.43it/s] 62%|██████▏   | 7911/12776 [1:21:53<31:17,  2.59it/s]                                                       62%|██████▏   | 7911/12776 [1:21:53<31:17,  2.59it/s] 62%|██████▏   | 7912/12776 [1:21:53<29:37,  2.74it/s]                                                       62%|██████▏   | 7912/12776 [1:21:53<29:37,  2.74it/s] 62%|██████▏   | 7913/12776 [1:21:53<29:10,  2.78it/s]                                                       62%|██████▏   | 7913/12776 [1:21:53<29:10,  2.78it/s] 62%|██████▏   | 7914/12776 [1:21:54<27:26,  2.95it/s]                                                       62%|██████▏   | 7914/12776 [1:21:54<27:26,  2.95it/s] 62%|██████▏   | 7915/12776 [1:21:54<25:54,  3.13it/s]                                                       62%|██████▏   | 7915/12776 [1:21:54<25:54,  3.13it/s] 62%|██████▏   | 7916/12776 [1:21:54<24:43,  3.28it/s]                                                       62%|██████▏   | 7916/12776 [1:21:54<24:43,  3.28it/s] 62%|██████▏   | 7917/12776 [1:21:55<24:50,  3.26it/s]                                                       62%|██████▏   | 7917/12776 [1:21:55<24:50,  3.26it/s] 62%|██████▏   | 7918/12776 [1:21:55<23:26,  3.45it/s]                                                       62%|██████▏   | 7918/12776 [1:21:55<23:26,  3.45it/s] 62%|██████▏   | 7919/12776 [1:21:55<22:24,  3.61it/s]                                                       62%|██████▏   | 7919/12776 [1:21:55<22:24,  3.61it/s] 62%|██████▏   | 7920/12776 [1:21:55<21:34,  3.75it/s]                                                       62%|██████▏   | 7920/12776 [1:21:55<21:34,  3.75it/s] 62%|██████▏   | 7921/12776 [1:21:55<20:45,  3.90it/s]                                                       62%|██████▏   | 7921/12776 [1:21:55<20:45,  3.90it/s] 62%|██████▏   | 7922/12776 [1:21:56<21:37,  3.74it/s]                                                       62%|██████▏   | 7922/12776 [1:21:56<21:37,  3.74it/s] 62%|██████▏   | 7923/12776 [1:21:56<20:36,  3.92it/s]                                                       62%|██████▏   | 7923/12776 [1:21:56<20:36,  3.92it/s] 62%|██████▏   | 7924/12776 [1:21:56<19:49,  4.08it/s]                                                       62%|██████▏   | 7924/12776 [1:21:56<19:49,  4.08it/s] 62%|██████▏   | 7925/12776 [1:21:56<19:03,  4.24it/s]                                                      {'loss': 0.3086, 'grad_norm': 1.66916024684906, 'learning_rate': 0.00012069892473118278, 'epoch': 1.23}
+{'loss': 0.3681, 'grad_norm': 0.6646075248718262, 'learning_rate': 0.00012067448680351905, 'epoch': 1.23}
+{'loss': 0.6128, 'grad_norm': 1.726301670074463, 'learning_rate': 0.00012065004887585533, 'epoch': 1.23}
+{'loss': 0.3387, 'grad_norm': 0.8818532824516296, 'learning_rate': 0.00012062561094819158, 'epoch': 1.23}
+{'loss': 0.2012, 'grad_norm': 0.7736045718193054, 'learning_rate': 0.00012060117302052784, 'epoch': 1.23}
+{'loss': 0.3497, 'grad_norm': 0.7957243919372559, 'learning_rate': 0.00012057673509286412, 'epoch': 1.23}
+{'loss': 0.3375, 'grad_norm': 1.0294607877731323, 'learning_rate': 0.00012055229716520037, 'epoch': 1.23}
+{'loss': 0.4712, 'grad_norm': 3.3992855548858643, 'learning_rate': 0.00012052785923753664, 'epoch': 1.23}
+{'loss': 0.208, 'grad_norm': 2.0325255393981934, 'learning_rate': 0.00012050342130987292, 'epoch': 1.23}
+{'loss': 0.3885, 'grad_norm': 1.33167564868927, 'learning_rate': 0.00012047898338220918, 'epoch': 1.23}
+{'loss': 0.3926, 'grad_norm': 2.1962788105010986, 'learning_rate': 0.00012045454545454543, 'epoch': 1.23}
+{'loss': 0.5303, 'grad_norm': 1.1187773942947388, 'learning_rate': 0.00012043010752688171, 'epoch': 1.23}
+{'loss': 0.373, 'grad_norm': 1.26564621925354, 'learning_rate': 0.00012040566959921797, 'epoch': 1.23}
+{'loss': 0.36, 'grad_norm': 1.3402997255325317, 'learning_rate': 0.00012038123167155424, 'epoch': 1.23}
+{'loss': 0.5391, 'grad_norm': 1.738292932510376, 'learning_rate': 0.00012035679374389052, 'epoch': 1.23}
+{'loss': 0.619, 'grad_norm': 2.344097137451172, 'learning_rate': 0.00012033235581622677, 'epoch': 1.23}
+{'loss': 0.3952, 'grad_norm': 1.4428097009658813, 'learning_rate': 0.00012030791788856303, 'epoch': 1.23}
+{'loss': 0.578, 'grad_norm': 2.1929662227630615, 'learning_rate': 0.00012028347996089931, 'epoch': 1.23}
+{'loss': 0.8856, 'grad_norm': 1.8685600757598877, 'learning_rate': 0.00012025904203323556, 'epoch': 1.23}
+{'loss': 0.4865, 'grad_norm': 2.1994404792785645, 'learning_rate': 0.00012023460410557183, 'epoch': 1.23}
+{'loss': 1.14, 'grad_norm': 8.819302558898926, 'learning_rate': 0.00012021016617790811, 'epoch': 1.23}
+{'loss': 0.8263, 'grad_norm': 2.020968437194824, 'learning_rate': 0.00012018572825024437, 'epoch': 1.23}
+{'loss': 0.5087, 'grad_norm': 1.5616576671600342, 'learning_rate': 0.00012016129032258062, 'epoch': 1.23}
+{'loss': 0.6024, 'grad_norm': 1.2136770486831665, 'learning_rate': 0.0001201368523949169, 'epoch': 1.23}
+{'loss': 0.585, 'grad_norm': 1.3263853788375854, 'learning_rate': 0.00012011241446725317, 'epoch': 1.23}
+{'loss': 0.9524, 'grad_norm': 3.187013864517212, 'learning_rate': 0.00012008797653958943, 'epoch': 1.23}
+{'loss': 0.8727, 'grad_norm': 4.589766979217529, 'learning_rate': 0.00012006353861192571, 'epoch': 1.23}
+{'loss': 0.5955, 'grad_norm': 1.6198341846466064, 'learning_rate': 0.00012003910068426196, 'epoch': 1.23}
+{'loss': 0.9213, 'grad_norm': 2.6360957622528076, 'learning_rate': 0.00012001466275659823, 'epoch': 1.23}
+{'loss': 0.9638, 'grad_norm': 2.5727317333221436, 'learning_rate': 0.0001199902248289345, 'epoch': 1.23}
+{'loss': 1.1984, 'grad_norm': 3.1968390941619873, 'learning_rate': 0.00011996578690127075, 'epoch': 1.23}
+{'loss': 0.7227, 'grad_norm': 5.276712417602539, 'learning_rate': 0.00011994134897360702, 'epoch': 1.23}
+{'loss': 1.0927, 'grad_norm': 2.5714964866638184, 'learning_rate': 0.0001199169110459433, 'epoch': 1.23}
+{'loss': 0.8852, 'grad_norm': 2.9321186542510986, 'learning_rate': 0.00011989247311827956, 'epoch': 1.23}
+{'loss': 1.2498, 'grad_norm': 3.1245834827423096, 'learning_rate': 0.00011986803519061581, 'epoch': 1.23}
+{'loss': 1.1943, 'grad_norm': 3.5027828216552734, 'learning_rate': 0.00011984359726295209, 'epoch': 1.23}
+{'loss': 1.0542, 'grad_norm': 2.125910520553589, 'learning_rate': 0.00011981915933528836, 'epoch': 1.23}
+{'loss': 0.5816, 'grad_norm': 1.8235660791397095, 'learning_rate': 0.00011979472140762462, 'epoch': 1.23}
+{'loss': 1.4135, 'grad_norm': 3.679835796356201, 'learning_rate': 0.00011977028347996089, 'epoch': 1.23}
+{'loss': 1.178, 'grad_norm': 6.588844299316406, 'learning_rate': 0.00011974584555229715, 'epoch': 1.23}
+{'loss': 1.2308, 'grad_norm': 2.3670153617858887, 'learning_rate': 0.00011972140762463342, 'epoch': 1.23}
+{'loss': 0.282, 'grad_norm': 0.49170297384262085, 'learning_rate': 0.0001196969696969697, 'epoch': 1.23}
+{'loss': 0.2686, 'grad_norm': 0.5877807140350342, 'learning_rate': 0.00011967253176930595, 'epoch': 1.24}
+{'loss': 0.2191, 'grad_norm': 0.6519326567649841, 'learning_rate': 0.00011964809384164221, 'epoch': 1.24}
+{'loss': 0.2397, 'grad_norm': 0.7324579954147339, 'learning_rate': 0.00011962365591397849, 'epoch': 1.24}
+{'loss': 0.159, 'grad_norm': 0.5463114976882935, 'learning_rate': 0.00011959921798631475, 'epoch': 1.24}
+{'loss': 0.2038, 'grad_norm': 0.6586346626281738, 'learning_rate': 0.000119574780058651, 'epoch': 1.24}
+{'loss': 0.2114, 'grad_norm': 0.4430273175239563, 'learning_rate': 0.00011955034213098728, 'epoch': 1.24}
+{'loss': 0.2591, 'grad_norm': 0.7554447054862976, 'learning_rate': 0.00011952590420332355, 'epoch': 1.24}
+{'loss': 0.2403, 'grad_norm': 0.531594455242157, 'learning_rate': 0.00011950146627565981, 'epoch': 1.24}
+{'loss': 0.2436, 'grad_norm': 0.8028531670570374, 'learning_rate': 0.00011947702834799608, 'epoch': 1.24}
+{'loss': 0.3462, 'grad_norm': 0.8752454519271851, 'learning_rate': 0.00011945259042033234, 'epoch': 1.24}
+{'loss': 0.621, 'grad_norm': 1.4788440465927124, 'learning_rate': 0.00011942815249266861, 'epoch': 1.24}
+{'loss': 0.6322, 'grad_norm': 1.0307644605636597, 'learning_rate': 0.00011940371456500489, 'epoch': 1.24}
+{'loss': 0.3076, 'grad_norm': 0.6076186299324036, 'learning_rate': 0.00011937927663734114, 'epoch': 1.24}
+{'loss': 0.2975, 'grad_norm': 1.0006450414657593, 'learning_rate': 0.0001193548387096774, 'epoch': 1.24}
+{'loss': 0.3799, 'grad_norm': 1.008905053138733, 'learning_rate': 0.00011933040078201368, 'epoch': 1.24}
+{'loss': 0.9816, 'grad_norm': 3.882817506790161, 'learning_rate': 0.00011930596285434995, 'epoch': 1.24}
+{'loss': 0.4128, 'grad_norm': 1.3156336545944214, 'learning_rate': 0.0001192815249266862, 'epoch': 1.24}
+{'loss': 0.5488, 'grad_norm': 1.1643537282943726, 'learning_rate': 0.00011925708699902247, 'epoch': 1.24}
+{'loss': 0.595, 'grad_norm': 1.2683813571929932, 'learning_rate': 0.00011923264907135874, 'epoch': 1.24}
+{'loss': 0.3329, 'grad_norm': 1.6245311498641968, 'learning_rate': 0.000119208211143695, 'epoch': 1.24}
+{'loss': 0.6928, 'grad_norm': 1.5488073825836182, 'learning_rate': 0.00011918377321603127, 'epoch': 1.24}
+{'loss': 0.3563, 'grad_norm': 1.1169195175170898, 'learning_rate': 0.00011915933528836753, 'epoch': 1.24}
+{'loss': 0.7176, 'grad_norm': 1.5224010944366455, 'learning_rate': 0.0001191348973607038, 'epoch': 1.24}
+{'loss': 0.3416, 'grad_norm': 0.7106965780258179, 'learning_rate': 0.00011911045943304008, 'epoch': 1.24}
+{'loss': 0.3449, 'grad_norm': 1.0206400156021118, 'learning_rate': 0.00011908602150537633, 'epoch': 1.24}
+{'loss': 0.744, 'grad_norm': 1.643692135810852, 'learning_rate': 0.00011906158357771259, 'epoch': 1.24}
+{'loss': 0.9874, 'grad_norm': 2.5953471660614014, 'learning_rate': 0.00011903714565004887, 'epoch': 1.24}
+{'loss': 0.7063, 'grad_norm': 2.029824733734131, 'learning_rate': 0.00011901270772238514, 'epoch': 1.24}
+{'loss': 0.6537, 'grad_norm': 2.1811952590942383, 'learning_rate': 0.00011898826979472139, 'epoch': 1.24}
+{'loss': 0.9151, 'grad_norm': 2.453786611557007, 'learning_rate': 0.00011896383186705767, 'epoch': 1.24}
+{'loss': 1.1769, 'grad_norm': 2.0153915882110596, 'learning_rate': 0.00011893939393939393, 'epoch': 1.24}
+{'loss': 0.6084, 'grad_norm': 2.056471824645996, 'learning_rate': 0.0001189149560117302, 'epoch': 1.24}
+{'loss': 1.0018, 'grad_norm': 1.4912649393081665, 'learning_rate': 0.00011889051808406646, 'epoch': 1.24}
+{'loss': 1.2432, 'grad_norm': 1.68226158618927, 'learning_rate': 0.00011886608015640273, 'epoch': 1.24}
+{'loss': 0.5132, 'grad_norm': 1.0560473203659058, 'learning_rate': 0.00011884164222873899, 'epoch': 1.24}
+ 62%|██████▏   | 7925/12776 [1:21:56<19:03,  4.24it/s] 62%|██████▏   | 7926/12776 [1:21:57<21:32,  3.75it/s]                                                       62%|██████▏   | 7926/12776 [1:21:57<21:32,  3.75it/s] 62%|██████▏   | 7927/12776 [1:21:57<20:14,  3.99it/s]                                                       62%|██████▏   | 7927/12776 [1:21:57<20:14,  3.99it/s] 62%|██████▏   | 7928/12776 [1:21:57<19:13,  4.20it/s]                                                       62%|██████▏   | 7928/12776 [1:21:57<19:13,  4.20it/s] 62%|██████▏   | 7929/12776 [1:21:57<18:24,  4.39it/s]                                                       62%|██████▏   | 7929/12776 [1:21:57<18:24,  4.39it/s] 62%|██████▏   | 7930/12776 [1:21:58<17:45,  4.55it/s]                                                       62%|██████▏   | 7930/12776 [1:21:58<17:45,  4.55it/s] 62%|██████▏   | 7931/12776 [1:21:58<19:36,  4.12it/s]                                                       62%|██████▏   | 7931/12776 [1:21:58<19:36,  4.12it/s] 62%|██████▏   | 7932/12776 [1:21:58<18:30,  4.36it/s]                                                       62%|██████▏   | 7932/12776 [1:21:58<18:30,  4.36it/s] 62%|██████▏   | 7933/12776 [1:21:58<17:39,  4.57it/s]                                                       62%|██████▏   | 7933/12776 [1:21:58<17:39,  4.57it/s] 62%|██████▏   | 7934/12776 [1:21:58<17:06,  4.72it/s]                                                       62%|██████▏   | 7934/12776 [1:21:58<17:06,  4.72it/s] 62%|██████▏   | 7935/12776 [1:21:59<16:39,  4.85it/s]                                                       62%|██████▏   | 7935/12776 [1:21:59<16:39,  4.85it/s] 62%|██████▏   | 7936/12776 [1:21:59<16:18,  4.95it/s]                                                       62%|██████▏   | 7936/12776 [1:21:59<16:18,  4.95it/s] 62%|██████▏   | 7937/12776 [1:21:59<17:35,  4.58it/s]                                                       62%|██████▏   | 7937/12776 [1:21:59<17:35,  4.58it/s] 62%|██████▏   | 7938/12776 [1:22:00<29:24,  2.74it/s]                                                       62%|██████▏   | 7938/12776 [1:22:00<29:24,  2.74it/s] 62%|██████▏   | 7939/12776 [1:22:01<53:07,  1.52it/s]                                                       62%|██████▏   | 7939/12776 [1:22:01<53:07,  1.52it/s] 62%|██████▏   | 7940/12776 [1:22:02<59:51,  1.35it/s]                                                       62%|██████▏   | 7940/12776 [1:22:02<59:51,  1.35it/s] 62%|██████▏   | 7941/12776 [1:22:03<1:02:33,  1.29it/s]                                                         62%|██████▏   | 7941/12776 [1:22:03<1:02:33,  1.29it/s] 62%|██████▏   | 7942/12776 [1:22:04<1:02:33,  1.29it/s]                                                         62%|██████▏   | 7942/12776 [1:22:04<1:02:33,  1.29it/s] 62%|██████▏   | 7943/12776 [1:22:04<1:01:20,  1.31it/s]                                                         62%|██████▏   | 7943/12776 [1:22:04<1:01:20,  1.31it/s] 62%|██████▏   | 7944/12776 [1:22:05<59:04,  1.36it/s]                                                         62%|██████▏   | 7944/12776 [1:22:05<59:04,  1.36it/s] 62%|██████▏   | 7945/12776 [1:22:06<59:24,  1.36it/s]                                                       62%|██████▏   | 7945/12776 [1:22:06<59:24,  1.36it/s] 62%|██████▏   | 7946/12776 [1:22:07<56:30,  1.42it/s]                                                       62%|██████▏   | 7946/12776 [1:22:07<56:30,  1.42it/s] 62%|██████▏   | 7947/12776 [1:22:07<54:32,  1.48it/s]                                                       62%|██████▏   | 7947/12776 [1:22:07<54:32,  1.48it/s] 62%|██████▏   | 7948/12776 [1:22:08<51:36,  1.56it/s]                                                       62%|██████▏   | 7948/12776 [1:22:08<51:36,  1.56it/s] 62%|██████▏   | 7949/12776 [1:22:08<51:08,  1.57it/s]                                                       62%|██████▏   | 7949/12776 [1:22:08<51:08,  1.57it/s] 62%|██████▏   | 7950/12776 [1:22:09<48:17,  1.67it/s]                                                       62%|██████▏   | 7950/12776 [1:22:09<48:17,  1.67it/s] 62%|██████▏   | 7951/12776 [1:22:09<48:28,  1.66it/s]                                                       62%|██████▏   | 7951/12776 [1:22:09<48:28,  1.66it/s] 62%|██████▏   | 7952/12776 [1:22:10<44:45,  1.80it/s]                                                       62%|██████▏   | 7952/12776 [1:22:10<44:45,  1.80it/s] 62%|██████▏   | 7953/12776 [1:22:10<41:46,  1.92it/s]                                                       62%|██████▏   | 7953/12776 [1:22:10<41:46,  1.92it/s] 62%|██████▏   | 7954/12776 [1:22:11<41:42,  1.93it/s]                                                       62%|██████▏   | 7954/12776 [1:22:11<41:42,  1.93it/s] 62%|██████▏   | 7955/12776 [1:22:11<38:59,  2.06it/s]                                                       62%|██████▏   | 7955/12776 [1:22:11<38:59,  2.06it/s] 62%|██████▏   | 7956/12776 [1:22:12<38:27,  2.09it/s]                                                       62%|██████▏   | 7956/12776 [1:22:12<38:27,  2.09it/s] 62%|██████▏   | 7957/12776 [1:22:12<36:10,  2.22it/s]                                                       62%|██████▏   | 7957/12776 [1:22:12<36:10,  2.22it/s] 62%|██████▏   | 7958/12776 [1:22:12<34:07,  2.35it/s]                                                       62%|██████▏   | 7958/12776 [1:22:12<34:07,  2.35it/s] 62%|██████▏   | 7959/12776 [1:22:13<33:27,  2.40it/s]                                                       62%|██████▏   | 7959/12776 [1:22:13<33:27,  2.40it/s] 62%|██████▏   | 7960/12776 [1:22:13<31:45,  2.53it/s]                                                       62%|██████▏   | 7960/12776 [1:22:13<31:45,  2.53it/s] 62%|██████▏   | 7961/12776 [1:22:14<30:24,  2.64it/s]                                                       62%|██████▏   | 7961/12776 [1:22:14<30:24,  2.64it/s] 62%|██████▏   | 7962/12776 [1:22:14<31:54,  2.52it/s]                                                       62%|██████▏   | 7962/12776 [1:22:14<31:54,  2.52it/s] 62%|██████▏   | 7963/12776 [1:22:14<29:38,  2.71it/s]                                                       62%|██████▏   | 7963/12776 [1:22:14<29:38,  2.71it/s] 62%|██████▏   | 7964/12776 [1:22:15<27:54,  2.87it/s]                                                       62%|██████▏   | 7964/12776 [1:22:15<27:54,  2.87it/s] 62%|██████▏   | 7965/12776 [1:22:15<26:31,  3.02it/s]                                                       62%|██████▏   | 7965/12776 [1:22:15<26:31,  3.02it/s] 62%|██████▏   | 7966/12776 [1:22:15<27:52,  2.88it/s]                                                       62%|██████▏   | 7966/12776 [1:22:15<27:52,  2.88it/s] 62%|██████▏   | 7967/12776 [1:22:16<25:57,  3.09it/s]                                                       62%|██████▏   | 7967/12776 [1:22:16<25:57,  3.09it/s] 62%|██████▏   | 7968/12776 [1:22:16<24:27,  3.28it/s]                                                       62%|██████▏   | 7968/12776 [1:22:16<24:27,  3.28it/s] 62%|██████▏   | 7969/12776 [1:22:16<23:12,  3.45it/s]                                                       62%|██████▏   | 7969/12776 [1:22:16<23:12,  3.45it/s] 62%|██████▏   | 7970/12776 [1:22:16<24:14,  3.31it/s]                                                       62%|██████▏   | 7970/12776 [1:22:16<24:14,  3.31it/s] 62%|██████▏   | 7971/12776 [1:22:17<22:54,  3.49it/s]                                                       62%|██████▏   | 7971/12776 [1:22:17<22:54,  3.49it/s] 62%|██████▏   | 7972/12776 [1:22:17<21:54,  3.65it/s]                                                       62%|██████▏   | 7972/12776 [1:22:17<21:54,  3.65it/s] 62%|██████▏   | 7973/12776 [1:22:17<21:05,  3.80it/s]                                                       62%|██████▏   | 7973/12776 [1:22:17<21:05,  3.80it/s] 62%|██████▏   | 7974/12776 [1:22:17<21:44,  3.68it/s]                                                       62%|██████▏   | 7974/12776 [1:22:17<21:44,  3.68it/s] 62%|██████▏   | 7975/12776 [1:22:18<20:44,  3.86it/s]                                                       62%|██████▏   | 7975/12776 [1:22:18<20:44,  3.86it/s] 62%|██████▏   | 7976/12776 [1:22:18<19:53,  4.02it/s]                                                       62%|██████▏   | 7976/12776 [1:22:18<19:53,  4.02it/s] 62%|██████▏   | 7977/12776 [1:22:18<19:11,  4.17it/s]                                                       62%|██████▏   | 7977/12776 [1:22:18<19:11,  4.17it/s] 62%|██████▏   | 7978/12776 [1:22:18<18:28,  4.33it/s]                                                       62%|██████▏   | 7978/12776 [1:22:18<18:28,  4.33it/s] 62%|██████▏   | 7979/12776 [1:22:19<19:20,  4.13it/s]                                                       62%|██████▏   | 7979/12776 [1:22:19<19:20,  4.13it/s] 62%|██████▏   | 7980/12776 [1:22:19<18:32,  4.31it/s]                                                       62%|██████▏   | 7980/12776 [1:22:19<18:32,  4.31it/s] 62%|██████▏   | 7981/12776 [1:22:19<17:55,  4.46it/s]                                                       62%|██████▏   | 7981/12776 [1:22:19<17:55,  4.46it/s] 62%|██████▏   | 7982/12776 [1:22:19<17:25,  4.58it/s]                                                       62%|██████▏   | 7982/12776 [1:22:19<17:25,  4.58it/s] 62%|██████▏   | 7983/12776 [1:22:19<17:10,  4.65it/s]                                                       62%|██████▏   | 7983/12776 [1:22:19<17:10,  4.65it/s] 62%|██████▏   | 7984/12776 [1:22:20<19:08,  4.17it/s]                                                       62%|██████▏   | 7984/12776 [1:22:20<19:08,  4.17it/s] 62%|██████▎   | 7985/12776 [1:22:20<18:03,  4.42it/s]                                                       62%|██████▎   | 7985/12776 [1:22:20<18:03,  4.42it/s] 63%|██████▎   | 7986/12776 [1:22:20<17:09,  4.65it/s]                                                       63%|██████▎   | 7986/12776 [1:22:20<17:09,  4.65it/s] 63%|██████▎   | 7987/12776 [1:22:20<16:29,  4.84it/s]                                                       63%|██████▎   | 7987/12776 [1:22:20<16:29,  4.84it/s] 63%|██████▎   | 7988/12776 [1:22:21<28:04,  2.84it/s]                                                       63%|██████▎   | 7988/12776 [1:22:21<28:04,  2.84it/s] 63%|██████▎   | 7989/12776 [1:22:22<48:46,  1.64it/s]                                                       63%|██████▎   | 7989/12776 [1:22:22<48:46,  1.64it/s] 63%|██████▎   | 7990/12776 [1:22:23<59:00,  1.35it/s]                                                       63%|██████▎   | 7990/12776 [1:22:23<59:00,  1.35it/s] 63%|██████▎   | 7991/12776 [1:22:24<1:01:55,  1.29it/s]                                                         63%|██████▎   | 7991/12776 [1:22:24<1:01:55,  1.29it/s] 63%|██████▎   | 7992/12776 [1:22:25<1:02:11,  1.28it/s]                                                         63%|██████▎   | 7992/12776 [1:22:25<1:02:11,  1.28it/s] 63%|██████▎   | 7993/12776 [1:22:26<1:05:14,  1.22it/s]                                                         63%|██████▎   | 7993/12776 [1:22:26<1:05:14,  1.22it/s] 63%|██████▎   | 7994/12776 [1:22:26<1:02:02,  1.28it/s]                                                         63%|██████▎   | 7994/12776 [1:22:26<1:02:02,  1.28it/s] 63%|██████▎   | 7995/12776 [1:22:27<58:36,  1.36it/s]                                                         63%|██████▎   | 7995/12776 [1:22:27<58:36,  1.36it/s] 63%|██████▎   | 7996/12776 [1:22:28<58:18,  1.37it/s]                                                       63%|██████▎   | 7996/12776 [1:22:28<58:18,  1.37it/s] 63%|██████▎   | 7997/12776 [1:22:28<54:31,  1.46it/s]                                                       63%|██████▎   | 7997/12776 [1:22:28<54:31,  1.46it/s] 63%|██████▎   | 7998/12776 [1:22:29<52:36,  1.51it/s]                                                       63%|██████▎   | 7998/12776 [1:22:29<52:36,  1.51it/s] 63%|██████▎   | 7999/12776 [1:22:30<49:32,  1.61it/s]                                                       63%|██████▎   | 7999/12776 [1:22:30<49:32,  1.61it/s] 63%|██████▎   | 8000/12776 [1:22:30<50:06,  1.59it/s]                                                       63%|██████▎   | 8000/12776 [1:22:30<50:06,  1.59it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 1.3016, 'grad_norm': 1.9787427186965942, 'learning_rate': 0.00011881720430107527, 'epoch': 1.24}
+{'loss': 0.4644, 'grad_norm': 1.528519868850708, 'learning_rate': 0.00011879276637341152, 'epoch': 1.24}
+{'loss': 1.06, 'grad_norm': 1.8350005149841309, 'learning_rate': 0.00011876832844574778, 'epoch': 1.24}
+{'loss': 0.6492, 'grad_norm': 2.0662553310394287, 'learning_rate': 0.00011874389051808406, 'epoch': 1.24}
+{'loss': 1.7245, 'grad_norm': 2.514481544494629, 'learning_rate': 0.00011871945259042033, 'epoch': 1.24}
+{'loss': 1.2266, 'grad_norm': 1.8698464632034302, 'learning_rate': 0.00011869501466275658, 'epoch': 1.24}
+{'loss': 0.7572, 'grad_norm': 1.368855595588684, 'learning_rate': 0.00011867057673509286, 'epoch': 1.24}
+{'loss': 0.8298, 'grad_norm': 2.3699071407318115, 'learning_rate': 0.00011864613880742912, 'epoch': 1.24}
+{'loss': 0.4536, 'grad_norm': 1.3850302696228027, 'learning_rate': 0.00011862170087976539, 'epoch': 1.24}
+{'loss': 1.0689, 'grad_norm': 4.977390289306641, 'learning_rate': 0.00011859726295210165, 'epoch': 1.24}
+{'loss': 0.8295, 'grad_norm': 2.6507201194763184, 'learning_rate': 0.00011857282502443792, 'epoch': 1.24}
+{'loss': 0.767, 'grad_norm': 1.9919297695159912, 'learning_rate': 0.00011854838709677418, 'epoch': 1.24}
+{'loss': 0.7606, 'grad_norm': 2.1812164783477783, 'learning_rate': 0.00011852394916911046, 'epoch': 1.24}
+{'loss': 1.381, 'grad_norm': 4.220646381378174, 'learning_rate': 0.00011849951124144671, 'epoch': 1.24}
+{'loss': 0.2814, 'grad_norm': 0.48588910698890686, 'learning_rate': 0.00011847507331378298, 'epoch': 1.24}
+{'loss': 0.2328, 'grad_norm': 0.5703171491622925, 'learning_rate': 0.00011845063538611925, 'epoch': 1.24}
+{'loss': 0.1882, 'grad_norm': 0.4970242977142334, 'learning_rate': 0.00011842619745845552, 'epoch': 1.24}
+{'loss': 0.1734, 'grad_norm': 0.5462607741355896, 'learning_rate': 0.00011840175953079177, 'epoch': 1.24}
+{'loss': 0.2375, 'grad_norm': 0.7907721400260925, 'learning_rate': 0.00011837732160312805, 'epoch': 1.24}
+{'loss': 0.1791, 'grad_norm': 0.4387291669845581, 'learning_rate': 0.00011835288367546431, 'epoch': 1.24}
+{'loss': 0.2238, 'grad_norm': 0.5500348210334778, 'learning_rate': 0.00011832844574780058, 'epoch': 1.24}
+{'loss': 0.2178, 'grad_norm': 0.6439167261123657, 'learning_rate': 0.00011830400782013684, 'epoch': 1.24}
+{'loss': 0.2993, 'grad_norm': 0.8734773397445679, 'learning_rate': 0.00011827956989247311, 'epoch': 1.24}
+{'loss': 0.3273, 'grad_norm': 1.8835079669952393, 'learning_rate': 0.00011825513196480937, 'epoch': 1.24}
+{'loss': 0.1828, 'grad_norm': 0.723751425743103, 'learning_rate': 0.00011823069403714565, 'epoch': 1.24}
+{'loss': 0.3214, 'grad_norm': 1.6862190961837769, 'learning_rate': 0.0001182062561094819, 'epoch': 1.24}
+{'loss': 0.2662, 'grad_norm': 0.7363371849060059, 'learning_rate': 0.00011818181818181817, 'epoch': 1.24}
+{'loss': 0.3986, 'grad_norm': 1.2656689882278442, 'learning_rate': 0.00011815738025415444, 'epoch': 1.24}
+{'loss': 0.405, 'grad_norm': 0.9024664759635925, 'learning_rate': 0.00011813294232649071, 'epoch': 1.24}
+{'loss': 0.4005, 'grad_norm': 0.8185746669769287, 'learning_rate': 0.00011810850439882696, 'epoch': 1.25}
+{'loss': 0.1425, 'grad_norm': 0.8885353207588196, 'learning_rate': 0.00011808406647116324, 'epoch': 1.25}
+{'loss': 0.3497, 'grad_norm': 6.204992294311523, 'learning_rate': 0.0001180596285434995, 'epoch': 1.25}
+{'loss': 0.3987, 'grad_norm': 1.7547203302383423, 'learning_rate': 0.00011803519061583576, 'epoch': 1.25}
+{'loss': 0.2755, 'grad_norm': 1.6890156269073486, 'learning_rate': 0.00011801075268817203, 'epoch': 1.25}
+{'loss': 0.6202, 'grad_norm': 1.954634189605713, 'learning_rate': 0.0001179863147605083, 'epoch': 1.25}
+{'loss': 0.2909, 'grad_norm': 1.5542676448822021, 'learning_rate': 0.00011796187683284456, 'epoch': 1.25}
+{'loss': 0.6644, 'grad_norm': 2.495922565460205, 'learning_rate': 0.00011793743890518084, 'epoch': 1.25}
+{'loss': 0.3718, 'grad_norm': 1.6161011457443237, 'learning_rate': 0.00011791300097751709, 'epoch': 1.25}
+{'loss': 0.397, 'grad_norm': 1.5443997383117676, 'learning_rate': 0.00011788856304985336, 'epoch': 1.25}
+{'loss': 0.3962, 'grad_norm': 1.664036750793457, 'learning_rate': 0.00011786412512218964, 'epoch': 1.25}
+{'loss': 0.4728, 'grad_norm': 1.7987858057022095, 'learning_rate': 0.0001178396871945259, 'epoch': 1.25}
+{'loss': 0.5981, 'grad_norm': 2.9872450828552246, 'learning_rate': 0.00011781524926686215, 'epoch': 1.25}
+{'loss': 0.8175, 'grad_norm': 1.4099814891815186, 'learning_rate': 0.00011779081133919843, 'epoch': 1.25}
+{'loss': 0.937, 'grad_norm': 2.3960330486297607, 'learning_rate': 0.0001177663734115347, 'epoch': 1.25}
+{'loss': 0.8408, 'grad_norm': 1.7411472797393799, 'learning_rate': 0.00011774193548387095, 'epoch': 1.25}
+{'loss': 0.6916, 'grad_norm': 1.6002928018569946, 'learning_rate': 0.00011771749755620722, 'epoch': 1.25}
+{'loss': 0.6554, 'grad_norm': 2.1395416259765625, 'learning_rate': 0.00011769305962854349, 'epoch': 1.25}
+{'loss': 0.5139, 'grad_norm': 2.8727903366088867, 'learning_rate': 0.00011766862170087975, 'epoch': 1.25}
+{'loss': 0.5492, 'grad_norm': 1.354569435119629, 'learning_rate': 0.00011764418377321603, 'epoch': 1.25}
+{'loss': 0.7196, 'grad_norm': 4.999713897705078, 'learning_rate': 0.00011761974584555228, 'epoch': 1.25}
+{'loss': 1.0452, 'grad_norm': 3.1970131397247314, 'learning_rate': 0.00011759530791788855, 'epoch': 1.25}
+{'loss': 1.3657, 'grad_norm': 3.8125860691070557, 'learning_rate': 0.00011757086999022483, 'epoch': 1.25}
+{'loss': 0.9489, 'grad_norm': 2.1403331756591797, 'learning_rate': 0.00011754643206256109, 'epoch': 1.25}
+{'loss': 0.7548, 'grad_norm': 1.500003457069397, 'learning_rate': 0.00011752199413489734, 'epoch': 1.25}
+{'loss': 0.5617, 'grad_norm': 5.040395259857178, 'learning_rate': 0.00011749755620723362, 'epoch': 1.25}
+{'loss': 1.0901, 'grad_norm': 2.7591681480407715, 'learning_rate': 0.00011747311827956989, 'epoch': 1.25}
+{'loss': 0.8664, 'grad_norm': 1.9304205179214478, 'learning_rate': 0.00011744868035190614, 'epoch': 1.25}
+{'loss': 0.7079, 'grad_norm': 3.9833483695983887, 'learning_rate': 0.00011742424242424242, 'epoch': 1.25}
+{'loss': 1.1423, 'grad_norm': 2.4207115173339844, 'learning_rate': 0.00011739980449657868, 'epoch': 1.25}
+{'loss': 0.9182, 'grad_norm': 2.918473958969116, 'learning_rate': 0.00011737536656891495, 'epoch': 1.25}
+{'loss': 0.3946, 'grad_norm': 1.54889714717865, 'learning_rate': 0.00011735092864125122, 'epoch': 1.25}
+{'loss': 0.7028, 'grad_norm': 2.440833806991577, 'learning_rate': 0.00011732649071358748, 'epoch': 1.25}
+{'loss': 0.6834, 'grad_norm': 3.6599295139312744, 'learning_rate': 0.00011730205278592374, 'epoch': 1.25}
+{'loss': 0.59, 'grad_norm': 2.046987533569336, 'learning_rate': 0.00011727761485826002, 'epoch': 1.25}
+{'loss': 0.1788, 'grad_norm': 0.6683407425880432, 'learning_rate': 0.00011725317693059628, 'epoch': 1.25}
+{'loss': 0.1947, 'grad_norm': 0.6008917093276978, 'learning_rate': 0.00011722873900293253, 'epoch': 1.25}
+{'loss': 0.2844, 'grad_norm': 0.8003541827201843, 'learning_rate': 0.00011720430107526881, 'epoch': 1.25}
+{'loss': 0.3327, 'grad_norm': 0.4843752980232239, 'learning_rate': 0.00011717986314760508, 'epoch': 1.25}
+{'loss': 0.2476, 'grad_norm': 0.6065008640289307, 'learning_rate': 0.00011715542521994133, 'epoch': 1.25}
+{'loss': 0.2134, 'grad_norm': 0.4822901785373688, 'learning_rate': 0.00011713098729227761, 'epoch': 1.25}
+{'loss': 0.198, 'grad_norm': 0.6061071753501892, 'learning_rate': 0.00011710654936461387, 'epoch': 1.25}
+{'loss': 0.3114, 'grad_norm': 0.7697739005088806, 'learning_rate': 0.00011708211143695014, 'epoch': 1.25}
+{'loss': 0.5542, 'grad_norm': 0.9554542899131775, 'learning_rate': 0.00011705767350928642, 'epoch': 1.25}
+{'loss': 0.2086, 'grad_norm': 0.5352123379707336, 'learning_rate': 0.00011703323558162267, 'epoch': 1.25}
+{'loss': 0.1956, 'grad_norm': 1.2916967868804932, 'learning_rate': 0.00011700879765395893, 'epoch': 1.25}
+{'loss': 0.2089, 'grad_norm': 0.7141245603561401, 'learning_rate': 0.00011698435972629521, 'epoch': 1.25}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:04,  6.19it/s][A
+  0%|          | 3/774 [00:00<02:48,  4.59it/s][A
+  1%|          | 4/774 [00:01<03:36,  3.56it/s][A
+  1%|          | 5/774 [00:01<03:28,  3.68it/s][A
+  1%|          | 6/774 [00:01<03:37,  3.53it/s][A
+  1%|          | 7/774 [00:01<03:32,  3.61it/s][A
+  1%|          | 8/774 [00:02<03:32,  3.60it/s][A
+  1%|          | 9/774 [00:02<03:19,  3.83it/s][A
+  1%|▏         | 10/774 [00:02<03:18,  3.85it/s][A
+  1%|▏         | 11/774 [00:02<03:33,  3.58it/s][A
+  2%|▏         | 12/774 [00:03<03:18,  3.84it/s][A
+  2%|▏         | 13/774 [00:03<03:10,  3.99it/s][A
+  2%|▏         | 14/774 [00:03<03:22,  3.76it/s][A
+  2%|▏         | 15/774 [00:04<03:40,  3.45it/s][A
+  2%|▏         | 16/774 [00:04<03:37,  3.49it/s][A
+  2%|▏         | 17/774 [00:04<03:14,  3.90it/s][A
+  2%|▏         | 18/774 [00:04<03:06,  4.06it/s][A
+  2%|▏         | 19/774 [00:04<03:16,  3.84it/s][A
+  3%|▎         | 20/774 [00:05<03:13,  3.90it/s][A
+  3%|▎         | 21/774 [00:05<03:15,  3.86it/s][A
+  3%|▎         | 22/774 [00:05<03:20,  3.74it/s][A
+  3%|▎         | 23/774 [00:06<03:32,  3.53it/s][A
+  3%|▎         | 24/774 [00:06<03:31,  3.54it/s][A
+  3%|▎         | 25/774 [00:06<03:36,  3.46it/s][A
+  3%|▎         | 26/774 [00:06<03:35,  3.47it/s][A
+  3%|▎         | 27/774 [00:07<03:34,  3.48it/s][A
+  4%|▎         | 28/774 [00:07<03:43,  3.34it/s][A
+  4%|▎         | 29/774 [00:07<03:46,  3.29it/s][A
+  4%|▍         | 30/774 [00:08<03:33,  3.48it/s][A
+  4%|▍         | 31/774 [00:08<03:34,  3.46it/s][A
+  4%|▍         | 32/774 [00:08<04:09,  2.98it/s][A
+  4%|▍         | 33/774 [00:09<03:56,  3.13it/s][A
+  4%|▍         | 34/774 [00:09<03:43,  3.32it/s][A
+  5%|▍         | 35/774 [00:09<03:50,  3.20it/s][A
+  5%|▍         | 36/774 [00:10<03:51,  3.19it/s][A
+  5%|▍         | 37/774 [00:10<04:00,  3.07it/s][A
+  5%|▍         | 38/774 [00:10<03:47,  3.24it/s][A
+  5%|▌         | 39/774 [00:10<03:36,  3.40it/s][A
+  5%|▌         | 40/774 [00:11<03:38,  3.36it/s][A
+  5%|▌         | 41/774 [00:11<03:32,  3.45it/s][A
+  5%|▌         | 42/774 [00:11<03:18,  3.69it/s][A
+  6%|▌         | 43/774 [00:12<03:29,  3.48it/s][A
+  6%|▌         | 44/774 [00:12<03:40,  3.31it/s][A
+  6%|▌         | 45/774 [00:12<03:26,  3.53it/s][A
+  6%|▌         | 46/774 [00:12<03:12,  3.79it/s][A
+  6%|▌         | 47/774 [00:13<02:57,  4.10it/s][A
+  6%|▌         | 48/774 [00:13<02:57,  4.09it/s][A
+  6%|▋         | 49/774 [00:13<02:56,  4.10it/s][A
+  6%|▋         | 50/774 [00:13<02:58,  4.05it/s][A
+  7%|▋         | 51/774 [00:14<03:00,  4.02it/s][A
+  7%|▋         | 52/774 [00:14<02:58,  4.05it/s][A
+  7%|▋         | 53/774 [00:14<03:07,  3.86it/s][A
+  7%|▋         | 54/774 [00:14<03:10,  3.78it/s][A
+  7%|▋         | 55/774 [00:15<03:19,  3.60it/s][A
+  7%|▋         | 56/774 [00:15<03:19,  3.60it/s][A
+  7%|▋         | 57/774 [00:15<03:25,  3.49it/s][A
+  7%|▋         | 58/774 [00:16<03:23,  3.51it/s][A
+  8%|▊         | 59/774 [00:16<03:08,  3.80it/s][A
+  8%|▊         | 60/774 [00:16<02:54,  4.10it/s][A
+  8%|▊         | 61/774 [00:16<02:32,  4.68it/s][A
+  8%|▊         | 62/774 [00:16<02:30,  4.75it/s][A
+  8%|▊         | 63/774 [00:17<02:54,  4.08it/s][A
+  8%|▊         | 64/774 [00:17<02:45,  4.29it/s][A
+  8%|▊         | 65/774 [00:17<02:47,  4.23it/s][A
+  9%|▊         | 66/774 [00:17<02:45,  4.28it/s][A
+  9%|▊         | 67/774 [00:18<02:40,  4.42it/s][A
+  9%|▉         | 68/774 [00:18<02:36,  4.52it/s][A
+  9%|▉         | 69/774 [00:18<02:28,  4.76it/s][A
+  9%|▉         | 70/774 [00:18<02:36,  4.51it/s][A
+  9%|▉         | 71/774 [00:18<02:31,  4.65it/s][A
+  9%|▉         | 72/774 [00:19<02:42,  4.32it/s][A
+  9%|▉         | 73/774 [00:19<02:51,  4.09it/s][A
+ 10%|▉         | 74/774 [00:19<02:58,  3.93it/s][A
+ 10%|▉         | 75/774 [00:19<03:04,  3.80it/s][A
+ 10%|▉         | 76/774 [00:20<03:00,  3.87it/s][A
+ 10%|▉         | 77/774 [00:20<03:12,  3.62it/s][A
+ 10%|█         | 78/774 [00:20<02:53,  4.01it/s][A
+ 10%|█         | 79/774 [00:20<02:41,  4.30it/s][A
+ 10%|█         | 80/774 [00:21<02:38,  4.37it/s][A
+ 10%|█         | 81/774 [00:21<02:17,  5.05it/s][A
+ 11%|█         | 82/774 [00:21<02:17,  5.02it/s][A
+ 11%|█         | 83/774 [00:21<02:21,  4.90it/s][A
+ 11%|█         | 84/774 [00:21<02:26,  4.69it/s][A
+ 11%|█         | 85/774 [00:22<02:35,  4.43it/s][A
+ 11%|█         | 86/774 [00:22<02:44,  4.19it/s][A
+ 11%|█         | 87/774 [00:22<02:47,  4.11it/s][A
+ 11%|█▏        | 88/774 [00:22<02:40,  4.27it/s][A
+ 11%|█▏        | 89/774 [00:23<02:32,  4.48it/s][A
+ 12%|█▏        | 90/774 [00:23<02:39,  4.29it/s][A
+ 12%|█▏        | 91/774 [00:23<02:52,  3.97it/s][A
+ 12%|█▏        | 92/774 [00:23<03:06,  3.66it/s][A
+ 12%|█▏        | 93/774 [00:24<03:00,  3.77it/s][A
+ 12%|█▏        | 94/774 [00:24<03:05,  3.67it/s][A
+ 12%|█▏        | 95/774 [00:24<03:02,  3.71it/s][A
+ 12%|█▏        | 96/774 [00:25<02:57,  3.81it/s][A
+ 13%|█▎        | 97/774 [00:25<02:43,  4.14it/s][A
+ 13%|█▎        | 98/774 [00:25<02:34,  4.36it/s][A
+ 13%|█▎        | 99/774 [00:25<02:48,  4.01it/s][A
+ 13%|█▎        | 100/774 [00:26<02:59,  3.75it/s][A
+ 13%|█▎        | 101/774 [00:26<03:04,  3.64it/s][A
+ 13%|█▎        | 102/774 [00:26<03:14,  3.45it/s][A
+ 13%|█▎        | 103/774 [00:26<03:24,  3.29it/s][A
+ 13%|█▎        | 104/774 [00:27<03:20,  3.34it/s][A
+ 14%|█▎        | 105/774 [00:27<03:19,  3.36it/s][A
+ 14%|█▎        | 106/774 [00:27<03:37,  3.07it/s][A
+ 14%|█▍        | 107/774 [00:28<03:48,  2.92it/s][A
+ 14%|█▍        | 108/774 [00:28<03:39,  3.03it/s][A
+ 14%|█▍        | 109/774 [00:29<03:44,  2.96it/s][A
+ 14%|█▍        | 110/774 [00:29<03:36,  3.07it/s][A
+ 14%|█▍        | 111/774 [00:29<03:40,  3.01it/s][A
+ 14%|█▍        | 112/774 [00:29<03:26,  3.20it/s][A
+ 15%|█▍        | 113/774 [00:30<03:35,  3.06it/s][A
+ 15%|█▍        | 114/774 [00:30<03:34,  3.08it/s][A
+ 15%|█▍        | 115/774 [00:30<03:24,  3.23it/s][A
+ 15%|█▍        | 116/774 [00:31<03:06,  3.52it/s][A
+ 15%|█▌        | 117/774 [00:31<03:11,  3.43it/s][A
+ 15%|█▌        | 118/774 [00:31<03:13,  3.40it/s][A
+ 15%|█▌        | 119/774 [00:31<03:03,  3.57it/s][A
+ 16%|█▌        | 120/774 [00:32<03:11,  3.41it/s][A
+ 16%|█▌        | 121/774 [00:32<03:05,  3.52it/s][A
+ 16%|█▌        | 122/774 [00:32<03:07,  3.47it/s][A
+ 16%|█▌        | 123/774 [00:33<02:58,  3.64it/s][A
+ 16%|█▌        | 124/774 [00:33<02:59,  3.62it/s][A
+ 16%|█▌        | 125/774 [00:33<02:59,  3.61it/s][A
+ 16%|█▋        | 126/774 [00:33<03:07,  3.45it/s][A
+ 16%|█▋        | 127/774 [00:34<03:17,  3.27it/s][A
+ 17%|█▋        | 128/774 [00:34<03:08,  3.43it/s][A
+ 17%|█▋        | 129/774 [00:34<03:09,  3.40it/s][A
+ 17%|█▋        | 130/774 [00:35<03:16,  3.28it/s][A
+ 17%|█▋        | 131/774 [00:35<03:06,  3.44it/s][A
+ 17%|█▋        | 132/774 [00:35<03:11,  3.35it/s][A
+ 17%|█▋        | 133/774 [00:36<03:06,  3.44it/s][A
+ 17%|█▋        | 134/774 [00:36<03:06,  3.44it/s][A
+ 17%|█▋        | 135/774 [00:36<03:24,  3.13it/s][A
+ 18%|█▊        | 136/774 [00:37<03:31,  3.01it/s][A
+ 18%|█▊        | 137/774 [00:37<03:30,  3.03it/s][A
+ 18%|█▊        | 138/774 [00:37<03:25,  3.10it/s][A
+ 18%|█▊        | 139/774 [00:38<03:25,  3.09it/s][A
+ 18%|█▊        | 140/774 [00:38<03:20,  3.17it/s][A
+ 18%|█▊        | 141/774 [00:38<03:12,  3.29it/s][A
+ 18%|█▊        | 142/774 [00:38<03:22,  3.13it/s][A
+ 18%|█▊        | 143/774 [00:39<03:18,  3.18it/s][A
+ 19%|█▊        | 144/774 [00:39<03:07,  3.36it/s][A
+ 19%|█▊        | 145/774 [00:39<03:00,  3.48it/s][A
+ 19%|█▉        | 146/774 [00:40<02:49,  3.70it/s][A
+ 19%|█▉        | 147/774 [00:40<02:41,  3.89it/s][A
+ 19%|█▉        | 148/774 [00:40<02:50,  3.67it/s][A
+ 19%|█▉        | 149/774 [00:40<03:03,  3.41it/s][A
+ 19%|█▉        | 150/774 [00:41<03:05,  3.36it/s][A
+ 20%|█▉        | 151/774 [00:41<02:55,  3.55it/s][A
+ 20%|█▉        | 152/774 [00:41<02:46,  3.73it/s][A
+ 20%|█▉        | 153/774 [00:41<02:54,  3.57it/s][A
+ 20%|█▉        | 154/774 [00:42<02:49,  3.66it/s][A
+ 20%|██        | 155/774 [00:42<02:46,  3.73it/s][A
+ 20%|██        | 156/774 [00:42<02:40,  3.84it/s][A
+ 20%|██        | 157/774 [00:42<02:33,  4.03it/s][A
+ 20%|██        | 158/774 [00:43<02:37,  3.91it/s][A
+ 21%|██        | 159/774 [00:43<02:39,  3.85it/s][A
+ 21%|██        | 160/774 [00:43<02:31,  4.05it/s][A
+ 21%|██        | 161/774 [00:44<02:41,  3.80it/s][A
+ 21%|██        | 162/774 [00:44<02:46,  3.68it/s][A
+ 21%|██        | 163/774 [00:44<02:45,  3.69it/s][A
+ 21%|██        | 164/774 [00:44<02:39,  3.83it/s][A
+ 21%|██▏       | 165/774 [00:45<02:37,  3.87it/s][A
+ 21%|██▏       | 166/774 [00:45<02:40,  3.78it/s][A
+ 22%|██▏       | 167/774 [00:45<02:43,  3.71it/s][A
+ 22%|██▏       | 168/774 [00:45<02:34,  3.91it/s][A
+ 22%|██▏       | 169/774 [00:46<02:27,  4.10it/s][A
+ 22%|██▏       | 170/774 [00:46<02:36,  3.87it/s][A
+ 22%|██▏       | 171/774 [00:46<02:46,  3.63it/s][A
+ 22%|██▏       | 172/774 [00:47<02:53,  3.46it/s][A
+ 22%|██▏       | 173/774 [00:47<02:50,  3.53it/s][A
+ 22%|██▏       | 174/774 [00:47<02:42,  3.68it/s][A
+ 23%|██▎       | 175/774 [00:47<02:43,  3.67it/s][A
+ 23%|██▎       | 176/774 [00:48<02:36,  3.82it/s][A
+ 23%|██▎       | 177/774 [00:48<02:50,  3.51it/s][A
+ 23%|██▎       | 178/774 [00:48<02:35,  3.84it/s][A
+ 23%|██▎       | 179/774 [00:48<02:21,  4.19it/s][A
+ 23%|██▎       | 180/774 [00:48<02:15,  4.40it/s][A
+ 23%|██▎       | 181/774 [00:49<02:19,  4.25it/s][A
+ 24%|██▎       | 182/774 [00:49<02:23,  4.14it/s][A
+ 24%|██▎       | 183/774 [00:49<02:24,  4.10it/s][A
+ 24%|██▍       | 184/774 [00:50<02:34,  3.81it/s][A
+ 24%|██▍       | 185/774 [00:50<02:43,  3.60it/s][A
+ 24%|██▍       | 186/774 [00:50<02:42,  3.62it/s][A
+ 24%|██▍       | 187/774 [00:50<02:35,  3.77it/s][A
+ 24%|██▍       | 188/774 [00:51<02:34,  3.80it/s][A
+ 24%|██▍       | 189/774 [00:51<02:32,  3.83it/s][A
+ 25%|██▍       | 190/774 [00:51<02:27,  3.97it/s][A
+ 25%|██▍       | 191/774 [00:51<02:33,  3.80it/s][A
+ 25%|██▍       | 192/774 [00:52<02:37,  3.70it/s][A
+ 25%|██▍       | 193/774 [00:52<02:39,  3.65it/s][A
+ 25%|██▌       | 194/774 [00:52<02:48,  3.45it/s][A
+ 25%|██▌       | 195/774 [00:53<02:57,  3.27it/s][A
+ 25%|██▌       | 196/774 [00:53<02:57,  3.26it/s][A
+ 25%|██▌       | 197/774 [00:53<02:53,  3.32it/s][A
+ 26%|██▌       | 198/774 [00:53<02:44,  3.51it/s][A
+ 26%|██▌       | 199/774 [00:54<02:44,  3.50it/s][A
+ 26%|██▌       | 200/774 [00:54<02:39,  3.61it/s][A
+ 26%|██▌       | 201/774 [00:54<02:35,  3.69it/s][A
+ 26%|██▌       | 202/774 [00:55<02:32,  3.75it/s][A
+ 26%|██▌       | 203/774 [00:55<02:25,  3.93it/s][A
+ 26%|██▋       | 204/774 [00:55<02:28,  3.83it/s][A
+ 26%|██▋       | 205/774 [00:55<02:39,  3.57it/s][A
+ 27%|██▋       | 206/774 [00:56<02:35,  3.66it/s][A
+ 27%|██▋       | 207/774 [00:56<02:32,  3.72it/s][A
+ 27%|██▋       | 208/774 [00:56<02:33,  3.70it/s][A
+ 27%|██▋       | 209/774 [00:56<02:31,  3.73it/s][A
+ 27%|██▋       | 210/774 [00:57<02:29,  3.78it/s][A
+ 27%|██▋       | 211/774 [00:57<02:26,  3.84it/s][A
+ 27%|██▋       | 212/774 [00:57<02:15,  4.14it/s][A
+ 28%|██▊       | 213/774 [00:57<02:00,  4.64it/s][A
+ 28%|██▊       | 214/774 [00:58<02:03,  4.53it/s][A
+ 28%|██▊       | 215/774 [00:58<02:03,  4.53it/s][A
+ 28%|██▊       | 216/774 [00:58<02:02,  4.56it/s][A
+ 28%|██▊       | 217/774 [00:58<02:05,  4.44it/s][A
+ 28%|██▊       | 218/774 [00:58<02:11,  4.22it/s][A
+ 28%|██▊       | 219/774 [00:59<02:20,  3.94it/s][A
+ 28%|██▊       | 220/774 [00:59<02:18,  3.99it/s][A
+ 29%|██▊       | 221/774 [00:59<02:26,  3.77it/s][A
+ 29%|██▊       | 222/774 [01:00<02:36,  3.52it/s][A
+ 29%|██▉       | 223/774 [01:00<02:53,  3.17it/s][A
+ 29%|██▉       | 224/774 [01:00<03:02,  3.02it/s][A
+ 29%|██▉       | 225/774 [01:01<03:12,  2.85it/s][A
+ 29%|██▉       | 226/774 [01:01<03:16,  2.79it/s][A
+ 29%|██▉       | 227/774 [01:01<03:12,  2.84it/s][A
+ 29%|██▉       | 228/774 [01:02<03:04,  2.96it/s][A
+ 30%|██▉       | 229/774 [01:02<03:19,  2.74it/s][A
+ 30%|██▉       | 230/774 [01:02<03:03,  2.96it/s][A
+ 30%|██▉       | 231/774 [01:03<03:01,  3.00it/s][A
+ 30%|██▉       | 232/774 [01:03<02:52,  3.14it/s][A
+ 30%|███       | 233/774 [01:04<03:07,  2.88it/s][A
+ 30%|███       | 234/774 [01:04<03:11,  2.82it/s][A
+ 30%|███       | 235/774 [01:04<03:10,  2.83it/s][A
+ 30%|███       | 236/774 [01:05<03:13,  2.77it/s][A
+ 31%|███       | 237/774 [01:05<03:10,  2.81it/s][A
+ 31%|███       | 238/774 [01:05<03:01,  2.96it/s][A
+ 31%|███       | 239/774 [01:06<02:59,  2.98it/s][A
+ 31%|███       | 240/774 [01:06<02:59,  2.98it/s][A
+ 31%|███       | 241/774 [01:06<03:02,  2.92it/s][A
+ 31%|███▏      | 242/774 [01:07<03:13,  2.76it/s][A
+ 31%|███▏      | 243/774 [01:07<03:21,  2.64it/s][A
+ 32%|███▏      | 244/774 [01:07<03:16,  2.69it/s][A
+ 32%|███▏      | 245/774 [01:08<03:09,  2.79it/s][A
+ 32%|███▏      | 246/774 [01:08<03:09,  2.79it/s][A
+ 32%|███▏      | 247/774 [01:09<03:35,  2.44it/s][A
+ 32%|███▏      | 248/774 [01:09<03:44,  2.34it/s][A
+ 32%|███▏      | 249/774 [01:09<03:21,  2.60it/s][A
+ 32%|███▏      | 250/774 [01:10<03:17,  2.66it/s][A
+ 32%|███▏      | 251/774 [01:10<03:16,  2.67it/s][A
+ 33%|███▎      | 252/774 [01:10<03:12,  2.71it/s][A
+ 33%|███▎      | 253/774 [01:11<03:11,  2.73it/s][A
+ 33%|███▎      | 254/774 [01:11<03:06,  2.79it/s][A
+ 33%|███▎      | 255/774 [01:12<03:02,  2.84it/s][A
+ 33%|███▎      | 256/774 [01:12<02:57,  2.92it/s][A
+ 33%|███▎      | 257/774 [01:12<02:55,  2.94it/s][A
+ 33%|███▎      | 258/774 [01:12<02:41,  3.20it/s][A
+ 33%|███▎      | 259/774 [01:13<02:23,  3.58it/s][A
+ 34%|███▎      | 260/774 [01:13<02:22,  3.62it/s][A
+ 34%|███▎      | 261/774 [01:13<02:27,  3.48it/s][A
+ 34%|███▍      | 262/774 [01:13<02:12,  3.86it/s][A
+ 34%|███▍      | 263/774 [01:14<02:05,  4.07it/s][A
+ 34%|███▍      | 264/774 [01:14<02:15,  3.77it/s][A
+ 34%|███▍      | 265/774 [01:14<02:07,  3.98it/s][A
+ 34%|███▍      | 266/774 [01:14<02:02,  4.15it/s][A
+ 34%|███▍      | 267/774 [01:15<02:00,  4.20it/s][A
+ 35%|███▍      | 268/774 [01:15<02:07,  3.96it/s][A
+ 35%|███▍      | 269/774 [01:15<02:13,  3.78it/s][A
+ 35%|███▍      | 270/774 [01:15<02:19,  3.62it/s][A
+ 35%|███▌      | 271/774 [01:16<02:15,  3.71it/s][A
+ 35%|███▌      | 272/774 [01:16<02:05,  4.01it/s][A
+ 35%|███▌      | 273/774 [01:16<02:01,  4.13it/s][A
+ 35%|███▌      | 274/774 [01:16<02:04,  4.00it/s][A
+ 36%|███▌      | 275/774 [01:17<02:01,  4.12it/s][A
+ 36%|███▌      | 276/774 [01:17<01:54,  4.35it/s][A
+ 36%|███▌      | 277/774 [01:17<01:58,  4.18it/s][A
+ 36%|███▌      | 278/774 [01:17<02:03,  4.00it/s][A
+ 36%|███▌      | 279/774 [01:18<01:57,  4.23it/s][A
+ 36%|███▌      | 280/774 [01:18<01:58,  4.18it/s][A
+ 36%|███▋      | 281/774 [01:18<02:11,  3.76it/s][A
+ 36%|███▋      | 282/774 [01:19<02:22,  3.44it/s][A
+ 37%|███▋      | 283/774 [01:19<02:21,  3.48it/s][A
+ 37%|███▋      | 284/774 [01:19<02:23,  3.42it/s][A
+ 37%|███▋      | 285/774 [01:19<02:20,  3.48it/s][A
+ 37%|███▋      | 286/774 [01:20<02:14,  3.63it/s][A
+ 37%|███▋      | 287/774 [01:20<02:25,  3.35it/s][A
+ 37%|███▋      | 288/774 [01:20<02:26,  3.31it/s][A
+ 37%|███▋      | 289/774 [01:21<02:21,  3.42it/s][A
+ 37%|███▋      | 290/774 [01:21<02:15,  3.57it/s][A
+ 38%|███▊      | 291/774 [01:21<02:12,  3.65it/s][A
+ 38%|███▊      | 292/774 [01:21<02:08,  3.76it/s][A
+ 38%|███▊      | 293/774 [01:22<01:58,  4.04it/s][A
+ 38%|███▊      | 294/774 [01:22<01:55,  4.17it/s][A
+ 38%|███▊      | 295/774 [01:22<01:52,  4.24it/s][A
+ 38%|███▊      | 296/774 [01:22<01:47,  4.45it/s][A
+ 38%|███▊      | 297/774 [01:22<01:41,  4.72it/s][A
+ 39%|███▊      | 298/774 [01:23<01:46,  4.48it/s][A
+ 39%|███▊      | 299/774 [01:23<01:50,  4.30it/s][A
+ 39%|███▉      | 300/774 [01:23<01:56,  4.06it/s][A
+ 39%|███▉      | 301/774 [01:23<01:48,  4.36it/s][A
+ 39%|███▉      | 302/774 [01:24<01:41,  4.63it/s][A
+ 39%|███▉      | 303/774 [01:24<01:38,  4.78it/s][A
+ 39%|███▉      | 304/774 [01:24<01:27,  5.39it/s][A
+ 39%|███▉      | 305/774 [01:24<01:26,  5.40it/s][A
+ 40%|███▉      | 306/774 [01:24<01:40,  4.67it/s][A
+ 40%|███▉      | 307/774 [01:25<01:44,  4.49it/s][A
+ 40%|███▉      | 308/774 [01:25<01:41,  4.60it/s][A
+ 40%|███▉      | 309/774 [01:25<01:42,  4.54it/s][A
+ 40%|████      | 310/774 [01:25<01:46,  4.35it/s][A
+ 40%|████      | 311/774 [01:25<01:44,  4.45it/s][A
+ 40%|████      | 312/774 [01:26<01:40,  4.59it/s][A
+ 40%|████      | 313/774 [01:26<01:39,  4.64it/s][A
+ 41%|████      | 314/774 [01:26<01:40,  4.56it/s][A
+ 41%|████      | 315/774 [01:26<01:49,  4.18it/s][A
+ 41%|████      | 316/774 [01:27<01:42,  4.46it/s][A
+ 41%|████      | 317/774 [01:27<01:35,  4.80it/s][A
+ 41%|████      | 318/774 [01:27<01:38,  4.61it/s][A
+ 41%|████      | 319/774 [01:27<01:40,  4.54it/s][A
+ 41%|████▏     | 320/774 [01:27<01:40,  4.53it/s][A
+ 41%|████▏     | 321/774 [01:28<01:32,  4.89it/s][A
+ 42%|████▏     | 322/774 [01:28<01:29,  5.03it/s][A
+ 42%|████▏     | 323/774 [01:28<01:22,  5.50it/s][A
+ 42%|████▏     | 324/774 [01:28<01:27,  5.13it/s][A
+ 42%|████▏     | 325/774 [01:28<01:30,  4.96it/s][A
+ 42%|████▏     | 326/774 [01:29<01:26,  5.17it/s][A
+ 42%|████▏     | 327/774 [01:29<01:29,  5.00it/s][A
+ 42%|████▏     | 328/774 [01:29<01:26,  5.15it/s][A
+ 43%|████▎     | 329/774 [01:29<01:34,  4.70it/s][A
+ 43%|████▎     | 330/774 [01:29<01:29,  4.96it/s][A
+ 43%|████▎     | 331/774 [01:30<01:21,  5.45it/s][A
+ 43%|████▎     | 332/774 [01:30<01:19,  5.58it/s][A
+ 43%|████▎     | 333/774 [01:30<01:23,  5.28it/s][A
+ 43%|████▎     | 334/774 [01:30<01:27,  5.05it/s][A
+ 43%|████▎     | 335/774 [01:30<01:28,  4.97it/s][A
+ 43%|████▎     | 336/774 [01:30<01:26,  5.08it/s][A
+ 44%|████▎     | 337/774 [01:31<01:19,  5.49it/s][A
+ 44%|████▎     | 338/774 [01:31<01:14,  5.87it/s][A
+ 44%|████▍     | 339/774 [01:31<01:09,  6.24it/s][A
+ 44%|████▍     | 340/774 [01:31<01:09,  6.23it/s][A
+ 44%|████▍     | 341/774 [01:31<01:26,  4.99it/s][A
+ 44%|████▍     | 342/774 [01:32<01:35,  4.53it/s][A
+ 44%|████▍     | 343/774 [01:32<01:36,  4.47it/s][A
+ 44%|████▍     | 344/774 [01:32<01:40,  4.29it/s][A
+ 45%|████▍     | 345/774 [01:32<01:43,  4.15it/s][A
+ 45%|████▍     | 346/774 [01:33<01:47,  3.98it/s][A
+ 45%|████▍     | 347/774 [01:33<01:43,  4.14it/s][A
+ 45%|████▍     | 348/774 [01:33<01:38,  4.33it/s][A
+ 45%|████▌     | 349/774 [01:33<01:34,  4.52it/s][A
+ 45%|████▌     | 350/774 [01:34<01:37,  4.37it/s][A
+ 45%|████▌     | 351/774 [01:34<01:37,  4.35it/s][A
+ 45%|████▌     | 352/774 [01:34<01:32,  4.54it/s][A
+ 46%|████▌     | 353/774 [01:34<01:32,  4.53it/s][A
+ 46%|████▌     | 354/774 [01:34<01:32,  4.56it/s][A
+ 46%|████▌     | 355/774 [01:35<01:37,  4.32it/s][A
+ 46%|████▌     | 356/774 [01:35<01:46,  3.92it/s][A
+ 46%|████▌     | 357/774 [01:35<02:02,  3.40it/s][A
+ 46%|████▋     | 358/774 [01:36<02:07,  3.27it/s][A
+ 46%|████▋     | 359/774 [01:36<02:06,  3.29it/s][A
+ 47%|████▋     | 360/774 [01:36<02:06,  3.29it/s][A
+ 47%|████▋     | 361/774 [01:37<02:00,  3.43it/s][A
+ 47%|████▋     | 362/774 [01:37<02:06,  3.25it/s][A
+ 47%|████▋     | 363/774 [01:37<02:05,  3.27it/s][A
+ 47%|████▋     | 364/774 [01:38<02:06,  3.24it/s][A
+ 47%|████▋     | 365/774 [01:38<02:03,  3.31it/s][A
+ 47%|████▋     | 366/774 [01:38<01:54,  3.55it/s][A
+ 47%|████▋     | 367/774 [01:38<01:50,  3.68it/s][A
+ 48%|████▊     | 368/774 [01:39<01:47,  3.78it/s][A
+ 48%|████▊     | 369/774 [01:39<01:55,  3.52it/s][A
+ 48%|████▊     | 370/774 [01:39<02:09,  3.12it/s][A
+ 48%|████▊     | 371/774 [01:40<02:01,  3.33it/s][A
+ 48%|████▊     | 372/774 [01:40<02:01,  3.32it/s][A
+ 48%|████▊     | 373/774 [01:40<01:59,  3.35it/s][A
+ 48%|████▊     | 374/774 [01:40<01:57,  3.41it/s][A
+ 48%|████▊     | 375/774 [01:41<01:56,  3.41it/s][A
+ 49%|████▊     | 376/774 [01:41<02:02,  3.24it/s][A
+ 49%|████▊     | 377/774 [01:41<02:15,  2.92it/s][A
+ 49%|████▉     | 378/774 [01:42<02:18,  2.85it/s][A
+ 49%|████▉     | 379/774 [01:42<02:08,  3.07it/s][A
+ 49%|████▉     | 380/774 [01:42<01:57,  3.35it/s][A
+ 49%|████▉     | 381/774 [01:43<01:49,  3.59it/s][A
+ 49%|████▉     | 382/774 [01:43<01:44,  3.73it/s][A
+ 49%|████▉     | 383/774 [01:43<01:42,  3.81it/s][A
+ 50%|████▉     | 384/774 [01:43<01:51,  3.49it/s][A
+ 50%|████▉     | 385/774 [01:44<02:00,  3.22it/s][A
+ 50%|████▉     | 386/774 [01:44<01:52,  3.44it/s][A
+ 50%|█████     | 387/774 [01:44<01:48,  3.57it/s][A
+ 50%|█████     | 388/774 [01:45<01:51,  3.47it/s][A
+ 50%|█████     | 389/774 [01:45<01:46,  3.62it/s][A
+ 50%|█████     | 390/774 [01:45<01:58,  3.23it/s][A
+ 51%|█████     | 391/774 [01:46<01:59,  3.20it/s][A
+ 51%|█████     | 392/774 [01:46<01:49,  3.50it/s][A
+ 51%|█████     | 393/774 [01:46<01:40,  3.80it/s][A
+ 51%|█████     | 394/774 [01:46<01:41,  3.76it/s][A
+ 51%|█████     | 395/774 [01:47<01:48,  3.50it/s][A
+ 51%|█████     | 396/774 [01:47<01:45,  3.60it/s][A
+ 51%|█████▏    | 397/774 [01:47<01:48,  3.48it/s][A
+ 51%|█████▏    | 398/774 [01:47<01:43,  3.64it/s][A
+ 52%|█████▏    | 399/774 [01:48<01:41,  3.68it/s][A
+ 52%|█████▏    | 400/774 [01:48<01:34,  3.95it/s][A
+ 52%|█████▏    | 401/774 [01:48<01:30,  4.12it/s][A
+ 52%|█████▏    | 402/774 [01:48<01:30,  4.09it/s][A
+ 52%|█████▏    | 403/774 [01:49<01:34,  3.92it/s][A
+ 52%|█████▏    | 404/774 [01:49<01:40,  3.68it/s][A
+ 52%|█████▏    | 405/774 [01:49<01:36,  3.83it/s][A
+ 52%|█████▏    | 406/774 [01:49<01:39,  3.70it/s][A
+ 53%|█████▎    | 407/774 [01:50<01:45,  3.47it/s][A
+ 53%|█████▎    | 408/774 [01:50<01:42,  3.58it/s][A
+ 53%|█████▎    | 409/774 [01:50<01:38,  3.69it/s][A
+ 53%|█████▎    | 410/774 [01:51<01:39,  3.64it/s][A
+ 53%|█████▎    | 411/774 [01:51<01:40,  3.63it/s][A
+ 53%|█████▎    | 412/774 [01:51<01:41,  3.58it/s][A
+ 53%|█████▎    | 413/774 [01:51<01:38,  3.65it/s][A
+ 53%|█████▎    | 414/774 [01:52<01:35,  3.78it/s][A
+ 54%|█████▎    | 415/774 [01:52<01:24,  4.27it/s][A
+ 54%|█████▎    | 416/774 [01:52<01:25,  4.21it/s][A
+ 54%|█████▍    | 417/774 [01:52<01:23,  4.26it/s][A
+ 54%|█████▍    | 418/774 [01:52<01:17,  4.58it/s][A
+ 54%|█████▍    | 419/774 [01:53<01:31,  3.87it/s][A
+ 54%|█████▍    | 420/774 [01:53<01:36,  3.67it/s][A
+ 54%|█████▍    | 421/774 [01:53<01:36,  3.66it/s][A
+ 55%|█████▍    | 422/774 [01:54<01:36,  3.66it/s][A
+ 55%|█████▍    | 423/774 [01:54<01:36,  3.63it/s][A
+ 55%|█████▍    | 424/774 [01:54<01:34,  3.70it/s][A
+ 55%|█████▍    | 425/774 [01:54<01:24,  4.15it/s][A
+ 55%|█████▌    | 426/774 [01:55<01:17,  4.47it/s][A
+ 55%|█████▌    | 427/774 [01:55<01:14,  4.66it/s][A
+ 55%|█████▌    | 428/774 [01:55<01:16,  4.54it/s][A
+ 55%|█████▌    | 429/774 [01:55<01:17,  4.43it/s][A
+ 56%|█████▌    | 430/774 [01:55<01:22,  4.17it/s][A
+ 56%|█████▌    | 431/774 [01:56<01:35,  3.60it/s][A
+ 56%|█████▌    | 432/774 [01:56<01:34,  3.62it/s][A
+ 56%|█████▌    | 433/774 [01:56<01:26,  3.93it/s][A
+ 56%|█████▌    | 434/774 [01:57<01:22,  4.14it/s][A
+ 56%|█████▌    | 435/774 [01:57<01:21,  4.16it/s][A
+ 56%|█████▋    | 436/774 [01:57<01:23,  4.05it/s][A
+ 56%|█████▋    | 437/774 [01:57<01:19,  4.21it/s][A
+ 57%|█████▋    | 438/774 [01:57<01:15,  4.44it/s][A
+ 57%|█████▋    | 439/774 [01:58<01:19,  4.24it/s][A
+ 57%|█████▋    | 440/774 [01:58<01:23,  4.02it/s][A
+ 57%|█████▋    | 441/774 [01:58<01:27,  3.81it/s][A
+ 57%|█████▋    | 442/774 [01:59<01:28,  3.74it/s][A
+ 57%|█████▋    | 443/774 [01:59<01:26,  3.82it/s][A
+ 57%|█████▋    | 444/774 [01:59<01:25,  3.88it/s][A
+ 57%|█████▋    | 445/774 [01:59<01:25,  3.86it/s][A
+ 58%|█████▊    | 446/774 [02:00<01:22,  3.96it/s][A
+ 58%|█████▊    | 447/774 [02:00<01:20,  4.04it/s][A
+ 58%|█████▊    | 448/774 [02:00<01:13,  4.41it/s][A
+ 58%|█████▊    | 449/774 [02:00<01:15,  4.28it/s][A
+ 58%|█████▊    | 450/774 [02:00<01:18,  4.12it/s][A
+ 58%|█████▊    | 451/774 [02:01<01:16,  4.24it/s][A
+ 58%|█████▊    | 452/774 [02:01<01:12,  4.41it/s][A
+ 59%|█████▊    | 453/774 [02:01<01:11,  4.47it/s][A
+ 59%|█████▊    | 454/774 [02:01<01:17,  4.13it/s][A
+ 59%|█████▉    | 455/774 [02:02<01:21,  3.90it/s][A
+ 59%|█████▉    | 456/774 [02:02<01:25,  3.71it/s][A
+ 59%|█████▉    | 457/774 [02:02<01:19,  3.99it/s][A
+ 59%|█████▉    | 458/774 [02:02<01:19,  4.00it/s][A
+ 59%|█████▉    | 459/774 [02:03<01:17,  4.06it/s][A
+ 59%|█████▉    | 460/774 [02:03<01:22,  3.79it/s][A
+ 60%|█████▉    | 461/774 [02:03<01:29,  3.51it/s][A
+ 60%|█████▉    | 462/774 [02:04<01:26,  3.61it/s][A
+ 60%|█████▉    | 463/774 [02:04<01:23,  3.71it/s][A
+ 60%|█████▉    | 464/774 [02:04<01:23,  3.70it/s][A
+ 60%|██████    | 465/774 [02:04<01:15,  4.09it/s][A
+ 60%|██████    | 466/774 [02:05<01:11,  4.29it/s][A
+ 60%|██████    | 467/774 [02:05<01:08,  4.50it/s][A
+ 60%|██████    | 468/774 [02:05<01:08,  4.45it/s][A
+ 61%|██████    | 469/774 [02:05<01:02,  4.85it/s][A
+ 61%|██████    | 470/774 [02:05<01:00,  5.03it/s][A
+ 61%|██████    | 471/774 [02:06<01:02,  4.84it/s][A
+ 61%|██████    | 472/774 [02:06<01:07,  4.49it/s][A
+ 61%|██████    | 473/774 [02:06<01:10,  4.29it/s][A
+ 61%|██████    | 474/774 [02:06<01:08,  4.37it/s][A
+ 61%|██████▏   | 475/774 [02:06<01:09,  4.28it/s][A
+ 61%|██████▏   | 476/774 [02:07<01:17,  3.84it/s][A
+ 62%|██████▏   | 477/774 [02:07<01:32,  3.23it/s][A
+ 62%|██████▏   | 478/774 [02:08<01:33,  3.18it/s][A
+ 62%|██████▏   | 479/774 [02:08<01:30,  3.25it/s][A
+ 62%|██████▏   | 480/774 [02:08<01:27,  3.35it/s][A
+ 62%|██████▏   | 481/774 [02:08<01:28,  3.30it/s][A
+ 62%|██████▏   | 482/774 [02:09<01:26,  3.36it/s][A
+ 62%|██████▏   | 483/774 [02:09<01:24,  3.45it/s][A
+ 63%|██████▎   | 484/774 [02:09<01:25,  3.40it/s][A
+ 63%|██████▎   | 485/774 [02:10<01:28,  3.28it/s][A
+ 63%|██████▎   | 486/774 [02:10<01:24,  3.40it/s][A
+ 63%|██████▎   | 487/774 [02:10<01:26,  3.32it/s][A
+ 63%|██████▎   | 488/774 [02:11<01:23,  3.42it/s][A
+ 63%|██████▎   | 489/774 [02:11<01:18,  3.63it/s][A
+ 63%|██████▎   | 490/774 [02:11<01:18,  3.62it/s][A
+ 63%|██████▎   | 491/774 [02:11<01:17,  3.66it/s][A
+ 64%|██████▎   | 492/774 [02:12<01:18,  3.61it/s][A
+ 64%|██████▎   | 493/774 [02:12<01:19,  3.56it/s][A
+ 64%|██████▍   | 494/774 [02:12<01:18,  3.59it/s][A
+ 64%|██████▍   | 495/774 [02:12<01:18,  3.58it/s][A
+ 64%|██████▍   | 496/774 [02:13<01:22,  3.37it/s][A
+ 64%|██████▍   | 497/774 [02:13<01:23,  3.31it/s][A
+ 64%|██████▍   | 498/774 [02:13<01:22,  3.35it/s][A
+ 64%|██████▍   | 499/774 [02:14<01:20,  3.42it/s][A
+ 65%|██████▍   | 500/774 [02:14<01:17,  3.54it/s][A
+ 65%|██████▍   | 501/774 [02:14<01:14,  3.66it/s][A
+ 65%|██████▍   | 502/774 [02:14<01:14,  3.67it/s][A
+ 65%|██████▍   | 503/774 [02:15<01:20,  3.37it/s][A
+ 65%|██████▌   | 504/774 [02:15<01:21,  3.30it/s][A
+ 65%|██████▌   | 505/774 [02:15<01:18,  3.41it/s][A
+ 65%|██████▌   | 506/774 [02:16<01:19,  3.38it/s][A
+ 66%|██████▌   | 507/774 [02:16<01:23,  3.19it/s][A
+ 66%|██████▌   | 508/774 [02:16<01:21,  3.27it/s][A
+ 66%|██████▌   | 509/774 [02:17<01:19,  3.32it/s][A
+ 66%|██████▌   | 510/774 [02:17<01:17,  3.40it/s][A
+ 66%|██████▌   | 511/774 [02:17<01:13,  3.58it/s][A
+ 66%|██████▌   | 512/774 [02:17<01:11,  3.68it/s][A
+ 66%|██████▋   | 513/774 [02:18<01:14,  3.52it/s][A
+ 66%|██████▋   | 514/774 [02:18<01:16,  3.41it/s][A
+ 67%|██████▋   | 515/774 [02:18<01:22,  3.15it/s][A
+ 67%|██████▋   | 516/774 [02:19<01:16,  3.38it/s][A
+ 67%|███��██▋   | 517/774 [02:19<01:10,  3.66it/s][A
+ 67%|██████▋   | 518/774 [02:19<01:07,  3.78it/s][A
+ 67%|██████▋   | 519/774 [02:19<01:10,  3.62it/s][A
+ 67%|██████▋   | 520/774 [02:20<01:09,  3.64it/s][A
+ 67%|██████▋   | 521/774 [02:20<01:07,  3.76it/s][A
+ 67%|██████▋   | 522/774 [02:20<01:04,  3.93it/s][A
+ 68%|██████▊   | 523/774 [02:20<01:02,  4.03it/s][A
+ 68%|██████▊   | 524/774 [02:21<01:06,  3.77it/s][A
+ 68%|██████▊   | 525/774 [02:21<01:07,  3.71it/s][A
+ 68%|██████▊   | 526/774 [02:21<01:09,  3.55it/s][A
+ 68%|██████▊   | 527/774 [02:22<01:11,  3.46it/s][A
+ 68%|██████▊   | 528/774 [02:22<01:10,  3.48it/s][A
+ 68%|██████▊   | 529/774 [02:22<01:06,  3.67it/s][A
+ 68%|██████▊   | 530/774 [02:22<01:05,  3.72it/s][A
+ 69%|██████▊   | 531/774 [02:23<01:05,  3.73it/s][A
+ 69%|██████▊   | 532/774 [02:23<01:03,  3.82it/s][A
+ 69%|██████▉   | 533/774 [02:23<00:59,  4.02it/s][A
+ 69%|██████▉   | 534/774 [02:23<00:56,  4.24it/s][A
+ 69%|██████▉   | 535/774 [02:24<00:59,  4.04it/s][A
+ 69%|██████▉   | 536/774 [02:24<01:01,  3.87it/s][A
+ 69%|██████▉   | 537/774 [02:24<01:02,  3.81it/s][A
+ 70%|██████▉   | 538/774 [02:24<01:05,  3.58it/s][A
+ 70%|██████▉   | 539/774 [02:25<01:05,  3.60it/s][A
+ 70%|██████▉   | 540/774 [02:25<01:04,  3.62it/s][A
+ 70%|██████▉   | 541/774 [02:25<01:02,  3.73it/s][A
+ 70%|███████   | 542/774 [02:25<01:02,  3.72it/s][A
+ 70%|███████   | 543/774 [02:26<01:03,  3.63it/s][A
+ 70%|███████   | 544/774 [02:26<01:03,  3.62it/s][A
+ 70%|███████   | 545/774 [02:26<01:01,  3.74it/s][A
+ 71%|███████   | 546/774 [02:27<00:57,  3.94it/s][A
+ 71%|███████   | 547/774 [02:27<00:55,  4.11it/s][A
+ 71%|███████   | 548/774 [02:27<00:54,  4.16it/s][A
+ 71%|███████   | 549/774 [02:27<00:55,  4.08it/s][A
+ 71%|███████   | 550/774 [02:28<00:58,  3.83it/s][A
+ 71%|███████   | 551/774 [02:28<01:01,  3.65it/s][A
+ 71%|███████▏  | 552/774 [02:28<01:04,  3.46it/s][A
+ 71%|███████▏  | 553/774 [02:29<01:08,  3.24it/s][A
+ 72%|███████▏  | 554/774 [02:29<01:07,  3.27it/s][A
+ 72%|███████▏  | 555/774 [02:29<01:06,  3.31it/s][A
+ 72%|███████▏  | 556/774 [02:29<01:02,  3.47it/s][A
+ 72%|███████▏  | 557/774 [02:30<01:06,  3.26it/s][A
+ 72%|███████▏  | 558/774 [02:30<01:00,  3.54it/s][A
+ 72%|███████▏  | 559/774 [02:30<00:56,  3.82it/s][A
+ 72%|███████▏  | 560/774 [02:30<01:00,  3.52it/s][A
+ 72%|███████▏  | 561/774 [02:31<00:57,  3.70it/s][A
+ 73%|███████▎  | 562/774 [02:31<00:52,  4.00it/s][A
+ 73%|███████▎  | 563/774 [02:31<00:50,  4.16it/s][A
+ 73%|███████▎  | 564/774 [02:31<00:52,  4.03it/s][A
+ 73%|███████▎  | 565/774 [02:32<00:54,  3.87it/s][A
+ 73%|███████▎  | 566/774 [02:32<00:49,  4.16it/s][A
+ 73%|███████▎  | 567/774 [02:32<00:46,  4.50it/s][A
+ 73%|███████▎  | 568/774 [02:32<00:47,  4.31it/s][A
+ 74%|███████▎  | 569/774 [02:33<00:48,  4.25it/s][A
+ 74%|███████▎  | 570/774 [02:33<00:48,  4.23it/s][A
+ 74%|███████▍  | 571/774 [02:33<00:52,  3.88it/s][A
+ 74%|███████▍  | 572/774 [02:33<00:54,  3.72it/s][A
+ 74%|███████▍  | 573/774 [02:34<00:53,  3.73it/s][A
+ 74%|███████▍  | 574/774 [02:34<00:52,  3.84it/s][A
+ 74%|███████▍  | 575/774 [02:34<00:51,  3.83it/s][A
+ 74%|███████▍  | 576/774 [02:35<00:57,  3.46it/s][A
+ 75%|███████▍  | 577/774 [02:35<00:55,  3.56it/s][A
+ 75%|███████▍  | 578/774 [02:35<00:54,  3.63it/s][A
+ 75%|███████▍  | 579/774 [02:35<00:56,  3.47it/s][A
+ 75%|███████▍  | 580/774 [02:36<00:55,  3.49it/s][A
+ 75%|███████▌  | 581/774 [02:36<00:54,  3.52it/s][A
+ 75%|███████▌  | 582/774 [02:36<00:52,  3.63it/s][A
+ 75%|███████▌  | 583/774 [02:36<00:50,  3.76it/s][A
+ 75%|███████▌  | 584/774 [02:37<00:50,  3.79it/s][A
+ 76%|███████▌  | 585/774 [02:37<00:52,  3.62it/s][A
+ 76%|███████▌  | 586/774 [02:37<00:52,  3.60it/s][A
+ 76%|███████▌  | 587/774 [02:38<00:51,  3.66it/s][A
+ 76%|███████▌  | 588/774 [02:38<00:49,  3.73it/s][A
+ 76%|███████▌  | 589/774 [02:38<00:48,  3.78it/s][A
+ 76%|███████▌  | 590/774 [02:38<00:45,  4.04it/s][A
+ 76%|███████▋  | 591/774 [02:39<00:46,  3.92it/s][A
+ 76%|███████▋  | 592/774 [02:39<00:49,  3.67it/s][A
+ 77%|███████▋  | 593/774 [02:39<00:50,  3.61it/s][A
+ 77%|███████▋  | 594/774 [02:39<00:50,  3.59it/s][A
+ 77%|███████▋  | 595/774 [02:40<00:54,  3.31it/s][A
+ 77%|███████▋  | 596/774 [02:40<00:56,  3.14it/s][A
+ 77%|███████▋  | 597/774 [02:40<00:56,  3.12it/s][A
+ 77%|███████▋  | 598/774 [02:41<00:58,  3.02it/s][A
+ 77%|███████▋  | 599/774 [02:41<00:58,  2.99it/s][A
+ 78%|███████▊  | 600/774 [02:42<00:58,  2.99it/s][A
+ 78%|███████▊  | 601/774 [02:42<00:58,  2.96it/s][A
+ 78%|███████▊  | 602/774 [02:42<00:58,  2.94it/s][A
+ 78%|███████▊  | 603/774 [02:43<00:57,  2.99it/s][A
+ 78%|███████▊  | 604/774 [02:43<00:57,  2.94it/s][A
+ 78%|███████▊  | 605/774 [02:43<00:56,  3.02it/s][A
+ 78%|███████▊  | 606/774 [02:44<00:57,  2.92it/s][A
+ 78%|███████▊  | 607/774 [02:44<00:56,  2.94it/s][A
+ 79%|███████▊  | 608/774 [02:44<00:56,  2.93it/s][A
+ 79%|███████▊  | 609/774 [02:45<00:54,  3.05it/s][A
+ 79%|███████▉  | 610/774 [02:45<00:55,  2.98it/s][A
+ 79%|███████▉  | 611/774 [02:45<00:59,  2.74it/s][A
+ 79%|███████▉  | 612/774 [02:46<01:01,  2.62it/s][A
+ 79%|███████▉  | 613/774 [02:46<00:57,  2.82it/s][A
+ 79%|███████▉  | 614/774 [02:46<00:55,  2.88it/s][A
+ 79%|███████▉  | 615/774 [02:47<00:52,  3.03it/s][A
+ 80%|███████▉  | 616/774 [02:47<00:51,  3.06it/s][A
+ 80%|███████▉  | 617/774 [02:47<00:50,  3.08it/s][A
+ 80%|███████▉  | 618/774 [02:48<00:48,  3.23it/s][A
+ 80%|███████▉  | 619/774 [02:48<00:45,  3.40it/s][A
+ 80%|████████  | 620/774 [02:48<00:44,  3.44it/s][A
+ 80%|████████  | 621/774 [02:48<00:41,  3.71it/s][A
+ 80%|████████  | 622/774 [02:49<00:38,  3.97it/s][A
+ 80%|████████  | 623/774 [02:49<00:38,  3.93it/s][A
+ 81%|████████  | 624/774 [02:49<00:41,  3.63it/s][A
+ 81%|████████  | 625/774 [02:49<00:41,  3.56it/s][A
+ 81%|████████  | 626/774 [02:50<00:45,  3.29it/s][A
+ 81%|████████  | 627/774 [02:50<00:46,  3.17it/s][A
+ 81%|████████  | 628/774 [02:50<00:45,  3.18it/s][A
+ 81%|████████▏ | 629/774 [02:51<00:44,  3.29it/s][A
+ 81%|████████▏ | 630/774 [02:51<00:41,  3.51it/s][A
+ 82%|████████▏ | 631/774 [02:51<00:38,  3.70it/s][A
+ 82%|████████▏ | 632/774 [02:51<00:37,  3.74it/s][A
+ 82%|████████▏ | 633/774 [02:52<00:39,  3.55it/s][A
+ 82%|████████▏ | 634/774 [02:52<00:40,  3.45it/s][A
+ 82%|████████▏ | 635/774 [02:52<00:39,  3.50it/s][A
+ 82%|████████▏ | 636/774 [02:53<00:39,  3.46it/s][A
+ 82%|████████▏ | 637/774 [02:53<00:39,  3.51it/s][A
+ 82%|████████▏ | 638/774 [02:53<00:39,  3.48it/s][A
+ 83%|████████▎ | 639/774 [02:54<00:43,  3.09it/s][A
+ 83%|████████▎ | 640/774 [02:54<00:50,  2.68it/s][A
+ 83%|████████▎ | 641/774 [02:54<00:49,  2.70it/s][A
+ 83%|████████▎ | 642/774 [02:55<00:45,  2.87it/s][A
+ 83%|████████▎ | 643/774 [02:55<00:45,  2.89it/s][A
+ 83%|████████▎ | 644/774 [02:55<00:41,  3.11it/s][A
+ 83%|████████▎ | 645/774 [02:56<00:37,  3.40it/s][A
+ 83%|████████▎ | 646/774 [02:56<00:35,  3.63it/s][A
+ 84%|████████▎ | 647/774 [02:56<00:32,  3.90it/s][A
+ 84%|████████▎ | 648/774 [02:56<00:30,  4.08it/s][A
+ 84%|████████▍ | 649/774 [02:56<00:30,  4.10it/s][A
+ 84%|████████▍ | 650/774 [02:57<00:28,  4.32it/s][A
+ 84%|████████▍ | 651/774 [02:57<00:28,  4.25it/s][A
+ 84%|████████▍ | 652/774 [02:57<00:29,  4.12it/s][A
+ 84%|████████▍ | 653/774 [02:57<00:31,  3.87it/s][A
+ 84%|████████▍ | 654/774 [02:58<00:29,  4.08it/s][A
+ 85%|████████▍ | 655/774 [02:58<00:27,  4.39it/s][A
+ 85%|████████▍ | 656/774 [02:58<00:27,  4.22it/s][A
+ 85%|████████▍ | 657/774 [02:58<00:26,  4.41it/s][A
+ 85%|████████▌ | 658/774 [02:59<00:27,  4.25it/s][A
+ 85%|████████▌ | 659/774 [02:59<00:29,  3.89it/s][A
+ 85%|████████▌ | 660/774 [02:59<00:30,  3.74it/s][A
+ 85%|████████▌ | 661/774 [02:59<00:30,  3.68it/s][A
+ 86%|████████▌ | 662/774 [03:00<00:28,  3.88it/s][A
+ 86%|████████▌ | 663/774 [03:00<00:30,  3.66it/s][A
+ 86%|████████▌ | 664/774 [03:00<00:30,  3.62it/s][A
+ 86%|████████▌ | 665/774 [03:01<00:27,  3.91it/s][A
+ 86%|████████▌ | 666/774 [03:01<00:25,  4.32it/s][A
+ 86%|████████▌ | 667/774 [03:01<00:23,  4.62it/s][A
+ 86%|████████▋ | 668/774 [03:01<00:23,  4.44it/s][A
+ 86%|████████▋ | 669/774 [03:01<00:25,  4.17it/s][A
+ 87%|████████▋ | 670/774 [03:02<00:24,  4.28it/s][A
+ 87%|████████▋ | 671/774 [03:02<00:26,  3.90it/s][A
+ 87%|████████▋ | 672/774 [03:02<00:25,  4.02it/s][A
+ 87%|████████▋ | 673/774 [03:02<00:24,  4.09it/s][A
+ 87%|████████▋ | 674/774 [03:03<00:24,  4.03it/s][A
+ 87%|████████▋ | 675/774 [03:03<00:23,  4.18it/s][A
+ 87%|████████▋ | 676/774 [03:03<00:22,  4.34it/s][A
+ 87%|████████▋ | 677/774 [03:03<00:22,  4.39it/s][A
+ 88%|████████▊ | 678/774 [03:04<00:21,  4.44it/s][A
+ 88%|████████▊ | 679/774 [03:04<00:22,  4.17it/s][A
+ 88%|████████▊ | 680/774 [03:04<00:22,  4.14it/s][A
+ 88%|████████▊ | 681/774 [03:04<00:21,  4.41it/s][A
+ 88%|████████▊ | 682/774 [03:04<00:20,  4.48it/s][A
+ 88%|████████▊ | 683/774 [03:05<00:22,  4.12it/s][A
+ 88%|████████▊ | 684/774 [03:05<00:23,  3.85it/s][A
+ 89%|████████▊ | 685/774 [03:05<00:24,  3.68it/s][A
+ 89%|████████▊ | 686/774 [03:06<00:23,  3.80it/s][A
+ 89%|████████▉ | 687/774 [03:06<00:21,  4.04it/s][A
+ 89%|████████▉ | 688/774 [03:06<00:21,  4.03it/s][A
+ 89%|████████▉ | 689/774 [03:06<00:20,  4.18it/s][A
+ 89%|████████▉ | 690/774 [03:06<00:19,  4.29it/s][A
+ 89%|████████▉ | 691/774 [03:07<00:18,  4.40it/s][A
+ 89%|████████▉ | 692/774 [03:07<00:18,  4.50it/s][A
+ 90%|████████▉ | 693/774 [03:07<00:18,  4.49it/s][A
+ 90%|████████▉ | 694/774 [03:07<00:19,  4.20it/s][A
+ 90%|████████▉ | 695/774 [03:08<00:20,  3.86it/s][A
+ 90%|████████▉ | 696/774 [03:08<00:19,  3.95it/s][A
+ 90%|█████████ | 697/774 [03:08<00:19,  3.96it/s][A
+ 90%|█████████ | 698/774 [03:08<00:17,  4.36it/s][A
+ 90%|█████████ | 699/774 [03:09<00:15,  4.70it/s][A
+ 90%|█████████ | 700/774 [03:09<00:17,  4.31it/s][A
+ 91%|█████████ | 701/774 [03:09<00:16,  4.38it/s][A
+ 91%|█████████ | 702/774 [03:09<00:16,  4.42it/s][A
+ 91%|█████████ | 703/774 [03:09<00:16,  4.40it/s][A
+ 91%|█████████ | 704/774 [03:10<00:16,  4.25it/s][A
+ 91%|█████████ | 705/774 [03:10<00:14,  4.61it/s][A
+ 91%|█████████ | 706/774 [03:10<00:14,  4.77it/s][A
+ 91%|█████████▏| 707/774 [03:10<00:14,  4.65it/s][A
+ 91%|█████████▏| 708/774 [03:11<00:13,  4.98it/s][A
+ 92%|█████████▏| 709/774 [03:11<00:13,  4.78it/s][A
+ 92%|█████████▏| 710/774 [03:11<00:13,  4.67it/s][A
+ 92%|█████████▏| 711/774 [03:11<00:12,  4.85it/s][A
+ 92%|█████████▏| 712/774 [03:11<00:12,  5.09it/s][A
+ 92%|█████████▏| 713/774 [03:12<00:12,  4.90it/s][A
+ 92%|█████████▏| 714/774 [03:12<00:13,  4.59it/s][A
+ 92%|█████████▏| 715/774 [03:12<00:12,  4.69it/s][A
+ 93%|█████████▎| 716/774 [03:12<00:11,  5.17it/s][A
+ 93%|█████████▎| 717/774 [03:12<00:10,  5.25it/s][A
+ 93%|█████████▎| 718/774 [03:13<00:11,  4.67it/s][A
+ 93%|█████████▎| 719/774 [03:13<00:11,  4.61it/s][A
+ 93%|█████████▎| 720/774 [03:13<00:11,  4.90it/s][A
+ 93%|█████████▎| 721/774 [03:13<00:10,  5.18it/s][A
+ 93%|█████████▎| 722/774 [03:13<00:09,  5.57it/s][A
+ 93%|█████████▎| 723/774 [03:14<00:09,  5.37it/s][A
+ 94%|█████████▎| 724/774 [03:14<00:09,  5.32it/s][A
+ 94%|█████████▎| 725/774 [03:14<00:08,  5.48it/s][A
+ 94%|█████████▍| 726/774 [03:14<00:08,  5.64it/s][A
+ 94%|█████████▍| 727/774 [03:14<00:08,  5.39it/s][A
+ 94%|█████████▍| 728/774 [03:15<00:09,  4.80it/s][A
+ 94%|█████████▍| 729/774 [03:15<00:08,  5.10it/s][A
+ 94%|█████████▍| 730/774 [03:15<00:08,  5.38it/s][A
+ 94%|█████████▍| 731/774 [03:15<00:08,  5.37it/s][A
+ 95%|█████████▍| 732/774 [03:15<00:07,  5.56it/s][A
+ 95%|█████████▍| 733/774 [03:15<00:07,  5.55it/s][A
+ 95%|█████████▍| 734/774 [03:16<00:07,  5.60it/s][A
+ 95%|█████████▍| 735/774 [03:16<00:06,  5.72it/s][A
+ 95%|█████████▌| 736/774 [03:16<00:06,  5.81it/s][A
+ 95%|█████████▌| 737/774 [03:16<00:06,  5.74it/s][A
+ 95%|█████████▌| 738/774 [03:16<00:06,  5.57it/s][A
+ 95%|█████████▌| 739/774 [03:16<00:06,  5.59it/s][A
+ 96%|█████████▌| 740/774 [03:17<00:06,  5.46it/s][A
+ 96%|█████████▌| 741/774 [03:17<00:06,  5.12it/s][A
+ 96%|█████████▌| 742/774 [03:17<00:06,  5.31it/s][A
+ 96%|█████████▌| 743/774 [03:17<00:05,  5.62it/s][A
+ 96%|█████████▌| 744/774 [03:17<00:05,  5.38it/s][A
+ 96%|█████████▋| 745/774 [03:18<00:06,  4.54it/s][A
+ 96%|█████████▋| 746/774 [03:18<00:07,  3.91it/s][A
+ 97%|█████████▋| 747/774 [03:18<00:06,  4.10it/s][A
+ 97%|█████████▋| 748/774 [03:18<00:06,  4.25it/s][A
+ 97%|█████████▋| 749/774 [03:19<00:05,  4.53it/s][A
+ 97%|█████████▋| 750/774 [03:19<00:05,  4.26it/s][A
+ 97%|█████████▋| 751/774 [03:19<00:05,  4.46it/s][A
+ 97%|█████████▋| 752/774 [03:19<00:04,  4.41it/s][A
+ 97%|█████████▋| 753/774 [03:20<00:04,  4.70it/s][A
+ 97%|█████████▋| 754/774 [03:20<00:03,  5.32it/s][A
+ 98%|█████████▊| 755/774 [03:20<00:03,  5.62it/s][A
+ 98%|█████████▊| 756/774 [03:20<00:03,  5.53it/s][A
+ 98%|█████████▊| 757/774 [03:20<00:03,  5.34it/s][A
+ 98%|█████████▊| 758/774 [03:20<00:03,  5.23it/s][A
+ 98%|█████████▊| 759/774 [03:21<00:02,  5.46it/s][A
+ 98%|█████████▊| 760/774 [03:21<00:02,  5.43it/s][A
+ 98%|█████████▊| 761/774 [03:21<00:02,  5.88it/s][A
+ 98%|█████████▊| 762/774 [03:21<00:02,  5.99it/s][A
+ 99%|█████████▊| 763/774 [03:21<00:01,  6.28it/s][A
+ 99%|█████████▊| 764/774 [03:21<00:01,  6.37it/s][A
+ 99%|█████████▉| 765/774 [03:21<00:01,  6.25it/s][A
+ 99%|█████████▉| 766/774 [03:22<00:01,  5.33it/s][A
+ 99%|█████████▉| 767/774 [03:22<00:01,  5.48it/s][A
+ 99%|█████████▉| 768/774 [03:22<00:01,  5.41it/s][A
+ 99%|█████████▉| 769/774 [03:22<00:01,  4.98it/s][A
+ 99%|█████████▉| 770/774 [03:23<00:00,  4.90it/s][A
+100%|█████████▉| 771/774 [03:23<00:00,  5.21it/s][A
+100%|█████████▉| 772/774 [03:23<00:00,  4.96it/s][A
+100%|█████████▉| 773/774 [03:23<00:00,  4.82it/s][A                                                      
+                                                 [A 63%|██████▎   | 8000/12776 [1:25:57<50:06,  1.59it/s]
+100%|██████████| 774/774 [03:26<00:00,  4.82it/s][A
+                                                 [ASaving model checkpoint to ./checkpoint-8000
+Configuration saved in ./checkpoint-8000/config.json
+Model weights saved in ./checkpoint-8000/model.safetensors
+Feature extractor saved in ./checkpoint-8000/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-8000/tokenizer_config.json
+Special tokens file saved in ./checkpoint-8000/special_tokens_map.json
+added tokens file saved in ./checkpoint-8000/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-6800] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 63%|██████▎   | 8001/12776 [1:26:03<85:06:21, 64.16s/it]                                                          63%|██████▎   | 8001/12776 [1:26:03<85:06:21, 64.16s/it] 63%|██████▎   | 8002/12776 [1:26:03<59:43:52, 45.04s/it]                                                          63%|██████▎   | 8002/12776 [1:26:03<59:43:52, 45.04s/it] 63%|██████▎   | 8003/12776 [1:26:03<41:58:04, 31.65s/it]                                                          63%|██████▎   | 8003/12776 [1:26:03<41:58:04, 31.65s/it] 63%|██████▎   | 8004/12776 [1:26:04<29:33:26, 22.30s/it]                                                          63%|██████▎   | 8004/12776 [1:26:04<29:33:26, 22.30s/it] 63%|██████▎   | 8005/12776 [1:26:04<20:50:31, 15.73s/it]                                                          63%|██████▎   | 8005/12776 [1:26:04<20:50:31, 15.73s/it] 63%|██████▎   | 8006/12776 [1:26:05<14:47:32, 11.16s/it]                                                          63%|██████▎   | 8006/12776 [1:26:05<14:47:32, 11.16s/it] 63%|██████▎   | 8007/12776 [1:26:05<10:29:29,  7.92s/it]                                                          63%|██████▎   | 8007/12776 [1:26:05<10:29:29,  7.92s/it] 63%|██████▎   | 8008/12776 [1:26:05<7:28:30,  5.64s/it]                                                          63%|██████▎   | 8008/12776 [1:26:05<7:28:30,  5.64s/it] 63%|██████▎   | 8009/12776 [1:26:06<5:24:53,  4.09s/it]                                                         63%|██████▎   | 8009/12776 [1:26:06<5:24:53,  4.09s/it] 63%|██████▎   | 8010/12776 [1:26:06<3:54:56,  2.96s/it]                                                         63%|██████▎   | 8010/12776 [1:26:06<3:54:56,  2.96s/it] 63%|██████▎   | 8011/12776 [1:26:07<2:51:47,  2.16s/it]                                                         63%|██████▎   | 8011/12776 [1:26:07<2:51:47,  2.16s/it] 63%|██████▎   | 8012/12776 [1:26:07<2:07:32,  1.61s/it]                                                         63%|██████▎   | 8012/12776 [1:26:07<2:07:32,  1.61s/it] 63%|██████▎   | 8013/12776 [1:26:07<1:36:58,  1.22s/it]                                                         63%|██████▎   | 8013/12776 [1:26:07<1:36:58,  1.22s/it] 63%|██████▎   | 8014/12776 [1:26:07<1:14:29,  1.07it/s]                                                         63%|██████▎   | 8014/12776 [1:26:07<1:14:29,  1.07it/s] 63%|██████▎   | 8015/12776 [1:26:08<58:36,  1.35it/s]                                                         63%|██████▎   | 8015/12776 [1:26:08<58:36,  1.35it/s] 63%|██████▎   | 8016/12776 [1:26:08<47:19,  1.68it/s]                                                       63%|██████▎   | 8016/12776 [1:26:08<47:19,  1.68it/s] 63%|██████▎   | 8017/12776 [1:26:08<40:33,  1.96it/s]                                                       63%|██████▎   | 8017/12776 [1:26:08<40:33,  1.96it/s] 63%|██████▎   | 8018/12776 [1:26:09<34:16,  2.31it/s]                                                       63%|██████▎   | 8018/12776 [1:26:09<34:16,  2.31it/s] 63%|██████▎   | 8019/12776 [1:26:09<29:43,  2.67it/s]                                                       63%|██████▎   | 8019/12776 [1:26:09<29:43,  2.67it/s] 63%|██████▎   | 8020/12776 [1:26:09<26:18,  3.01it/s]                                                       63%|██████▎   | 8020/12776 [1:26:09<26:18,  3.01it/s] 63%|██████▎   | 8021/12776 [1:26:09<25:24,  3.12it/s]                                                       63%|██████▎   | 8021/12776 [1:26:09<25:24,  3.12it/s] 63%|██████▎   | 8022/12776 [1:26:10<23:16,  3.40it/s]                                                       63%|██████▎   | 8022/12776 [1:26:10<23:16,  3.40it/s] 63%|██████▎   | 8023/12776 [1:26:10<21:31,  3.68it/s]                                                       63%|██████▎   | 8023/12776 [1:26:10<21:31,  3.68it/s] 63%|██████▎   | 8024/12776 [1:26:10<19:59,  3.96it/s]                                                       63%|██████▎   | 8024/12776 [1:26:10<19:59,  3.96it/s] 63%|██████▎   | 8025/12776 [1:26:10<18:44,  4.22it/s]                                                       63%|██████▎   | 8025/12776 [1:26:10<18:44,  4.22it/s] 63%|██████▎   | 8026/12776 [1:26:10<19:39,  4.03it/s]                                                       63%|██████▎   | 8026/12776 [1:26:10<19:39,  4.03it/s] 63%|██████▎   | 8027/12776 [1:26:11<18:19,  4.32it/s]                                                       63%|██████▎   | 8027/12776 [1:26:11<18:19,  4.32it/s] 63%|██████▎   | 8028/12776 [1:26:11<17:18,  4.57it/s]                                                       63%|██████▎   | 8028/12776 [1:26:11<17:18,  4.57it/s] 63%|██████▎   | 8029/12776 [1:26:11<16:29,  4.80it/s]                                                       63%|██████▎   | 8029/12776 [1:26:11<16:29,  4.80it/s] 63%|██████▎   | 8030/12776 [1:26:11<15:51,  4.99it/s]                                                       63%|██████▎   | 8030/12776 [1:26:11<15:51,  4.99it/s] 63%|██████▎   | 8031/12776 [1:26:11<15:19,  5.16it/s]                                                       63%|██████▎   | 8031/12776 [1:26:11<15:19,  5.16it/s] 63%|██████▎   | 8032/12776 [1:26:12<17:51,  4.43it/s]                                                       63%|██████▎   | 8032/12776 [1:26:12<17:51,  4.43it/s] 63%|██████▎   | 8033/12776 [1:26:12<16:37,  4.75it/s]                                                       63%|██████▎   | 8033/12776 [1:26:12<16:37,  4.75it/s] 63%|██████▎   | 8034/12776 [1:26:12<15:44,  5.02it/s]                                                       63%|██████▎   | 8034/12776 [1:26:12<15:44,  5.02it/s] 63%|██████▎   | 8035/12776 [1:26:12<15:05,  5.23it/s]                                                       63%|██████▎   | 8035/12776 [1:26:12<15:05,  5.23it/s] 63%|██████▎   | 8036/12776 [1:26:12<14:26,  5.47it/s]                                                       63%|██████▎   | 8036/12776 [1:26:12<14:26,  5.47it/s] 63%|██████▎   | 8037/12776 [1:26:13<14:00,  5.64it/s]                                                       63%|██████▎   | 8037/12776 [1:26:13<14:00,  5.64it/s] 63%|██████▎   | 8038/12776 [1:26:13<25:04,  3.15it/s]                                                       63%|██████▎   | 8038/12776 [1:26:13<25:04,  3.15it/s] 63%|██████▎   | 8039/12776 [1:26:14<46:34,  1.70it/s]                                                       63%|██████▎   | 8039/12776 [1:26:14<46:34,  1.70it/s] 63%|██████▎   | 8040/12776 [1:26:15<55:39,  1.42it/s]                                                       63%|██████▎   | 8040/12776 [1:26:15<55:39,  1.42it/s] 63%|██████▎   | 8041/12776 [1:26:16<58:31,  1.35it/s]                                                       63%|██████▎   | 8041/12776 [1:26:16<58:31,  1.35it/s] 63%|██████▎   | 8042/12776 [1:26:17<58:39,  1.35it/s]                                                       63%|██████▎   | 8042/12776 [1:26:17<58:39,  1.35it/s] 63%|██████▎   | 8043/12776 [1:26:18<59:20,  1.33it/s]                                                       63%|██████▎   | 8043/12776 [1:26:18<59:20,  1.33it/s] 63%|██████▎   | 8044/12776 [1:26:18<59:12,  1.33it/s]                                                       63%|██████▎   | 8044/12776 [1:26:18<59:12,  1.33it/s] 63%|██████▎   | 8045/12776 [1:26:19<56:24,  1.40it/s]                                                       63%|██████▎   | 8045/12776 [1:26:19<56:24,  1.40it/s] 63%|██████▎   | 8046/12776 [1:26:20<56:57,  1.38it/s]                                                       63%|██████▎   | 8046/12776 [1:26:20<56:57,  1.38it/s] 63%|██████▎   | 8047/12776 [1:26:20<53:34,  1.47it/s]                                                       63%|██████▎   | 8047/12776 [1:26:20<53:34,  1.47it/s] 63%|██████▎   | 8048/12776 [1:26:21<52:14,  1.51it/s]                                                       63%|██████▎   | 8048/12776 [1:26:21<52:14,  1.51it/s] 63%|██████▎   | 8049/12776 [1:26:22<49:01,  1.61it/s]                                                       63%|██████▎   | 8049/12776 [1:26:22<49:01,  1.61it/s] 63%|██████▎   | 8050/12776 [1:26:22<48:07,  1.64it/s]                                                       63%|██████▎   | 8050/12776 [1:26:22<48:07,  1.64it/s] 63%|██████▎   | 8051/12776 [1:26:23<45:13,  1.74it/s]                                                       63%|██████▎   | 8051/12776 [1:26:23<45:13,  1.74it/s] 63%|██████▎   | 8052/12776 [1:26:23<44:47,  1.76it/s]                                                       63%|██████▎   | 8052/12776 [1:26:23<44:47,  1.76it/s] 63%|██████▎   | 8053/12776 [1:26:24<41:43,  1.89it/s]                                                       63%|██████▎   | 8053/12776 [1:26:24<41:43,  1.89it/s] 63%|██████▎   | 8054/12776 [1:26:24<41:47,  1.88it/s]                                                       63%|██████▎   | 8054/12776 [1:26:24<41:47,  1.88it/s] 63%|██████▎   | 8055/12776 [1:26:25<38:35,  2.04it/s]                                                       63%|██████▎   | 8055/12776 [1:26:25<38:35,  2.04it/s] 63%|██████▎   | 8056/12776 [1:26:25<35:50,  2.19it/s]                                                       63%|██████▎   | 8056/12776 [1:26:25<35:50,  2.19it/s] 63%|██████▎   | 8057/12776 [1:26:25<34:09,  2.30it/s]                                                       63%|██████▎   | 8057/12776 [1:26:25<34:09,  2.30it/s] 63%|██████▎   | 8058/12776 [1:26:26<31:58,  2.46it/s]                                                       63%|██████▎   | 8058/12776 [1:26:26<31:58,  2.46it/s] 63%|██████▎   | 8059/12776 [1:26:26<30:09,  2.61it/s]                                                       63%|██████▎   | 8059/12776 [1:26:26<30:09,  2.61it/s] 63%|██████▎   | 8060/12776 [1:26:26<28:44,  2.74it/s]                                                       63%|██████▎   | 8060/12776 [1:26:26<28:44,  2.74it/s] 63%|██████▎   | 8061/12776 [1:26:27<28:33,  2.75it/s]                                                       63%|██████▎   | 8061/12776 [1:26:27<28:33,  2.75it/s] 63%|██████▎   | 8062/12776 [1:26:27<27:06,  2.90it/s]                                                       63%|██████▎   | 8062/12776 [1:26:27<27:06,  2.90it/s] 63%|██████▎   | 8063/12776 [1:26:27<25:36,  3.07it/s]                                                       63%|██████▎   | 8063/12776 [1:26:27<25:36,  3.07it/s] 63%|██████▎   | 8064/12776 [1:26:28<26:08,  3.00it/s]                                                       63%|██████▎   | 8064/12776 [1:26:28<26:08,  3.00it/s] 63%|██████▎   | 8065/12776 [1:26:28<24:38,  3.19it/s]                                                       63%|██████▎   | 8065/12776 [1:26:28<24:38,  3.19it/s] 63%|██████▎   | 8066/12776 [1:26:28<23:28,  3.34it/s]                                                       63%|██████▎   | 8066/12776 [1:26:28<23:28,  3.34it/s] 63%|██████▎   | 8067/12776 [1:26:28<22:29,  3.49it/s]                                                       63%|██████▎   | 8067/12776 [1:26:28<22:29,  3.49it/s] 63%|██████▎   | 8068/12776 [1:26:29<23:51,  3.29it/s]                                                       63%|██████▎   | 8068/12776 [1:26:29<23:51,  3.29it/s] 63%|██████▎   | 8069/12776 [1:26:29<23:01,  3.41it/s]                                                       63%|██████▎   | 8069/12776 [1:26:29<23:01,  3.41it/s] 63%|██████▎   | 8070/12776 [1:26:29<22:00,  3.56it/s]                                                       63%|██████▎   | 8070/12776 [1:26:29<22:00,  3.56it/s] 63%|██████▎   | 8071/12776 [1:26:30<21:20,  3.67it/s]                                                       63%|██████▎   | 8071/12776 [1:26:30<21:20,  3.67it/s] 63%|██████▎   | 8072/12776 [1:26:30<23:01,  3.41it/s]                                                       63%|██████▎   | 8072/12776 [1:26:30<23:01,  3.41it/s] 63%|██████▎   | 8073/12776 [1:26:30<21:56,  3.57it/s]                                                       63%|██████▎   | 8073/12776 [1:26:30<21:56,  3.57it/s] 63%|██████▎   | 8074/12776 [1:26:30<20:58,  3.74it/s]                                                       63%|██████▎   | 8074/12776 [1:26:30<20:58,  3.74it/s] 63%|██████▎   | 8075/12776 [1:26:31<20:08,  3.89it/s]                                                       63%|██████▎   | 8075/12776 [1:26:31<20:08,  3.89it/s] 63%|██████▎   | 8076/12776 [1:26:31<19:25,  4.03it/s]                                                      {'eval_loss': 0.5047087669372559, 'eval_wer': 0.3239156767324734, 'eval_runtime': 206.5621, 'eval_samples_per_second': 59.948, 'eval_steps_per_second': 3.747, 'epoch': 1.25}
+{'loss': 0.4006, 'grad_norm': 0.8334164023399353, 'learning_rate': 0.00011695992179863146, 'epoch': 1.25}
+{'loss': 0.3454, 'grad_norm': 0.8113241195678711, 'learning_rate': 0.00011693548387096773, 'epoch': 1.25}
+{'loss': 0.2453, 'grad_norm': 0.6484245657920837, 'learning_rate': 0.000116911045943304, 'epoch': 1.25}
+{'loss': 0.2726, 'grad_norm': 0.7791106700897217, 'learning_rate': 0.00011688660801564027, 'epoch': 1.25}
+{'loss': 0.4168, 'grad_norm': 1.1766725778579712, 'learning_rate': 0.00011686217008797652, 'epoch': 1.25}
+{'loss': 0.3907, 'grad_norm': 1.342408537864685, 'learning_rate': 0.0001168377321603128, 'epoch': 1.25}
+{'loss': 0.3619, 'grad_norm': 1.233473300933838, 'learning_rate': 0.00011681329423264906, 'epoch': 1.25}
+{'loss': 0.4103, 'grad_norm': 1.4168598651885986, 'learning_rate': 0.00011678885630498533, 'epoch': 1.25}
+{'loss': 0.433, 'grad_norm': 1.5982956886291504, 'learning_rate': 0.0001167644183773216, 'epoch': 1.25}
+{'loss': 0.4682, 'grad_norm': 1.1311919689178467, 'learning_rate': 0.00011673998044965786, 'epoch': 1.25}
+{'loss': 0.3652, 'grad_norm': 1.4813308715820312, 'learning_rate': 0.00011671554252199412, 'epoch': 1.25}
+{'loss': 0.5578, 'grad_norm': 1.0279951095581055, 'learning_rate': 0.0001166911045943304, 'epoch': 1.25}
+{'loss': 0.3986, 'grad_norm': 1.014884352684021, 'learning_rate': 0.00011666666666666665, 'epoch': 1.25}
+{'loss': 0.5565, 'grad_norm': 2.5004680156707764, 'learning_rate': 0.00011664222873900292, 'epoch': 1.25}
+{'loss': 0.5358, 'grad_norm': 1.6078928709030151, 'learning_rate': 0.0001166177908113392, 'epoch': 1.25}
+{'loss': 0.2927, 'grad_norm': 1.8332239389419556, 'learning_rate': 0.00011659335288367546, 'epoch': 1.25}
+{'loss': 0.5876, 'grad_norm': 1.8943898677825928, 'learning_rate': 0.00011656891495601171, 'epoch': 1.26}
+{'loss': 0.4997, 'grad_norm': 1.8188652992248535, 'learning_rate': 0.00011654447702834799, 'epoch': 1.26}
+{'loss': 0.3398, 'grad_norm': 2.3401331901550293, 'learning_rate': 0.00011652003910068425, 'epoch': 1.26}
+{'loss': 0.7564, 'grad_norm': 1.9932831525802612, 'learning_rate': 0.00011649560117302052, 'epoch': 1.26}
+{'loss': 0.6872, 'grad_norm': 2.9455413818359375, 'learning_rate': 0.0001164711632453568, 'epoch': 1.26}
+{'loss': 0.6771, 'grad_norm': 1.4435310363769531, 'learning_rate': 0.00011644672531769305, 'epoch': 1.26}
+{'loss': 0.7648, 'grad_norm': 3.489488363265991, 'learning_rate': 0.00011642228739002931, 'epoch': 1.26}
+{'loss': 0.4852, 'grad_norm': 1.4743152856826782, 'learning_rate': 0.00011639784946236559, 'epoch': 1.26}
+{'loss': 0.6416, 'grad_norm': 3.376526355743408, 'learning_rate': 0.00011637341153470184, 'epoch': 1.26}
+{'loss': 1.1278, 'grad_norm': 1.8703612089157104, 'learning_rate': 0.00011634897360703811, 'epoch': 1.26}
+{'loss': 1.1146, 'grad_norm': 2.682764768600464, 'learning_rate': 0.00011632453567937439, 'epoch': 1.26}
+{'loss': 1.0425, 'grad_norm': 2.28507399559021, 'learning_rate': 0.00011630009775171065, 'epoch': 1.26}
+{'loss': 0.8179, 'grad_norm': 4.47053861618042, 'learning_rate': 0.0001162756598240469, 'epoch': 1.26}
+{'loss': 1.1578, 'grad_norm': 3.5144686698913574, 'learning_rate': 0.00011625122189638318, 'epoch': 1.26}
+{'loss': 1.1386, 'grad_norm': 3.6762499809265137, 'learning_rate': 0.00011622678396871945, 'epoch': 1.26}
+{'loss': 0.9114, 'grad_norm': 2.404191017150879, 'learning_rate': 0.00011620234604105571, 'epoch': 1.26}
+{'loss': 0.5878, 'grad_norm': 2.5697121620178223, 'learning_rate': 0.00011617790811339199, 'epoch': 1.26}
+{'loss': 0.6517, 'grad_norm': 2.2942793369293213, 'learning_rate': 0.00011615347018572824, 'epoch': 1.26}
+{'loss': 0.5267, 'grad_norm': 1.6695375442504883, 'learning_rate': 0.0001161290322580645, 'epoch': 1.26}
+{'loss': 0.5404, 'grad_norm': 1.7396913766860962, 'learning_rate': 0.00011610459433040078, 'epoch': 1.26}
+{'loss': 0.7826, 'grad_norm': 2.4069020748138428, 'learning_rate': 0.00011608015640273703, 'epoch': 1.26}
+{'loss': 0.7537, 'grad_norm': 2.687602996826172, 'learning_rate': 0.0001160557184750733, 'epoch': 1.26}
+{'loss': 0.3249, 'grad_norm': 0.9946150779724121, 'learning_rate': 0.00011603128054740958, 'epoch': 1.26}
+{'loss': 0.364, 'grad_norm': 0.7107114791870117, 'learning_rate': 0.00011600684261974584, 'epoch': 1.26}
+{'loss': 0.247, 'grad_norm': 1.1093772649765015, 'learning_rate': 0.0001159824046920821, 'epoch': 1.26}
+{'loss': 0.209, 'grad_norm': 0.6720324754714966, 'learning_rate': 0.00011595796676441837, 'epoch': 1.26}
+{'loss': 0.2063, 'grad_norm': 0.6645235419273376, 'learning_rate': 0.00011593352883675464, 'epoch': 1.26}
+{'loss': 0.2311, 'grad_norm': 0.49136972427368164, 'learning_rate': 0.0001159090909090909, 'epoch': 1.26}
+{'loss': 0.2226, 'grad_norm': 1.026901364326477, 'learning_rate': 0.00011588465298142717, 'epoch': 1.26}
+{'loss': 0.2241, 'grad_norm': 0.6816409826278687, 'learning_rate': 0.00011586021505376343, 'epoch': 1.26}
+{'loss': 0.2395, 'grad_norm': 0.4918442666530609, 'learning_rate': 0.0001158357771260997, 'epoch': 1.26}
+{'loss': 0.2683, 'grad_norm': 0.6494612097740173, 'learning_rate': 0.00011581133919843597, 'epoch': 1.26}
+{'loss': 0.363, 'grad_norm': 0.875194787979126, 'learning_rate': 0.00011578690127077223, 'epoch': 1.26}
+{'loss': 0.2729, 'grad_norm': 0.8091587424278259, 'learning_rate': 0.00011576246334310849, 'epoch': 1.26}
+{'loss': 0.2604, 'grad_norm': 0.5551338791847229, 'learning_rate': 0.00011573802541544477, 'epoch': 1.26}
+{'loss': 0.2483, 'grad_norm': 0.5007133483886719, 'learning_rate': 0.00011571358748778103, 'epoch': 1.26}
+{'loss': 0.175, 'grad_norm': 0.773539662361145, 'learning_rate': 0.00011568914956011728, 'epoch': 1.26}
+{'loss': 0.6749, 'grad_norm': 1.5341578722000122, 'learning_rate': 0.00011566471163245356, 'epoch': 1.26}
+{'loss': 0.2875, 'grad_norm': 0.6845079660415649, 'learning_rate': 0.00011564027370478983, 'epoch': 1.26}
+{'loss': 0.2767, 'grad_norm': 1.1050750017166138, 'learning_rate': 0.00011561583577712609, 'epoch': 1.26}
+{'loss': 0.3132, 'grad_norm': 0.9473204612731934, 'learning_rate': 0.00011559139784946234, 'epoch': 1.26}
+{'loss': 0.4922, 'grad_norm': 2.2619035243988037, 'learning_rate': 0.00011556695992179862, 'epoch': 1.26}
+{'loss': 0.9514, 'grad_norm': 2.7712318897247314, 'learning_rate': 0.00011554252199413489, 'epoch': 1.26}
+{'loss': 0.6567, 'grad_norm': 1.5762767791748047, 'learning_rate': 0.00011551808406647114, 'epoch': 1.26}
+{'loss': 0.3136, 'grad_norm': 0.8980954885482788, 'learning_rate': 0.00011549364613880742, 'epoch': 1.26}
+{'loss': 0.8192, 'grad_norm': 5.263506889343262, 'learning_rate': 0.00011546920821114368, 'epoch': 1.26}
+{'loss': 0.3902, 'grad_norm': 1.3415788412094116, 'learning_rate': 0.00011544477028347995, 'epoch': 1.26}
+{'loss': 0.7186, 'grad_norm': 1.4701817035675049, 'learning_rate': 0.00011542033235581622, 'epoch': 1.26}
+{'loss': 0.5271, 'grad_norm': 1.1412750482559204, 'learning_rate': 0.00011539589442815248, 'epoch': 1.26}
+{'loss': 0.7291, 'grad_norm': 1.3679181337356567, 'learning_rate': 0.00011537145650048874, 'epoch': 1.26}
+{'loss': 0.9745, 'grad_norm': 2.3653368949890137, 'learning_rate': 0.00011534701857282502, 'epoch': 1.26}
+{'loss': 0.8932, 'grad_norm': 3.5362980365753174, 'learning_rate': 0.00011532258064516128, 'epoch': 1.26}
+{'loss': 0.5735, 'grad_norm': 1.438018560409546, 'learning_rate': 0.00011529814271749753, 'epoch': 1.26}
+{'loss': 0.4908, 'grad_norm': 1.334162950515747, 'learning_rate': 0.00011527370478983381, 'epoch': 1.26}
+{'loss': 0.3825, 'grad_norm': 1.1316155195236206, 'learning_rate': 0.00011524926686217008, 'epoch': 1.26}
+{'loss': 1.0631, 'grad_norm': 3.4603209495544434, 'learning_rate': 0.00011522482893450633, 'epoch': 1.26}
+{'loss': 0.6785, 'grad_norm': 3.4349160194396973, 'learning_rate': 0.00011520039100684261, 'epoch': 1.26}
+{'loss': 0.6231, 'grad_norm': 2.098545789718628, 'learning_rate': 0.00011517595307917887, 'epoch': 1.26}
+{'loss': 0.4584, 'grad_norm': 1.599894642829895, 'learning_rate': 0.00011515151515151514, 'epoch': 1.26}
+{'loss': 0.7141, 'grad_norm': 2.5117294788360596, 'learning_rate': 0.00011512707722385142, 'epoch': 1.26}
+ 63%|██████▎   | 8076/12776 [1:26:31<19:25,  4.03it/s] 63%|██████▎   | 8077/12776 [1:26:31<20:59,  3.73it/s]                                                       63%|██████▎   | 8077/12776 [1:26:31<20:59,  3.73it/s] 63%|██████▎   | 8078/12776 [1:26:31<19:30,  4.01it/s]                                                       63%|██████▎   | 8078/12776 [1:26:31<19:30,  4.01it/s] 63%|██████▎   | 8079/12776 [1:26:32<17:59,  4.35it/s]                                                       63%|██████▎   | 8079/12776 [1:26:32<17:59,  4.35it/s] 63%|██████▎   | 8080/12776 [1:26:32<16:59,  4.61it/s]                                                       63%|██████▎   | 8080/12776 [1:26:32<16:59,  4.61it/s] 63%|██████▎   | 8081/12776 [1:26:32<16:03,  4.87it/s]                                                       63%|██████▎   | 8081/12776 [1:26:32<16:03,  4.87it/s] 63%|██████▎   | 8082/12776 [1:26:32<17:31,  4.46it/s]                                                       63%|██████▎   | 8082/12776 [1:26:32<17:31,  4.46it/s] 63%|██████▎   | 8083/12776 [1:26:32<16:19,  4.79it/s]                                                       63%|██████▎   | 8083/12776 [1:26:32<16:19,  4.79it/s] 63%|██████▎   | 8084/12776 [1:26:33<15:26,  5.06it/s]                                                       63%|██████▎   | 8084/12776 [1:26:33<15:26,  5.06it/s] 63%|██████▎   | 8085/12776 [1:26:33<14:46,  5.29it/s]                                                       63%|██████▎   | 8085/12776 [1:26:33<14:46,  5.29it/s] 63%|██████▎   | 8086/12776 [1:26:33<14:12,  5.50it/s]                                                       63%|██████▎   | 8086/12776 [1:26:33<14:12,  5.50it/s] 63%|██████▎   | 8087/12776 [1:26:33<13:41,  5.71it/s]                                                       63%|██████▎   | 8087/12776 [1:26:33<13:41,  5.71it/s] 63%|██████▎   | 8088/12776 [1:26:34<25:11,  3.10it/s]                                                       63%|██████▎   | 8088/12776 [1:26:34<25:11,  3.10it/s] 63%|██████▎   | 8089/12776 [1:26:35<47:49,  1.63it/s]                                                       63%|██████▎   | 8089/12776 [1:26:35<47:49,  1.63it/s] 63%|██████▎   | 8090/12776 [1:26:36<57:56,  1.35it/s]                                                       63%|██████▎   | 8090/12776 [1:26:36<57:56,  1.35it/s] 63%|██████▎   | 8091/12776 [1:26:37<59:47,  1.31it/s]                                                       63%|██████▎   | 8091/12776 [1:26:37<59:47,  1.31it/s] 63%|██████▎   | 8092/12776 [1:26:38<59:41,  1.31it/s]                                                       63%|██████▎   | 8092/12776 [1:26:38<59:41,  1.31it/s] 63%|██████▎   | 8093/12776 [1:26:38<58:43,  1.33it/s]                                                       63%|██████▎   | 8093/12776 [1:26:38<58:43,  1.33it/s] 63%|██████▎   | 8094/12776 [1:26:39<58:11,  1.34it/s]                                                       63%|██████▎   | 8094/12776 [1:26:39<58:11,  1.34it/s] 63%|██████▎   | 8095/12776 [1:26:40<55:48,  1.40it/s]                                                       63%|██████▎   | 8095/12776 [1:26:40<55:48,  1.40it/s] 63%|██████▎   | 8096/12776 [1:26:40<56:32,  1.38it/s]                                                       63%|██████▎   | 8096/12776 [1:26:40<56:32,  1.38it/s] 63%|██████▎   | 8097/12776 [1:26:41<53:56,  1.45it/s]                                                       63%|██████▎   | 8097/12776 [1:26:41<53:56,  1.45it/s] 63%|██████▎   | 8098/12776 [1:26:42<52:24,  1.49it/s]                                                       63%|██████▎   | 8098/12776 [1:26:42<52:24,  1.49it/s] 63%|██████▎   | 8099/12776 [1:26:42<49:26,  1.58it/s]                                                       63%|██████▎   | 8099/12776 [1:26:42<49:26,  1.58it/s] 63%|██████▎   | 8100/12776 [1:26:43<48:24,  1.61it/s]                                                       63%|██████▎   | 8100/12776 [1:26:43<48:24,  1.61it/s] 63%|██████▎   | 8101/12776 [1:26:43<45:42,  1.70it/s]                                                       63%|██████▎   | 8101/12776 [1:26:43<45:42,  1.70it/s] 63%|██████▎   | 8102/12776 [1:26:44<44:51,  1.74it/s]                                                       63%|██████▎   | 8102/12776 [1:26:44<44:51,  1.74it/s] 63%|██████▎   | 8103/12776 [1:26:44<41:54,  1.86it/s]                                                       63%|██████▎   | 8103/12776 [1:26:44<41:54,  1.86it/s] 63%|██████▎   | 8104/12776 [1:26:45<41:11,  1.89it/s]                                                       63%|██████▎   | 8104/12776 [1:26:45<41:11,  1.89it/s] 63%|██████▎   | 8105/12776 [1:26:45<38:24,  2.03it/s]                                                       63%|██████▎   | 8105/12776 [1:26:45<38:24,  2.03it/s] 63%|██████▎   | 8106/12776 [1:26:46<36:02,  2.16it/s]                                                       63%|██████▎   | 8106/12776 [1:26:46<36:02,  2.16it/s] 63%|██████▎   | 8107/12776 [1:26:46<36:50,  2.11it/s]                                                       63%|██████▎   | 8107/12776 [1:26:46<36:50,  2.11it/s] 63%|██████▎   | 8108/12776 [1:26:46<34:09,  2.28it/s]                                                       63%|██████▎   | 8108/12776 [1:26:46<34:09,  2.28it/s] 63%|██████▎   | 8109/12776 [1:26:47<32:07,  2.42it/s]                                                       63%|██████▎   | 8109/12776 [1:26:47<32:07,  2.42it/s] 63%|██████▎   | 8110/12776 [1:26:47<31:39,  2.46it/s]                                                       63%|██████▎   | 8110/12776 [1:26:47<31:39,  2.46it/s] 63%|██████▎   | 8111/12776 [1:26:48<29:52,  2.60it/s]                                                       63%|██████▎   | 8111/12776 [1:26:48<29:52,  2.60it/s] 63%|██████▎   | 8112/12776 [1:26:48<28:21,  2.74it/s]                                                       63%|██████▎   | 8112/12776 [1:26:48<28:21,  2.74it/s] 64%|██████▎   | 8113/12776 [1:26:48<27:52,  2.79it/s]                                                       64%|██████▎   | 8113/12776 [1:26:48<27:52,  2.79it/s] 64%|██████▎   | 8114/12776 [1:26:49<26:23,  2.94it/s]                                                       64%|██████▎   | 8114/12776 [1:26:49<26:23,  2.94it/s] 64%|██████▎   | 8115/12776 [1:26:49<25:06,  3.09it/s]                                                       64%|██████▎   | 8115/12776 [1:26:49<25:06,  3.09it/s] 64%|██████▎   | 8116/12776 [1:26:49<24:00,  3.23it/s]                                                       64%|██████▎   | 8116/12776 [1:26:49<24:00,  3.23it/s] 64%|██████▎   | 8117/12776 [1:26:49<24:14,  3.20it/s]                                                       64%|██████▎   | 8117/12776 [1:26:49<24:14,  3.20it/s] 64%|██████▎   | 8118/12776 [1:26:50<23:07,  3.36it/s]                                                       64%|██████▎   | 8118/12776 [1:26:50<23:07,  3.36it/s] 64%|██████▎   | 8119/12776 [1:26:50<22:15,  3.49it/s]                                                       64%|██████▎   | 8119/12776 [1:26:50<22:15,  3.49it/s] 64%|██████▎   | 8120/12776 [1:26:50<21:26,  3.62it/s]                                                       64%|██████▎   | 8120/12776 [1:26:50<21:26,  3.62it/s] 64%|██████▎   | 8121/12776 [1:26:51<23:15,  3.34it/s]                                                       64%|██████▎   | 8121/12776 [1:26:51<23:15,  3.34it/s] 64%|██████▎   | 8122/12776 [1:26:51<21:50,  3.55it/s]                                                       64%|██████▎   | 8122/12776 [1:26:51<21:50,  3.55it/s] 64%|██████▎   | 8123/12776 [1:26:51<20:55,  3.71it/s]                                                       64%|██████▎   | 8123/12776 [1:26:51<20:55,  3.71it/s] 64%|██████▎   | 8124/12776 [1:26:51<19:53,  3.90it/s]                                                       64%|██████▎   | 8124/12776 [1:26:51<19:53,  3.90it/s] 64%|██████▎   | 8125/12776 [1:26:51<18:58,  4.08it/s]                                                       64%|██████▎   | 8125/12776 [1:26:51<18:58,  4.08it/s] 64%|██████▎   | 8126/12776 [1:26:52<20:57,  3.70it/s]                                                       64%|██████▎   | 8126/12776 [1:26:52<20:57,  3.70it/s] 64%|██████▎   | 8127/12776 [1:26:52<19:34,  3.96it/s]                                                       64%|██████▎   | 8127/12776 [1:26:52<19:34,  3.96it/s] 64%|██████▎   | 8128/12776 [1:26:52<18:31,  4.18it/s]                                                       64%|██████▎   | 8128/12776 [1:26:52<18:31,  4.18it/s] 64%|██████▎   | 8129/12776 [1:26:52<17:47,  4.35it/s]                                                       64%|██████▎   | 8129/12776 [1:26:52<17:47,  4.35it/s] 64%|██████▎   | 8130/12776 [1:26:53<17:12,  4.50it/s]                                                       64%|██████▎   | 8130/12776 [1:26:53<17:12,  4.50it/s] 64%|██████▎   | 8131/12776 [1:26:53<19:09,  4.04it/s]                                                       64%|██████▎   | 8131/12776 [1:26:53<19:09,  4.04it/s] 64%|██████▎   | 8132/12776 [1:26:53<18:02,  4.29it/s]                                                       64%|██████▎   | 8132/12776 [1:26:53<18:02,  4.29it/s] 64%|██████▎   | 8133/12776 [1:26:53<17:14,  4.49it/s]                                                       64%|██████▎   | 8133/12776 [1:26:53<17:14,  4.49it/s] 64%|██████▎   | 8134/12776 [1:26:54<16:37,  4.66it/s]                                                       64%|██████▎   | 8134/12776 [1:26:54<16:37,  4.66it/s] 64%|██████▎   | 8135/12776 [1:26:54<16:08,  4.79it/s]                                                       64%|██████▎   | 8135/12776 [1:26:54<16:08,  4.79it/s] 64%|██████▎   | 8136/12776 [1:26:54<17:58,  4.30it/s]                                                       64%|██████▎   | 8136/12776 [1:26:54<17:58,  4.30it/s] 64%|██████▎   | 8137/12776 [1:26:54<16:55,  4.57it/s]                                                       64%|██████▎   | 8137/12776 [1:26:54<16:55,  4.57it/s] 64%|██████▎   | 8138/12776 [1:26:55<28:24,  2.72it/s]                                                       64%|██████▎   | 8138/12776 [1:26:55<28:24,  2.72it/s] 64%|██████▎   | 8139/12776 [1:26:56<51:45,  1.49it/s]                                                       64%|██████▎   | 8139/12776 [1:26:56<51:45,  1.49it/s] 64%|██████▎   | 8140/12776 [1:26:57<1:00:13,  1.28it/s]                                                         64%|██████▎   | 8140/12776 [1:26:57<1:00:13,  1.28it/s] 64%|██████▎   | 8141/12776 [1:26:58<1:02:02,  1.25it/s]                                                         64%|██████▎   | 8141/12776 [1:26:58<1:02:02,  1.25it/s] 64%|██████▎   | 8142/12776 [1:26:59<1:01:28,  1.26it/s]                                                         64%|██████▎   | 8142/12776 [1:26:59<1:01:28,  1.26it/s] 64%|██████▎   | 8143/12776 [1:27:00<59:57,  1.29it/s]                                                         64%|██████▎   | 8143/12776 [1:27:00<59:57,  1.29it/s] 64%|██████▎   | 8144/12776 [1:27:00<58:24,  1.32it/s]                                                       64%|██████▎   | 8144/12776 [1:27:00<58:24,  1.32it/s] 64%|██████▍   | 8145/12776 [1:27:01<58:05,  1.33it/s]                                                       64%|██████▍   | 8145/12776 [1:27:01<58:05,  1.33it/s] 64%|██████▍   | 8146/12776 [1:27:02<58:52,  1.31it/s]                                                       64%|██████▍   | 8146/12776 [1:27:02<58:52,  1.31it/s] 64%|██████▍   | 8147/12776 [1:27:03<55:43,  1.38it/s]                                                       64%|██████▍   | 8147/12776 [1:27:03<55:43,  1.38it/s] 64%|██████▍   | 8148/12776 [1:27:03<51:56,  1.48it/s]                                                       64%|██████▍   | 8148/12776 [1:27:03<51:56,  1.48it/s] 64%|██████▍   | 8149/12776 [1:27:04<49:04,  1.57it/s]                                                       64%|██████▍   | 8149/12776 [1:27:04<49:04,  1.57it/s] 64%|██████▍   | 8150/12776 [1:27:04<48:21,  1.59it/s]                                                       64%|██████▍   | 8150/12776 [1:27:04<48:21,  1.59it/s] 64%|██████▍   | 8151/12776 [1:27:05<45:31,  1.69it/s]                                                       64%|██████▍   | 8151/12776 [1:27:05<45:31,  1.69it/s] 64%|██████▍   | 8152/12776 [1:27:05<42:39,  1.81it/s]                                                       64%|██████▍   | 8152/12776 [1:27:05<42:39,  1.81it/s] 64%|██████▍   | 8153/12776 [1:27:06<40:40,  1.89it/s]                                                       64%|██████▍   | 8153/12776 [1:27:06<40:40,  1.89it/s] 64%|██████▍   | 8154/12776 [1:27:06<38:44,  1.99it/s]                                                      {'loss': 1.4734, 'grad_norm': 3.738943576812744, 'learning_rate': 0.00011510263929618767, 'epoch': 1.26}
+{'loss': 0.6161, 'grad_norm': 2.5964126586914062, 'learning_rate': 0.00011507820136852393, 'epoch': 1.26}
+{'loss': 1.5048, 'grad_norm': 3.135176420211792, 'learning_rate': 0.00011505376344086021, 'epoch': 1.26}
+{'loss': 1.0791, 'grad_norm': 2.939114570617676, 'learning_rate': 0.00011502932551319647, 'epoch': 1.26}
+{'loss': 2.0609, 'grad_norm': 2.810537815093994, 'learning_rate': 0.00011500488758553273, 'epoch': 1.27}
+{'loss': 0.956, 'grad_norm': 3.116837978363037, 'learning_rate': 0.000114980449657869, 'epoch': 1.27}
+{'loss': 1.196, 'grad_norm': 2.0274899005889893, 'learning_rate': 0.00011495601173020527, 'epoch': 1.27}
+{'loss': 1.0615, 'grad_norm': 2.4018070697784424, 'learning_rate': 0.00011493157380254152, 'epoch': 1.27}
+{'loss': 0.4063, 'grad_norm': 3.216063976287842, 'learning_rate': 0.0001149071358748778, 'epoch': 1.27}
+{'loss': 0.4202, 'grad_norm': 1.749650001525879, 'learning_rate': 0.00011488269794721406, 'epoch': 1.27}
+{'loss': 0.8565, 'grad_norm': 3.246669292449951, 'learning_rate': 0.00011485826001955033, 'epoch': 1.27}
+{'loss': 0.634, 'grad_norm': 1.4922568798065186, 'learning_rate': 0.00011483382209188661, 'epoch': 1.27}
+{'loss': 0.2766, 'grad_norm': 0.5845685601234436, 'learning_rate': 0.00011480938416422286, 'epoch': 1.27}
+{'loss': 0.2173, 'grad_norm': 0.3810208737850189, 'learning_rate': 0.00011478494623655912, 'epoch': 1.27}
+{'loss': 0.2469, 'grad_norm': 0.5567788481712341, 'learning_rate': 0.0001147605083088954, 'epoch': 1.27}
+{'loss': 0.2548, 'grad_norm': 0.5596190690994263, 'learning_rate': 0.00011473607038123167, 'epoch': 1.27}
+{'loss': 0.2395, 'grad_norm': 0.5146242380142212, 'learning_rate': 0.00011471163245356792, 'epoch': 1.27}
+{'loss': 0.3531, 'grad_norm': 0.6236430406570435, 'learning_rate': 0.0001146871945259042, 'epoch': 1.27}
+{'loss': 0.3415, 'grad_norm': 0.6425842046737671, 'learning_rate': 0.00011466275659824046, 'epoch': 1.27}
+{'loss': 0.2194, 'grad_norm': 0.39682844281196594, 'learning_rate': 0.00011463831867057671, 'epoch': 1.27}
+{'loss': 0.3667, 'grad_norm': 2.4693057537078857, 'learning_rate': 0.00011461388074291299, 'epoch': 1.27}
+{'loss': 0.3198, 'grad_norm': 0.5568252801895142, 'learning_rate': 0.00011458944281524925, 'epoch': 1.27}
+{'loss': 0.3658, 'grad_norm': 0.7685470581054688, 'learning_rate': 0.00011456500488758552, 'epoch': 1.27}
+{'loss': 0.2678, 'grad_norm': 1.0343455076217651, 'learning_rate': 0.0001145405669599218, 'epoch': 1.27}
+{'loss': 0.6284, 'grad_norm': 2.833596706390381, 'learning_rate': 0.00011451612903225805, 'epoch': 1.27}
+{'loss': 0.2395, 'grad_norm': 0.7539536952972412, 'learning_rate': 0.00011449169110459431, 'epoch': 1.27}
+{'loss': 0.2926, 'grad_norm': 1.2098023891448975, 'learning_rate': 0.00011446725317693059, 'epoch': 1.27}
+{'loss': 0.4996, 'grad_norm': 1.0733698606491089, 'learning_rate': 0.00011444281524926686, 'epoch': 1.27}
+{'loss': 0.5157, 'grad_norm': 1.2377598285675049, 'learning_rate': 0.00011441837732160311, 'epoch': 1.27}
+{'loss': 0.3019, 'grad_norm': 0.8723770976066589, 'learning_rate': 0.00011439393939393939, 'epoch': 1.27}
+{'loss': 0.5106, 'grad_norm': 1.6629414558410645, 'learning_rate': 0.00011436950146627565, 'epoch': 1.27}
+{'loss': 0.4231, 'grad_norm': 1.0047154426574707, 'learning_rate': 0.0001143450635386119, 'epoch': 1.27}
+{'loss': 0.6328, 'grad_norm': 1.7095948457717896, 'learning_rate': 0.00011432062561094818, 'epoch': 1.27}
+{'loss': 0.535, 'grad_norm': 1.4480046033859253, 'learning_rate': 0.00011429618768328445, 'epoch': 1.27}
+{'loss': 0.2345, 'grad_norm': 0.7682520151138306, 'learning_rate': 0.00011427174975562071, 'epoch': 1.27}
+{'loss': 0.7018, 'grad_norm': 1.8508092164993286, 'learning_rate': 0.00011424731182795699, 'epoch': 1.27}
+{'loss': 0.5689, 'grad_norm': 1.5471041202545166, 'learning_rate': 0.00011422287390029324, 'epoch': 1.27}
+{'loss': 0.5738, 'grad_norm': 1.868023157119751, 'learning_rate': 0.0001141984359726295, 'epoch': 1.27}
+{'loss': 0.5171, 'grad_norm': 5.055905818939209, 'learning_rate': 0.00011417399804496578, 'epoch': 1.27}
+{'loss': 0.6595, 'grad_norm': 1.255279779434204, 'learning_rate': 0.00011414956011730203, 'epoch': 1.27}
+{'loss': 0.5065, 'grad_norm': 2.3022544384002686, 'learning_rate': 0.0001141251221896383, 'epoch': 1.27}
+{'loss': 1.1003, 'grad_norm': 2.671542167663574, 'learning_rate': 0.00011410068426197458, 'epoch': 1.27}
+{'loss': 0.6959, 'grad_norm': 2.1620357036590576, 'learning_rate': 0.00011407624633431084, 'epoch': 1.27}
+{'loss': 0.9013, 'grad_norm': 2.031935214996338, 'learning_rate': 0.0001140518084066471, 'epoch': 1.27}
+{'loss': 0.5662, 'grad_norm': 2.194514751434326, 'learning_rate': 0.00011402737047898337, 'epoch': 1.27}
+{'loss': 0.5597, 'grad_norm': 1.3475483655929565, 'learning_rate': 0.00011400293255131964, 'epoch': 1.27}
+{'loss': 0.9418, 'grad_norm': 4.436418533325195, 'learning_rate': 0.0001139784946236559, 'epoch': 1.27}
+{'loss': 1.1798, 'grad_norm': 5.266567230224609, 'learning_rate': 0.00011395405669599218, 'epoch': 1.27}
+{'loss': 0.9634, 'grad_norm': 3.8170948028564453, 'learning_rate': 0.00011392961876832843, 'epoch': 1.27}
+{'loss': 1.2201, 'grad_norm': 2.858017921447754, 'learning_rate': 0.0001139051808406647, 'epoch': 1.27}
+{'loss': 0.9677, 'grad_norm': 3.335280418395996, 'learning_rate': 0.00011388074291300097, 'epoch': 1.27}
+{'loss': 1.3408, 'grad_norm': 2.3671767711639404, 'learning_rate': 0.00011385630498533723, 'epoch': 1.27}
+{'loss': 0.6781, 'grad_norm': 1.9784801006317139, 'learning_rate': 0.00011383186705767349, 'epoch': 1.27}
+{'loss': 1.1412, 'grad_norm': 2.1394803524017334, 'learning_rate': 0.00011380742913000977, 'epoch': 1.27}
+{'loss': 1.4167, 'grad_norm': 4.030261039733887, 'learning_rate': 0.00011378299120234603, 'epoch': 1.27}
+{'loss': 0.9049, 'grad_norm': 2.028381824493408, 'learning_rate': 0.00011375855327468229, 'epoch': 1.27}
+{'loss': 1.1694, 'grad_norm': 3.869459629058838, 'learning_rate': 0.00011373411534701856, 'epoch': 1.27}
+{'loss': 0.9433, 'grad_norm': 2.090665340423584, 'learning_rate': 0.00011370967741935483, 'epoch': 1.27}
+{'loss': 1.0372, 'grad_norm': 3.8290891647338867, 'learning_rate': 0.00011368523949169109, 'epoch': 1.27}
+{'loss': 0.7292, 'grad_norm': 1.362321138381958, 'learning_rate': 0.00011366080156402737, 'epoch': 1.27}
+{'loss': 1.0427, 'grad_norm': 2.492807149887085, 'learning_rate': 0.00011363636363636362, 'epoch': 1.27}
+{'loss': 0.6872, 'grad_norm': 2.186556339263916, 'learning_rate': 0.00011361192570869989, 'epoch': 1.27}
+{'loss': 0.2192, 'grad_norm': 0.4808526933193207, 'learning_rate': 0.00011358748778103617, 'epoch': 1.27}
+{'loss': 0.4221, 'grad_norm': 0.7778594493865967, 'learning_rate': 0.00011356304985337242, 'epoch': 1.27}
+{'loss': 0.2751, 'grad_norm': 0.5027886629104614, 'learning_rate': 0.00011353861192570868, 'epoch': 1.27}
+{'loss': 0.2394, 'grad_norm': 0.5371960997581482, 'learning_rate': 0.00011351417399804496, 'epoch': 1.27}
+{'loss': 0.2188, 'grad_norm': 0.42323970794677734, 'learning_rate': 0.00011348973607038123, 'epoch': 1.27}
+{'loss': 0.2642, 'grad_norm': 0.8892855644226074, 'learning_rate': 0.00011346529814271748, 'epoch': 1.27}
+{'loss': 0.204, 'grad_norm': 0.6918807625770569, 'learning_rate': 0.00011344086021505375, 'epoch': 1.28}
+{'loss': 0.2568, 'grad_norm': 0.6264479756355286, 'learning_rate': 0.00011341642228739002, 'epoch': 1.28}
+{'loss': 0.244, 'grad_norm': 0.6950996518135071, 'learning_rate': 0.00011339198435972628, 'epoch': 1.28}
+{'loss': 0.2628, 'grad_norm': 0.6539076566696167, 'learning_rate': 0.00011336754643206256, 'epoch': 1.28}
+{'loss': 0.2214, 'grad_norm': 0.7629345655441284, 'learning_rate': 0.00011334310850439881, 'epoch': 1.28}
+{'loss': 0.2655, 'grad_norm': 0.7511631846427917, 'learning_rate': 0.00011331867057673508, 'epoch': 1.28}
+{'loss': 0.3632, 'grad_norm': 3.1061666011810303, 'learning_rate': 0.00011329423264907136, 'epoch': 1.28}
+{'loss': 0.2821, 'grad_norm': 0.8534873127937317, 'learning_rate': 0.00011326979472140761, 'epoch': 1.28}
+{'loss': 0.2113, 'grad_norm': 0.879711389541626, 'learning_rate': 0.00011324535679374387, 'epoch': 1.28}
+ 64%|██████▍   | 8154/12776 [1:27:06<38:44,  1.99it/s] 64%|██████▍   | 8155/12776 [1:27:07<37:37,  2.05it/s]                                                       64%|██████▍   | 8155/12776 [1:27:07<37:37,  2.05it/s] 64%|██████▍   | 8156/12776 [1:27:07<35:25,  2.17it/s]                                                       64%|██████▍   | 8156/12776 [1:27:07<35:25,  2.17it/s] 64%|██████▍   | 8157/12776 [1:27:07<33:49,  2.28it/s]                                                       64%|██████▍   | 8157/12776 [1:27:07<33:49,  2.28it/s] 64%|██████▍   | 8158/12776 [1:27:08<31:58,  2.41it/s]                                                       64%|██████▍   | 8158/12776 [1:27:08<31:58,  2.41it/s] 64%|██████▍   | 8159/12776 [1:27:08<30:22,  2.53it/s]                                                       64%|██████▍   | 8159/12776 [1:27:08<30:22,  2.53it/s] 64%|██████▍   | 8160/12776 [1:27:08<29:02,  2.65it/s]                                                       64%|██████▍   | 8160/12776 [1:27:08<29:02,  2.65it/s] 64%|██████▍   | 8161/12776 [1:27:09<30:57,  2.48it/s]                                                       64%|██████▍   | 8161/12776 [1:27:09<30:57,  2.48it/s] 64%|██████▍   | 8162/12776 [1:27:09<29:06,  2.64it/s]                                                       64%|██████▍   | 8162/12776 [1:27:09<29:06,  2.64it/s] 64%|██████▍   | 8163/12776 [1:27:10<27:19,  2.81it/s]                                                       64%|██████▍   | 8163/12776 [1:27:10<27:19,  2.81it/s] 64%|██████▍   | 8164/12776 [1:27:10<25:55,  2.97it/s]                                                       64%|██████▍   | 8164/12776 [1:27:10<25:55,  2.97it/s] 64%|██████▍   | 8165/12776 [1:27:10<26:38,  2.88it/s]                                                       64%|██████▍   | 8165/12776 [1:27:10<26:38,  2.88it/s] 64%|██████▍   | 8166/12776 [1:27:10<25:01,  3.07it/s]                                                       64%|██████▍   | 8166/12776 [1:27:10<25:01,  3.07it/s] 64%|██████▍   | 8167/12776 [1:27:11<23:42,  3.24it/s]                                                       64%|██████▍   | 8167/12776 [1:27:11<23:42,  3.24it/s] 64%|██████▍   | 8168/12776 [1:27:11<22:42,  3.38it/s]                                                       64%|██████▍   | 8168/12776 [1:27:11<22:42,  3.38it/s] 64%|██████▍   | 8169/12776 [1:27:11<24:12,  3.17it/s]                                                       64%|██████▍   | 8169/12776 [1:27:11<24:12,  3.17it/s] 64%|██████▍   | 8170/12776 [1:27:12<22:44,  3.38it/s]                                                       64%|██████▍   | 8170/12776 [1:27:12<22:44,  3.38it/s] 64%|██████▍   | 8171/12776 [1:27:12<21:34,  3.56it/s]                                                       64%|██████▍   | 8171/12776 [1:27:12<21:34,  3.56it/s] 64%|██████▍   | 8172/12776 [1:27:12<20:51,  3.68it/s]                                                       64%|██████▍   | 8172/12776 [1:27:12<20:51,  3.68it/s] 64%|██████▍   | 8173/12776 [1:27:12<22:43,  3.38it/s]                                                       64%|██████▍   | 8173/12776 [1:27:12<22:43,  3.38it/s] 64%|██████▍   | 8174/12776 [1:27:13<21:27,  3.57it/s]                                                       64%|██████▍   | 8174/12776 [1:27:13<21:27,  3.57it/s] 64%|██████▍   | 8175/12776 [1:27:13<20:37,  3.72it/s]                                                       64%|██████▍   | 8175/12776 [1:27:13<20:37,  3.72it/s] 64%|██████▍   | 8176/12776 [1:27:13<19:30,  3.93it/s]                                                       64%|██████▍   | 8176/12776 [1:27:13<19:30,  3.93it/s] 64%|██████▍   | 8177/12776 [1:27:13<20:47,  3.69it/s]                                                       64%|██████▍   | 8177/12776 [1:27:13<20:47,  3.69it/s] 64%|██████▍   | 8178/12776 [1:27:14<19:22,  3.95it/s]                                                       64%|██████▍   | 8178/12776 [1:27:14<19:22,  3.95it/s] 64%|██████▍   | 8179/12776 [1:27:14<18:56,  4.05it/s]                                                       64%|██████▍   | 8179/12776 [1:27:14<18:56,  4.05it/s] 64%|██████▍   | 8180/12776 [1:27:14<18:00,  4.25it/s]                                                       64%|██████▍   | 8180/12776 [1:27:14<18:00,  4.25it/s] 64%|██████▍   | 8181/12776 [1:27:14<17:18,  4.43it/s]                                                       64%|██████▍   | 8181/12776 [1:27:14<17:18,  4.43it/s] 64%|██████▍   | 8182/12776 [1:27:15<18:52,  4.06it/s]                                                       64%|██████▍   | 8182/12776 [1:27:15<18:52,  4.06it/s] 64%|██████▍   | 8183/12776 [1:27:15<17:45,  4.31it/s]                                                       64%|██████▍   | 8183/12776 [1:27:15<17:45,  4.31it/s] 64%|██████▍   | 8184/12776 [1:27:15<16:59,  4.51it/s]                                                       64%|██████▍   | 8184/12776 [1:27:15<16:59,  4.51it/s] 64%|██████▍   | 8185/12776 [1:27:15<16:20,  4.68it/s]                                                       64%|██████▍   | 8185/12776 [1:27:15<16:20,  4.68it/s] 64%|██████▍   | 8186/12776 [1:27:15<15:46,  4.85it/s]                                                       64%|██████▍   | 8186/12776 [1:27:15<15:46,  4.85it/s] 64%|██████▍   | 8187/12776 [1:27:16<15:19,  4.99it/s]                                                       64%|██████▍   | 8187/12776 [1:27:16<15:19,  4.99it/s] 64%|██████▍   | 8188/12776 [1:27:16<27:14,  2.81it/s]                                                       64%|██████▍   | 8188/12776 [1:27:16<27:14,  2.81it/s] 64%|██████▍   | 8189/12776 [1:27:18<52:44,  1.45it/s]                                                       64%|██████▍   | 8189/12776 [1:27:18<52:44,  1.45it/s] 64%|██████▍   | 8190/12776 [1:27:19<58:15,  1.31it/s]                                                       64%|██████▍   | 8190/12776 [1:27:19<58:15,  1.31it/s] 64%|██████▍   | 8191/12776 [1:27:20<1:02:36,  1.22it/s]                                                         64%|██████▍   | 8191/12776 [1:27:20<1:02:36,  1.22it/s] 64%|██████▍   | 8192/12776 [1:27:20<1:01:45,  1.24it/s]                                                         64%|██████▍   | 8192/12776 [1:27:20<1:01:45,  1.24it/s] 64%|██████▍   | 8193/12776 [1:27:21<59:40,  1.28it/s]                                                         64%|██████▍   | 8193/12776 [1:27:21<59:40,  1.28it/s] 64%|██████▍   | 8194/12776 [1:27:22<57:55,  1.32it/s]                                                       64%|██████▍   | 8194/12776 [1:27:22<57:55,  1.32it/s] 64%|██████▍   | 8195/12776 [1:27:23<57:09,  1.34it/s]                                                       64%|██████▍   | 8195/12776 [1:27:23<57:09,  1.34it/s] 64%|██████▍   | 8196/12776 [1:27:23<54:25,  1.40it/s]                                                       64%|██████▍   | 8196/12776 [1:27:23<54:25,  1.40it/s] 64%|██████▍   | 8197/12776 [1:27:24<51:55,  1.47it/s]                                                       64%|██████▍   | 8197/12776 [1:27:24<51:55,  1.47it/s] 64%|██████▍   | 8198/12776 [1:27:24<49:01,  1.56it/s]                                                       64%|██████▍   | 8198/12776 [1:27:24<49:01,  1.56it/s] 64%|██████▍   | 8199/12776 [1:27:25<47:11,  1.62it/s]                                                       64%|██████▍   | 8199/12776 [1:27:25<47:11,  1.62it/s] 64%|██████▍   | 8200/12776 [1:27:25<44:57,  1.70it/s]                                                       64%|██████▍   | 8200/12776 [1:27:25<44:57,  1.70it/s] 64%|██████▍   | 8201/12776 [1:27:26<45:31,  1.68it/s]                                                       64%|██████▍   | 8201/12776 [1:27:26<45:31,  1.68it/s] 64%|██████▍   | 8202/12776 [1:27:27<42:15,  1.80it/s]                                                       64%|██████▍   | 8202/12776 [1:27:27<42:15,  1.80it/s] 64%|██████▍   | 8203/12776 [1:27:27<39:42,  1.92it/s]                                                       64%|██████▍   | 8203/12776 [1:27:27<39:42,  1.92it/s] 64%|██████▍   | 8204/12776 [1:27:27<39:21,  1.94it/s]                                                       64%|██████▍   | 8204/12776 [1:27:27<39:21,  1.94it/s] 64%|██████▍   | 8205/12776 [1:27:28<37:04,  2.06it/s]                                                       64%|██████▍   | 8205/12776 [1:27:28<37:04,  2.06it/s] 64%|██████▍   | 8206/12776 [1:27:28<36:14,  2.10it/s]                                                       64%|██████▍   | 8206/12776 [1:27:28<36:14,  2.10it/s] 64%|██████▍   | 8207/12776 [1:27:29<34:21,  2.22it/s]                                                       64%|█████���▍   | 8207/12776 [1:27:29<34:21,  2.22it/s] 64%|██████▍   | 8208/12776 [1:27:29<32:42,  2.33it/s]                                                       64%|██████▍   | 8208/12776 [1:27:29<32:42,  2.33it/s] 64%|██████▍   | 8209/12776 [1:27:30<31:51,  2.39it/s]                                                       64%|██████▍   | 8209/12776 [1:27:30<31:51,  2.39it/s] 64%|██████▍   | 8210/12776 [1:27:30<30:12,  2.52it/s]                                                       64%|██████▍   | 8210/12776 [1:27:30<30:12,  2.52it/s] 64%|██████▍   | 8211/12776 [1:27:30<28:53,  2.63it/s]                                                       64%|██████▍   | 8211/12776 [1:27:30<28:53,  2.63it/s] 64%|██████▍   | 8212/12776 [1:27:31<30:41,  2.48it/s]                                                       64%|██████▍   | 8212/12776 [1:27:31<30:41,  2.48it/s] 64%|██████▍   | 8213/12776 [1:27:31<28:26,  2.67it/s]                                                       64%|██████▍   | 8213/12776 [1:27:31<28:26,  2.67it/s] 64%|██████▍   | 8214/12776 [1:27:31<26:43,  2.84it/s]                                                       64%|██████▍   | 8214/12776 [1:27:31<26:43,  2.84it/s] 64%|██████▍   | 8215/12776 [1:27:32<25:21,  3.00it/s]                                                       64%|██████▍   | 8215/12776 [1:27:32<25:21,  3.00it/s] 64%|██████▍   | 8216/12776 [1:27:32<26:15,  2.89it/s]                                                       64%|██████▍   | 8216/12776 [1:27:32<26:15,  2.89it/s] 64%|██████▍   | 8217/12776 [1:27:32<24:40,  3.08it/s]                                                       64%|██████▍   | 8217/12776 [1:27:32<24:40,  3.08it/s] 64%|██████▍   | 8218/12776 [1:27:32<23:18,  3.26it/s]                                                       64%|██████▍   | 8218/12776 [1:27:32<23:18,  3.26it/s] 64%|██████▍   | 8219/12776 [1:27:33<22:17,  3.41it/s]                                                       64%|██████▍   | 8219/12776 [1:27:33<22:17,  3.41it/s] 64%|██████▍   | 8220/12776 [1:27:33<23:12,  3.27it/s]                                                       64%|██████▍   | 8220/12776 [1:27:33<23:12,  3.27it/s] 64%|██████▍   | 8221/12776 [1:27:33<21:46,  3.49it/s]                                                       64%|██████▍   | 8221/12776 [1:27:33<21:46,  3.49it/s] 64%|██████▍   | 8222/12776 [1:27:34<20:37,  3.68it/s]                                                       64%|██████▍   | 8222/12776 [1:27:34<20:37,  3.68it/s] 64%|██████▍   | 8223/12776 [1:27:34<19:44,  3.84it/s]                                                       64%|██████▍   | 8223/12776 [1:27:34<19:44,  3.84it/s] 64%|██████▍   | 8224/12776 [1:27:34<20:24,  3.72it/s]                                                       64%|██████▍   | 8224/12776 [1:27:34<20:24,  3.72it/s] 64%|██████▍   | 8225/12776 [1:27:34<19:13,  3.94it/s]                                                       64%|██████▍   | 8225/12776 [1:27:34<19:13,  3.94it/s] 64%|██████▍   | 8226/12776 [1:27:34<18:17,  4.15it/s]                                                       64%|██████▍   | 8226/12776 [1:27:34<18:17,  4.15it/s] 64%|██████▍   | 8227/12776 [1:27:35<17:39,  4.30it/s]                                                       64%|██████▍   | 8227/12776 [1:27:35<17:39,  4.30it/s] 64%|██████▍   | 8228/12776 [1:27:35<17:04,  4.44it/s]                                                       64%|██████▍   | 8228/12776 [1:27:35<17:04,  4.44it/s] 64%|██████▍   | 8229/12776 [1:27:35<18:39,  4.06it/s]                                                       64%|██████▍   | 8229/12776 [1:27:35<18:39,  4.06it/s] 64%|██████▍   | 8230/12776 [1:27:35<17:38,  4.29it/s]                                                       64%|██████▍   | 8230/12776 [1:27:35<17:38,  4.29it/s] 64%|██████▍   | 8231/12776 [1:27:36<16:53,  4.48it/s]                                                      {'loss': 0.471, 'grad_norm': 1.0535318851470947, 'learning_rate': 0.00011322091886608015, 'epoch': 1.28}
+{'loss': 0.7819, 'grad_norm': 3.926427125930786, 'learning_rate': 0.00011319648093841642, 'epoch': 1.28}
+{'loss': 0.2567, 'grad_norm': 0.8711987137794495, 'learning_rate': 0.00011317204301075267, 'epoch': 1.28}
+{'loss': 0.5032, 'grad_norm': 1.3189749717712402, 'learning_rate': 0.00011314760508308895, 'epoch': 1.28}
+{'loss': 0.4017, 'grad_norm': 2.5916378498077393, 'learning_rate': 0.00011312316715542521, 'epoch': 1.28}
+{'loss': 0.5681, 'grad_norm': 2.361572504043579, 'learning_rate': 0.00011309872922776148, 'epoch': 1.28}
+{'loss': 0.8003, 'grad_norm': 3.2034995555877686, 'learning_rate': 0.00011307429130009774, 'epoch': 1.28}
+{'loss': 0.3008, 'grad_norm': 0.9315476417541504, 'learning_rate': 0.000113049853372434, 'epoch': 1.28}
+{'loss': 0.6728, 'grad_norm': 1.9534274339675903, 'learning_rate': 0.00011302541544477027, 'epoch': 1.28}
+{'loss': 0.4329, 'grad_norm': 1.1545753479003906, 'learning_rate': 0.00011300097751710655, 'epoch': 1.28}
+{'loss': 0.7543, 'grad_norm': 3.3171403408050537, 'learning_rate': 0.0001129765395894428, 'epoch': 1.28}
+{'loss': 0.7833, 'grad_norm': 1.9916654825210571, 'learning_rate': 0.00011295210166177906, 'epoch': 1.28}
+{'loss': 0.5321, 'grad_norm': 1.3301039934158325, 'learning_rate': 0.00011292766373411534, 'epoch': 1.28}
+{'loss': 0.7389, 'grad_norm': 1.8553197383880615, 'learning_rate': 0.00011290322580645161, 'epoch': 1.28}
+{'loss': 0.7611, 'grad_norm': 3.052807092666626, 'learning_rate': 0.00011287878787878786, 'epoch': 1.28}
+{'loss': 1.0834, 'grad_norm': 3.283430814743042, 'learning_rate': 0.00011285434995112414, 'epoch': 1.28}
+{'loss': 0.747, 'grad_norm': 1.6933223009109497, 'learning_rate': 0.0001128299120234604, 'epoch': 1.28}
+{'loss': 0.7694, 'grad_norm': 2.0889194011688232, 'learning_rate': 0.00011280547409579667, 'epoch': 1.28}
+{'loss': 0.7245, 'grad_norm': 1.8516852855682373, 'learning_rate': 0.00011278103616813293, 'epoch': 1.28}
+{'loss': 1.0895, 'grad_norm': 2.9009687900543213, 'learning_rate': 0.0001127565982404692, 'epoch': 1.28}
+{'loss': 0.9004, 'grad_norm': 2.16554856300354, 'learning_rate': 0.00011273216031280546, 'epoch': 1.28}
+{'loss': 0.9605, 'grad_norm': 1.9977914094924927, 'learning_rate': 0.00011270772238514174, 'epoch': 1.28}
+{'loss': 0.9607, 'grad_norm': 2.656644344329834, 'learning_rate': 0.00011268328445747799, 'epoch': 1.28}
+{'loss': 0.866, 'grad_norm': 2.0683369636535645, 'learning_rate': 0.00011265884652981426, 'epoch': 1.28}
+{'loss': 1.2594, 'grad_norm': 3.113068103790283, 'learning_rate': 0.00011263440860215053, 'epoch': 1.28}
+{'loss': 1.3643, 'grad_norm': 2.857494831085205, 'learning_rate': 0.0001126099706744868, 'epoch': 1.28}
+{'loss': 1.0681, 'grad_norm': 2.787754535675049, 'learning_rate': 0.00011258553274682305, 'epoch': 1.28}
+{'loss': 0.9953, 'grad_norm': 2.6079978942871094, 'learning_rate': 0.00011256109481915933, 'epoch': 1.28}
+{'loss': 0.7956, 'grad_norm': 1.9221597909927368, 'learning_rate': 0.00011253665689149559, 'epoch': 1.28}
+{'loss': 0.6788, 'grad_norm': 3.6456334590911865, 'learning_rate': 0.00011251221896383186, 'epoch': 1.28}
+{'loss': 0.808, 'grad_norm': 4.128708839416504, 'learning_rate': 0.00011248778103616812, 'epoch': 1.28}
+{'loss': 0.5298, 'grad_norm': 1.7657461166381836, 'learning_rate': 0.00011246334310850439, 'epoch': 1.28}
+{'loss': 0.5269, 'grad_norm': 2.3742294311523438, 'learning_rate': 0.00011243890518084065, 'epoch': 1.28}
+{'loss': 0.8815, 'grad_norm': 2.828449249267578, 'learning_rate': 0.00011241446725317693, 'epoch': 1.28}
+{'loss': 0.553, 'grad_norm': 1.5230042934417725, 'learning_rate': 0.00011239002932551318, 'epoch': 1.28}
+{'loss': 0.2289, 'grad_norm': 0.6481024622917175, 'learning_rate': 0.00011236559139784945, 'epoch': 1.28}
+{'loss': 0.8718, 'grad_norm': 2.089646577835083, 'learning_rate': 0.00011234115347018572, 'epoch': 1.28}
+{'loss': 0.1615, 'grad_norm': 0.42590638995170593, 'learning_rate': 0.00011231671554252199, 'epoch': 1.28}
+{'loss': 0.164, 'grad_norm': 0.3601533770561218, 'learning_rate': 0.00011229227761485824, 'epoch': 1.28}
+{'loss': 0.3217, 'grad_norm': 0.6453453898429871, 'learning_rate': 0.00011226783968719452, 'epoch': 1.28}
+{'loss': 0.2657, 'grad_norm': 0.7831723093986511, 'learning_rate': 0.00011224340175953078, 'epoch': 1.28}
+{'loss': 0.278, 'grad_norm': 0.6272794008255005, 'learning_rate': 0.00011221896383186705, 'epoch': 1.28}
+{'loss': 0.3111, 'grad_norm': 0.6463831067085266, 'learning_rate': 0.00011219452590420331, 'epoch': 1.28}
+{'loss': 0.2155, 'grad_norm': 0.7817267775535583, 'learning_rate': 0.00011217008797653958, 'epoch': 1.28}
+{'loss': 0.2571, 'grad_norm': 0.6414055824279785, 'learning_rate': 0.00011214565004887584, 'epoch': 1.28}
+{'loss': 0.2441, 'grad_norm': 1.242760181427002, 'learning_rate': 0.00011212121212121212, 'epoch': 1.28}
+{'loss': 0.206, 'grad_norm': 0.43037867546081543, 'learning_rate': 0.00011209677419354837, 'epoch': 1.28}
+{'loss': 0.2067, 'grad_norm': 0.5707399845123291, 'learning_rate': 0.00011207233626588464, 'epoch': 1.28}
+{'loss': 0.3798, 'grad_norm': 1.2979316711425781, 'learning_rate': 0.00011204789833822092, 'epoch': 1.28}
+{'loss': 0.162, 'grad_norm': 0.5706567764282227, 'learning_rate': 0.00011202346041055718, 'epoch': 1.28}
+{'loss': 0.5603, 'grad_norm': 1.7582261562347412, 'learning_rate': 0.00011199902248289343, 'epoch': 1.28}
+{'loss': 0.4609, 'grad_norm': 2.591653823852539, 'learning_rate': 0.00011197458455522971, 'epoch': 1.28}
+{'loss': 0.3353, 'grad_norm': 0.9483798146247864, 'learning_rate': 0.00011195014662756598, 'epoch': 1.28}
+{'loss': 0.3398, 'grad_norm': 1.1728459596633911, 'learning_rate': 0.00011192570869990224, 'epoch': 1.28}
+{'loss': 0.3597, 'grad_norm': 0.952708899974823, 'learning_rate': 0.0001119012707722385, 'epoch': 1.28}
+{'loss': 0.6664, 'grad_norm': 1.8662762641906738, 'learning_rate': 0.00011187683284457477, 'epoch': 1.29}
+{'loss': 0.8306, 'grad_norm': 1.1493111848831177, 'learning_rate': 0.00011185239491691103, 'epoch': 1.29}
+{'loss': 0.4859, 'grad_norm': 1.7580465078353882, 'learning_rate': 0.00011182795698924731, 'epoch': 1.29}
+{'loss': 0.2457, 'grad_norm': 0.9127596020698547, 'learning_rate': 0.00011180351906158356, 'epoch': 1.29}
+{'loss': 0.4663, 'grad_norm': 1.9063149690628052, 'learning_rate': 0.00011177908113391983, 'epoch': 1.29}
+{'loss': 0.3457, 'grad_norm': 1.1802544593811035, 'learning_rate': 0.00011175464320625611, 'epoch': 1.29}
+{'loss': 0.6875, 'grad_norm': 2.804399013519287, 'learning_rate': 0.00011173020527859237, 'epoch': 1.29}
+{'loss': 0.5672, 'grad_norm': 1.3575066328048706, 'learning_rate': 0.00011170576735092862, 'epoch': 1.29}
+{'loss': 0.731, 'grad_norm': 1.3274577856063843, 'learning_rate': 0.0001116813294232649, 'epoch': 1.29}
+{'loss': 0.6624, 'grad_norm': 1.7587782144546509, 'learning_rate': 0.00011165689149560117, 'epoch': 1.29}
+{'loss': 0.5267, 'grad_norm': 1.3552849292755127, 'learning_rate': 0.00011163245356793742, 'epoch': 1.29}
+{'loss': 0.8283, 'grad_norm': 1.7056719064712524, 'learning_rate': 0.0001116080156402737, 'epoch': 1.29}
+{'loss': 0.7403, 'grad_norm': 2.418220281600952, 'learning_rate': 0.00011158357771260996, 'epoch': 1.29}
+{'loss': 0.4315, 'grad_norm': 1.3776272535324097, 'learning_rate': 0.00011155913978494623, 'epoch': 1.29}
+{'loss': 0.4908, 'grad_norm': 1.3834933042526245, 'learning_rate': 0.0001115347018572825, 'epoch': 1.29}
+{'loss': 0.8942, 'grad_norm': 1.5205992460250854, 'learning_rate': 0.00011151026392961876, 'epoch': 1.29}
+{'loss': 1.2169, 'grad_norm': 3.1065549850463867, 'learning_rate': 0.00011148582600195502, 'epoch': 1.29}
+{'loss': 1.0125, 'grad_norm': 3.65362811088562, 'learning_rate': 0.0001114613880742913, 'epoch': 1.29}
+{'loss': 1.1199, 'grad_norm': 1.9371777772903442, 'learning_rate': 0.00011143695014662756, 'epoch': 1.29}
+{'loss': 0.8579, 'grad_norm': 3.518394947052002, 'learning_rate': 0.00011141251221896381, 'epoch': 1.29}
+{'loss': 0.6714, 'grad_norm': 2.363788604736328, 'learning_rate': 0.00011138807429130009, 'epoch': 1.29}
+{'loss': 0.9438, 'grad_norm': 2.800300359725952, 'learning_rate': 0.00011136363636363636, 'epoch': 1.29}
+ 64%|██████▍   | 8231/12776 [1:27:36<16:53,  4.48it/s] 64%|██████▍   | 8232/12776 [1:27:36<16:20,  4.63it/s]                                                       64%|██████▍   | 8232/12776 [1:27:36<16:20,  4.63it/s] 64%|██████▍   | 8233/12776 [1:27:36<15:53,  4.77it/s]                                                       64%|██████▍   | 8233/12776 [1:27:36<15:53,  4.77it/s] 64%|██████▍   | 8234/12776 [1:27:36<15:31,  4.88it/s]                                                       64%|██████▍   | 8234/12776 [1:27:36<15:31,  4.88it/s] 64%|██████▍   | 8235/12776 [1:27:36<16:22,  4.62it/s]                                                       64%|██████▍   | 8235/12776 [1:27:36<16:22,  4.62it/s] 64%|██████▍   | 8236/12776 [1:27:37<15:41,  4.82it/s]                                                       64%|██████▍   | 8236/12776 [1:27:37<15:41,  4.82it/s] 64%|██████▍   | 8237/12776 [1:27:37<15:15,  4.96it/s]                                                       64%|██████▍   | 8237/12776 [1:27:37<15:15,  4.96it/s] 64%|██████▍   | 8238/12776 [1:27:38<27:09,  2.79it/s]                                                       64%|██████▍   | 8238/12776 [1:27:38<27:09,  2.79it/s] 64%|██████▍   | 8239/12776 [1:27:39<53:01,  1.43it/s]                                                       64%|██████▍   | 8239/12776 [1:27:39<53:01,  1.43it/s] 64%|██████▍   | 8240/12776 [1:27:40<1:01:08,  1.24it/s]                                                         64%|██████▍   | 8240/12776 [1:27:40<1:01:08,  1.24it/s] 65%|██████▍   | 8241/12776 [1:27:41<1:02:57,  1.20it/s]                                                         65%|██████▍   | 8241/12776 [1:27:41<1:02:57,  1.20it/s] 65%|██████▍   | 8242/12776 [1:27:42<1:02:45,  1.20it/s]                                                         65%|██████▍   | 8242/12776 [1:27:42<1:02:45,  1.20it/s] 65%|██████▍   | 8243/12776 [1:27:43<1:00:57,  1.24it/s]                                                         65%|██████▍   | 8243/12776 [1:27:43<1:00:57,  1.24it/s] 65%|██████▍   | 8244/12776 [1:27:43<58:45,  1.29it/s]                                                         65%|██████▍   | 8244/12776 [1:27:43<58:45,  1.29it/s] 65%|██████▍   | 8245/12776 [1:27:44<56:26,  1.34it/s]                                                       65%|██████▍   | 8245/12776 [1:27:44<56:26,  1.34it/s] 65%|██████▍   | 8246/12776 [1:27:45<55:34,  1.36it/s]                                                       65%|██████▍   | 8246/12776 [1:27:45<55:34,  1.36it/s] 65%|██████▍   | 8247/12776 [1:27:45<52:51,  1.43it/s]                                                       65%|██████▍   | 8247/12776 [1:27:45<52:51,  1.43it/s] 65%|██████▍   | 8248/12776 [1:27:46<50:35,  1.49it/s]                                                       65%|██████▍   | 8248/12776 [1:27:46<50:35,  1.49it/s] 65%|██████▍   | 8249/12776 [1:27:46<47:55,  1.57it/s]                                                       65%|██████▍   | 8249/12776 [1:27:46<47:55,  1.57it/s] 65%|██████▍   | 8250/12776 [1:27:47<46:34,  1.62it/s]                                                       65%|██████▍   | 8250/12776 [1:27:47<46:34,  1.62it/s] 65%|██████▍   | 8251/12776 [1:27:48<43:57,  1.72it/s]                                                       65%|██████▍   | 8251/12776 [1:27:48<43:57,  1.72it/s] 65%|██████▍   | 8252/12776 [1:27:48<41:55,  1.80it/s]                                                       65%|██████▍   | 8252/12776 [1:27:48<41:55,  1.80it/s] 65%|██████▍   | 8253/12776 [1:27:48<39:30,  1.91it/s]                                                       65%|██████▍   | 8253/12776 [1:27:48<39:30,  1.91it/s] 65%|██████▍   | 8254/12776 [1:27:49<39:32,  1.91it/s]                                                       65%|██████▍   | 8254/12776 [1:27:49<39:32,  1.91it/s] 65%|██████▍   | 8255/12776 [1:27:49<36:58,  2.04it/s]                                                       65%|██████▍   | 8255/12776 [1:27:49<36:58,  2.04it/s] 65%|██████▍   | 8256/12776 [1:27:50<34:50,  2.16it/s]                                                       65%|██████▍   | 8256/12776 [1:27:50<34:50,  2.16it/s] 65%|██████▍   | 8257/12776 [1:27:50<35:29,  2.12it/s]                                                       65%|██████▍   | 8257/12776 [1:27:50<35:29,  2.12it/s] 65%|██████▍   | 8258/12776 [1:27:51<32:49,  2.29it/s]                                                       65%|██████▍   | 8258/12776 [1:27:51<32:49,  2.29it/s] 65%|██████▍   | 8259/12776 [1:27:51<30:49,  2.44it/s]                                                       65%|██████▍   | 8259/12776 [1:27:51<30:49,  2.44it/s] 65%|██████▍   | 8260/12776 [1:27:51<30:51,  2.44it/s]                                                       65%|██████▍   | 8260/12776 [1:27:51<30:51,  2.44it/s] 65%|██████▍   | 8261/12776 [1:27:52<29:10,  2.58it/s]                                                       65%|██████▍   | 8261/12776 [1:27:52<29:10,  2.58it/s] 65%|██████▍   | 8262/12776 [1:27:52<27:45,  2.71it/s]                                                       65%|██████▍   | 8262/12776 [1:27:52<27:45,  2.71it/s] 65%|██████▍   | 8263/12776 [1:27:52<26:54,  2.80it/s]                                                       65%|██████▍   | 8263/12776 [1:27:52<26:54,  2.80it/s] 65%|██████▍   | 8264/12776 [1:27:53<25:37,  2.93it/s]                                                       65%|██████▍   | 8264/12776 [1:27:53<25:37,  2.93it/s] 65%|██████▍   | 8265/12776 [1:27:53<24:35,  3.06it/s]                                                       65%|██████▍   | 8265/12776 [1:27:53<24:35,  3.06it/s] 65%|██████▍   | 8266/12776 [1:27:53<23:38,  3.18it/s]                                                       65%|██████▍   | 8266/12776 [1:27:53<23:38,  3.18it/s] 65%|██████▍   | 8267/12776 [1:27:54<25:15,  2.97it/s]                                                       65%|██████▍   | 8267/12776 [1:27:54<25:15,  2.97it/s] 65%|██████▍   | 8268/12776 [1:27:54<23:43,  3.17it/s]                                                       65%|██████▍   | 8268/12776 [1:27:54<23:43,  3.17it/s] 65%|██████▍   | 8269/12776 [1:27:54<22:31,  3.33it/s]                                                       65%|██████▍   | 8269/12776 [1:27:54<22:31,  3.33it/s] 65%|██████▍   | 8270/12776 [1:27:54<21:24,  3.51it/s]                                                       65%|██████▍   | 8270/12776 [1:27:54<21:24,  3.51it/s] 65%|██████▍   | 8271/12776 [1:27:55<22:53,  3.28it/s]                                                       65%|██████▍   | 8271/12776 [1:27:55<22:53,  3.28it/s] 65%|██████▍   | 8272/12776 [1:27:55<21:25,  3.50it/s]                                                       65%|██████▍   | 8272/12776 [1:27:55<21:25,  3.50it/s] 65%|██████▍   | 8273/12776 [1:27:55<20:16,  3.70it/s]                                                       65%|██████▍   | 8273/12776 [1:27:55<20:16,  3.70it/s] 65%|██████▍   | 8274/12776 [1:27:55<19:18,  3.89it/s]                                                       65%|██████▍   | 8274/12776 [1:27:55<19:18,  3.89it/s] 65%|██████▍   | 8275/12776 [1:27:56<19:56,  3.76it/s]                                                       65%|██████▍   | 8275/12776 [1:27:56<19:56,  3.76it/s] 65%|██████▍   | 8276/12776 [1:27:56<18:43,  4.00it/s]                                                       65%|██████▍   | 8276/12776 [1:27:56<18:43,  4.00it/s] 65%|██████▍   | 8277/12776 [1:27:56<17:51,  4.20it/s]                                                       65%|██████▍   | 8277/12776 [1:27:56<17:51,  4.20it/s] 65%|██████▍   | 8278/12776 [1:27:56<17:16,  4.34it/s]                                                       65%|██████▍   | 8278/12776 [1:27:56<17:16,  4.34it/s] 65%|██████▍   | 8279/12776 [1:27:57<16:45,  4.47it/s]                                                       65%|██████▍   | 8279/12776 [1:27:57<16:45,  4.47it/s] 65%|██████▍   | 8280/12776 [1:27:57<17:53,  4.19it/s]                                                       65%|██████▍   | 8280/12776 [1:27:57<17:53,  4.19it/s] 65%|██████▍   | 8281/12776 [1:27:57<17:00,  4.40it/s]                                                       65%|██████▍   | 8281/12776 [1:27:57<17:00,  4.40it/s] 65%|██████▍   | 8282/12776 [1:27:57<16:23,  4.57it/s]                                                       65%|██████▍   | 8282/12776 [1:27:57<16:23,  4.57it/s] 65%|██████▍   | 8283/12776 [1:27:57<15:55,  4.70it/s]                                                       65%|██████▍   | 8283/12776 [1:27:57<15:55,  4.70it/s] 65%|██████▍   | 8284/12776 [1:27:58<15:30,  4.83it/s]                                                       65%|██████▍   | 8284/12776 [1:27:58<15:30,  4.83it/s] 65%|██████▍   | 8285/12776 [1:27:58<15:07,  4.95it/s]                                                       65%|██████▍   | 8285/12776 [1:27:58<15:07,  4.95it/s] 65%|██████▍   | 8286/12776 [1:27:58<16:37,  4.50it/s]                                                       65%|██████▍   | 8286/12776 [1:27:58<16:37,  4.50it/s] 65%|██████▍   | 8287/12776 [1:27:58<15:44,  4.75it/s]                                                       65%|██████▍   | 8287/12776 [1:27:58<15:44,  4.75it/s] 65%|██████▍   | 8288/12776 [1:27:59<29:05,  2.57it/s]                                                       65%|██████▍   | 8288/12776 [1:27:59<29:05,  2.57it/s] 65%|██████▍   | 8289/12776 [1:28:01<52:29,  1.42it/s]                                                       65%|██████▍   | 8289/12776 [1:28:01<52:29,  1.42it/s] 65%|██████▍   | 8290/12776 [1:28:02<58:19,  1.28it/s]                                                       65%|██████▍   | 8290/12776 [1:28:02<58:19,  1.28it/s] 65%|██████▍   | 8291/12776 [1:28:02<59:47,  1.25it/s]                                                       65%|██████▍   | 8291/12776 [1:28:02<59:47,  1.25it/s] 65%|██████▍   | 8292/12776 [1:28:03<59:05,  1.26it/s]                                                       65%|██████▍   | 8292/12776 [1:28:03<59:05,  1.26it/s] 65%|██████▍   | 8293/12776 [1:28:04<1:00:50,  1.23it/s]                                                         65%|██████▍   | 8293/12776 [1:28:04<1:00:50,  1.23it/s] 65%|██████▍   | 8294/12776 [1:28:05<57:39,  1.30it/s]                                                         65%|██████▍   | 8294/12776 [1:28:05<57:39,  1.30it/s] 65%|██████▍   | 8295/12776 [1:28:05<54:46,  1.36it/s]                                                       65%|██████▍   | 8295/12776 [1:28:05<54:46,  1.36it/s] 65%|██████▍   | 8296/12776 [1:28:06<53:45,  1.39it/s]                                                       65%|██████▍   | 8296/12776 [1:28:06<53:45,  1.39it/s] 65%|██████▍   | 8297/12776 [1:28:07<50:22,  1.48it/s]                                                       65%|██████▍   | 8297/12776 [1:28:07<50:22,  1.48it/s] 65%|██████▍   | 8298/12776 [1:28:07<48:32,  1.54it/s]                                                       65%|██████▍   | 8298/12776 [1:28:07<48:32,  1.54it/s] 65%|██████▍   | 8299/12776 [1:28:08<45:34,  1.64it/s]                                                       65%|██████▍   | 8299/12776 [1:28:08<45:34,  1.64it/s] 65%|██████▍   | 8300/12776 [1:28:08<45:24,  1.64it/s]                                                       65%|██████▍   | 8300/12776 [1:28:08<45:24,  1.64it/s] 65%|██████▍   | 8301/12776 [1:28:09<42:18,  1.76it/s]                                                       65%|██████▍   | 8301/12776 [1:28:09<42:18,  1.76it/s] 65%|██████▍   | 8302/12776 [1:28:09<39:26,  1.89it/s]                                                       65%|██████▍   | 8302/12776 [1:28:09<39:26,  1.89it/s] 65%|██████▍   | 8303/12776 [1:28:10<38:25,  1.94it/s]                                                       65%|██████▍   | 8303/12776 [1:28:10<38:25,  1.94it/s] 65%|██████▍   | 8304/12776 [1:28:10<36:12,  2.06it/s]                                                       65%|██████▍   | 8304/12776 [1:28:10<36:12,  2.06it/s] 65%|██████▌   | 8305/12776 [1:28:11<36:00,  2.07it/s]                                                       65%|██████▌   | 8305/12776 [1:28:11<36:00,  2.07it/s] 65%|██████▌   | 8306/12776 [1:28:11<33:40,  2.21it/s]                                                       65%|██████▌   | 8306/12776 [1:28:11<33:40,  2.21it/s] 65%|██████▌   | 8307/12776 [1:28:11<31:49,  2.34it/s]                                                       65%|██████▌   | 8307/12776 [1:28:11<31:49,  2.34it/s] 65%|██████▌   | 8308/12776 [1:28:12<31:09,  2.39it/s]                                                       65%|██████▌   | 8308/12776 [1:28:12<31:09,  2.39it/s] 65%|██████▌   | 8309/12776 [1:28:12<29:42,  2.51it/s]                                                      {'loss': 1.5732, 'grad_norm': 2.6638309955596924, 'learning_rate': 0.00011133919843597261, 'epoch': 1.29}
+{'loss': 0.8746, 'grad_norm': 1.8096959590911865, 'learning_rate': 0.00011131476050830889, 'epoch': 1.29}
+{'loss': 0.7608, 'grad_norm': 2.5477139949798584, 'learning_rate': 0.00011129032258064515, 'epoch': 1.29}
+{'loss': 0.746, 'grad_norm': 5.085765361785889, 'learning_rate': 0.00011126588465298142, 'epoch': 1.29}
+{'loss': 0.3566, 'grad_norm': 2.019724130630493, 'learning_rate': 0.0001112414467253177, 'epoch': 1.29}
+{'loss': 0.8774, 'grad_norm': 6.809519290924072, 'learning_rate': 0.00011121700879765395, 'epoch': 1.29}
+{'loss': 0.3884, 'grad_norm': 4.453648090362549, 'learning_rate': 0.00011119257086999021, 'epoch': 1.29}
+{'loss': 1.12, 'grad_norm': 2.842747449874878, 'learning_rate': 0.00011116813294232649, 'epoch': 1.29}
+{'loss': 0.2758, 'grad_norm': 0.7022001147270203, 'learning_rate': 0.00011114369501466275, 'epoch': 1.29}
+{'loss': 0.2534, 'grad_norm': 0.49633824825286865, 'learning_rate': 0.000111119257086999, 'epoch': 1.29}
+{'loss': 0.2632, 'grad_norm': 0.6422197818756104, 'learning_rate': 0.00011109481915933528, 'epoch': 1.29}
+{'loss': 0.4564, 'grad_norm': 1.4231148958206177, 'learning_rate': 0.00011107038123167155, 'epoch': 1.29}
+{'loss': 0.2579, 'grad_norm': 0.8207333087921143, 'learning_rate': 0.0001110459433040078, 'epoch': 1.29}
+{'loss': 0.3219, 'grad_norm': 0.8601287603378296, 'learning_rate': 0.00011102150537634408, 'epoch': 1.29}
+{'loss': 0.3404, 'grad_norm': 0.8240009546279907, 'learning_rate': 0.00011099706744868034, 'epoch': 1.29}
+{'loss': 0.2921, 'grad_norm': 0.8633431792259216, 'learning_rate': 0.00011097262952101661, 'epoch': 1.29}
+{'loss': 0.4439, 'grad_norm': 1.7753422260284424, 'learning_rate': 0.00011094819159335289, 'epoch': 1.29}
+{'loss': 0.317, 'grad_norm': 1.0025125741958618, 'learning_rate': 0.00011092375366568914, 'epoch': 1.29}
+{'loss': 0.3887, 'grad_norm': 1.528899073600769, 'learning_rate': 0.0001108993157380254, 'epoch': 1.29}
+{'loss': 0.3759, 'grad_norm': 0.7538073062896729, 'learning_rate': 0.00011087487781036168, 'epoch': 1.29}
+{'loss': 0.2921, 'grad_norm': 0.7805300951004028, 'learning_rate': 0.00011085043988269795, 'epoch': 1.29}
+{'loss': 0.4664, 'grad_norm': 1.1637154817581177, 'learning_rate': 0.0001108260019550342, 'epoch': 1.29}
+{'loss': 0.3022, 'grad_norm': 0.8898148536682129, 'learning_rate': 0.00011080156402737048, 'epoch': 1.29}
+{'loss': 0.4828, 'grad_norm': 1.7075831890106201, 'learning_rate': 0.00011077712609970674, 'epoch': 1.29}
+{'loss': 0.3136, 'grad_norm': 1.2227857112884521, 'learning_rate': 0.00011075268817204299, 'epoch': 1.29}
+{'loss': 0.209, 'grad_norm': 1.10588800907135, 'learning_rate': 0.00011072825024437927, 'epoch': 1.29}
+{'loss': 0.6039, 'grad_norm': 1.42483389377594, 'learning_rate': 0.00011070381231671553, 'epoch': 1.29}
+{'loss': 1.3905, 'grad_norm': 4.606034755706787, 'learning_rate': 0.0001106793743890518, 'epoch': 1.29}
+{'loss': 0.5845, 'grad_norm': 1.765781283378601, 'learning_rate': 0.00011065493646138808, 'epoch': 1.29}
+{'loss': 0.339, 'grad_norm': 1.1531269550323486, 'learning_rate': 0.00011063049853372433, 'epoch': 1.29}
+{'loss': 0.5406, 'grad_norm': 2.018428087234497, 'learning_rate': 0.0001106060606060606, 'epoch': 1.29}
+{'loss': 0.3733, 'grad_norm': 1.3222421407699585, 'learning_rate': 0.00011058162267839687, 'epoch': 1.29}
+{'loss': 0.5531, 'grad_norm': 1.9479771852493286, 'learning_rate': 0.00011055718475073312, 'epoch': 1.29}
+{'loss': 0.456, 'grad_norm': 0.9404929876327515, 'learning_rate': 0.00011053274682306939, 'epoch': 1.29}
+{'loss': 0.6238, 'grad_norm': 2.357956886291504, 'learning_rate': 0.00011050830889540567, 'epoch': 1.29}
+{'loss': 0.565, 'grad_norm': 1.7230677604675293, 'learning_rate': 0.00011048387096774193, 'epoch': 1.29}
+{'loss': 0.9608, 'grad_norm': 4.3762526512146, 'learning_rate': 0.00011045943304007818, 'epoch': 1.29}
+{'loss': 0.7233, 'grad_norm': 3.0639162063598633, 'learning_rate': 0.00011043499511241446, 'epoch': 1.29}
+{'loss': 0.7643, 'grad_norm': 1.96674644947052, 'learning_rate': 0.00011041055718475073, 'epoch': 1.29}
+{'loss': 0.6065, 'grad_norm': 4.862483978271484, 'learning_rate': 0.00011038611925708699, 'epoch': 1.29}
+{'loss': 0.8087, 'grad_norm': 2.0144357681274414, 'learning_rate': 0.00011036168132942327, 'epoch': 1.29}
+{'loss': 1.2482, 'grad_norm': 3.6097030639648438, 'learning_rate': 0.00011033724340175952, 'epoch': 1.29}
+{'loss': 0.5976, 'grad_norm': 1.5838818550109863, 'learning_rate': 0.00011031280547409578, 'epoch': 1.3}
+{'loss': 0.7551, 'grad_norm': 2.3606481552124023, 'learning_rate': 0.00011028836754643206, 'epoch': 1.3}
+{'loss': 0.6356, 'grad_norm': 1.6453297138214111, 'learning_rate': 0.00011026392961876831, 'epoch': 1.3}
+{'loss': 0.8155, 'grad_norm': 2.8421103954315186, 'learning_rate': 0.00011023949169110458, 'epoch': 1.3}
+{'loss': 0.813, 'grad_norm': 2.4521589279174805, 'learning_rate': 0.00011021505376344086, 'epoch': 1.3}
+{'loss': 0.981, 'grad_norm': 2.380380868911743, 'learning_rate': 0.00011019061583577712, 'epoch': 1.3}
+{'loss': 1.051, 'grad_norm': 2.3057072162628174, 'learning_rate': 0.00011016617790811337, 'epoch': 1.3}
+{'loss': 1.2693, 'grad_norm': 2.9905688762664795, 'learning_rate': 0.00011014173998044965, 'epoch': 1.3}
+{'loss': 1.336, 'grad_norm': 9.889979362487793, 'learning_rate': 0.00011011730205278592, 'epoch': 1.3}
+{'loss': 1.5213, 'grad_norm': 2.7968623638153076, 'learning_rate': 0.00011009286412512218, 'epoch': 1.3}
+{'loss': 0.7344, 'grad_norm': 2.403857469558716, 'learning_rate': 0.00011006842619745846, 'epoch': 1.3}
+{'loss': 0.5804, 'grad_norm': 1.8447998762130737, 'learning_rate': 0.00011004398826979471, 'epoch': 1.3}
+{'loss': 0.6722, 'grad_norm': 1.6647576093673706, 'learning_rate': 0.00011001955034213098, 'epoch': 1.3}
+{'loss': 0.8471, 'grad_norm': 3.1341192722320557, 'learning_rate': 0.00010999511241446725, 'epoch': 1.3}
+{'loss': 0.7715, 'grad_norm': 2.227797746658325, 'learning_rate': 0.0001099706744868035, 'epoch': 1.3}
+{'loss': 0.7457, 'grad_norm': 1.991193413734436, 'learning_rate': 0.00010994623655913977, 'epoch': 1.3}
+{'loss': 0.2327, 'grad_norm': 0.5508354902267456, 'learning_rate': 0.00010992179863147605, 'epoch': 1.3}
+{'loss': 0.1992, 'grad_norm': 0.48610156774520874, 'learning_rate': 0.00010989736070381231, 'epoch': 1.3}
+{'loss': 0.2157, 'grad_norm': 0.6914536356925964, 'learning_rate': 0.00010987292277614856, 'epoch': 1.3}
+{'loss': 0.2804, 'grad_norm': 0.6526252627372742, 'learning_rate': 0.00010984848484848484, 'epoch': 1.3}
+{'loss': 0.2843, 'grad_norm': 0.7797197103500366, 'learning_rate': 0.00010982404692082111, 'epoch': 1.3}
+{'loss': 0.4454, 'grad_norm': 1.6517852544784546, 'learning_rate': 0.00010979960899315737, 'epoch': 1.3}
+{'loss': 0.1868, 'grad_norm': 0.5789437294006348, 'learning_rate': 0.00010977517106549365, 'epoch': 1.3}
+{'loss': 0.3895, 'grad_norm': 1.5858449935913086, 'learning_rate': 0.0001097507331378299, 'epoch': 1.3}
+{'loss': 0.2817, 'grad_norm': 0.5253259539604187, 'learning_rate': 0.00010972629521016617, 'epoch': 1.3}
+{'loss': 0.279, 'grad_norm': 0.7501121163368225, 'learning_rate': 0.00010970185728250245, 'epoch': 1.3}
+{'loss': 0.2974, 'grad_norm': 0.6583108305931091, 'learning_rate': 0.0001096774193548387, 'epoch': 1.3}
+{'loss': 0.1879, 'grad_norm': 0.7893651723861694, 'learning_rate': 0.00010965298142717496, 'epoch': 1.3}
+{'loss': 0.2392, 'grad_norm': 0.6830184459686279, 'learning_rate': 0.00010962854349951124, 'epoch': 1.3}
+{'loss': 0.3198, 'grad_norm': 0.9842543601989746, 'learning_rate': 0.0001096041055718475, 'epoch': 1.3}
+{'loss': 0.3716, 'grad_norm': 1.0459553003311157, 'learning_rate': 0.00010957966764418376, 'epoch': 1.3}
+{'loss': 0.2841, 'grad_norm': 0.7785957455635071, 'learning_rate': 0.00010955522971652003, 'epoch': 1.3}
+{'loss': 0.2479, 'grad_norm': 1.5206092596054077, 'learning_rate': 0.0001095307917888563, 'epoch': 1.3}
+{'loss': 0.2994, 'grad_norm': 1.0379140377044678, 'learning_rate': 0.00010950635386119256, 'epoch': 1.3}
+{'loss': 0.6375, 'grad_norm': 2.3565833568573, 'learning_rate': 0.00010948191593352884, 'epoch': 1.3}
+{'loss': 0.653, 'grad_norm': 1.7786818742752075, 'learning_rate': 0.0001094574780058651, 'epoch': 1.3}
+ 65%|██████▌   | 8309/12776 [1:28:12<29:42,  2.51it/s] 65%|██████▌   | 8310/12776 [1:28:12<28:17,  2.63it/s]                                                       65%|██████▌   | 8310/12776 [1:28:12<28:17,  2.63it/s] 65%|██████▌   | 8311/12776 [1:28:13<29:26,  2.53it/s]                                                       65%|██████▌   | 8311/12776 [1:28:13<29:26,  2.53it/s] 65%|██████▌   | 8312/12776 [1:28:13<27:41,  2.69it/s]                                                       65%|██████▌   | 8312/12776 [1:28:13<27:41,  2.69it/s] 65%|██████▌   | 8313/12776 [1:28:13<26:11,  2.84it/s]                                                       65%|██████▌   | 8313/12776 [1:28:13<26:11,  2.84it/s] 65%|██████▌   | 8314/12776 [1:28:14<25:01,  2.97it/s]                                                       65%|██████▌   | 8314/12776 [1:28:14<25:01,  2.97it/s] 65%|██████▌   | 8315/12776 [1:28:14<25:35,  2.90it/s]                                                       65%|██████▌   | 8315/12776 [1:28:14<25:35,  2.90it/s] 65%|██████▌   | 8316/12776 [1:28:14<24:19,  3.06it/s]                                                       65%|██████▌   | 8316/12776 [1:28:14<24:19,  3.06it/s] 65%|██████▌   | 8317/12776 [1:28:15<23:10,  3.21it/s]                                                       65%|██████▌   | 8317/12776 [1:28:15<23:10,  3.21it/s] 65%|██████▌   | 8318/12776 [1:28:15<22:11,  3.35it/s]                                                       65%|██████▌   | 8318/12776 [1:28:15<22:11,  3.35it/s] 65%|██████▌   | 8319/12776 [1:28:15<22:29,  3.30it/s]                                                       65%|██████▌   | 8319/12776 [1:28:15<22:29,  3.30it/s] 65%|██████▌   | 8320/12776 [1:28:16<21:21,  3.48it/s]                                                       65%|██████▌   | 8320/12776 [1:28:16<21:21,  3.48it/s] 65%|██████▌   | 8321/12776 [1:28:16<20:30,  3.62it/s]                                                       65%|██████▌   | 8321/12776 [1:28:16<20:30,  3.62it/s] 65%|██████▌   | 8322/12776 [1:28:16<19:47,  3.75it/s]                                                       65%|██████▌   | 8322/12776 [1:28:16<19:47,  3.75it/s] 65%|██████▌   | 8323/12776 [1:28:16<21:41,  3.42it/s]                                                       65%|██████▌   | 8323/12776 [1:28:16<21:41,  3.42it/s] 65%|██████▌   | 8324/12776 [1:28:17<20:19,  3.65it/s]                                                       65%|██████▌   | 8324/12776 [1:28:17<20:19,  3.65it/s] 65%|██████▌   | 8325/12776 [1:28:17<19:17,  3.84it/s]                                                       65%|██████▌   | 8325/12776 [1:28:17<19:17,  3.84it/s] 65%|██████▌   | 8326/12776 [1:28:17<18:22,  4.04it/s]                                                       65%|██████▌   | 8326/12776 [1:28:17<18:22,  4.04it/s] 65%|██████▌   | 8327/12776 [1:28:17<19:44,  3.76it/s]                                                       65%|██████▌   | 8327/12776 [1:28:17<19:44,  3.76it/s] 65%|██████▌   | 8328/12776 [1:28:18<18:28,  4.01it/s]                                                       65%|██████▌   | 8328/12776 [1:28:18<18:28,  4.01it/s] 65%|██████▌   | 8329/12776 [1:28:18<17:37,  4.20it/s]                                                       65%|██████▌   | 8329/12776 [1:28:18<17:37,  4.20it/s] 65%|██████▌   | 8330/12776 [1:28:18<16:52,  4.39it/s]                                                       65%|██████▌   | 8330/12776 [1:28:18<16:52,  4.39it/s] 65%|██████▌   | 8331/12776 [1:28:18<16:19,  4.54it/s]                                                       65%|██████▌   | 8331/12776 [1:28:18<16:19,  4.54it/s] 65%|██████▌   | 8332/12776 [1:28:19<18:11,  4.07it/s]                                                       65%|██████▌   | 8332/12776 [1:28:19<18:11,  4.07it/s] 65%|██████▌   | 8333/12776 [1:28:19<17:06,  4.33it/s]                                                       65%|██████▌   | 8333/12776 [1:28:19<17:06,  4.33it/s] 65%|██████▌   | 8334/12776 [1:28:19<16:21,  4.53it/s]                                                       65%|██████▌   | 8334/12776 [1:28:19<16:21,  4.53it/s] 65%|██████▌   | 8335/12776 [1:28:19<15:44,  4.70it/s]                                                       65%|██████▌   | 8335/12776 [1:28:19<15:44,  4.70it/s] 65%|██████▌   | 8336/12776 [1:28:19<15:16,  4.84it/s]                                                       65%|██████▌   | 8336/12776 [1:28:19<15:16,  4.84it/s] 65%|██████▌   | 8337/12776 [1:28:19<14:47,  5.00it/s]                                                       65%|██████▌   | 8337/12776 [1:28:19<14:47,  5.00it/s] 65%|██████▌   | 8338/12776 [1:28:20<26:06,  2.83it/s]                                                       65%|██████▌   | 8338/12776 [1:28:20<26:06,  2.83it/s] 65%|██████▌   | 8339/12776 [1:28:22<54:06,  1.37it/s]                                                       65%|██████▌   | 8339/12776 [1:28:22<54:06,  1.37it/s] 65%|██████▌   | 8340/12776 [1:28:23<58:13,  1.27it/s]                                                       65%|██████▌   | 8340/12776 [1:28:23<58:13,  1.27it/s] 65%|██████▌   | 8341/12776 [1:28:24<59:13,  1.25it/s]                                                       65%|██████▌   | 8341/12776 [1:28:24<59:13,  1.25it/s] 65%|██████▌   | 8342/12776 [1:28:24<58:02,  1.27it/s]                                                       65%|██████▌   | 8342/12776 [1:28:24<58:02,  1.27it/s] 65%|██████▌   | 8343/12776 [1:28:25<56:31,  1.31it/s]                                                       65%|██████▌   | 8343/12776 [1:28:25<56:31,  1.31it/s] 65%|██████▌   | 8344/12776 [1:28:26<56:15,  1.31it/s]                                                       65%|██████▌   | 8344/12776 [1:28:26<56:15,  1.31it/s] 65%|██████▌   | 8345/12776 [1:28:27<56:27,  1.31it/s]                                                       65%|██████▌   | 8345/12776 [1:28:27<56:27,  1.31it/s] 65%|██████▌   | 8346/12776 [1:28:27<53:14,  1.39it/s]                                                       65%|██████▌   | 8346/12776 [1:28:27<53:14,  1.39it/s] 65%|██████▌   | 8347/12776 [1:28:28<49:57,  1.48it/s]                                                       65%|██████▌   | 8347/12776 [1:28:28<49:57,  1.48it/s] 65%|██████▌   | 8348/12776 [1:28:28<47:13,  1.56it/s]                                                       65%|██████▌   | 8348/12776 [1:28:28<47:13,  1.56it/s] 65%|██████▌   | 8349/12776 [1:28:29<46:27,  1.59it/s]                                                       65%|██████▌   | 8349/12776 [1:28:29<46:27,  1.59it/s] 65%|██████▌   | 8350/12776 [1:28:29<43:44,  1.69it/s]                                                       65%|██████▌   | 8350/12776 [1:28:29<43:44,  1.69it/s] 65%|██████▌   | 8351/12776 [1:28:30<41:09,  1.79it/s]                                                       65%|██████▌   | 8351/12776 [1:28:30<41:09,  1.79it/s] 65%|██████▌   | 8352/12776 [1:28:30<39:51,  1.85it/s]                                                       65%|██████▌   | 8352/12776 [1:28:30<39:51,  1.85it/s] 65%|██████▌   | 8353/12776 [1:28:31<37:39,  1.96it/s]                                                       65%|██████▌   | 8353/12776 [1:28:31<37:39,  1.96it/s] 65%|██████▌   | 8354/12776 [1:28:31<36:08,  2.04it/s]                                                       65%|██████▌   | 8354/12776 [1:28:31<36:08,  2.04it/s] 65%|██████▌   | 8355/12776 [1:28:32<34:24,  2.14it/s]                                                       65%|██████▌   | 8355/12776 [1:28:32<34:24,  2.14it/s] 65%|██████▌   | 8356/12776 [1:28:32<32:55,  2.24it/s]                                                       65%|██████▌   | 8356/12776 [1:28:32<32:55,  2.24it/s] 65%|██████▌   | 8357/12776 [1:28:33<33:52,  2.17it/s]                                                       65%|██████▌   | 8357/12776 [1:28:33<33:52,  2.17it/s] 65%|██████▌   | 8358/12776 [1:28:33<31:48,  2.32it/s]                                                       65%|██████▌   | 8358/12776 [1:28:33<31:48,  2.32it/s] 65%|██████▌   | 8359/12776 [1:28:33<29:59,  2.45it/s]                                                       65%|██████▌   | 8359/12776 [1:28:33<29:59,  2.45it/s] 65%|██████▌   | 8360/12776 [1:28:34<29:49,  2.47it/s]                                                       65%|██████▌   | 8360/12776 [1:28:34<29:49,  2.47it/s] 65%|██████▌   | 8361/12776 [1:28:34<28:15,  2.60it/s]                                                       65%|██████▌   | 8361/12776 [1:28:34<28:15,  2.60it/s] 65%|██████▌   | 8362/12776 [1:28:34<27:04,  2.72it/s]                                                       65%|██████▌   | 8362/12776 [1:28:34<27:04,  2.72it/s] 65%|██████▌   | 8363/12776 [1:28:35<28:42,  2.56it/s]                                                       65%|██████▌   | 8363/12776 [1:28:35<28:42,  2.56it/s] 65%|██████▌   | 8364/12776 [1:28:35<26:46,  2.75it/s]                                                       65%|██████▌   | 8364/12776 [1:28:35<26:46,  2.75it/s] 65%|██████▌   | 8365/12776 [1:28:35<25:22,  2.90it/s]                                                       65%|██████▌   | 8365/12776 [1:28:35<25:22,  2.90it/s] 65%|██████▌   | 8366/12776 [1:28:36<26:25,  2.78it/s]                                                       65%|██████▌   | 8366/12776 [1:28:36<26:25,  2.78it/s] 65%|██████▌   | 8367/12776 [1:28:36<24:41,  2.98it/s]                                                       65%|██████▌   | 8367/12776 [1:28:36<24:41,  2.98it/s] 65%|██████▌   | 8368/12776 [1:28:36<23:24,  3.14it/s]                                                       65%|██████▌   | 8368/12776 [1:28:36<23:24,  3.14it/s] 66%|██████▌   | 8369/12776 [1:28:37<22:20,  3.29it/s]                                                       66%|██████▌   | 8369/12776 [1:28:37<22:20,  3.29it/s] 66%|██████▌   | 8370/12776 [1:28:37<22:09,  3.31it/s]                                                       66%|██████▌   | 8370/12776 [1:28:37<22:09,  3.31it/s] 66%|██████▌   | 8371/12776 [1:28:37<21:08,  3.47it/s]                                                       66%|██████▌   | 8371/12776 [1:28:37<21:08,  3.47it/s] 66%|██████▌   | 8372/12776 [1:28:37<20:17,  3.62it/s]                                                       66%|██████▌   | 8372/12776 [1:28:37<20:17,  3.62it/s] 66%|██████▌   | 8373/12776 [1:28:38<19:34,  3.75it/s]                                                       66%|██████▌   | 8373/12776 [1:28:38<19:34,  3.75it/s] 66%|██████▌   | 8374/12776 [1:28:38<18:59,  3.86it/s]                                                       66%|██████▌   | 8374/12776 [1:28:38<18:59,  3.86it/s] 66%|██████▌   | 8375/12776 [1:28:38<18:54,  3.88it/s]                                                       66%|██████▌   | 8375/12776 [1:28:38<18:54,  3.88it/s] 66%|██████▌   | 8376/12776 [1:28:38<18:05,  4.05it/s]                                                       66%|██████▌   | 8376/12776 [1:28:38<18:05,  4.05it/s] 66%|██████▌   | 8377/12776 [1:28:39<17:21,  4.22it/s]                                                       66%|██████▌   | 8377/12776 [1:28:39<17:21,  4.22it/s] 66%|██████▌   | 8378/12776 [1:28:39<16:44,  4.38it/s]                                                       66%|██████▌   | 8378/12776 [1:28:39<16:44,  4.38it/s] 66%|██████▌   | 8379/12776 [1:28:39<16:20,  4.48it/s]                                                       66%|██████▌   | 8379/12776 [1:28:39<16:20,  4.48it/s] 66%|██████▌   | 8380/12776 [1:28:39<17:15,  4.24it/s]                                                       66%|██████▌   | 8380/12776 [1:28:39<17:15,  4.24it/s] 66%|██████▌   | 8381/12776 [1:28:39<16:33,  4.42it/s]                                                       66%|██████▌   | 8381/12776 [1:28:39<16:33,  4.42it/s] 66%|██████▌   | 8382/12776 [1:28:40<15:57,  4.59it/s]                                                       66%|██████▌   | 8382/12776 [1:28:40<15:57,  4.59it/s] 66%|██████▌   | 8383/12776 [1:28:40<15:28,  4.73it/s]                                                       66%|██████▌   | 8383/12776 [1:28:40<15:28,  4.73it/s] 66%|██████▌   | 8384/12776 [1:28:40<15:03,  4.86it/s]                                                       66%|██████▌   | 8384/12776 [1:28:40<15:03,  4.86it/s] 66%|██████▌   | 8385/12776 [1:28:40<17:02,  4.29it/s]                                                       66%|██████▌   | 8385/12776 [1:28:40<17:02,  4.29it/s] 66%|██████▌   | 8386/12776 [1:28:41<16:01,  4.56it/s]                                                       66%|██████▌   | 8386/12776 [1:28:41<16:01,  4.56it/s] 66%|██████▌   | 8387/12776 [1:28:41<15:17,  4.78it/s]                                                      {'loss': 0.3824, 'grad_norm': 1.0531212091445923, 'learning_rate': 0.00010943304007820136, 'epoch': 1.3}
+{'loss': 0.6215, 'grad_norm': 1.700603723526001, 'learning_rate': 0.00010940860215053764, 'epoch': 1.3}
+{'loss': 0.4508, 'grad_norm': 1.701421856880188, 'learning_rate': 0.00010938416422287389, 'epoch': 1.3}
+{'loss': 0.5921, 'grad_norm': 1.6104190349578857, 'learning_rate': 0.00010935972629521015, 'epoch': 1.3}
+{'loss': 0.3565, 'grad_norm': 0.7885456681251526, 'learning_rate': 0.00010933528836754642, 'epoch': 1.3}
+{'loss': 0.6124, 'grad_norm': 1.718679666519165, 'learning_rate': 0.0001093108504398827, 'epoch': 1.3}
+{'loss': 0.8574, 'grad_norm': 4.118200302124023, 'learning_rate': 0.00010928641251221895, 'epoch': 1.3}
+{'loss': 0.5231, 'grad_norm': 1.17429518699646, 'learning_rate': 0.00010926197458455521, 'epoch': 1.3}
+{'loss': 0.3513, 'grad_norm': 3.1723527908325195, 'learning_rate': 0.00010923753665689149, 'epoch': 1.3}
+{'loss': 0.5292, 'grad_norm': 2.3258113861083984, 'learning_rate': 0.00010921309872922775, 'epoch': 1.3}
+{'loss': 0.2251, 'grad_norm': 3.5120906829833984, 'learning_rate': 0.000109188660801564, 'epoch': 1.3}
+{'loss': 0.4773, 'grad_norm': 1.4291130304336548, 'learning_rate': 0.00010916422287390028, 'epoch': 1.3}
+{'loss': 0.7817, 'grad_norm': 3.090890884399414, 'learning_rate': 0.00010913978494623655, 'epoch': 1.3}
+{'loss': 0.8859, 'grad_norm': 3.119511842727661, 'learning_rate': 0.00010911534701857281, 'epoch': 1.3}
+{'loss': 0.2614, 'grad_norm': 1.6605695486068726, 'learning_rate': 0.00010909090909090908, 'epoch': 1.3}
+{'loss': 0.7882, 'grad_norm': 3.2734925746917725, 'learning_rate': 0.00010906647116324534, 'epoch': 1.3}
+{'loss': 0.5519, 'grad_norm': 1.581061601638794, 'learning_rate': 0.00010904203323558161, 'epoch': 1.3}
+{'loss': 1.1315, 'grad_norm': 1.6887787580490112, 'learning_rate': 0.00010901759530791789, 'epoch': 1.3}
+{'loss': 1.2345, 'grad_norm': 5.0255937576293945, 'learning_rate': 0.00010899315738025414, 'epoch': 1.3}
+{'loss': 1.2284, 'grad_norm': 3.7134711742401123, 'learning_rate': 0.0001089687194525904, 'epoch': 1.3}
+{'loss': 0.588, 'grad_norm': 2.470165252685547, 'learning_rate': 0.00010894428152492668, 'epoch': 1.3}
+{'loss': 0.8248, 'grad_norm': 2.310760021209717, 'learning_rate': 0.00010891984359726295, 'epoch': 1.3}
+{'loss': 1.843, 'grad_norm': 4.917720794677734, 'learning_rate': 0.0001088954056695992, 'epoch': 1.3}
+{'loss': 1.4091, 'grad_norm': 3.521848440170288, 'learning_rate': 0.00010887096774193548, 'epoch': 1.3}
+{'loss': 1.0417, 'grad_norm': 1.4144970178604126, 'learning_rate': 0.00010884652981427174, 'epoch': 1.3}
+{'loss': 0.7129, 'grad_norm': 2.08368182182312, 'learning_rate': 0.00010882209188660799, 'epoch': 1.3}
+{'loss': 1.2135, 'grad_norm': 2.0993666648864746, 'learning_rate': 0.00010879765395894427, 'epoch': 1.3}
+{'loss': 0.8379, 'grad_norm': 4.826048374176025, 'learning_rate': 0.00010877321603128053, 'epoch': 1.3}
+{'loss': 0.603, 'grad_norm': 2.809587001800537, 'learning_rate': 0.0001087487781036168, 'epoch': 1.31}
+{'loss': 1.2882, 'grad_norm': 5.488523483276367, 'learning_rate': 0.00010872434017595308, 'epoch': 1.31}
+{'loss': 0.3117, 'grad_norm': 1.1666176319122314, 'learning_rate': 0.00010869990224828933, 'epoch': 1.31}
+{'loss': 0.2904, 'grad_norm': 0.6575944423675537, 'learning_rate': 0.0001086754643206256, 'epoch': 1.31}
+{'loss': 0.202, 'grad_norm': 0.6943413615226746, 'learning_rate': 0.00010865102639296187, 'epoch': 1.31}
+{'loss': 0.2215, 'grad_norm': 0.5365055799484253, 'learning_rate': 0.00010862658846529814, 'epoch': 1.31}
+{'loss': 0.2398, 'grad_norm': 0.6395205855369568, 'learning_rate': 0.00010860215053763439, 'epoch': 1.31}
+{'loss': 0.2191, 'grad_norm': 0.6453742980957031, 'learning_rate': 0.00010857771260997067, 'epoch': 1.31}
+{'loss': 0.1847, 'grad_norm': 0.4960818290710449, 'learning_rate': 0.00010855327468230693, 'epoch': 1.31}
+{'loss': 0.2645, 'grad_norm': 0.8823752403259277, 'learning_rate': 0.00010852883675464318, 'epoch': 1.31}
+{'loss': 0.1919, 'grad_norm': 0.6216477751731873, 'learning_rate': 0.00010850439882697946, 'epoch': 1.31}
+{'loss': 0.2694, 'grad_norm': 0.8468354344367981, 'learning_rate': 0.00010847996089931573, 'epoch': 1.31}
+{'loss': 0.3727, 'grad_norm': 1.344497799873352, 'learning_rate': 0.00010845552297165199, 'epoch': 1.31}
+{'loss': 0.3696, 'grad_norm': 1.8707456588745117, 'learning_rate': 0.00010843108504398827, 'epoch': 1.31}
+{'loss': 0.2772, 'grad_norm': 1.084493637084961, 'learning_rate': 0.00010840664711632452, 'epoch': 1.31}
+{'loss': 0.2514, 'grad_norm': 0.9700431823730469, 'learning_rate': 0.00010838220918866079, 'epoch': 1.31}
+{'loss': 0.2902, 'grad_norm': 1.6982876062393188, 'learning_rate': 0.00010835777126099706, 'epoch': 1.31}
+{'loss': 0.3883, 'grad_norm': 1.004989504814148, 'learning_rate': 0.00010833333333333333, 'epoch': 1.31}
+{'loss': 0.5643, 'grad_norm': 1.6661001443862915, 'learning_rate': 0.00010830889540566958, 'epoch': 1.31}
+{'loss': 0.5259, 'grad_norm': 1.4825246334075928, 'learning_rate': 0.00010828445747800586, 'epoch': 1.31}
+{'loss': 0.2891, 'grad_norm': 1.141077995300293, 'learning_rate': 0.00010826001955034212, 'epoch': 1.31}
+{'loss': 0.7122, 'grad_norm': 2.092963695526123, 'learning_rate': 0.00010823558162267837, 'epoch': 1.31}
+{'loss': 0.3163, 'grad_norm': 0.9098227024078369, 'learning_rate': 0.00010821114369501465, 'epoch': 1.31}
+{'loss': 0.6714, 'grad_norm': 2.1590492725372314, 'learning_rate': 0.00010818670576735092, 'epoch': 1.31}
+{'loss': 0.5353, 'grad_norm': 1.1368892192840576, 'learning_rate': 0.00010816226783968718, 'epoch': 1.31}
+{'loss': 0.4366, 'grad_norm': 1.050818681716919, 'learning_rate': 0.00010813782991202346, 'epoch': 1.31}
+{'loss': 0.377, 'grad_norm': 0.9604326486587524, 'learning_rate': 0.00010811339198435971, 'epoch': 1.31}
+{'loss': 0.709, 'grad_norm': 2.909843683242798, 'learning_rate': 0.00010808895405669598, 'epoch': 1.31}
+{'loss': 0.5225, 'grad_norm': 2.03232741355896, 'learning_rate': 0.00010806451612903225, 'epoch': 1.31}
+{'loss': 0.7947, 'grad_norm': 1.5411139726638794, 'learning_rate': 0.00010804007820136852, 'epoch': 1.31}
+{'loss': 0.5839, 'grad_norm': 1.4965941905975342, 'learning_rate': 0.00010801564027370477, 'epoch': 1.31}
+{'loss': 0.6892, 'grad_norm': 2.01226806640625, 'learning_rate': 0.00010799120234604105, 'epoch': 1.31}
+{'loss': 0.437, 'grad_norm': 4.478280067443848, 'learning_rate': 0.00010796676441837731, 'epoch': 1.31}
+{'loss': 0.5926, 'grad_norm': 2.566091299057007, 'learning_rate': 0.00010794232649071357, 'epoch': 1.31}
+{'loss': 0.6444, 'grad_norm': 2.0691118240356445, 'learning_rate': 0.00010791788856304984, 'epoch': 1.31}
+{'loss': 0.8449, 'grad_norm': 2.3462331295013428, 'learning_rate': 0.00010789345063538611, 'epoch': 1.31}
+{'loss': 0.6608, 'grad_norm': 1.7684667110443115, 'learning_rate': 0.00010786901270772237, 'epoch': 1.31}
+{'loss': 1.0546, 'grad_norm': 2.054476022720337, 'learning_rate': 0.00010784457478005865, 'epoch': 1.31}
+{'loss': 0.9144, 'grad_norm': 2.8694870471954346, 'learning_rate': 0.0001078201368523949, 'epoch': 1.31}
+{'loss': 0.5151, 'grad_norm': 1.2306618690490723, 'learning_rate': 0.00010779569892473117, 'epoch': 1.31}
+{'loss': 0.8821, 'grad_norm': 1.6525115966796875, 'learning_rate': 0.00010777126099706745, 'epoch': 1.31}
+{'loss': 1.6076, 'grad_norm': 3.0999720096588135, 'learning_rate': 0.0001077468230694037, 'epoch': 1.31}
+{'loss': 0.8059, 'grad_norm': 1.6865127086639404, 'learning_rate': 0.00010772238514173996, 'epoch': 1.31}
+{'loss': 0.5467, 'grad_norm': 2.4270031452178955, 'learning_rate': 0.00010769794721407624, 'epoch': 1.31}
+{'loss': 0.532, 'grad_norm': 2.8954577445983887, 'learning_rate': 0.0001076735092864125, 'epoch': 1.31}
+{'loss': 1.7656, 'grad_norm': 4.275869846343994, 'learning_rate': 0.00010764907135874876, 'epoch': 1.31}
+{'loss': 1.1036, 'grad_norm': 1.7701374292373657, 'learning_rate': 0.00010762463343108503, 'epoch': 1.31}
+{'loss': 0.4755, 'grad_norm': 2.1359615325927734, 'learning_rate': 0.0001076001955034213, 'epoch': 1.31}
+{'loss': 0.349, 'grad_norm': 1.1913822889328003, 'learning_rate': 0.00010757575757575756, 'epoch': 1.31}
+{'loss': 0.4667, 'grad_norm': 1.3649957180023193, 'learning_rate': 0.00010755131964809384, 'epoch': 1.31}
+ 66%|██████▌   | 8387/12776 [1:28:41<15:17,  4.78it/s] 66%|██████▌   | 8388/12776 [1:28:41<27:20,  2.67it/s]                                                       66%|██████▌   | 8388/12776 [1:28:41<27:20,  2.67it/s] 66%|██████▌   | 8389/12776 [1:28:43<49:18,  1.48it/s]                                                       66%|██████▌   | 8389/12776 [1:28:43<49:18,  1.48it/s] 66%|██████▌   | 8390/12776 [1:28:44<56:14,  1.30it/s]                                                       66%|██████▌   | 8390/12776 [1:28:44<56:14,  1.30it/s] 66%|██████▌   | 8391/12776 [1:28:45<1:01:13,  1.19it/s]                                                         66%|██████▌   | 8391/12776 [1:28:45<1:01:13,  1.19it/s] 66%|██████▌   | 8392/12776 [1:28:46<1:00:59,  1.20it/s]                                                         66%|██████▌   | 8392/12776 [1:28:46<1:00:59,  1.20it/s] 66%|██████▌   | 8393/12776 [1:28:46<59:38,  1.22it/s]                                                         66%|██████▌   | 8393/12776 [1:28:46<59:38,  1.22it/s] 66%|██████▌   | 8394/12776 [1:28:47<58:24,  1.25it/s]                                                       66%|██████▌   | 8394/12776 [1:28:47<58:24,  1.25it/s] 66%|██████▌   | 8395/12776 [1:28:48<55:49,  1.31it/s]                                                       66%|██████▌   | 8395/12776 [1:28:48<55:49,  1.31it/s] 66%|██████▌   | 8396/12776 [1:28:49<55:24,  1.32it/s]                                                       66%|██████▌   | 8396/12776 [1:28:49<55:24,  1.32it/s] 66%|██████▌   | 8397/12776 [1:28:49<52:09,  1.40it/s]                                                       66%|██████▌   | 8397/12776 [1:28:49<52:09,  1.40it/s] 66%|██████▌   | 8398/12776 [1:28:50<50:00,  1.46it/s]                                                       66%|██████▌   | 8398/12776 [1:28:50<50:00,  1.46it/s] 66%|██████▌   | 8399/12776 [1:28:50<47:01,  1.55it/s]                                                       66%|██████▌   | 8399/12776 [1:28:50<47:01,  1.55it/s] 66%|██████▌   | 8400/12776 [1:28:51<44:40,  1.63it/s]                                                       66%|██████▌   | 8400/12776 [1:28:51<44:40,  1.63it/s]Saving model checkpoint to ./checkpoint-8400
+Configuration saved in ./checkpoint-8400/config.json
+Model weights saved in ./checkpoint-8400/model.safetensors
+Feature extractor saved in ./checkpoint-8400/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-8400/tokenizer_config.json
+Special tokens file saved in ./checkpoint-8400/special_tokens_map.json
+added tokens file saved in ./checkpoint-8400/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-7200] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 66%|██████▌   | 8401/12776 [1:28:57<2:37:15,  2.16s/it]                                                         66%|██████▌   | 8401/12776 [1:28:57<2:37:15,  2.16s/it] 66%|██████▌   | 8402/12776 [1:28:57<2:00:12,  1.65s/it]                                                         66%|██████▌   | 8402/12776 [1:28:57<2:00:12,  1.65s/it] 66%|██████▌   | 8403/12776 [1:28:58<1:35:24,  1.31s/it]                                                         66%|██████▌   | 8403/12776 [1:28:58<1:35:24,  1.31s/it] 66%|██████▌   | 8404/12776 [1:28:58<1:16:16,  1.05s/it]                                                         66%|██████▌   | 8404/12776 [1:28:58<1:16:16,  1.05s/it] 66%|██████▌   | 8405/12776 [1:28:59<1:05:34,  1.11it/s]                                                         66%|██████▌   | 8405/12776 [1:28:59<1:05:34,  1.11it/s] 66%|██████▌   | 8406/12776 [1:28:59<54:03,  1.35it/s]                                                         66%|██████▌   | 8406/12776 [1:28:59<54:03,  1.35it/s] 66%|██████▌   | 8407/12776 [1:28:59<45:50,  1.59it/s]                                                       66%|██████▌   | 8407/12776 [1:28:59<45:50,  1.59it/s] 66%|██████▌   | 8408/12776 [1:29:00<41:32,  1.75it/s]                                                       66%|██████▌   | 8408/12776 [1:29:00<41:32,  1.75it/s] 66%|██████▌   | 8409/12776 [1:29:00<36:31,  1.99it/s]                                                       66%|██████▌   | 8409/12776 [1:29:00<36:31,  1.99it/s] 66%|██████▌   | 8410/12776 [1:29:01<32:44,  2.22it/s]                                                       66%|██████▌   | 8410/12776 [1:29:01<32:44,  2.22it/s] 66%|██████▌   | 8411/12776 [1:29:01<29:56,  2.43it/s]                                                       66%|██████▌   | 8411/12776 [1:29:01<29:56,  2.43it/s] 66%|██████▌   | 8412/12776 [1:29:01<28:08,  2.58it/s]                                                       66%|██████▌   | 8412/12776 [1:29:01<28:08,  2.58it/s] 66%|██████▌   | 8413/12776 [1:29:01<26:17,  2.77it/s]                                                       66%|██████▌   | 8413/12776 [1:29:01<26:17,  2.77it/s] 66%|██████▌   | 8414/12776 [1:29:02<24:43,  2.94it/s]                                                       66%|██████▌   | 8414/12776 [1:29:02<24:43,  2.94it/s] 66%|██████▌   | 8415/12776 [1:29:02<24:51,  2.92it/s]                                                       66%|██████▌   | 8415/12776 [1:29:02<24:51,  2.92it/s] 66%|██████▌   | 8416/12776 [1:29:02<23:19,  3.12it/s]                                                       66%|██████▌   | 8416/12776 [1:29:02<23:19,  3.12it/s] 66%|██████▌   | 8417/12776 [1:29:03<22:03,  3.29it/s]                                                       66%|██████▌   | 8417/12776 [1:29:03<22:03,  3.29it/s] 66%|██████▌   | 8418/12776 [1:29:03<21:04,  3.45it/s]                                                       66%|██████▌   | 8418/12776 [1:29:03<21:04,  3.45it/s] 66%|██████▌   | 8419/12776 [1:29:03<22:30,  3.23it/s]                                                       66%|██████▌   | 8419/12776 [1:29:03<22:30,  3.23it/s] 66%|██████▌   | 8420/12776 [1:29:04<21:02,  3.45it/s]                                                       66%|██████▌   | 8420/12776 [1:29:04<21:02,  3.45it/s] 66%|██████▌   | 8421/12776 [1:29:04<19:46,  3.67it/s]                                                       66%|██████▌   | 8421/12776 [1:29:04<19:46,  3.67it/s] 66%|██████▌   | 8422/12776 [1:29:04<18:38,  3.89it/s]                                                       66%|██████▌   | 8422/12776 [1:29:04<18:38,  3.89it/s] 66%|██████▌   | 8423/12776 [1:29:04<17:49,  4.07it/s]                                                       66%|██████▌   | 8423/12776 [1:29:04<17:49,  4.07it/s] 66%|██████▌   | 8424/12776 [1:29:05<19:57,  3.64it/s]                                                       66%|██████▌   | 8424/12776 [1:29:05<19:57,  3.64it/s] 66%|██████▌   | 8425/12776 [1:29:05<18:29,  3.92it/s]                                                       66%|██████▌   | 8425/12776 [1:29:05<18:29,  3.92it/s] 66%|██████▌   | 8426/12776 [1:29:05<17:21,  4.18it/s]                                                       66%|██████▌   | 8426/12776 [1:29:05<17:21,  4.18it/s] 66%|██████▌   | 8427/12776 [1:29:05<16:24,  4.42it/s]                                                       66%|██████▌   | 8427/12776 [1:29:05<16:24,  4.42it/s] 66%|██████▌   | 8428/12776 [1:29:05<15:40,  4.62it/s]                                                       66%|██████▌   | 8428/12776 [1:29:05<15:40,  4.62it/s] 66%|██████▌   | 8429/12776 [1:29:06<18:19,  3.95it/s]                                                       66%|██████▌   | 8429/12776 [1:29:06<18:19,  3.95it/s] 66%|██████▌   | 8430/12776 [1:29:06<16:50,  4.30it/s]                                                       66%|██████▌   | 8430/12776 [1:29:06<16:50,  4.30it/s] 66%|██████▌   | 8431/12776 [1:29:06<15:42,  4.61it/s]                                                       66%|██████▌   | 8431/12776 [1:29:06<15:42,  4.61it/s] 66%|██████▌   | 8432/12776 [1:29:06<14:55,  4.85it/s]                                                       66%|██████▌   | 8432/12776 [1:29:06<14:55,  4.85it/s] 66%|██████▌   | 8433/12776 [1:29:06<14:15,  5.08it/s]                                                       66%|██████▌   | 8433/12776 [1:29:06<14:15,  5.08it/s] 66%|██████▌   | 8434/12776 [1:29:07<13:44,  5.27it/s]                                                       66%|██████▌   | 8434/12776 [1:29:07<13:44,  5.27it/s] 66%|██████▌   | 8435/12776 [1:29:07<14:30,  4.98it/s]                                                       66%|██████▌   | 8435/12776 [1:29:07<14:30,  4.98it/s] 66%|██████▌   | 8436/12776 [1:29:07<13:56,  5.19it/s]                                                       66%|██████▌   | 8436/12776 [1:29:07<13:56,  5.19it/s] 66%|██████▌   | 8437/12776 [1:29:07<13:19,  5.43it/s]                                                       66%|██████▌   | 8437/12776 [1:29:07<13:19,  5.43it/s] 66%|██████▌   | 8438/12776 [1:29:08<25:43,  2.81it/s]                                                       66%|██████▌   | 8438/12776 [1:29:08<25:43,  2.81it/s] 66%|██████▌   | 8439/12776 [1:29:09<44:13,  1.63it/s]                                                       66%|██████▌   | 8439/12776 [1:29:09<44:13,  1.63it/s] 66%|██████▌   | 8440/12776 [1:29:10<53:04,  1.36it/s]                                                       66%|██████▌   | 8440/12776 [1:29:10<53:04,  1.36it/s] 66%|██████▌   | 8441/12776 [1:29:11<54:10,  1.33it/s]                                                       66%|██████▌   | 8441/12776 [1:29:11<54:10,  1.33it/s] 66%|██████▌   | 8442/12776 [1:29:12<54:44,  1.32it/s]                                                       66%|██████▌   | 8442/12776 [1:29:12<54:44,  1.32it/s] 66%|██████▌   | 8443/12776 [1:29:12<54:33,  1.32it/s]                                                       66%|██████▌   | 8443/12776 [1:29:12<54:33,  1.32it/s] 66%|██████▌   | 8444/12776 [1:29:13<51:42,  1.40it/s]                                                       66%|██████▌   | 8444/12776 [1:29:13<51:42,  1.40it/s] 66%|██████▌   | 8445/12776 [1:29:14<48:50,  1.48it/s]                                                       66%|██████▌   | 8445/12776 [1:29:14<48:50,  1.48it/s] 66%|██████▌   | 8446/12776 [1:29:14<46:05,  1.57it/s]                                                       66%|██████▌   | 8446/12776 [1:29:14<46:05,  1.57it/s] 66%|██████▌   | 8447/12776 [1:29:15<45:16,  1.59it/s]                                                       66%|██████▌   | 8447/12776 [1:29:15<45:16,  1.59it/s] 66%|██████▌   | 8448/12776 [1:29:15<42:47,  1.69it/s]                                                       66%|██████▌   | 8448/12776 [1:29:15<42:47,  1.69it/s] 66%|██████▌   | 8449/12776 [1:29:16<40:23,  1.79it/s]                                                       66%|██████▌   | 8449/12776 [1:29:16<40:23,  1.79it/s] 66%|██████▌   | 8450/12776 [1:29:16<38:37,  1.87it/s]                                                       66%|██████▌   | 8450/12776 [1:29:16<38:37,  1.87it/s] 66%|██████▌   | 8451/12776 [1:29:17<36:24,  1.98it/s]                                                       66%|██████▌   | 8451/12776 [1:29:17<36:24,  1.98it/s] 66%|██████▌   | 8452/12776 [1:29:17<35:11,  2.05it/s]                                                       66%|██████▌   | 8452/12776 [1:29:17<35:11,  2.05it/s] 66%|██████▌   | 8453/12776 [1:29:18<33:20,  2.16it/s]                                                       66%|██████▌   | 8453/12776 [1:29:18<33:20,  2.16it/s] 66%|██████▌   | 8454/12776 [1:29:18<31:45,  2.27it/s]                                                       66%|██████▌   | 8454/12776 [1:29:18<31:45,  2.27it/s] 66%|██████▌   | 8455/12776 [1:29:18<33:04,  2.18it/s]                                                       66%|██████▌   | 8455/12776 [1:29:18<33:04,  2.18it/s] 66%|██████▌   | 8456/12776 [1:29:19<30:56,  2.33it/s]                                                       66%|██████▌   | 8456/12776 [1:29:19<30:56,  2.33it/s] 66%|██████▌   | 8457/12776 [1:29:19<29:11,  2.47it/s]                                                       66%|██████▌   | 8457/12776 [1:29:19<29:11,  2.47it/s] 66%|██████▌   | 8458/12776 [1:29:20<29:02,  2.48it/s]                                                       66%|██████▌   | 8458/12776 [1:29:20<29:02,  2.48it/s] 66%|██████▌   | 8459/12776 [1:29:20<27:12,  2.64it/s]                                                       66%|██████▌   | 8459/12776 [1:29:20<27:12,  2.64it/s] 66%|██████▌   | 8460/12776 [1:29:20<25:37,  2.81it/s]                                                       66%|██████▌   | 8460/12776 [1:29:20<25:37,  2.81it/s] 66%|██████▌   | 8461/12776 [1:29:21<25:52,  2.78it/s]                                                       66%|██████▌   | 8461/12776 [1:29:21<25:52,  2.78it/s] 66%|██████▌   | 8462/12776 [1:29:21<24:10,  2.98it/s]                                                       66%|██████▌   | 8462/12776 [1:29:21<24:10,  2.98it/s] 66%|██████▌   | 8463/12776 [1:29:21<22:50,  3.15it/s]                                                       66%|██████▌   | 8463/12776 [1:29:21<22:50,  3.15it/s] 66%|██████▌   | 8464/12776 [1:29:21<21:46,  3.30it/s]                                                      {'loss': 0.7366, 'grad_norm': 2.1878700256347656, 'learning_rate': 0.0001075268817204301, 'epoch': 1.31}
+{'loss': 1.0322, 'grad_norm': 2.3514275550842285, 'learning_rate': 0.00010750244379276636, 'epoch': 1.31}
+{'loss': 0.211, 'grad_norm': 0.49473220109939575, 'learning_rate': 0.00010747800586510264, 'epoch': 1.31}
+{'loss': 0.2018, 'grad_norm': 1.2258360385894775, 'learning_rate': 0.00010745356793743889, 'epoch': 1.31}
+{'loss': 0.2365, 'grad_norm': 0.7705320119857788, 'learning_rate': 0.00010742913000977515, 'epoch': 1.31}
+{'loss': 0.168, 'grad_norm': 0.46400439739227295, 'learning_rate': 0.00010740469208211143, 'epoch': 1.31}
+{'loss': 0.3099, 'grad_norm': 0.7878824472427368, 'learning_rate': 0.0001073802541544477, 'epoch': 1.31}
+{'loss': 0.1739, 'grad_norm': 0.9498413801193237, 'learning_rate': 0.00010735581622678395, 'epoch': 1.31}
+{'loss': 0.2746, 'grad_norm': 0.9091798663139343, 'learning_rate': 0.00010733137829912023, 'epoch': 1.31}
+{'loss': 0.2913, 'grad_norm': 0.8814572691917419, 'learning_rate': 0.00010730694037145649, 'epoch': 1.31}
+{'loss': 0.3373, 'grad_norm': 0.8799420595169067, 'learning_rate': 0.00010728250244379276, 'epoch': 1.31}
+{'loss': 0.3035, 'grad_norm': 0.6307176351547241, 'learning_rate': 0.00010725806451612903, 'epoch': 1.31}
+{'loss': 0.2801, 'grad_norm': 0.962165355682373, 'learning_rate': 0.00010723362658846529, 'epoch': 1.31}
+{'loss': 0.1575, 'grad_norm': 0.4829339385032654, 'learning_rate': 0.00010720918866080155, 'epoch': 1.31}
+{'loss': 0.3151, 'grad_norm': 0.9093053936958313, 'learning_rate': 0.00010718475073313783, 'epoch': 1.32}
+{'loss': 0.3769, 'grad_norm': 1.592719316482544, 'learning_rate': 0.00010716031280547408, 'epoch': 1.32}
+{'loss': 0.3612, 'grad_norm': 1.357019066810608, 'learning_rate': 0.00010713587487781034, 'epoch': 1.32}
+{'loss': 0.3585, 'grad_norm': 4.118210315704346, 'learning_rate': 0.00010711143695014662, 'epoch': 1.32}
+{'loss': 0.4046, 'grad_norm': 1.2734813690185547, 'learning_rate': 0.00010708699902248289, 'epoch': 1.32}
+{'loss': 0.2412, 'grad_norm': 0.6813839077949524, 'learning_rate': 0.00010706256109481914, 'epoch': 1.32}
+{'loss': 0.3266, 'grad_norm': 1.974395751953125, 'learning_rate': 0.00010703812316715542, 'epoch': 1.32}
+{'loss': 0.3412, 'grad_norm': 1.261323094367981, 'learning_rate': 0.00010701368523949168, 'epoch': 1.32}
+{'loss': 0.3782, 'grad_norm': 1.1508007049560547, 'learning_rate': 0.00010698924731182795, 'epoch': 1.32}
+{'loss': 0.5881, 'grad_norm': 1.8937280178070068, 'learning_rate': 0.00010696480938416422, 'epoch': 1.32}
+{'loss': 0.3392, 'grad_norm': 1.3329272270202637, 'learning_rate': 0.00010694037145650048, 'epoch': 1.32}
+{'loss': 0.5811, 'grad_norm': 0.9232223033905029, 'learning_rate': 0.00010691593352883674, 'epoch': 1.32}
+{'loss': 0.4383, 'grad_norm': 1.0369586944580078, 'learning_rate': 0.00010689149560117302, 'epoch': 1.32}
+{'loss': 0.554, 'grad_norm': 2.1007254123687744, 'learning_rate': 0.00010686705767350927, 'epoch': 1.32}
+{'loss': 0.4968, 'grad_norm': 1.1402606964111328, 'learning_rate': 0.00010684261974584554, 'epoch': 1.32}
+{'loss': 0.3661, 'grad_norm': 1.1033568382263184, 'learning_rate': 0.00010681818181818181, 'epoch': 1.32}
+{'loss': 0.3178, 'grad_norm': 1.5588769912719727, 'learning_rate': 0.00010679374389051808, 'epoch': 1.32}
+{'loss': 0.5225, 'grad_norm': 1.2382311820983887, 'learning_rate': 0.00010676930596285433, 'epoch': 1.32}
+{'loss': 0.7478, 'grad_norm': 2.5015268325805664, 'learning_rate': 0.00010674486803519061, 'epoch': 1.32}
+{'loss': 0.5885, 'grad_norm': 3.2753753662109375, 'learning_rate': 0.00010672043010752687, 'epoch': 1.32}
+{'loss': 0.6413, 'grad_norm': 2.250828742980957, 'learning_rate': 0.00010669599217986314, 'epoch': 1.32}
+{'loss': 0.8944, 'grad_norm': 1.6455849409103394, 'learning_rate': 0.0001066715542521994, 'epoch': 1.32}
+{'loss': 1.0937, 'grad_norm': 1.4225467443466187, 'learning_rate': 0.00010664711632453567, 'epoch': 1.32}
+{'loss': 1.155, 'grad_norm': 3.475102663040161, 'learning_rate': 0.00010662267839687193, 'epoch': 1.32}
+{'loss': 1.1532, 'grad_norm': 1.6069629192352295, 'learning_rate': 0.00010659824046920821, 'epoch': 1.32}
+{'loss': 0.9993, 'grad_norm': 2.740570306777954, 'learning_rate': 0.00010657380254154446, 'epoch': 1.32}
+{'loss': 0.342, 'grad_norm': 1.2884479761123657, 'learning_rate': 0.00010654936461388073, 'epoch': 1.32}
+{'loss': 0.8971, 'grad_norm': 5.032875061035156, 'learning_rate': 0.000106524926686217, 'epoch': 1.32}
+{'loss': 1.1503, 'grad_norm': 3.1008434295654297, 'learning_rate': 0.00010650048875855327, 'epoch': 1.32}
+{'loss': 1.0131, 'grad_norm': 3.7066233158111572, 'learning_rate': 0.00010647605083088952, 'epoch': 1.32}
+{'loss': 1.4015, 'grad_norm': 2.721331834793091, 'learning_rate': 0.0001064516129032258, 'epoch': 1.32}
+{'loss': 0.7506, 'grad_norm': 2.37866473197937, 'learning_rate': 0.00010642717497556206, 'epoch': 1.32}
+{'loss': 1.0727, 'grad_norm': 1.654268503189087, 'learning_rate': 0.00010640273704789833, 'epoch': 1.32}
+{'loss': 0.427, 'grad_norm': 0.7855532765388489, 'learning_rate': 0.0001063782991202346, 'epoch': 1.32}
+{'loss': 0.6105, 'grad_norm': 2.7681164741516113, 'learning_rate': 0.00010635386119257086, 'epoch': 1.32}
+{'loss': 0.7922, 'grad_norm': 4.213181495666504, 'learning_rate': 0.00010632942326490712, 'epoch': 1.32}
+{'loss': 1.3575, 'grad_norm': 4.588217258453369, 'learning_rate': 0.0001063049853372434, 'epoch': 1.32}
+{'loss': 0.9862, 'grad_norm': 1.3220864534378052, 'learning_rate': 0.00010628054740957965, 'epoch': 1.32}
+{'loss': 0.2151, 'grad_norm': 0.4366258680820465, 'learning_rate': 0.00010625610948191592, 'epoch': 1.32}
+{'loss': 0.1961, 'grad_norm': 0.3557334244251251, 'learning_rate': 0.0001062316715542522, 'epoch': 1.32}
+{'loss': 0.3202, 'grad_norm': 0.6270286440849304, 'learning_rate': 0.00010620723362658846, 'epoch': 1.32}
+{'loss': 0.1946, 'grad_norm': 0.5191538333892822, 'learning_rate': 0.00010618279569892471, 'epoch': 1.32}
+{'loss': 0.2023, 'grad_norm': 0.35942304134368896, 'learning_rate': 0.00010615835777126099, 'epoch': 1.32}
+{'loss': 0.2216, 'grad_norm': 0.5088504552841187, 'learning_rate': 0.00010613391984359726, 'epoch': 1.32}
+{'loss': 0.2147, 'grad_norm': 0.6313852667808533, 'learning_rate': 0.00010610948191593352, 'epoch': 1.32}
+{'loss': 0.3264, 'grad_norm': 1.02388596534729, 'learning_rate': 0.00010608504398826978, 'epoch': 1.32}
+{'loss': 0.3412, 'grad_norm': 0.7219918370246887, 'learning_rate': 0.00010606060606060605, 'epoch': 1.32}
+{'loss': 0.2682, 'grad_norm': 1.3611302375793457, 'learning_rate': 0.00010603616813294231, 'epoch': 1.32}
+{'loss': 0.2816, 'grad_norm': 0.6783427596092224, 'learning_rate': 0.00010601173020527859, 'epoch': 1.32}
+{'loss': 0.5017, 'grad_norm': 1.3119559288024902, 'learning_rate': 0.00010598729227761484, 'epoch': 1.32}
+{'loss': 0.6098, 'grad_norm': 2.026618719100952, 'learning_rate': 0.00010596285434995111, 'epoch': 1.32}
+{'loss': 0.3054, 'grad_norm': 1.1035137176513672, 'learning_rate': 0.00010593841642228739, 'epoch': 1.32}
+{'loss': 0.322, 'grad_norm': 1.0470503568649292, 'learning_rate': 0.00010591397849462365, 'epoch': 1.32}
+{'loss': 0.4344, 'grad_norm': 1.4040778875350952, 'learning_rate': 0.0001058895405669599, 'epoch': 1.32}
+{'loss': 0.4741, 'grad_norm': 1.1632486581802368, 'learning_rate': 0.00010586510263929618, 'epoch': 1.32}
+{'loss': 0.4954, 'grad_norm': 2.275049924850464, 'learning_rate': 0.00010584066471163245, 'epoch': 1.32}
+{'loss': 0.2482, 'grad_norm': 0.912585973739624, 'learning_rate': 0.00010581622678396871, 'epoch': 1.32}
+{'loss': 0.563, 'grad_norm': 2.720503568649292, 'learning_rate': 0.00010579178885630498, 'epoch': 1.32}
+{'loss': 0.3343, 'grad_norm': 1.2635301351547241, 'learning_rate': 0.00010576735092864124, 'epoch': 1.32}
+{'loss': 0.4257, 'grad_norm': 2.1451432704925537, 'learning_rate': 0.0001057429130009775, 'epoch': 1.32}
+{'loss': 0.3141, 'grad_norm': 0.8080310225486755, 'learning_rate': 0.00010571847507331378, 'epoch': 1.32}
+{'loss': 0.3324, 'grad_norm': 2.107248544692993, 'learning_rate': 0.00010569403714565004, 'epoch': 1.32}
+{'loss': 0.3303, 'grad_norm': 1.0356974601745605, 'learning_rate': 0.0001056695992179863, 'epoch': 1.32}
+ 66%|██████▌   | 8464/12776 [1:29:21<21:46,  3.30it/s] 66%|██████▋   | 8465/12776 [1:29:22<22:02,  3.26it/s]                                                       66%|██████▋   | 8465/12776 [1:29:22<22:02,  3.26it/s] 66%|██████▋   | 8466/12776 [1:29:22<21:20,  3.37it/s]                                                       66%|██████▋   | 8466/12776 [1:29:22<21:20,  3.37it/s] 66%|██████▋   | 8467/12776 [1:29:22<20:42,  3.47it/s]                                                       66%|██████▋   | 8467/12776 [1:29:22<20:42,  3.47it/s] 66%|██████▋   | 8468/12776 [1:29:22<20:07,  3.57it/s]                                                       66%|██████▋   | 8468/12776 [1:29:22<20:07,  3.57it/s] 66%|██████▋   | 8469/12776 [1:29:23<21:01,  3.41it/s]                                                       66%|██████▋   | 8469/12776 [1:29:23<21:01,  3.41it/s] 66%|██████▋   | 8470/12776 [1:29:23<20:01,  3.58it/s]                                                       66%|██████▋   | 8470/12776 [1:29:23<20:01,  3.58it/s] 66%|██████▋   | 8471/12776 [1:29:23<19:14,  3.73it/s]                                                       66%|██████▋   | 8471/12776 [1:29:23<19:14,  3.73it/s] 66%|██████▋   | 8472/12776 [1:29:24<18:37,  3.85it/s]                                                       66%|██████▋   | 8472/12776 [1:29:24<18:37,  3.85it/s] 66%|██████▋   | 8473/12776 [1:29:24<18:02,  3.97it/s]                                                       66%|██████▋   | 8473/12776 [1:29:24<18:02,  3.97it/s] 66%|██████▋   | 8474/12776 [1:29:24<19:09,  3.74it/s]                                                       66%|██████▋   | 8474/12776 [1:29:24<19:09,  3.74it/s] 66%|██████▋   | 8475/12776 [1:29:24<18:04,  3.97it/s]                                                       66%|██████▋   | 8475/12776 [1:29:24<18:04,  3.97it/s] 66%|██████▋   | 8476/12776 [1:29:25<17:13,  4.16it/s]                                                       66%|██████▋   | 8476/12776 [1:29:25<17:13,  4.16it/s] 66%|██████▋   | 8477/12776 [1:29:25<16:34,  4.32it/s]                                                       66%|██████▋   | 8477/12776 [1:29:25<16:34,  4.32it/s] 66%|██████▋   | 8478/12776 [1:29:25<16:03,  4.46it/s]                                                       66%|██████▋   | 8478/12776 [1:29:25<16:03,  4.46it/s] 66%|██████▋   | 8479/12776 [1:29:25<17:14,  4.15it/s]                                                       66%|██████▋   | 8479/12776 [1:29:25<17:14,  4.15it/s] 66%|██████▋   | 8480/12776 [1:29:25<15:57,  4.49it/s]                                                       66%|██████▋   | 8480/12776 [1:29:25<15:57,  4.49it/s] 66%|██████▋   | 8481/12776 [1:29:26<15:00,  4.77it/s]                                                       66%|██████▋   | 8481/12776 [1:29:26<15:00,  4.77it/s] 66%|██████▋   | 8482/12776 [1:29:26<14:16,  5.01it/s]                                                       66%|██████▋   | 8482/12776 [1:29:26<14:16,  5.01it/s] 66%|██████▋   | 8483/12776 [1:29:26<13:45,  5.20it/s]                                                       66%|██████▋   | 8483/12776 [1:29:26<13:45,  5.20it/s] 66%|██████▋   | 8484/12776 [1:29:26<13:19,  5.37it/s]                                                       66%|██████▋   | 8484/12776 [1:29:26<13:19,  5.37it/s] 66%|██████▋   | 8485/12776 [1:29:26<14:10,  5.05it/s]                                                       66%|██████▋   | 8485/12776 [1:29:26<14:10,  5.05it/s] 66%|██████▋   | 8486/12776 [1:29:26<13:31,  5.29it/s]                                                       66%|██████▋   | 8486/12776 [1:29:26<13:31,  5.29it/s] 66%|██████▋   | 8487/12776 [1:29:27<12:54,  5.54it/s]                                                       66%|██████▋   | 8487/12776 [1:29:27<12:54,  5.54it/s] 66%|██████▋   | 8488/12776 [1:29:27<25:38,  2.79it/s]                                                       66%|██████▋   | 8488/12776 [1:29:27<25:38,  2.79it/s] 66%|██████▋   | 8489/12776 [1:29:29<50:49,  1.41it/s]                                                       66%|██████▋   | 8489/12776 [1:29:29<50:49,  1.41it/s] 66%|██████▋   | 8490/12776 [1:29:30<58:35,  1.22it/s]                                                       66%|██████▋   | 8490/12776 [1:29:30<58:35,  1.22it/s] 66%|██████▋   | 8491/12776 [1:29:31<59:09,  1.21it/s]                                                       66%|██████▋   | 8491/12776 [1:29:31<59:09,  1.21it/s] 66%|██████▋   | 8492/12776 [1:29:32<57:53,  1.23it/s]                                                       66%|██████▋   | 8492/12776 [1:29:32<57:53,  1.23it/s] 66%|██████▋   | 8493/12776 [1:29:32<56:08,  1.27it/s]                                                       66%|██████▋   | 8493/12776 [1:29:32<56:08,  1.27it/s] 66%|██████▋   | 8494/12776 [1:29:33<56:01,  1.27it/s]                                                       66%|██████▋   | 8494/12776 [1:29:33<56:01,  1.27it/s] 66%|██████▋   | 8495/12776 [1:29:34<53:27,  1.33it/s]                                                       66%|██████▋   | 8495/12776 [1:29:34<53:27,  1.33it/s] 66%|██████▋   | 8496/12776 [1:29:35<53:15,  1.34it/s]                                                       66%|██████▋   | 8496/12776 [1:29:35<53:15,  1.34it/s] 67%|██████▋   | 8497/12776 [1:29:35<50:19,  1.42it/s]                                                       67%|██████▋   | 8497/12776 [1:29:35<50:19,  1.42it/s] 67%|██████▋   | 8498/12776 [1:29:36<48:01,  1.48it/s]                                                       67%|██████▋   | 8498/12776 [1:29:36<48:01,  1.48it/s] 67%|██████▋   | 8499/12776 [1:29:36<45:25,  1.57it/s]                                                       67%|██████▋   | 8499/12776 [1:29:36<45:25,  1.57it/s] 67%|██████▋   | 8500/12776 [1:29:37<44:08,  1.61it/s]                                                       67%|██████▋   | 8500/12776 [1:29:37<44:08,  1.61it/s] 67%|██████▋   | 8501/12776 [1:29:37<41:28,  1.72it/s]                                                       67%|██████▋   | 8501/12776 [1:29:37<41:28,  1.72it/s] 67%|██████▋   | 8502/12776 [1:29:38<40:48,  1.75it/s]                                                       67%|██████▋   | 8502/12776 [1:29:38<40:48,  1.75it/s] 67%|██████▋   | 8503/12776 [1:29:38<38:13,  1.86it/s]                                                       67%|██████▋   | 8503/12776 [1:29:38<38:13,  1.86it/s] 67%|██████▋   | 8504/12776 [1:29:39<37:19,  1.91it/s]                                                       67%|██████▋   | 8504/12776 [1:29:39<37:19,  1.91it/s] 67%|██████▋   | 8505/12776 [1:29:39<34:55,  2.04it/s]                                                       67%|██████▋   | 8505/12776 [1:29:39<34:55,  2.04it/s] 67%|██████▋   | 8506/12776 [1:29:40<33:05,  2.15it/s]                                                       67%|██████▋   | 8506/12776 [1:29:40<33:05,  2.15it/s] 67%|██████▋   | 8507/12776 [1:29:40<33:35,  2.12it/s]                                                       67%|██████▋   | 8507/12776 [1:29:40<33:35,  2.12it/s] 67%|██████▋   | 8508/12776 [1:29:41<31:43,  2.24it/s]                                                       67%|██████▋   | 8508/12776 [1:29:41<31:43,  2.24it/s] 67%|██████▋   | 8509/12776 [1:29:41<29:57,  2.37it/s]                                                       67%|██████▋   | 8509/12776 [1:29:41<29:57,  2.37it/s] 67%|██████▋   | 8510/12776 [1:29:41<29:52,  2.38it/s]                                                       67%|██████▋   | 8510/12776 [1:29:41<29:52,  2.38it/s] 67%|██████▋   | 8511/12776 [1:29:42<28:09,  2.52it/s]                                                       67%|██████▋   | 8511/12776 [1:29:42<28:09,  2.52it/s] 67%|██████▋   | 8512/12776 [1:29:42<26:48,  2.65it/s]                                                       67%|██████▋   | 8512/12776 [1:29:42<26:48,  2.65it/s] 67%|██████▋   | 8513/12776 [1:29:42<27:02,  2.63it/s]                                                       67%|██████▋   | 8513/12776 [1:29:42<27:02,  2.63it/s] 67%|██████▋   | 8514/12776 [1:29:43<25:19,  2.81it/s]                                                       67%|██████▋   | 8514/12776 [1:29:43<25:19,  2.81it/s] 67%|██████▋   | 8515/12776 [1:29:43<23:53,  2.97it/s]                                                       67%|██████▋   | 8515/12776 [1:29:43<23:53,  2.97it/s] 67%|██████▋   | 8516/12776 [1:29:43<24:56,  2.85it/s]                                                       67%|██████▋   | 8516/12776 [1:29:43<24:56,  2.85it/s] 67%|██████▋   | 8517/12776 [1:29:44<23:18,  3.04it/s]                                                       67%|██████▋   | 8517/12776 [1:29:44<23:18,  3.04it/s] 67%|██████▋   | 8518/12776 [1:29:44<22:02,  3.22it/s]                                                       67%|██████▋   | 8518/12776 [1:29:44<22:02,  3.22it/s] 67%|██████▋   | 8519/12776 [1:29:44<21:00,  3.38it/s]                                                       67%|██████▋   | 8519/12776 [1:29:44<21:00,  3.38it/s] 67%|██████▋   | 8520/12776 [1:29:45<21:33,  3.29it/s]                                                       67%|██████▋   | 8520/12776 [1:29:45<21:33,  3.29it/s] 67%|██████▋   | 8521/12776 [1:29:45<20:22,  3.48it/s]                                                       67%|██████▋   | 8521/12776 [1:29:45<20:22,  3.48it/s] 67%|██████▋   | 8522/12776 [1:29:45<19:27,  3.64it/s]                                                       67%|██████▋   | 8522/12776 [1:29:45<19:27,  3.64it/s] 67%|██████▋   | 8523/12776 [1:29:45<18:40,  3.79it/s]                                                       67%|██████▋   | 8523/12776 [1:29:45<18:40,  3.79it/s] 67%|██████▋   | 8524/12776 [1:29:46<18:12,  3.89it/s]                                                       67%|██████▋   | 8524/12776 [1:29:46<18:12,  3.89it/s] 67%|██████▋   | 8525/12776 [1:29:46<19:04,  3.71it/s]                                                       67%|██████▋   | 8525/12776 [1:29:46<19:04,  3.71it/s] 67%|██████▋   | 8526/12776 [1:29:46<17:55,  3.95it/s]                                                       67%|██████▋   | 8526/12776 [1:29:46<17:55,  3.95it/s] 67%|██████▋   | 8527/12776 [1:29:46<17:01,  4.16it/s]                                                       67%|██████▋   | 8527/12776 [1:29:46<17:01,  4.16it/s] 67%|██████▋   | 8528/12776 [1:29:46<16:24,  4.32it/s]                                                       67%|██████▋   | 8528/12776 [1:29:46<16:24,  4.32it/s] 67%|██████▋   | 8529/12776 [1:29:47<15:54,  4.45it/s]                                                       67%|██████▋   | 8529/12776 [1:29:47<15:54,  4.45it/s] 67%|██████▋   | 8530/12776 [1:29:47<16:52,  4.19it/s]                                                       67%|██████▋   | 8530/12776 [1:29:47<16:52,  4.19it/s] 67%|██████▋   | 8531/12776 [1:29:47<16:03,  4.41it/s]                                                       67%|██████▋   | 8531/12776 [1:29:47<16:03,  4.41it/s] 67%|██████▋   | 8532/12776 [1:29:47<15:35,  4.54it/s]                                                       67%|██████▋   | 8532/12776 [1:29:47<15:35,  4.54it/s] 67%|██████▋   | 8533/12776 [1:29:48<15:07,  4.68it/s]                                                       67%|██████▋   | 8533/12776 [1:29:48<15:07,  4.68it/s] 67%|██████▋   | 8534/12776 [1:29:48<14:45,  4.79it/s]                                                       67%|██████▋   | 8534/12776 [1:29:48<14:45,  4.79it/s] 67%|██████▋   | 8535/12776 [1:29:48<16:34,  4.27it/s]                                                       67%|██████▋   | 8535/12776 [1:29:48<16:34,  4.27it/s] 67%|██████▋   | 8536/12776 [1:29:48<15:32,  4.55it/s]                                                       67%|██████▋   | 8536/12776 [1:29:48<15:32,  4.55it/s] 67%|██████▋   | 8537/12776 [1:29:48<14:48,  4.77it/s]                                                       67%|██████▋   | 8537/12776 [1:29:48<14:48,  4.77it/s] 67%|██████▋   | 8538/12776 [1:29:49<26:22,  2.68it/s]                                                       67%|██████▋   | 8538/12776 [1:29:49<26:22,  2.68it/s] 67%|██████▋   | 8539/12776 [1:29:51<48:59,  1.44it/s]                                                       67%|██████▋   | 8539/12776 [1:29:51<48:59,  1.44it/s] 67%|██████▋   | 8540/12776 [1:29:52<55:36,  1.27it/s]                                                       67%|██████▋   | 8540/12776 [1:29:52<55:36,  1.27it/s] 67%|██████▋   | 8541/12776 [1:29:53<59:32,  1.19it/s]                                                      {'loss': 0.4119, 'grad_norm': 1.8852458000183105, 'learning_rate': 0.00010564516129032258, 'epoch': 1.32}
+{'loss': 0.4388, 'grad_norm': 1.1586437225341797, 'learning_rate': 0.00010562072336265884, 'epoch': 1.33}
+{'loss': 0.7354, 'grad_norm': 2.8293962478637695, 'learning_rate': 0.0001055962854349951, 'epoch': 1.33}
+{'loss': 0.6096, 'grad_norm': 3.1717278957366943, 'learning_rate': 0.00010557184750733137, 'epoch': 1.33}
+{'loss': 1.0067, 'grad_norm': 3.8848044872283936, 'learning_rate': 0.00010554740957966764, 'epoch': 1.33}
+{'loss': 0.5702, 'grad_norm': 1.52760648727417, 'learning_rate': 0.0001055229716520039, 'epoch': 1.33}
+{'loss': 0.8501, 'grad_norm': 1.8875513076782227, 'learning_rate': 0.00010549853372434017, 'epoch': 1.33}
+{'loss': 1.1365, 'grad_norm': 4.692626953125, 'learning_rate': 0.00010547409579667643, 'epoch': 1.33}
+{'loss': 1.165, 'grad_norm': 3.195230484008789, 'learning_rate': 0.0001054496578690127, 'epoch': 1.33}
+{'loss': 1.016, 'grad_norm': 2.567884683609009, 'learning_rate': 0.00010542521994134898, 'epoch': 1.33}
+{'loss': 1.1238, 'grad_norm': 2.327993392944336, 'learning_rate': 0.00010540078201368523, 'epoch': 1.33}
+{'loss': 1.3329, 'grad_norm': 4.008900165557861, 'learning_rate': 0.00010537634408602149, 'epoch': 1.33}
+{'loss': 1.2717, 'grad_norm': 3.4523065090179443, 'learning_rate': 0.00010535190615835777, 'epoch': 1.33}
+{'loss': 1.3055, 'grad_norm': 2.4509525299072266, 'learning_rate': 0.00010532746823069403, 'epoch': 1.33}
+{'loss': 1.0867, 'grad_norm': 2.2657418251037598, 'learning_rate': 0.00010530303030303029, 'epoch': 1.33}
+{'loss': 1.2424, 'grad_norm': 10.114274024963379, 'learning_rate': 0.00010527859237536656, 'epoch': 1.33}
+{'loss': 1.5289, 'grad_norm': 2.6086230278015137, 'learning_rate': 0.00010525415444770283, 'epoch': 1.33}
+{'loss': 0.8191, 'grad_norm': 7.500784397125244, 'learning_rate': 0.0001052297165200391, 'epoch': 1.33}
+{'loss': 0.9757, 'grad_norm': 2.1050162315368652, 'learning_rate': 0.00010520527859237536, 'epoch': 1.33}
+{'loss': 1.1642, 'grad_norm': 1.964867353439331, 'learning_rate': 0.00010518084066471162, 'epoch': 1.33}
+{'loss': 0.9598, 'grad_norm': 2.479874849319458, 'learning_rate': 0.00010515640273704789, 'epoch': 1.33}
+{'loss': 0.3375, 'grad_norm': 1.8766109943389893, 'learning_rate': 0.00010513196480938417, 'epoch': 1.33}
+{'loss': 1.2596, 'grad_norm': 2.781697988510132, 'learning_rate': 0.00010510752688172042, 'epoch': 1.33}
+{'loss': 1.0103, 'grad_norm': 4.228986740112305, 'learning_rate': 0.00010508308895405668, 'epoch': 1.33}
+{'loss': 0.83, 'grad_norm': 1.7900171279907227, 'learning_rate': 0.00010505865102639296, 'epoch': 1.33}
+{'loss': 0.2403, 'grad_norm': 0.6766242384910583, 'learning_rate': 0.00010503421309872923, 'epoch': 1.33}
+{'loss': 0.3599, 'grad_norm': 0.5489671230316162, 'learning_rate': 0.00010500977517106548, 'epoch': 1.33}
+{'loss': 0.2344, 'grad_norm': 0.7693419456481934, 'learning_rate': 0.00010498533724340176, 'epoch': 1.33}
+{'loss': 0.2692, 'grad_norm': 0.5848720073699951, 'learning_rate': 0.00010496089931573802, 'epoch': 1.33}
+{'loss': 0.1891, 'grad_norm': 0.4316113591194153, 'learning_rate': 0.00010493646138807427, 'epoch': 1.33}
+{'loss': 0.2097, 'grad_norm': 0.5180829167366028, 'learning_rate': 0.00010491202346041055, 'epoch': 1.33}
+{'loss': 0.1346, 'grad_norm': 0.31930309534072876, 'learning_rate': 0.00010488758553274681, 'epoch': 1.33}
+{'loss': 0.3029, 'grad_norm': 0.5279285907745361, 'learning_rate': 0.00010486314760508308, 'epoch': 1.33}
+{'loss': 0.319, 'grad_norm': 0.6613864898681641, 'learning_rate': 0.00010483870967741936, 'epoch': 1.33}
+{'loss': 0.2005, 'grad_norm': 0.5798127055168152, 'learning_rate': 0.00010481427174975561, 'epoch': 1.33}
+{'loss': 0.2422, 'grad_norm': 0.7351884245872498, 'learning_rate': 0.00010478983382209187, 'epoch': 1.33}
+{'loss': 0.3467, 'grad_norm': 0.8078888058662415, 'learning_rate': 0.00010476539589442815, 'epoch': 1.33}
+{'loss': 0.2722, 'grad_norm': 0.8279627561569214, 'learning_rate': 0.00010474095796676442, 'epoch': 1.33}
+{'loss': 0.4529, 'grad_norm': 1.2416898012161255, 'learning_rate': 0.00010471652003910067, 'epoch': 1.33}
+{'loss': 0.4516, 'grad_norm': 1.2886146306991577, 'learning_rate': 0.00010469208211143695, 'epoch': 1.33}
+{'loss': 0.1717, 'grad_norm': 0.4999758303165436, 'learning_rate': 0.00010466764418377321, 'epoch': 1.33}
+{'loss': 0.3384, 'grad_norm': 1.2985860109329224, 'learning_rate': 0.00010464320625610946, 'epoch': 1.33}
+{'loss': 0.4438, 'grad_norm': 2.1126205921173096, 'learning_rate': 0.00010461876832844574, 'epoch': 1.33}
+{'loss': 0.2766, 'grad_norm': 2.8582603931427, 'learning_rate': 0.000104594330400782, 'epoch': 1.33}
+{'loss': 0.3095, 'grad_norm': 1.146053433418274, 'learning_rate': 0.00010456989247311827, 'epoch': 1.33}
+{'loss': 0.5287, 'grad_norm': 1.1559828519821167, 'learning_rate': 0.00010454545454545455, 'epoch': 1.33}
+{'loss': 0.3, 'grad_norm': 1.217333436012268, 'learning_rate': 0.0001045210166177908, 'epoch': 1.33}
+{'loss': 0.6234, 'grad_norm': 1.9483610391616821, 'learning_rate': 0.00010449657869012706, 'epoch': 1.33}
+{'loss': 0.6622, 'grad_norm': 1.2724167108535767, 'learning_rate': 0.00010447214076246334, 'epoch': 1.33}
+{'loss': 0.8264, 'grad_norm': 3.2737345695495605, 'learning_rate': 0.00010444770283479961, 'epoch': 1.33}
+{'loss': 0.649, 'grad_norm': 2.594571352005005, 'learning_rate': 0.00010442326490713586, 'epoch': 1.33}
+{'loss': 0.4745, 'grad_norm': 2.1215741634368896, 'learning_rate': 0.00010439882697947214, 'epoch': 1.33}
+{'loss': 0.8717, 'grad_norm': 2.0820720195770264, 'learning_rate': 0.0001043743890518084, 'epoch': 1.33}
+{'loss': 0.6137, 'grad_norm': 1.5317957401275635, 'learning_rate': 0.00010434995112414465, 'epoch': 1.33}
+{'loss': 0.9649, 'grad_norm': 2.0376477241516113, 'learning_rate': 0.00010432551319648093, 'epoch': 1.33}
+{'loss': 0.963, 'grad_norm': 2.7941699028015137, 'learning_rate': 0.0001043010752688172, 'epoch': 1.33}
+{'loss': 0.5505, 'grad_norm': 2.3799190521240234, 'learning_rate': 0.00010427663734115346, 'epoch': 1.33}
+{'loss': 0.4311, 'grad_norm': 1.9923593997955322, 'learning_rate': 0.00010425219941348974, 'epoch': 1.33}
+{'loss': 0.8483, 'grad_norm': 1.5143426656723022, 'learning_rate': 0.00010422776148582599, 'epoch': 1.33}
+{'loss': 0.7053, 'grad_norm': 1.211293339729309, 'learning_rate': 0.00010420332355816226, 'epoch': 1.33}
+{'loss': 0.7195, 'grad_norm': 2.1427128314971924, 'learning_rate': 0.00010417888563049853, 'epoch': 1.33}
+{'loss': 1.0497, 'grad_norm': 2.3068511486053467, 'learning_rate': 0.0001041544477028348, 'epoch': 1.33}
+{'loss': 0.8406, 'grad_norm': 1.6837023496627808, 'learning_rate': 0.00010413000977517105, 'epoch': 1.33}
+{'loss': 1.1097, 'grad_norm': 2.0553340911865234, 'learning_rate': 0.00010410557184750733, 'epoch': 1.33}
+{'loss': 1.5506, 'grad_norm': 2.150421619415283, 'learning_rate': 0.0001040811339198436, 'epoch': 1.34}
+{'loss': 1.0281, 'grad_norm': 1.8495726585388184, 'learning_rate': 0.00010405669599217984, 'epoch': 1.34}
+{'loss': 1.2466, 'grad_norm': 5.749996662139893, 'learning_rate': 0.00010403225806451612, 'epoch': 1.34}
+{'loss': 0.9421, 'grad_norm': 3.338156223297119, 'learning_rate': 0.00010400782013685239, 'epoch': 1.34}
+{'loss': 0.594, 'grad_norm': 1.7294635772705078, 'learning_rate': 0.00010398338220918865, 'epoch': 1.34}
+{'loss': 0.5398, 'grad_norm': 1.9416723251342773, 'learning_rate': 0.00010395894428152493, 'epoch': 1.34}
+{'loss': 0.8419, 'grad_norm': 4.1962785720825195, 'learning_rate': 0.00010393450635386118, 'epoch': 1.34}
+{'loss': 0.3254, 'grad_norm': 1.9734654426574707, 'learning_rate': 0.00010391006842619745, 'epoch': 1.34}
+{'loss': 0.8437, 'grad_norm': 2.3827269077301025, 'learning_rate': 0.00010388563049853373, 'epoch': 1.34}
+{'loss': 0.6051, 'grad_norm': 1.408588171005249, 'learning_rate': 0.00010386119257086998, 'epoch': 1.34}
+{'loss': 0.855, 'grad_norm': 2.1959314346313477, 'learning_rate': 0.00010383675464320624, 'epoch': 1.34}
+{'loss': 0.2188, 'grad_norm': 0.5544465780258179, 'learning_rate': 0.00010381231671554252, 'epoch': 1.34}
+{'loss': 0.2731, 'grad_norm': 1.3485573530197144, 'learning_rate': 0.00010378787878787878, 'epoch': 1.34}
+ 67%|██████▋   | 8541/12776 [1:29:53<59:32,  1.19it/s] 67%|██████▋   | 8542/12776 [1:29:53<58:00,  1.22it/s]                                                       67%|██████▋   | 8542/12776 [1:29:53<58:00,  1.22it/s] 67%|██████▋   | 8543/12776 [1:29:54<57:12,  1.23it/s]                                                       67%|██████▋   | 8543/12776 [1:29:54<57:12,  1.23it/s] 67%|██████▋   | 8544/12776 [1:29:55<55:34,  1.27it/s]                                                       67%|██████▋   | 8544/12776 [1:29:55<55:34,  1.27it/s] 67%|██████▋   | 8545/12776 [1:29:55<52:33,  1.34it/s]                                                       67%|██████▋   | 8545/12776 [1:29:55<52:33,  1.34it/s] 67%|██████▋   | 8546/12776 [1:29:56<53:07,  1.33it/s]                                                       67%|██████▋   | 8546/12776 [1:29:56<53:07,  1.33it/s] 67%|██████▋   | 8547/12776 [1:29:57<49:33,  1.42it/s]                                                       67%|██████▋   | 8547/12776 [1:29:57<49:33,  1.42it/s] 67%|██████▋   | 8548/12776 [1:29:57<47:53,  1.47it/s]                                                       67%|██████▋   | 8548/12776 [1:29:57<47:53,  1.47it/s] 67%|██████▋   | 8549/12776 [1:29:58<45:05,  1.56it/s]                                                       67%|██████▋   | 8549/12776 [1:29:58<45:05,  1.56it/s] 67%|██████▋   | 8550/12776 [1:29:59<43:51,  1.61it/s]                                                       67%|██████▋   | 8550/12776 [1:29:59<43:51,  1.61it/s] 67%|██████▋   | 8551/12776 [1:29:59<41:25,  1.70it/s]                                                       67%|██████▋   | 8551/12776 [1:29:59<41:25,  1.70it/s] 67%|██████▋   | 8552/12776 [1:30:00<40:07,  1.75it/s]                                                       67%|██████▋   | 8552/12776 [1:30:00<40:07,  1.75it/s] 67%|██████▋   | 8553/12776 [1:30:00<37:40,  1.87it/s]                                                       67%|██████▋   | 8553/12776 [1:30:00<37:40,  1.87it/s] 67%|██████▋   | 8554/12776 [1:30:01<36:10,  1.94it/s]                                                       67%|██████▋   | 8554/12776 [1:30:01<36:10,  1.94it/s] 67%|██████▋   | 8555/12776 [1:30:01<34:16,  2.05it/s]                                                       67%|██████▋   | 8555/12776 [1:30:01<34:16,  2.05it/s] 67%|██████▋   | 8556/12776 [1:30:01<32:35,  2.16it/s]                                                       67%|██████▋   | 8556/12776 [1:30:01<32:35,  2.16it/s] 67%|██████▋   | 8557/12776 [1:30:02<34:15,  2.05it/s]                                                       67%|██████▋   | 8557/12776 [1:30:02<34:15,  2.05it/s] 67%|██████▋   | 8558/12776 [1:30:02<31:49,  2.21it/s]                                                       67%|██████▋   | 8558/12776 [1:30:02<31:49,  2.21it/s] 67%|██████▋   | 8559/12776 [1:30:03<29:33,  2.38it/s]                                                       67%|██████▋   | 8559/12776 [1:30:03<29:33,  2.38it/s] 67%|██████▋   | 8560/12776 [1:30:03<29:09,  2.41it/s]                                                       67%|██████▋   | 8560/12776 [1:30:03<29:09,  2.41it/s] 67%|██████▋   | 8561/12776 [1:30:03<27:17,  2.57it/s]                                                       67%|██████▋   | 8561/12776 [1:30:03<27:17,  2.57it/s] 67%|██████▋   | 8562/12776 [1:30:04<25:41,  2.73it/s]                                                       67%|██████▋   | 8562/12776 [1:30:04<25:41,  2.73it/s] 67%|██████▋   | 8563/12776 [1:30:04<25:14,  2.78it/s]                                                       67%|██████▋   | 8563/12776 [1:30:04<25:14,  2.78it/s] 67%|██████▋   | 8564/12776 [1:30:04<23:44,  2.96it/s]                                                       67%|██████▋   | 8564/12776 [1:30:04<23:44,  2.96it/s] 67%|██████▋   | 8565/12776 [1:30:05<22:32,  3.11it/s]                                                       67%|██████▋   | 8565/12776 [1:30:05<22:32,  3.11it/s] 67%|██████▋   | 8566/12776 [1:30:05<21:36,  3.25it/s]                                                       67%|██████▋   | 8566/12776 [1:30:05<21:36,  3.25it/s] 67%|██████▋   | 8567/12776 [1:30:05<21:25,  3.27it/s]                                                       67%|██████▋   | 8567/12776 [1:30:05<21:25,  3.27it/s] 67%|██████▋   | 8568/12776 [1:30:05<20:32,  3.41it/s]                                                       67%|██████▋   | 8568/12776 [1:30:05<20:32,  3.41it/s] 67%|██████▋   | 8569/12776 [1:30:06<19:44,  3.55it/s]                                                       67%|██████▋   | 8569/12776 [1:30:06<19:44,  3.55it/s] 67%|██████▋   | 8570/12776 [1:30:06<19:04,  3.68it/s]                                                       67%|██████▋   | 8570/12776 [1:30:06<19:04,  3.68it/s] 67%|██████▋   | 8571/12776 [1:30:06<21:09,  3.31it/s]                                                       67%|██████▋   | 8571/12776 [1:30:06<21:09,  3.31it/s] 67%|██████▋   | 8572/12776 [1:30:07<19:51,  3.53it/s]                                                       67%|██████▋   | 8572/12776 [1:30:07<19:51,  3.53it/s] 67%|██████▋   | 8573/12776 [1:30:07<18:50,  3.72it/s]                                                       67%|██████▋   | 8573/12776 [1:30:07<18:50,  3.72it/s] 67%|██████▋   | 8574/12776 [1:30:07<17:58,  3.90it/s]                                                       67%|██████▋   | 8574/12776 [1:30:07<17:58,  3.90it/s] 67%|██████▋   | 8575/12776 [1:30:07<17:09,  4.08it/s]                                                       67%|██████▋   | 8575/12776 [1:30:07<17:09,  4.08it/s] 67%|██████▋   | 8576/12776 [1:30:08<18:26,  3.79it/s]                                                       67%|██████▋   | 8576/12776 [1:30:08<18:26,  3.79it/s] 67%|██████▋   | 8577/12776 [1:30:08<17:17,  4.05it/s]                                                       67%|██████▋   | 8577/12776 [1:30:08<17:17,  4.05it/s] 67%|██████▋   | 8578/12776 [1:30:08<16:31,  4.24it/s]                                                       67%|██████▋   | 8578/12776 [1:30:08<16:31,  4.24it/s] 67%|██████▋   | 8579/12776 [1:30:08<15:55,  4.39it/s]                                                       67%|██████▋   | 8579/12776 [1:30:08<15:55,  4.39it/s] 67%|██████▋   | 8580/12776 [1:30:08<15:26,  4.53it/s]                                                       67%|██████▋   | 8580/12776 [1:30:08<15:26,  4.53it/s] 67%|██████▋   | 8581/12776 [1:30:09<16:42,  4.18it/s]                                                       67%|██████▋   | 8581/12776 [1:30:09<16:42,  4.18it/s] 67%|██████▋   | 8582/12776 [1:30:09<15:53,  4.40it/s]                                                       67%|██████▋   | 8582/12776 [1:30:09<15:53,  4.40it/s] 67%|██████▋   | 8583/12776 [1:30:09<15:14,  4.59it/s]                                                       67%|██████▋   | 8583/12776 [1:30:09<15:14,  4.59it/s] 67%|██████▋   | 8584/12776 [1:30:09<14:42,  4.75it/s]                                                       67%|██████▋   | 8584/12776 [1:30:09<14:42,  4.75it/s] 67%|██████▋   | 8585/12776 [1:30:09<14:20,  4.87it/s]                                                       67%|██████▋   | 8585/12776 [1:30:09<14:20,  4.87it/s] 67%|██████▋   | 8586/12776 [1:30:10<15:07,  4.62it/s]                                                       67%|██████▋   | 8586/12776 [1:30:10<15:07,  4.62it/s] 67%|██████▋   | 8587/12776 [1:30:10<14:28,  4.83it/s]                                                       67%|██████▋   | 8587/12776 [1:30:10<14:28,  4.83it/s] 67%|██████▋   | 8588/12776 [1:30:11<25:29,  2.74it/s]                                                       67%|██████▋   | 8588/12776 [1:30:11<25:29,  2.74it/s] 67%|██████▋   | 8589/12776 [1:30:12<50:44,  1.38it/s]                                                       67%|██████▋   | 8589/12776 [1:30:12<50:44,  1.38it/s] 67%|██████▋   | 8590/12776 [1:30:13<57:34,  1.21it/s]                                                       67%|██████▋   | 8590/12776 [1:30:13<57:34,  1.21it/s] 67%|██████▋   | 8591/12776 [1:30:14<58:21,  1.20it/s]                                                       67%|██████▋   | 8591/12776 [1:30:14<58:21,  1.20it/s] 67%|██████▋   | 8592/12776 [1:30:15<57:24,  1.21it/s]                                                       67%|██████▋   | 8592/12776 [1:30:15<57:24,  1.21it/s] 67%|██████▋   | 8593/12776 [1:30:16<58:38,  1.19it/s]                                                       67%|██████▋   | 8593/12776 [1:30:16<58:38,  1.19it/s] 67%|██████▋   | 8594/12776 [1:30:16<55:18,  1.26it/s]                                                       67%|██████▋   | 8594/12776 [1:30:16<55:18,  1.26it/s] 67%|██████▋   | 8595/12776 [1:30:17<52:24,  1.33it/s]                                                       67%|██████▋   | 8595/12776 [1:30:17<52:24,  1.33it/s] 67%|██████▋   | 8596/12776 [1:30:18<51:06,  1.36it/s]                                                       67%|██████▋   | 8596/12776 [1:30:18<51:06,  1.36it/s] 67%|██████▋   | 8597/12776 [1:30:18<47:47,  1.46it/s]                                                       67%|██████▋   | 8597/12776 [1:30:18<47:47,  1.46it/s] 67%|██████▋   | 8598/12776 [1:30:19<45:54,  1.52it/s]                                                       67%|██████▋   | 8598/12776 [1:30:19<45:54,  1.52it/s] 67%|██████▋   | 8599/12776 [1:30:20<43:22,  1.60it/s]                                                       67%|██████▋   | 8599/12776 [1:30:20<43:22,  1.60it/s] 67%|██████▋   | 8600/12776 [1:30:20<42:59,  1.62it/s]                                                       67%|██████▋   | 8600/12776 [1:30:20<42:59,  1.62it/s] 67%|██████▋   | 8601/12776 [1:30:21<40:08,  1.73it/s]                                                       67%|██████▋   | 8601/12776 [1:30:21<40:08,  1.73it/s] 67%|██████▋   | 8602/12776 [1:30:21<37:36,  1.85it/s]                                                       67%|██████▋   | 8602/12776 [1:30:21<37:36,  1.85it/s] 67%|██████▋   | 8603/12776 [1:30:22<36:01,  1.93it/s]                                                       67%|██████▋   | 8603/12776 [1:30:22<36:01,  1.93it/s] 67%|██████▋   | 8604/12776 [1:30:22<33:41,  2.06it/s]                                                       67%|██████▋   | 8604/12776 [1:30:22<33:41,  2.06it/s] 67%|██████▋   | 8605/12776 [1:30:22<33:17,  2.09it/s]                                                       67%|██████▋   | 8605/12776 [1:30:22<33:17,  2.09it/s] 67%|██████▋   | 8606/12776 [1:30:23<31:17,  2.22it/s]                                                       67%|██████▋   | 8606/12776 [1:30:23<31:17,  2.22it/s] 67%|██████▋   | 8607/12776 [1:30:23<29:32,  2.35it/s]                                                       67%|██████▋   | 8607/12776 [1:30:23<29:32,  2.35it/s] 67%|██████▋   | 8608/12776 [1:30:24<28:53,  2.40it/s]                                                       67%|██████▋   | 8608/12776 [1:30:24<28:53,  2.40it/s] 67%|██████▋   | 8609/12776 [1:30:24<27:24,  2.53it/s]                                                       67%|██████▋   | 8609/12776 [1:30:24<27:24,  2.53it/s] 67%|██████▋   | 8610/12776 [1:30:24<26:09,  2.65it/s]                                                       67%|██████▋   | 8610/12776 [1:30:24<26:09,  2.65it/s] 67%|██████▋   | 8611/12776 [1:30:25<27:47,  2.50it/s]                                                       67%|██████▋   | 8611/12776 [1:30:25<27:47,  2.50it/s] 67%|██████▋   | 8612/12776 [1:30:25<25:45,  2.69it/s]                                                       67%|██████▋   | 8612/12776 [1:30:25<25:45,  2.69it/s] 67%|██████▋   | 8613/12776 [1:30:25<24:15,  2.86it/s]                                                       67%|██████▋   | 8613/12776 [1:30:25<24:15,  2.86it/s] 67%|██████▋   | 8614/12776 [1:30:26<23:03,  3.01it/s]                                                       67%|██████▋   | 8614/12776 [1:30:26<23:03,  3.01it/s] 67%|██████▋   | 8615/12776 [1:30:26<24:20,  2.85it/s]                                                       67%|██████▋   | 8615/12776 [1:30:26<24:20,  2.85it/s] 67%|██████▋   | 8616/12776 [1:30:26<22:45,  3.05it/s]                                                       67%|██████▋   | 8616/12776 [1:30:26<22:45,  3.05it/s] 67%|██████▋   | 8617/12776 [1:30:27<21:34,  3.21it/s]                                                       67%|██████▋   | 8617/12776 [1:30:27<21:34,  3.21it/s] 67%|██████▋   | 8618/12776 [1:30:27<20:31,  3.38it/s]                                                      {'loss': 0.2697, 'grad_norm': 0.4418382942676544, 'learning_rate': 0.00010376344086021504, 'epoch': 1.34}
+{'loss': 0.3345, 'grad_norm': 0.8457216024398804, 'learning_rate': 0.00010373900293255131, 'epoch': 1.34}
+{'loss': 0.2579, 'grad_norm': 0.6509329080581665, 'learning_rate': 0.00010371456500488758, 'epoch': 1.34}
+{'loss': 0.3396, 'grad_norm': 0.7329632043838501, 'learning_rate': 0.00010369012707722384, 'epoch': 1.34}
+{'loss': 0.1596, 'grad_norm': 0.5292937755584717, 'learning_rate': 0.00010366568914956012, 'epoch': 1.34}
+{'loss': 0.3351, 'grad_norm': 0.620823085308075, 'learning_rate': 0.00010364125122189637, 'epoch': 1.34}
+{'loss': 0.2396, 'grad_norm': 0.5701507329940796, 'learning_rate': 0.00010361681329423264, 'epoch': 1.34}
+{'loss': 0.2507, 'grad_norm': 0.6084917783737183, 'learning_rate': 0.00010359237536656892, 'epoch': 1.34}
+{'loss': 0.305, 'grad_norm': 0.8983578085899353, 'learning_rate': 0.00010356793743890517, 'epoch': 1.34}
+{'loss': 0.3077, 'grad_norm': 0.9507225751876831, 'learning_rate': 0.00010354349951124143, 'epoch': 1.34}
+{'loss': 0.2864, 'grad_norm': 1.0135985612869263, 'learning_rate': 0.00010351906158357771, 'epoch': 1.34}
+{'loss': 0.2869, 'grad_norm': 1.2849130630493164, 'learning_rate': 0.00010349462365591398, 'epoch': 1.34}
+{'loss': 0.2438, 'grad_norm': 1.036739706993103, 'learning_rate': 0.00010347018572825023, 'epoch': 1.34}
+{'loss': 0.4037, 'grad_norm': 0.8827303051948547, 'learning_rate': 0.0001034457478005865, 'epoch': 1.34}
+{'loss': 0.4922, 'grad_norm': 0.9790602326393127, 'learning_rate': 0.00010342130987292277, 'epoch': 1.34}
+{'loss': 0.4532, 'grad_norm': 1.958410382270813, 'learning_rate': 0.00010339687194525903, 'epoch': 1.34}
+{'loss': 0.3544, 'grad_norm': 1.1825295686721802, 'learning_rate': 0.00010337243401759531, 'epoch': 1.34}
+{'loss': 0.1968, 'grad_norm': 0.9426168203353882, 'learning_rate': 0.00010334799608993156, 'epoch': 1.34}
+{'loss': 0.3628, 'grad_norm': 0.9069138765335083, 'learning_rate': 0.00010332355816226783, 'epoch': 1.34}
+{'loss': 0.1958, 'grad_norm': 1.00847589969635, 'learning_rate': 0.00010329912023460411, 'epoch': 1.34}
+{'loss': 0.3903, 'grad_norm': 2.047116994857788, 'learning_rate': 0.00010327468230694036, 'epoch': 1.34}
+{'loss': 0.6619, 'grad_norm': 2.307471513748169, 'learning_rate': 0.00010325024437927662, 'epoch': 1.34}
+{'loss': 0.7681, 'grad_norm': 2.9408271312713623, 'learning_rate': 0.0001032258064516129, 'epoch': 1.34}
+{'loss': 0.5818, 'grad_norm': 1.2898457050323486, 'learning_rate': 0.00010320136852394917, 'epoch': 1.34}
+{'loss': 0.2375, 'grad_norm': 1.0917787551879883, 'learning_rate': 0.00010317693059628542, 'epoch': 1.34}
+{'loss': 0.9421, 'grad_norm': 5.130285739898682, 'learning_rate': 0.0001031524926686217, 'epoch': 1.34}
+{'loss': 0.9799, 'grad_norm': 3.367882013320923, 'learning_rate': 0.00010312805474095796, 'epoch': 1.34}
+{'loss': 0.4556, 'grad_norm': 1.5562324523925781, 'learning_rate': 0.00010310361681329423, 'epoch': 1.34}
+{'loss': 0.7766, 'grad_norm': 2.622732400894165, 'learning_rate': 0.00010307917888563048, 'epoch': 1.34}
+{'loss': 0.9861, 'grad_norm': 6.2125630378723145, 'learning_rate': 0.00010305474095796676, 'epoch': 1.34}
+{'loss': 1.042, 'grad_norm': 2.0387191772460938, 'learning_rate': 0.00010303030303030302, 'epoch': 1.34}
+{'loss': 0.9906, 'grad_norm': 2.0245776176452637, 'learning_rate': 0.00010300586510263929, 'epoch': 1.34}
+{'loss': 0.3743, 'grad_norm': 1.5084283351898193, 'learning_rate': 0.00010298142717497555, 'epoch': 1.34}
+{'loss': 0.8406, 'grad_norm': 2.843535900115967, 'learning_rate': 0.00010295698924731181, 'epoch': 1.34}
+{'loss': 0.788, 'grad_norm': 3.4681601524353027, 'learning_rate': 0.00010293255131964808, 'epoch': 1.34}
+{'loss': 0.7895, 'grad_norm': 2.2604522705078125, 'learning_rate': 0.00010290811339198436, 'epoch': 1.34}
+{'loss': 0.685, 'grad_norm': 3.683725357055664, 'learning_rate': 0.00010288367546432061, 'epoch': 1.34}
+{'loss': 0.9342, 'grad_norm': 2.207420825958252, 'learning_rate': 0.00010285923753665687, 'epoch': 1.34}
+{'loss': 1.3186, 'grad_norm': 2.529273748397827, 'learning_rate': 0.00010283479960899315, 'epoch': 1.34}
+{'loss': 1.0946, 'grad_norm': 2.481981039047241, 'learning_rate': 0.00010281036168132942, 'epoch': 1.34}
+{'loss': 0.6584, 'grad_norm': 1.3697443008422852, 'learning_rate': 0.00010278592375366567, 'epoch': 1.34}
+{'loss': 0.5824, 'grad_norm': 2.0801591873168945, 'learning_rate': 0.00010276148582600195, 'epoch': 1.34}
+{'loss': 0.7734, 'grad_norm': 2.740255832672119, 'learning_rate': 0.00010273704789833821, 'epoch': 1.34}
+{'loss': 0.8381, 'grad_norm': 2.066756010055542, 'learning_rate': 0.00010271260997067448, 'epoch': 1.34}
+{'loss': 0.7212, 'grad_norm': 2.6519453525543213, 'learning_rate': 0.00010268817204301074, 'epoch': 1.34}
+{'loss': 0.7086, 'grad_norm': 2.464167833328247, 'learning_rate': 0.000102663734115347, 'epoch': 1.34}
+{'loss': 0.8188, 'grad_norm': 2.8775999546051025, 'learning_rate': 0.00010263929618768327, 'epoch': 1.34}
+{'loss': 0.6777, 'grad_norm': 2.8111610412597656, 'learning_rate': 0.00010261485826001955, 'epoch': 1.34}
+{'loss': 0.1694, 'grad_norm': 0.4312553405761719, 'learning_rate': 0.0001025904203323558, 'epoch': 1.34}
+{'loss': 0.1899, 'grad_norm': 0.5284144282341003, 'learning_rate': 0.00010256598240469207, 'epoch': 1.34}
+{'loss': 0.3319, 'grad_norm': 0.9777967929840088, 'learning_rate': 0.00010254154447702834, 'epoch': 1.34}
+{'loss': 0.1933, 'grad_norm': 1.211936116218567, 'learning_rate': 0.00010251710654936461, 'epoch': 1.35}
+{'loss': 0.2897, 'grad_norm': 0.5879170894622803, 'learning_rate': 0.00010249266862170086, 'epoch': 1.35}
+{'loss': 0.2997, 'grad_norm': 1.0071542263031006, 'learning_rate': 0.00010246823069403714, 'epoch': 1.35}
+{'loss': 0.2348, 'grad_norm': 0.840803325176239, 'learning_rate': 0.0001024437927663734, 'epoch': 1.35}
+{'loss': 0.1762, 'grad_norm': 0.6389973163604736, 'learning_rate': 0.00010241935483870965, 'epoch': 1.35}
+{'loss': 0.278, 'grad_norm': 0.771003007888794, 'learning_rate': 0.00010239491691104593, 'epoch': 1.35}
+{'loss': 0.3955, 'grad_norm': 1.348070740699768, 'learning_rate': 0.0001023704789833822, 'epoch': 1.35}
+{'loss': 0.3047, 'grad_norm': 0.9808512926101685, 'learning_rate': 0.00010234604105571846, 'epoch': 1.35}
+{'loss': 0.4669, 'grad_norm': 1.0971183776855469, 'learning_rate': 0.00010232160312805474, 'epoch': 1.35}
+{'loss': 0.5127, 'grad_norm': 2.088040351867676, 'learning_rate': 0.00010229716520039099, 'epoch': 1.35}
+{'loss': 0.3575, 'grad_norm': 0.7336698770523071, 'learning_rate': 0.00010227272727272726, 'epoch': 1.35}
+{'loss': 0.2192, 'grad_norm': 0.7726986408233643, 'learning_rate': 0.00010224828934506353, 'epoch': 1.35}
+{'loss': 0.3309, 'grad_norm': 1.5100910663604736, 'learning_rate': 0.0001022238514173998, 'epoch': 1.35}
+{'loss': 0.4132, 'grad_norm': 1.1917885541915894, 'learning_rate': 0.00010219941348973605, 'epoch': 1.35}
+{'loss': 0.455, 'grad_norm': 1.4236356019973755, 'learning_rate': 0.00010217497556207233, 'epoch': 1.35}
+{'loss': 0.4558, 'grad_norm': 1.214121699333191, 'learning_rate': 0.0001021505376344086, 'epoch': 1.35}
+{'loss': 0.3848, 'grad_norm': 1.2391715049743652, 'learning_rate': 0.00010212609970674485, 'epoch': 1.35}
+{'loss': 0.5066, 'grad_norm': 1.3483924865722656, 'learning_rate': 0.00010210166177908112, 'epoch': 1.35}
+{'loss': 0.373, 'grad_norm': 1.2172237634658813, 'learning_rate': 0.00010207722385141739, 'epoch': 1.35}
+{'loss': 0.3206, 'grad_norm': 0.9454227685928345, 'learning_rate': 0.00010205278592375365, 'epoch': 1.35}
+{'loss': 0.4893, 'grad_norm': 2.5474867820739746, 'learning_rate': 0.00010202834799608993, 'epoch': 1.35}
+{'loss': 0.404, 'grad_norm': 0.9093128442764282, 'learning_rate': 0.00010200391006842618, 'epoch': 1.35}
+{'loss': 0.8356, 'grad_norm': 2.522016763687134, 'learning_rate': 0.00010197947214076245, 'epoch': 1.35}
+{'loss': 0.395, 'grad_norm': 1.7662311792373657, 'learning_rate': 0.00010195503421309873, 'epoch': 1.35}
+{'loss': 0.5021, 'grad_norm': 1.433812141418457, 'learning_rate': 0.00010193059628543499, 'epoch': 1.35}
+{'loss': 0.5901, 'grad_norm': 1.4101475477218628, 'learning_rate': 0.00010190615835777124, 'epoch': 1.35}
+ 67%|██████▋   | 8618/12776 [1:30:27<20:31,  3.38it/s] 67%|██████▋   | 8619/12776 [1:30:27<21:29,  3.22it/s]                                                       67%|██████▋   | 8619/12776 [1:30:27<21:29,  3.22it/s] 67%|██████▋   | 8620/12776 [1:30:27<20:07,  3.44it/s]                                                       67%|██████▋   | 8620/12776 [1:30:27<20:07,  3.44it/s] 67%|██████▋   | 8621/12776 [1:30:28<19:06,  3.62it/s]                                                       67%|██████▋   | 8621/12776 [1:30:28<19:06,  3.62it/s] 67%|██████▋   | 8622/12776 [1:30:28<18:17,  3.78it/s]                                                       67%|██████▋   | 8622/12776 [1:30:28<18:17,  3.78it/s] 67%|██████▋   | 8623/12776 [1:30:28<18:29,  3.74it/s]                                                       67%|██████▋   | 8623/12776 [1:30:28<18:29,  3.74it/s] 68%|██████▊   | 8624/12776 [1:30:28<17:36,  3.93it/s]                                                       68%|██████▊   | 8624/12776 [1:30:28<17:36,  3.93it/s] 68%|██████▊   | 8625/12776 [1:30:29<16:54,  4.09it/s]                                                       68%|██████▊   | 8625/12776 [1:30:29<16:54,  4.09it/s] 68%|██████▊   | 8626/12776 [1:30:29<16:19,  4.24it/s]                                                       68%|██████▊   | 8626/12776 [1:30:29<16:19,  4.24it/s] 68%|██████▊   | 8627/12776 [1:30:29<15:49,  4.37it/s]                                                       68%|██████▊   | 8627/12776 [1:30:29<15:49,  4.37it/s] 68%|██████▊   | 8628/12776 [1:30:29<16:36,  4.16it/s]                                                       68%|██████▊   | 8628/12776 [1:30:29<16:36,  4.16it/s] 68%|██████▊   | 8629/12776 [1:30:29<15:59,  4.32it/s]                                                       68%|██████▊   | 8629/12776 [1:30:29<15:59,  4.32it/s] 68%|██████▊   | 8630/12776 [1:30:30<15:29,  4.46it/s]                                                       68%|██████▊   | 8630/12776 [1:30:30<15:29,  4.46it/s] 68%|██████▊   | 8631/12776 [1:30:30<15:05,  4.58it/s]                                                       68%|██████▊   | 8631/12776 [1:30:30<15:05,  4.58it/s] 68%|██████▊   | 8632/12776 [1:30:30<14:43,  4.69it/s]                                                       68%|██████▊   | 8632/12776 [1:30:30<14:43,  4.69it/s] 68%|██████▊   | 8633/12776 [1:30:30<16:42,  4.13it/s]                                                       68%|██████▊   | 8633/12776 [1:30:30<16:42,  4.13it/s] 68%|██████▊   | 8634/12776 [1:30:31<15:44,  4.39it/s]                                                       68%|██████▊   | 8634/12776 [1:30:31<15:44,  4.39it/s] 68%|██████▊   | 8635/12776 [1:30:31<15:01,  4.59it/s]                                                       68%|██████▊   | 8635/12776 [1:30:31<15:01,  4.59it/s] 68%|██████▊   | 8636/12776 [1:30:31<14:22,  4.80it/s]                                                       68%|██████▊   | 8636/12776 [1:30:31<14:22,  4.80it/s] 68%|██████▊   | 8637/12776 [1:30:31<13:51,  4.98it/s]                                                       68%|██████▊   | 8637/12776 [1:30:31<13:51,  4.98it/s] 68%|██████▊   | 8638/12776 [1:30:32<24:08,  2.86it/s]                                                       68%|██████▊   | 8638/12776 [1:30:32<24:08,  2.86it/s] 68%|██████▊   | 8639/12776 [1:30:33<45:06,  1.53it/s]                                                       68%|██████▊   | 8639/12776 [1:30:33<45:06,  1.53it/s] 68%|██████▊   | 8640/12776 [1:30:34<53:00,  1.30it/s]                                                       68%|██████▊   | 8640/12776 [1:30:34<53:00,  1.30it/s] 68%|██████▊   | 8641/12776 [1:30:35<55:38,  1.24it/s]                                                       68%|██████▊   | 8641/12776 [1:30:35<55:38,  1.24it/s] 68%|██████▊   | 8642/12776 [1:30:36<56:39,  1.22it/s]                                                       68%|██████▊   | 8642/12776 [1:30:36<56:39,  1.22it/s] 68%|██████▊   | 8643/12776 [1:30:37<56:13,  1.23it/s]                                                       68%|██████▊   | 8643/12776 [1:30:37<56:13,  1.23it/s] 68%|██████▊   | 8644/12776 [1:30:38<56:17,  1.22it/s]                                                       68%|██████▊   | 8644/12776 [1:30:38<56:17,  1.22it/s] 68%|██████▊   | 8645/12776 [1:30:38<57:03,  1.21it/s]                                                       68%|██████▊   | 8645/12776 [1:30:38<57:03,  1.21it/s] 68%|██████▊   | 8646/12776 [1:30:39<53:48,  1.28it/s]                                                       68%|██████▊   | 8646/12776 [1:30:39<53:48,  1.28it/s] 68%|██████▊   | 8647/12776 [1:30:40<53:06,  1.30it/s]                                                       68%|██████▊   | 8647/12776 [1:30:40<53:06,  1.30it/s] 68%|██████▊   | 8648/12776 [1:30:40<49:23,  1.39it/s]                                                       68%|██████▊   | 8648/12776 [1:30:40<49:23,  1.39it/s] 68%|██████▊   | 8649/12776 [1:30:41<46:57,  1.46it/s]                                                       68%|██████▊   | 8649/12776 [1:30:41<46:57,  1.46it/s] 68%|██████▊   | 8650/12776 [1:30:42<43:58,  1.56it/s]                                                       68%|██████▊   | 8650/12776 [1:30:42<43:58,  1.56it/s] 68%|██████▊   | 8651/12776 [1:30:42<43:04,  1.60it/s]                                                       68%|██████▊   | 8651/12776 [1:30:42<43:04,  1.60it/s] 68%|██████▊   | 8652/12776 [1:30:43<39:46,  1.73it/s]                                                       68%|██████▊   | 8652/12776 [1:30:43<39:46,  1.73it/s] 68%|██████▊   | 8653/12776 [1:30:43<39:36,  1.73it/s]                                                       68%|██████▊   | 8653/12776 [1:30:43<39:36,  1.73it/s] 68%|██████▊   | 8654/12776 [1:30:44<36:25,  1.89it/s]                                                       68%|██████▊   | 8654/12776 [1:30:44<36:25,  1.89it/s] 68%|██████▊   | 8655/12776 [1:30:44<36:19,  1.89it/s]                                                       68%|██████▊   | 8655/12776 [1:30:44<36:19,  1.89it/s] 68%|██████▊   | 8656/12776 [1:30:45<33:28,  2.05it/s]                                                       68%|██████▊   | 8656/12776 [1:30:45<33:28,  2.05it/s] 68%|██████▊   | 8657/12776 [1:30:45<31:15,  2.20it/s]                                                       68%|██████▊   | 8657/12776 [1:30:45<31:15,  2.20it/s] 68%|██████▊   | 8658/12776 [1:30:45<30:00,  2.29it/s]                                                       68%|██████▊   | 8658/12776 [1:30:45<30:00,  2.29it/s] 68%|██████▊   | 8659/12776 [1:30:46<28:16,  2.43it/s]                                                       68%|██████▊   | 8659/12776 [1:30:46<28:16,  2.43it/s] 68%|██████▊   | 8660/12776 [1:30:46<26:48,  2.56it/s]                                                       68%|██████▊   | 8660/12776 [1:30:46<26:48,  2.56it/s] 68%|██████▊   | 8661/12776 [1:30:47<27:52,  2.46it/s]                                                       68%|██████▊   | 8661/12776 [1:30:47<27:52,  2.46it/s] 68%|██████▊   | 8662/12776 [1:30:47<26:06,  2.63it/s]                                                       68%|██████▊   | 8662/12776 [1:30:47<26:06,  2.63it/s] 68%|██████▊   | 8663/12776 [1:30:47<24:35,  2.79it/s]                                                       68%|██████▊   | 8663/12776 [1:30:47<24:35,  2.79it/s] 68%|██████▊   | 8664/12776 [1:30:47<23:19,  2.94it/s]                                                       68%|██████▊   | 8664/12776 [1:30:47<23:19,  2.94it/s] 68%|██████▊   | 8665/12776 [1:30:48<23:39,  2.90it/s]                                                       68%|██████▊   | 8665/12776 [1:30:48<23:39,  2.90it/s] 68%|██████▊   | 8666/12776 [1:30:48<22:23,  3.06it/s]                                                       68%|██████▊   | 8666/12776 [1:30:48<22:23,  3.06it/s] 68%|██████▊   | 8667/12776 [1:30:48<21:17,  3.22it/s]                                                       68%|██████▊   | 8667/12776 [1:30:48<21:17,  3.22it/s] 68%|██████▊   | 8668/12776 [1:30:49<20:26,  3.35it/s]                                                       68%|██████▊   | 8668/12776 [1:30:49<20:26,  3.35it/s] 68%|██████▊   | 8669/12776 [1:30:49<21:25,  3.20it/s]                                                       68%|██████▊   | 8669/12776 [1:30:49<21:25,  3.20it/s] 68%|██████▊   | 8670/12776 [1:30:49<20:11,  3.39it/s]                                                       68%|██████▊   | 8670/12776 [1:30:49<20:11,  3.39it/s] 68%|██████▊   | 8671/12776 [1:30:49<19:12,  3.56it/s]                                                       68%|██████▊   | 8671/12776 [1:30:49<19:12,  3.56it/s] 68%|██████▊   | 8672/12776 [1:30:50<18:27,  3.70it/s]                                                       68%|██████▊   | 8672/12776 [1:30:50<18:27,  3.70it/s] 68%|██████▊   | 8673/12776 [1:30:50<19:57,  3.43it/s]                                                       68%|██████▊   | 8673/12776 [1:30:50<19:57,  3.43it/s] 68%|██████▊   | 8674/12776 [1:30:50<18:41,  3.66it/s]                                                       68%|██████▊   | 8674/12776 [1:30:50<18:41,  3.66it/s] 68%|██████▊   | 8675/12776 [1:30:51<17:39,  3.87it/s]                                                       68%|██████▊   | 8675/12776 [1:30:51<17:39,  3.87it/s] 68%|██████▊   | 8676/12776 [1:30:51<16:48,  4.06it/s]                                                       68%|██████▊   | 8676/12776 [1:30:51<16:48,  4.06it/s] 68%|██████▊   | 8677/12776 [1:30:51<18:18,  3.73it/s]                                                       68%|██████▊   | 8677/12776 [1:30:51<18:18,  3.73it/s] 68%|██████▊   | 8678/12776 [1:30:51<17:06,  3.99it/s]                                                       68%|██████▊   | 8678/12776 [1:30:51<17:06,  3.99it/s] 68%|██████▊   | 8679/12776 [1:30:51<16:17,  4.19it/s]                                                       68%|██████▊   | 8679/12776 [1:30:51<16:17,  4.19it/s] 68%|██████▊   | 8680/12776 [1:30:52<15:35,  4.38it/s]                                                       68%|██████▊   | 8680/12776 [1:30:52<15:35,  4.38it/s] 68%|██████▊   | 8681/12776 [1:30:52<15:03,  4.53it/s]                                                       68%|██████▊   | 8681/12776 [1:30:52<15:03,  4.53it/s] 68%|██████▊   | 8682/12776 [1:30:52<16:36,  4.11it/s]                                                       68%|██████▊   | 8682/12776 [1:30:52<16:36,  4.11it/s] 68%|██████▊   | 8683/12776 [1:30:52<15:40,  4.35it/s]                                                       68%|██████▊   | 8683/12776 [1:30:52<15:40,  4.35it/s] 68%|██████▊   | 8684/12776 [1:30:53<14:56,  4.57it/s]                                                       68%|██████▊   | 8684/12776 [1:30:53<14:56,  4.57it/s] 68%|██████▊   | 8685/12776 [1:30:53<14:22,  4.74it/s]                                                       68%|██████▊   | 8685/12776 [1:30:53<14:22,  4.74it/s] 68%|█���████▊   | 8686/12776 [1:30:53<13:55,  4.90it/s]                                                       68%|██████▊   | 8686/12776 [1:30:53<13:55,  4.90it/s] 68%|██████▊   | 8687/12776 [1:30:53<13:30,  5.04it/s]                                                       68%|██████▊   | 8687/12776 [1:30:53<13:30,  5.04it/s] 68%|██████▊   | 8688/12776 [1:30:54<23:46,  2.87it/s]                                                       68%|██████▊   | 8688/12776 [1:30:54<23:46,  2.87it/s] 68%|██████▊   | 8689/12776 [1:30:55<42:28,  1.60it/s]                                                       68%|██████▊   | 8689/12776 [1:30:55<42:28,  1.60it/s] 68%|██████▊   | 8690/12776 [1:30:56<51:30,  1.32it/s]                                                       68%|██████▊   | 8690/12776 [1:30:56<51:30,  1.32it/s] 68%|██████▊   | 8691/12776 [1:30:57<56:54,  1.20it/s]                                                       68%|██████▊   | 8691/12776 [1:30:57<56:54,  1.20it/s] 68%|██████▊   | 8692/12776 [1:30:58<56:49,  1.20it/s]                                                       68%|██████▊   | 8692/12776 [1:30:58<56:49,  1.20it/s] 68%|██████▊   | 8693/12776 [1:30:59<54:55,  1.24it/s]                                                       68%|██████▊   | 8693/12776 [1:30:59<54:55,  1.24it/s] 68%|██████▊   | 8694/12776 [1:31:00<54:50,  1.24it/s]                                                       68%|██████▊   | 8694/12776 [1:31:00<54:50,  1.24it/s] 68%|██████▊   | 8695/12776 [1:31:00<53:20,  1.28it/s]                                                      {'loss': 0.8687, 'grad_norm': 4.594160556793213, 'learning_rate': 0.00010188172043010752, 'epoch': 1.35}
+{'loss': 0.4597, 'grad_norm': 1.1630065441131592, 'learning_rate': 0.00010185728250244379, 'epoch': 1.35}
+{'loss': 0.6061, 'grad_norm': 2.0793771743774414, 'learning_rate': 0.00010183284457478004, 'epoch': 1.35}
+{'loss': 0.8025, 'grad_norm': 2.358942985534668, 'learning_rate': 0.00010180840664711631, 'epoch': 1.35}
+{'loss': 0.6873, 'grad_norm': 2.1486661434173584, 'learning_rate': 0.00010178396871945258, 'epoch': 1.35}
+{'loss': 1.0033, 'grad_norm': 4.260776519775391, 'learning_rate': 0.00010175953079178884, 'epoch': 1.35}
+{'loss': 0.3625, 'grad_norm': 1.1536997556686401, 'learning_rate': 0.00010173509286412512, 'epoch': 1.35}
+{'loss': 0.7172, 'grad_norm': 2.115208625793457, 'learning_rate': 0.00010171065493646137, 'epoch': 1.35}
+{'loss': 1.299, 'grad_norm': 2.6683943271636963, 'learning_rate': 0.00010168621700879764, 'epoch': 1.35}
+{'loss': 1.0832, 'grad_norm': 1.838832139968872, 'learning_rate': 0.00010166177908113392, 'epoch': 1.35}
+{'loss': 1.1096, 'grad_norm': 3.921957492828369, 'learning_rate': 0.00010163734115347018, 'epoch': 1.35}
+{'loss': 1.1012, 'grad_norm': 2.774120569229126, 'learning_rate': 0.00010161290322580643, 'epoch': 1.35}
+{'loss': 0.8941, 'grad_norm': 2.269116163253784, 'learning_rate': 0.00010158846529814271, 'epoch': 1.35}
+{'loss': 1.0635, 'grad_norm': 2.243638038635254, 'learning_rate': 0.00010156402737047898, 'epoch': 1.35}
+{'loss': 0.4038, 'grad_norm': 1.6823785305023193, 'learning_rate': 0.00010153958944281523, 'epoch': 1.35}
+{'loss': 0.9487, 'grad_norm': 3.375223159790039, 'learning_rate': 0.0001015151515151515, 'epoch': 1.35}
+{'loss': 1.2298, 'grad_norm': 2.887037754058838, 'learning_rate': 0.00010149071358748777, 'epoch': 1.35}
+{'loss': 0.7477, 'grad_norm': 2.0357108116149902, 'learning_rate': 0.00010146627565982404, 'epoch': 1.35}
+{'loss': 1.3611, 'grad_norm': 2.5809435844421387, 'learning_rate': 0.00010144183773216031, 'epoch': 1.35}
+{'loss': 0.5658, 'grad_norm': 1.5694135427474976, 'learning_rate': 0.00010141739980449657, 'epoch': 1.35}
+{'loss': 0.9227, 'grad_norm': 2.64839243888855, 'learning_rate': 0.00010139296187683283, 'epoch': 1.35}
+{'loss': 0.2149, 'grad_norm': 0.7386291027069092, 'learning_rate': 0.00010136852394916911, 'epoch': 1.35}
+{'loss': 0.2091, 'grad_norm': 0.43771645426750183, 'learning_rate': 0.00010134408602150537, 'epoch': 1.35}
+{'loss': 0.3522, 'grad_norm': 0.7924376726150513, 'learning_rate': 0.00010131964809384162, 'epoch': 1.35}
+{'loss': 0.264, 'grad_norm': 0.5517721176147461, 'learning_rate': 0.0001012952101661779, 'epoch': 1.35}
+{'loss': 0.1477, 'grad_norm': 0.5062223672866821, 'learning_rate': 0.00010127077223851417, 'epoch': 1.35}
+{'loss': 0.3525, 'grad_norm': 0.5685814619064331, 'learning_rate': 0.00010124633431085042, 'epoch': 1.35}
+{'loss': 0.2294, 'grad_norm': 0.5402782559394836, 'learning_rate': 0.0001012218963831867, 'epoch': 1.35}
+{'loss': 0.2081, 'grad_norm': 0.5032253861427307, 'learning_rate': 0.00010119745845552296, 'epoch': 1.35}
+{'loss': 0.3869, 'grad_norm': 1.4722563028335571, 'learning_rate': 0.00010117302052785923, 'epoch': 1.35}
+{'loss': 0.3021, 'grad_norm': 0.9200348854064941, 'learning_rate': 0.0001011485826001955, 'epoch': 1.35}
+{'loss': 0.1811, 'grad_norm': 0.6962469816207886, 'learning_rate': 0.00010112414467253176, 'epoch': 1.35}
+{'loss': 0.3023, 'grad_norm': 0.8069591522216797, 'learning_rate': 0.00010109970674486802, 'epoch': 1.35}
+{'loss': 0.1662, 'grad_norm': 0.669252336025238, 'learning_rate': 0.0001010752688172043, 'epoch': 1.35}
+{'loss': 0.3415, 'grad_norm': 0.6303135752677917, 'learning_rate': 0.00010105083088954055, 'epoch': 1.35}
+{'loss': 0.284, 'grad_norm': 0.9292386770248413, 'learning_rate': 0.00010102639296187682, 'epoch': 1.35}
+{'loss': 0.2997, 'grad_norm': 1.1860474348068237, 'learning_rate': 0.0001010019550342131, 'epoch': 1.35}
+{'loss': 0.5064, 'grad_norm': 1.6328331232070923, 'learning_rate': 0.00010097751710654936, 'epoch': 1.35}
+{'loss': 0.4723, 'grad_norm': 1.2533385753631592, 'learning_rate': 0.00010095307917888561, 'epoch': 1.36}
+{'loss': 0.3477, 'grad_norm': 1.2162647247314453, 'learning_rate': 0.00010092864125122189, 'epoch': 1.36}
+{'loss': 0.2559, 'grad_norm': 1.2495274543762207, 'learning_rate': 0.00010090420332355815, 'epoch': 1.36}
+{'loss': 0.3749, 'grad_norm': 1.11675226688385, 'learning_rate': 0.00010087976539589442, 'epoch': 1.36}
+{'loss': 0.2769, 'grad_norm': 1.302672028541565, 'learning_rate': 0.0001008553274682307, 'epoch': 1.36}
+{'loss': 0.7427, 'grad_norm': 1.504712462425232, 'learning_rate': 0.00010083088954056695, 'epoch': 1.36}
+{'loss': 0.824, 'grad_norm': 1.3615977764129639, 'learning_rate': 0.00010080645161290321, 'epoch': 1.36}
+{'loss': 0.4904, 'grad_norm': 1.5800176858901978, 'learning_rate': 0.00010078201368523949, 'epoch': 1.36}
+{'loss': 0.4018, 'grad_norm': 1.6747968196868896, 'learning_rate': 0.00010075757575757574, 'epoch': 1.36}
+{'loss': 0.4234, 'grad_norm': 1.6093438863754272, 'learning_rate': 0.000100733137829912, 'epoch': 1.36}
+{'loss': 0.6212, 'grad_norm': 1.9799742698669434, 'learning_rate': 0.00010070869990224828, 'epoch': 1.36}
+{'loss': 0.5132, 'grad_norm': 1.8439103364944458, 'learning_rate': 0.00010068426197458455, 'epoch': 1.36}
+{'loss': 0.6544, 'grad_norm': 2.6742753982543945, 'learning_rate': 0.0001006598240469208, 'epoch': 1.36}
+{'loss': 0.9793, 'grad_norm': 2.446819543838501, 'learning_rate': 0.00010063538611925708, 'epoch': 1.36}
+{'loss': 0.5525, 'grad_norm': 1.334675669670105, 'learning_rate': 0.00010061094819159334, 'epoch': 1.36}
+{'loss': 0.4494, 'grad_norm': 1.5871644020080566, 'learning_rate': 0.00010058651026392961, 'epoch': 1.36}
+{'loss': 0.8743, 'grad_norm': 1.5029113292694092, 'learning_rate': 0.00010056207233626589, 'epoch': 1.36}
+{'loss': 0.806, 'grad_norm': 2.237409830093384, 'learning_rate': 0.00010053763440860214, 'epoch': 1.36}
+{'loss': 0.8528, 'grad_norm': 2.4241881370544434, 'learning_rate': 0.0001005131964809384, 'epoch': 1.36}
+{'loss': 0.5835, 'grad_norm': 2.1619865894317627, 'learning_rate': 0.00010048875855327468, 'epoch': 1.36}
+{'loss': 1.6964, 'grad_norm': 2.61045503616333, 'learning_rate': 0.00010046432062561093, 'epoch': 1.36}
+{'loss': 0.8914, 'grad_norm': 1.533729910850525, 'learning_rate': 0.0001004398826979472, 'epoch': 1.36}
+{'loss': 0.9521, 'grad_norm': 2.500321388244629, 'learning_rate': 0.00010041544477028348, 'epoch': 1.36}
+{'loss': 1.4087, 'grad_norm': 5.0488080978393555, 'learning_rate': 0.00010039100684261974, 'epoch': 1.36}
+{'loss': 0.6997, 'grad_norm': 2.802337408065796, 'learning_rate': 0.00010036656891495599, 'epoch': 1.36}
+{'loss': 0.6674, 'grad_norm': 1.9060883522033691, 'learning_rate': 0.00010034213098729227, 'epoch': 1.36}
+{'loss': 1.335, 'grad_norm': 6.120393753051758, 'learning_rate': 0.00010031769305962854, 'epoch': 1.36}
+{'loss': 0.6943, 'grad_norm': 1.4589465856552124, 'learning_rate': 0.0001002932551319648, 'epoch': 1.36}
+{'loss': 0.6131, 'grad_norm': 3.765908718109131, 'learning_rate': 0.00010026881720430108, 'epoch': 1.36}
+{'loss': 0.577, 'grad_norm': 2.0184710025787354, 'learning_rate': 0.00010024437927663733, 'epoch': 1.36}
+{'loss': 0.6143, 'grad_norm': 1.7017927169799805, 'learning_rate': 0.0001002199413489736, 'epoch': 1.36}
+{'loss': 0.6772, 'grad_norm': 3.1961417198181152, 'learning_rate': 0.00010019550342130987, 'epoch': 1.36}
+{'loss': 1.4173, 'grad_norm': 4.469157695770264, 'learning_rate': 0.00010017106549364612, 'epoch': 1.36}
+{'loss': 0.316, 'grad_norm': 0.6359954476356506, 'learning_rate': 0.00010014662756598239, 'epoch': 1.36}
+{'loss': 0.1854, 'grad_norm': 1.053896427154541, 'learning_rate': 0.00010012218963831867, 'epoch': 1.36}
+{'loss': 0.2186, 'grad_norm': 0.57310950756073, 'learning_rate': 0.00010009775171065493, 'epoch': 1.36}
+{'loss': 0.2643, 'grad_norm': 0.5779302716255188, 'learning_rate': 0.00010007331378299118, 'epoch': 1.36}
+{'loss': 0.2138, 'grad_norm': 0.4520057439804077, 'learning_rate': 0.00010004887585532746, 'epoch': 1.36}
+{'loss': 0.2805, 'grad_norm': 1.1857097148895264, 'learning_rate': 0.00010002443792766373, 'epoch': 1.36}
+ 68%|██████▊   | 8695/12776 [1:31:00<53:20,  1.28it/s] 68%|██████▊   | 8696/12776 [1:31:01<50:17,  1.35it/s]                                                       68%|██████▊   | 8696/12776 [1:31:01<50:17,  1.35it/s] 68%|██████▊   | 8697/12776 [1:31:02<50:03,  1.36it/s]                                                       68%|██████▊   | 8697/12776 [1:31:02<50:03,  1.36it/s] 68%|██████▊   | 8698/12776 [1:31:02<46:37,  1.46it/s]                                                       68%|██████▊   | 8698/12776 [1:31:02<46:37,  1.46it/s] 68%|██████▊   | 8699/12776 [1:31:03<44:21,  1.53it/s]                                                       68%|██████▊   | 8699/12776 [1:31:03<44:21,  1.53it/s] 68%|██████▊   | 8700/12776 [1:31:03<41:27,  1.64it/s]                                                       68%|██████▊   | 8700/12776 [1:31:03<41:27,  1.64it/s] 68%|██████▊   | 8701/12776 [1:31:04<42:35,  1.59it/s]                                                       68%|██████▊   | 8701/12776 [1:31:04<42:35,  1.59it/s] 68%|██████▊   | 8702/12776 [1:31:04<39:23,  1.72it/s]                                                       68%|██████▊   | 8702/12776 [1:31:04<39:23,  1.72it/s] 68%|██████▊   | 8703/12776 [1:31:05<37:50,  1.79it/s]                                                       68%|██████▊   | 8703/12776 [1:31:05<37:50,  1.79it/s] 68%|██████▊   | 8704/12776 [1:31:05<35:13,  1.93it/s]                                                       68%|██████▊   | 8704/12776 [1:31:05<35:13,  1.93it/s] 68%|██████▊   | 8705/12776 [1:31:06<33:04,  2.05it/s]                                                       68%|██████▊   | 8705/12776 [1:31:06<33:04,  2.05it/s] 68%|██████▊   | 8706/12776 [1:31:06<32:53,  2.06it/s]                                                       68%|██████▊   | 8706/12776 [1:31:06<32:53,  2.06it/s] 68%|██████▊   | 8707/12776 [1:31:07<30:54,  2.19it/s]                                                       68%|██████▊   | 8707/12776 [1:31:07<30:54,  2.19it/s] 68%|██████▊   | 8708/12776 [1:31:07<29:13,  2.32it/s]                                                       68%|██████▊   | 8708/12776 [1:31:07<29:13,  2.32it/s] 68%|██████▊   | 8709/12776 [1:31:07<29:12,  2.32it/s]                                                       68%|██████▊   | 8709/12776 [1:31:07<29:12,  2.32it/s] 68%|██████▊   | 8710/12776 [1:31:08<27:28,  2.47it/s]                                                       68%|██████▊   | 8710/12776 [1:31:08<27:28,  2.47it/s] 68%|██████▊   | 8711/12776 [1:31:08<26:02,  2.60it/s]                                                       68%|██████▊   | 8711/12776 [1:31:08<26:02,  2.60it/s] 68%|██████▊   | 8712/12776 [1:31:09<26:40,  2.54it/s]                                                       68%|██████▊   | 8712/12776 [1:31:09<26:40,  2.54it/s] 68%|██████▊   | 8713/12776 [1:31:09<24:53,  2.72it/s]                                                       68%|██████▊   | 8713/12776 [1:31:09<24:53,  2.72it/s] 68%|██████▊   | 8714/12776 [1:31:09<23:31,  2.88it/s]                                                       68%|██████▊   | 8714/12776 [1:31:09<23:31,  2.88it/s] 68%|██████▊   | 8715/12776 [1:31:10<23:30,  2.88it/s]                                                       68%|██████▊   | 8715/12776 [1:31:10<23:30,  2.88it/s] 68%|██████▊   | 8716/12776 [1:31:10<22:13,  3.05it/s]                                                       68%|██████▊   | 8716/12776 [1:31:10<22:13,  3.05it/s] 68%|██████▊   | 8717/12776 [1:31:10<21:09,  3.20it/s]                                                       68%|██████▊   | 8717/12776 [1:31:10<21:09,  3.20it/s] 68%|██████▊   | 8718/12776 [1:31:10<20:14,  3.34it/s]                                                       68%|██████▊   | 8718/12776 [1:31:10<20:14,  3.34it/s] 68%|██████▊   | 8719/12776 [1:31:11<20:40,  3.27it/s]                                                       68%|██████▊   | 8719/12776 [1:31:11<20:40,  3.27it/s] 68%|██████▊   | 8720/12776 [1:31:11<19:36,  3.45it/s]                                                       68%|██████▊   | 8720/12776 [1:31:11<19:36,  3.45it/s] 68%|██████▊   | 8721/12776 [1:31:11<18:47,  3.60it/s]                                                       68%|██████▊   | 8721/12776 [1:31:11<18:47,  3.60it/s] 68%|██████▊   | 8722/12776 [1:31:11<18:05,  3.73it/s]                                                       68%|██████▊   | 8722/12776 [1:31:11<18:05,  3.73it/s] 68%|██████▊   | 8723/12776 [1:31:12<17:29,  3.86it/s]                                                       68%|██████▊   | 8723/12776 [1:31:12<17:29,  3.86it/s] 68%|██████▊   | 8724/12776 [1:31:12<17:33,  3.85it/s]                                                       68%|██████▊   | 8724/12776 [1:31:12<17:33,  3.85it/s] 68%|██████▊   | 8725/12776 [1:31:12<16:54,  3.99it/s]                                                       68%|██████▊   | 8725/12776 [1:31:12<16:54,  3.99it/s] 68%|██████▊   | 8726/12776 [1:31:12<16:17,  4.14it/s]                                                       68%|██████▊   | 8726/12776 [1:31:12<16:17,  4.14it/s] 68%|██████▊   | 8727/12776 [1:31:13<15:42,  4.30it/s]                                                       68%|██████▊   | 8727/12776 [1:31:13<15:42,  4.30it/s] 68%|██████▊   | 8728/12776 [1:31:13<15:13,  4.43it/s]                                                       68%|██████▊   | 8728/12776 [1:31:13<15:13,  4.43it/s] 68%|██████▊   | 8729/12776 [1:31:13<16:35,  4.06it/s]                                                       68%|██████▊   | 8729/12776 [1:31:13<16:35,  4.06it/s] 68%|██████▊   | 8730/12776 [1:31:13<15:46,  4.28it/s]                                                       68%|██████▊   | 8730/12776 [1:31:13<15:46,  4.28it/s] 68%|██████▊   | 8731/12776 [1:31:13<15:07,  4.46it/s]                                                       68%|██████▊   | 8731/12776 [1:31:13<15:07,  4.46it/s] 68%|██████▊   | 8732/12776 [1:31:14<14:34,  4.62it/s]                                                       68%|██████▊   | 8732/12776 [1:31:14<14:34,  4.62it/s] 68%|██████▊   | 8733/12776 [1:31:14<14:08,  4.76it/s]                                                       68%|██████▊   | 8733/12776 [1:31:14<14:08,  4.76it/s] 68%|██████▊   | 8734/12776 [1:31:14<15:51,  4.25it/s]                                                       68%|██████▊   | 8734/12776 [1:31:14<15:51,  4.25it/s] 68%|██████▊   | 8735/12776 [1:31:14<14:56,  4.51it/s]                                                       68%|██████▊   | 8735/12776 [1:31:14<14:56,  4.51it/s] 68%|██████▊   | 8736/12776 [1:31:15<14:13,  4.74it/s]                                                       68%|██████▊   | 8736/12776 [1:31:15<14:13,  4.74it/s] 68%|██████▊   | 8737/12776 [1:31:15<13:38,  4.94it/s]                                                       68%|██████▊   | 8737/12776 [1:31:15<13:38,  4.94it/s] 68%|██████▊   | 8738/12776 [1:31:15<22:56,  2.93it/s]                                                       68%|██████▊   | 8738/12776 [1:31:15<22:56,  2.93it/s] 68%|██████▊   | 8739/12776 [1:31:17<43:21,  1.55it/s]                                                       68%|██████▊   | 8739/12776 [1:31:17<43:21,  1.55it/s] 68%|██████▊   | 8740/12776 [1:31:18<48:43,  1.38it/s]                                                       68%|██████▊   | 8740/12776 [1:31:18<48:43,  1.38it/s] 68%|██████▊   | 8741/12776 [1:31:19<51:04,  1.32it/s]                                                       68%|██████▊   | 8741/12776 [1:31:19<51:04,  1.32it/s] 68%|██████▊   | 8742/12776 [1:31:19<51:07,  1.32it/s]                                                       68%|██████▊   | 8742/12776 [1:31:19<51:07,  1.32it/s] 68%|██████▊   | 8743/12776 [1:31:20<53:55,  1.25it/s]                                                       68%|██████▊   | 8743/12776 [1:31:20<53:55,  1.25it/s] 68%|██████▊   | 8744/12776 [1:31:21<51:15,  1.31it/s]                                                       68%|██████▊   | 8744/12776 [1:31:21<51:15,  1.31it/s] 68%|██████▊   | 8745/12776 [1:31:21<48:46,  1.38it/s]                                                       68%|██████▊   | 8745/12776 [1:31:21<48:46,  1.38it/s] 68%|██████▊   | 8746/12776 [1:31:22<48:15,  1.39it/s]                                                       68%|██████▊   | 8746/12776 [1:31:22<48:15,  1.39it/s] 68%|██████▊   | 8747/12776 [1:31:23<45:22,  1.48it/s]                                                       68%|██████▊   | 8747/12776 [1:31:23<45:22,  1.48it/s] 68%|██████▊   | 8748/12776 [1:31:23<43:22,  1.55it/s]                                                       68%|██████▊   | 8748/12776 [1:31:23<43:22,  1.55it/s] 68%|██████▊   | 8749/12776 [1:31:24<40:53,  1.64it/s]                                                       68%|██████▊   | 8749/12776 [1:31:24<40:53,  1.64it/s] 68%|██████▊   | 8750/12776 [1:31:25<41:40,  1.61it/s]                                                       68%|██████▊   | 8750/12776 [1:31:25<41:40,  1.61it/s] 68%|██████▊   | 8751/12776 [1:31:25<38:41,  1.73it/s]                                                       68%|██████▊   | 8751/12776 [1:31:25<38:41,  1.73it/s] 69%|██████▊   | 8752/12776 [1:31:25<36:06,  1.86it/s]                                                       69%|██████▊   | 8752/12776 [1:31:25<36:06,  1.86it/s] 69%|██████▊   | 8753/12776 [1:31:26<35:15,  1.90it/s]                                                       69%|██████▊   | 8753/12776 [1:31:26<35:15,  1.90it/s] 69%|██████▊   | 8754/12776 [1:31:26<32:59,  2.03it/s]                                                       69%|██████▊   | 8754/12776 [1:31:26<32:59,  2.03it/s] 69%|██████▊   | 8755/12776 [1:31:27<32:41,  2.05it/s]                                                       69%|██████▊   | 8755/12776 [1:31:27<32:41,  2.05it/s] 69%|██████▊   | 8756/12776 [1:31:27<30:39,  2.19it/s]                                                       69%|██████▊   | 8756/12776 [1:31:27<30:39,  2.19it/s] 69%|██████▊   | 8757/12776 [1:31:28<29:01,  2.31it/s]                                                       69%|██████▊   | 8757/12776 [1:31:28<29:01,  2.31it/s] 69%|██████▊   | 8758/12776 [1:31:28<28:02,  2.39it/s]                                                       69%|██████▊   | 8758/12776 [1:31:28<28:02,  2.39it/s] 69%|██████▊   | 8759/12776 [1:31:28<26:31,  2.52it/s]                                                       69%|██████▊   | 8759/12776 [1:31:28<26:31,  2.52it/s] 69%|██████▊   | 8760/12776 [1:31:29<25:13,  2.65it/s]                                                       69%|██████▊   | 8760/12776 [1:31:29<25:13,  2.65it/s] 69%|██████▊   | 8761/12776 [1:31:29<24:05,  2.78it/s]                                                       69%|██████▊   | 8761/12776 [1:31:29<24:05,  2.78it/s] 69%|██████▊   | 8762/12776 [1:31:29<22:50,  2.93it/s]                                                       69%|██████▊   | 8762/12776 [1:31:29<22:50,  2.93it/s] 69%|██████▊   | 8763/12776 [1:31:30<21:51,  3.06it/s]                                                       69%|██████▊   | 8763/12776 [1:31:30<21:51,  3.06it/s] 69%|██████▊   | 8764/12776 [1:31:30<20:59,  3.18it/s]                                                       69%|██████▊   | 8764/12776 [1:31:30<20:59,  3.18it/s] 69%|██████▊   | 8765/12776 [1:31:30<22:06,  3.02it/s]                                                       69%|██████▊   | 8765/12776 [1:31:30<22:06,  3.02it/s] 69%|██████▊   | 8766/12776 [1:31:30<20:49,  3.21it/s]                                                       69%|██████▊   | 8766/12776 [1:31:30<20:49,  3.21it/s] 69%|██████▊   | 8767/12776 [1:31:31<19:47,  3.38it/s]                                                       69%|██████▊   | 8767/12776 [1:31:31<19:47,  3.38it/s] 69%|██████▊   | 8768/12776 [1:31:31<18:52,  3.54it/s]                                                       69%|██████▊   | 8768/12776 [1:31:31<18:52,  3.54it/s] 69%|██████▊   | 8769/12776 [1:31:31<20:26,  3.27it/s]                                                       69%|██████▊   | 8769/12776 [1:31:31<20:26,  3.27it/s] 69%|██████▊   | 8770/12776 [1:31:32<19:11,  3.48it/s]                                                       69%|██████▊   | 8770/12776 [1:31:32<19:11,  3.48it/s] 69%|██████▊   | 8771/12776 [1:31:32<18:12,  3.67it/s]                                                       69%|██████▊   | 8771/12776 [1:31:32<18:12,  3.67it/s] 69%|██████▊   | 8772/12776 [1:31:32<17:28,  3.82it/s]                                                       69%|██████▊   | 8772/12776 [1:31:32<17:28,  3.82it/s] 69%|██████▊   | 8773/12776 [1:31:32<16:48,  3.97it/s]                                                      {'loss': 0.3539, 'grad_norm': 0.5992613434791565, 'learning_rate': 9.999999999999999e-05, 'epoch': 1.36}
+{'loss': 0.2893, 'grad_norm': 0.8540019392967224, 'learning_rate': 9.997556207233626e-05, 'epoch': 1.36}
+{'loss': 0.2457, 'grad_norm': 0.5429804921150208, 'learning_rate': 9.995112414467252e-05, 'epoch': 1.36}
+{'loss': 0.2567, 'grad_norm': 1.0506290197372437, 'learning_rate': 9.992668621700879e-05, 'epoch': 1.36}
+{'loss': 0.1966, 'grad_norm': 0.7274707555770874, 'learning_rate': 9.990224828934506e-05, 'epoch': 1.36}
+{'loss': 0.3843, 'grad_norm': 1.6726579666137695, 'learning_rate': 9.987781036168132e-05, 'epoch': 1.36}
+{'loss': 0.304, 'grad_norm': 1.3097379207611084, 'learning_rate': 9.985337243401758e-05, 'epoch': 1.36}
+{'loss': 0.453, 'grad_norm': 1.2650363445281982, 'learning_rate': 9.982893450635386e-05, 'epoch': 1.36}
+{'loss': 0.3309, 'grad_norm': 0.9016621112823486, 'learning_rate': 9.980449657869012e-05, 'epoch': 1.36}
+{'loss': 0.3763, 'grad_norm': 1.0004808902740479, 'learning_rate': 9.978005865102637e-05, 'epoch': 1.36}
+{'loss': 0.4991, 'grad_norm': 1.2892861366271973, 'learning_rate': 9.975562072336265e-05, 'epoch': 1.36}
+{'loss': 0.3803, 'grad_norm': 1.6277682781219482, 'learning_rate': 9.973118279569892e-05, 'epoch': 1.36}
+{'loss': 0.4247, 'grad_norm': 1.4325026273727417, 'learning_rate': 9.970674486803518e-05, 'epoch': 1.36}
+{'loss': 0.4286, 'grad_norm': 1.4070262908935547, 'learning_rate': 9.968230694037145e-05, 'epoch': 1.36}
+{'loss': 0.65, 'grad_norm': 2.1237456798553467, 'learning_rate': 9.965786901270771e-05, 'epoch': 1.36}
+{'loss': 0.858, 'grad_norm': 2.7738819122314453, 'learning_rate': 9.963343108504398e-05, 'epoch': 1.36}
+{'loss': 0.3916, 'grad_norm': 1.125198483467102, 'learning_rate': 9.960899315738026e-05, 'epoch': 1.36}
+{'loss': 0.3602, 'grad_norm': 1.3788397312164307, 'learning_rate': 9.95845552297165e-05, 'epoch': 1.36}
+{'loss': 0.3027, 'grad_norm': 1.1802959442138672, 'learning_rate': 9.956011730205277e-05, 'epoch': 1.36}
+{'loss': 0.3809, 'grad_norm': 1.0281505584716797, 'learning_rate': 9.953567937438905e-05, 'epoch': 1.36}
+{'loss': 0.8278, 'grad_norm': 2.2165725231170654, 'learning_rate': 9.951124144672531e-05, 'epoch': 1.36}
+{'loss': 0.944, 'grad_norm': 2.219921350479126, 'learning_rate': 9.948680351906157e-05, 'epoch': 1.36}
+{'loss': 0.9238, 'grad_norm': 2.5214898586273193, 'learning_rate': 9.946236559139784e-05, 'epoch': 1.36}
+{'loss': 0.6352, 'grad_norm': 2.641808032989502, 'learning_rate': 9.943792766373411e-05, 'epoch': 1.36}
+{'loss': 0.7535, 'grad_norm': 1.4690858125686646, 'learning_rate': 9.941348973607037e-05, 'epoch': 1.36}
+{'loss': 0.6958, 'grad_norm': 1.5977320671081543, 'learning_rate': 9.938905180840664e-05, 'epoch': 1.37}
+{'loss': 0.846, 'grad_norm': 1.7913562059402466, 'learning_rate': 9.93646138807429e-05, 'epoch': 1.37}
+{'loss': 0.7296, 'grad_norm': 2.0344135761260986, 'learning_rate': 9.934017595307917e-05, 'epoch': 1.37}
+{'loss': 0.7046, 'grad_norm': 3.8352715969085693, 'learning_rate': 9.931573802541545e-05, 'epoch': 1.37}
+{'loss': 0.5948, 'grad_norm': 1.1829397678375244, 'learning_rate': 9.92913000977517e-05, 'epoch': 1.37}
+{'loss': 1.1317, 'grad_norm': 3.249732732772827, 'learning_rate': 9.926686217008796e-05, 'epoch': 1.37}
+{'loss': 0.8155, 'grad_norm': 2.0440311431884766, 'learning_rate': 9.924242424242424e-05, 'epoch': 1.37}
+{'loss': 0.8181, 'grad_norm': 2.4997398853302, 'learning_rate': 9.92179863147605e-05, 'epoch': 1.37}
+{'loss': 0.573, 'grad_norm': 4.196403503417969, 'learning_rate': 9.919354838709676e-05, 'epoch': 1.37}
+{'loss': 0.5787, 'grad_norm': 1.9175196886062622, 'learning_rate': 9.916911045943304e-05, 'epoch': 1.37}
+{'loss': 0.7122, 'grad_norm': 1.8270318508148193, 'learning_rate': 9.91446725317693e-05, 'epoch': 1.37}
+{'loss': 0.9239, 'grad_norm': 2.314758539199829, 'learning_rate': 9.912023460410556e-05, 'epoch': 1.37}
+{'loss': 0.8547, 'grad_norm': 1.4689182043075562, 'learning_rate': 9.909579667644183e-05, 'epoch': 1.37}
+{'loss': 0.6393, 'grad_norm': 2.735502004623413, 'learning_rate': 9.90713587487781e-05, 'epoch': 1.37}
+{'loss': 1.0392, 'grad_norm': 4.347554683685303, 'learning_rate': 9.904692082111436e-05, 'epoch': 1.37}
+{'loss': 0.525, 'grad_norm': 1.6307176351547241, 'learning_rate': 9.902248289345064e-05, 'epoch': 1.37}
+{'loss': 0.6347, 'grad_norm': 2.497677803039551, 'learning_rate': 9.899804496578689e-05, 'epoch': 1.37}
+{'loss': 0.6741, 'grad_norm': 2.2993502616882324, 'learning_rate': 9.897360703812315e-05, 'epoch': 1.37}
+{'loss': 0.4096, 'grad_norm': 1.1915621757507324, 'learning_rate': 9.894916911045943e-05, 'epoch': 1.37}
+{'loss': 0.2614, 'grad_norm': 0.5528977513313293, 'learning_rate': 9.89247311827957e-05, 'epoch': 1.37}
+{'loss': 0.2172, 'grad_norm': 0.7073773145675659, 'learning_rate': 9.890029325513195e-05, 'epoch': 1.37}
+{'loss': 0.2084, 'grad_norm': 0.5945557355880737, 'learning_rate': 9.887585532746823e-05, 'epoch': 1.37}
+{'loss': 0.1236, 'grad_norm': 0.41675809025764465, 'learning_rate': 9.885141739980449e-05, 'epoch': 1.37}
+{'loss': 0.2195, 'grad_norm': 0.6224748492240906, 'learning_rate': 9.882697947214076e-05, 'epoch': 1.37}
+{'loss': 0.5121, 'grad_norm': 1.0293275117874146, 'learning_rate': 9.880254154447702e-05, 'epoch': 1.37}
+{'loss': 0.2933, 'grad_norm': 0.8710348606109619, 'learning_rate': 9.877810361681329e-05, 'epoch': 1.37}
+{'loss': 0.1909, 'grad_norm': 0.6916895508766174, 'learning_rate': 9.875366568914955e-05, 'epoch': 1.37}
+{'loss': 0.2403, 'grad_norm': 0.8933185338973999, 'learning_rate': 9.872922776148583e-05, 'epoch': 1.37}
+{'loss': 0.3148, 'grad_norm': 1.0398600101470947, 'learning_rate': 9.870478983382208e-05, 'epoch': 1.37}
+{'loss': 0.3194, 'grad_norm': 1.0829379558563232, 'learning_rate': 9.868035190615834e-05, 'epoch': 1.37}
+{'loss': 0.2394, 'grad_norm': 0.7273138761520386, 'learning_rate': 9.865591397849462e-05, 'epoch': 1.37}
+{'loss': 0.7665, 'grad_norm': 2.8995516300201416, 'learning_rate': 9.863147605083089e-05, 'epoch': 1.37}
+{'loss': 0.3913, 'grad_norm': 1.2653355598449707, 'learning_rate': 9.860703812316714e-05, 'epoch': 1.37}
+{'loss': 0.4381, 'grad_norm': 4.484793663024902, 'learning_rate': 9.858260019550342e-05, 'epoch': 1.37}
+{'loss': 0.1318, 'grad_norm': 0.5917729735374451, 'learning_rate': 9.855816226783968e-05, 'epoch': 1.37}
+{'loss': 0.3232, 'grad_norm': 1.4675835371017456, 'learning_rate': 9.853372434017593e-05, 'epoch': 1.37}
+{'loss': 0.3359, 'grad_norm': 1.0259408950805664, 'learning_rate': 9.850928641251221e-05, 'epoch': 1.37}
+{'loss': 0.5625, 'grad_norm': 1.2516374588012695, 'learning_rate': 9.848484848484848e-05, 'epoch': 1.37}
+{'loss': 0.378, 'grad_norm': 3.212458372116089, 'learning_rate': 9.846041055718474e-05, 'epoch': 1.37}
+{'loss': 0.4768, 'grad_norm': 2.0374770164489746, 'learning_rate': 9.843597262952102e-05, 'epoch': 1.37}
+{'loss': 0.4226, 'grad_norm': 2.4900288581848145, 'learning_rate': 9.841153470185727e-05, 'epoch': 1.37}
+{'loss': 0.5211, 'grad_norm': 1.8129522800445557, 'learning_rate': 9.838709677419354e-05, 'epoch': 1.37}
+{'loss': 0.7839, 'grad_norm': 1.4478105306625366, 'learning_rate': 9.836265884652981e-05, 'epoch': 1.37}
+{'loss': 0.4166, 'grad_norm': 1.799777626991272, 'learning_rate': 9.833822091886608e-05, 'epoch': 1.37}
+{'loss': 0.8867, 'grad_norm': 3.246655225753784, 'learning_rate': 9.831378299120233e-05, 'epoch': 1.37}
+{'loss': 0.5577, 'grad_norm': 1.721030354499817, 'learning_rate': 9.828934506353861e-05, 'epoch': 1.37}
+{'loss': 0.573, 'grad_norm': 1.6854079961776733, 'learning_rate': 9.826490713587487e-05, 'epoch': 1.37}
+{'loss': 0.7653, 'grad_norm': 2.2403340339660645, 'learning_rate': 9.824046920821112e-05, 'epoch': 1.37}
+{'loss': 0.5029, 'grad_norm': 1.6862674951553345, 'learning_rate': 9.82160312805474e-05, 'epoch': 1.37}
+{'loss': 0.772, 'grad_norm': 2.492246627807617, 'learning_rate': 9.819159335288367e-05, 'epoch': 1.37}
+{'loss': 1.0772, 'grad_norm': 3.606316328048706, 'learning_rate': 9.816715542521993e-05, 'epoch': 1.37}
+{'loss': 0.3547, 'grad_norm': 1.919001579284668, 'learning_rate': 9.814271749755621e-05, 'epoch': 1.37}
+{'loss': 0.7453, 'grad_norm': 1.8840322494506836, 'learning_rate': 9.811827956989246e-05, 'epoch': 1.37}
+ 69%|██████▊   | 8773/12776 [1:31:32<16:48,  3.97it/s] 69%|██████▊   | 8774/12776 [1:31:33<18:07,  3.68it/s]                                                       69%|██████▊   | 8774/12776 [1:31:33<18:07,  3.68it/s] 69%|██████▊   | 8775/12776 [1:31:33<17:00,  3.92it/s]                                                       69%|██████▊   | 8775/12776 [1:31:33<17:00,  3.92it/s] 69%|██████▊   | 8776/12776 [1:31:33<16:03,  4.15it/s]                                                       69%|██████▊   | 8776/12776 [1:31:33<16:03,  4.15it/s] 69%|██████▊   | 8777/12776 [1:31:33<15:24,  4.33it/s]                                                       69%|██████▊   | 8777/12776 [1:31:33<15:24,  4.33it/s] 69%|██████▊   | 8778/12776 [1:31:33<14:56,  4.46it/s]                                                       69%|██████▊   | 8778/12776 [1:31:33<14:56,  4.46it/s] 69%|██████▊   | 8779/12776 [1:31:34<16:27,  4.05it/s]                                                       69%|██████▊   | 8779/12776 [1:31:34<16:27,  4.05it/s] 69%|██████▊   | 8780/12776 [1:31:34<15:34,  4.27it/s]                                                       69%|██████▊   | 8780/12776 [1:31:34<15:34,  4.27it/s] 69%|██████▊   | 8781/12776 [1:31:34<14:53,  4.47it/s]                                                       69%|██████▊   | 8781/12776 [1:31:34<14:53,  4.47it/s] 69%|██████▊   | 8782/12776 [1:31:34<14:22,  4.63it/s]                                                       69%|██████▊   | 8782/12776 [1:31:34<14:22,  4.63it/s] 69%|██████▊   | 8783/12776 [1:31:35<13:59,  4.75it/s]                                                       69%|██████▊   | 8783/12776 [1:31:35<13:59,  4.75it/s] 69%|██████▉   | 8784/12776 [1:31:35<16:19,  4.07it/s]                                                       69%|██████▉   | 8784/12776 [1:31:35<16:19,  4.07it/s] 69%|██████▉   | 8785/12776 [1:31:35<15:15,  4.36it/s]                                                       69%|██████▉   | 8785/12776 [1:31:35<15:15,  4.36it/s] 69%|██████▉   | 8786/12776 [1:31:35<14:28,  4.60it/s]                                                       69%|██████▉   | 8786/12776 [1:31:35<14:28,  4.60it/s] 69%|██████▉   | 8787/12776 [1:31:35<13:47,  4.82it/s]                                                       69%|██████▉   | 8787/12776 [1:31:35<13:47,  4.82it/s] 69%|██████▉   | 8788/12776 [1:31:36<23:09,  2.87it/s]                                                       69%|██████▉   | 8788/12776 [1:31:36<23:09,  2.87it/s] 69%|██████▉   | 8789/12776 [1:31:37<43:13,  1.54it/s]                                                       69%|██████▉   | 8789/12776 [1:31:37<43:13,  1.54it/s] 69%|██████▉   | 8790/12776 [1:31:38<48:07,  1.38it/s]                                                       69%|██████▉   | 8790/12776 [1:31:38<48:07,  1.38it/s] 69%|██████▉   | 8791/12776 [1:31:39<50:22,  1.32it/s]                                                       69%|██████▉   | 8791/12776 [1:31:39<50:22,  1.32it/s] 69%|██████▉   | 8792/12776 [1:31:40<50:36,  1.31it/s]                                                       69%|██████▉   | 8792/12776 [1:31:40<50:36,  1.31it/s] 69%|██████▉   | 8793/12776 [1:31:41<53:30,  1.24it/s]                                                       69%|██████▉   | 8793/12776 [1:31:41<53:30,  1.24it/s] 69%|██████▉   | 8794/12776 [1:31:42<55:00,  1.21it/s]                                                       69%|██████▉   | 8794/12776 [1:31:42<55:00,  1.21it/s] 69%|██████▉   | 8795/12776 [1:31:42<51:50,  1.28it/s]                                                       69%|██████▉   | 8795/12776 [1:31:42<51:50,  1.28it/s] 69%|██████▉   | 8796/12776 [1:31:43<50:43,  1.31it/s]                                                       69%|██████▉   | 8796/12776 [1:31:43<50:43,  1.31it/s] 69%|██████▉   | 8797/12776 [1:31:44<47:42,  1.39it/s]                                                       69%|██████▉   | 8797/12776 [1:31:44<47:42,  1.39it/s] 69%|██████▉   | 8798/12776 [1:31:44<45:31,  1.46it/s]                                                       69%|██████▉   | 8798/12776 [1:31:44<45:31,  1.46it/s] 69%|██████▉   | 8799/12776 [1:31:45<42:45,  1.55it/s]                                                       69%|██████▉   | 8799/12776 [1:31:45<42:45,  1.55it/s] 69%|██████▉   | 8800/12776 [1:31:46<41:47,  1.59it/s]                                                       69%|██████▉   | 8800/12776 [1:31:46<41:47,  1.59it/s]Saving model checkpoint to ./checkpoint-8800
+Configuration saved in ./checkpoint-8800/config.json
+Model weights saved in ./checkpoint-8800/model.safetensors
+Feature extractor saved in ./checkpoint-8800/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-8800/tokenizer_config.json
+Special tokens file saved in ./checkpoint-8800/special_tokens_map.json
+added tokens file saved in ./checkpoint-8800/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-7600] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 69%|██████▉   | 8801/12776 [1:31:51<2:24:04,  2.17s/it]                                                         69%|██████▉   | 8801/12776 [1:31:51<2:24:04,  2.17s/it] 69%|██████▉   | 8802/12776 [1:31:52<1:49:55,  1.66s/it]                                                         69%|██████▉   | 8802/12776 [1:31:52<1:49:55,  1.66s/it] 69%|██████▉   | 8803/12776 [1:31:52<1:26:48,  1.31s/it]                                                         69%|██████▉   | 8803/12776 [1:31:52<1:26:48,  1.31s/it] 69%|██████▉   | 8804/12776 [1:31:53<1:09:32,  1.05s/it]                                                         69%|██████▉   | 8804/12776 [1:31:53<1:09:32,  1.05s/it] 69%|██████▉   | 8805/12776 [1:31:53<57:58,  1.14it/s]                                                         69%|██████▉   | 8805/12776 [1:31:53<57:58,  1.14it/s] 69%|██████▉   | 8806/12776 [1:31:54<48:21,  1.37it/s]                                                       69%|██████▉   | 8806/12776 [1:31:54<48:21,  1.37it/s] 69%|██████▉   | 8807/12776 [1:31:54<41:13,  1.60it/s]                                                       69%|██████▉   | 8807/12776 [1:31:54<41:13,  1.60it/s] 69%|██████▉   | 8808/12776 [1:31:54<37:10,  1.78it/s]                                                       69%|██████▉   | 8808/12776 [1:31:54<37:10,  1.78it/s] 69%|██████▉   | 8809/12776 [1:31:55<32:42,  2.02it/s]                                                       69%|██████▉   | 8809/12776 [1:31:55<32:42,  2.02it/s] 69%|██████▉   | 8810/12776 [1:31:55<29:13,  2.26it/s]                                                       69%|██████▉   | 8810/12776 [1:31:55<29:13,  2.26it/s] 69%|██████▉   | 8811/12776 [1:31:55<26:34,  2.49it/s]                                                       69%|██████▉   | 8811/12776 [1:31:55<26:34,  2.49it/s] 69%|██████▉   | 8812/12776 [1:31:56<25:45,  2.56it/s]                                                       69%|██████▉   | 8812/12776 [1:31:56<25:45,  2.56it/s] 69%|██████▉   | 8813/12776 [1:31:56<23:35,  2.80it/s]                                                       69%|██████▉   | 8813/12776 [1:31:56<23:35,  2.80it/s] 69%|██████▉   | 8814/12776 [1:31:56<21:55,  3.01it/s]                                                       69%|██████▉   | 8814/12776 [1:31:56<21:55,  3.01it/s] 69%|██████▉   | 8815/12776 [1:31:57<22:57,  2.88it/s]                                                       69%|██████▉   | 8815/12776 [1:31:57<22:57,  2.88it/s] 69%|██████▉   | 8816/12776 [1:31:57<21:01,  3.14it/s]                                                       69%|██████▉   | 8816/12776 [1:31:57<21:01,  3.14it/s] 69%|██████▉   | 8817/12776 [1:31:57<19:31,  3.38it/s]                                                       69%|██████▉   | 8817/12776 [1:31:57<19:31,  3.38it/s] 69%|██████▉   | 8818/12776 [1:31:57<18:17,  3.60it/s]                                                       69%|██████▉   | 8818/12776 [1:31:57<18:17,  3.60it/s] 69%|██████▉   | 8819/12776 [1:31:58<17:30,  3.77it/s]                                                       69%|██████▉   | 8819/12776 [1:31:58<17:30,  3.77it/s] 69%|██████▉   | 8820/12776 [1:31:58<18:04,  3.65it/s]                                                       69%|██████▉   | 8820/12776 [1:31:58<18:04,  3.65it/s] 69%|██████▉   | 8821/12776 [1:31:58<16:59,  3.88it/s]                                                       69%|██████▉   | 8821/12776 [1:31:58<16:59,  3.88it/s] 69%|██████▉   | 8822/12776 [1:31:58<16:07,  4.09it/s]                                                       69%|██████▉   | 8822/12776 [1:31:58<16:07,  4.09it/s] 69%|██████▉   | 8823/12776 [1:31:59<15:32,  4.24it/s]                                                       69%|██████▉   | 8823/12776 [1:31:59<15:32,  4.24it/s] 69%|██████▉   | 8824/12776 [1:31:59<14:53,  4.42it/s]                                                       69%|██████▉   | 8824/12776 [1:31:59<14:53,  4.42it/s] 69%|██████▉   | 8825/12776 [1:31:59<15:32,  4.24it/s]                                                       69%|██████▉   | 8825/12776 [1:31:59<15:32,  4.24it/s] 69%|██████▉   | 8826/12776 [1:31:59<14:44,  4.47it/s]                                                       69%|██████▉   | 8826/12776 [1:31:59<14:44,  4.47it/s] 69%|██████▉   | 8827/12776 [1:31:59<14:06,  4.66it/s]                                                       69%|██████▉   | 8827/12776 [1:31:59<14:06,  4.66it/s] 69%|██████▉   | 8828/12776 [1:32:00<13:37,  4.83it/s]                                                       69%|██████▉   | 8828/12776 [1:32:00<13:37,  4.83it/s] 69%|██████▉   | 8829/12776 [1:32:00<13:12,  4.98it/s]                                                       69%|██████▉   | 8829/12776 [1:32:00<13:12,  4.98it/s] 69%|██████▉   | 8830/12776 [1:32:00<14:09,  4.65it/s]                                                       69%|██████▉   | 8830/12776 [1:32:00<14:09,  4.65it/s] 69%|██████▉   | 8831/12776 [1:32:00<13:26,  4.89it/s]                                                       69%|██████▉   | 8831/12776 [1:32:00<13:26,  4.89it/s] 69%|██████▉   | 8832/12776 [1:32:00<12:54,  5.09it/s]                                                       69%|█��████▉   | 8832/12776 [1:32:00<12:54,  5.09it/s] 69%|██████▉   | 8833/12776 [1:32:01<12:32,  5.24it/s]                                                       69%|██████▉   | 8833/12776 [1:32:01<12:32,  5.24it/s] 69%|██████▉   | 8834/12776 [1:32:01<12:12,  5.38it/s]                                                       69%|██████▉   | 8834/12776 [1:32:01<12:12,  5.38it/s] 69%|██████▉   | 8835/12776 [1:32:01<11:54,  5.52it/s]                                                       69%|██████▉   | 8835/12776 [1:32:01<11:54,  5.52it/s] 69%|██████▉   | 8836/12776 [1:32:01<13:31,  4.85it/s]                                                       69%|██████▉   | 8836/12776 [1:32:01<13:31,  4.85it/s] 69%|██████▉   | 8837/12776 [1:32:01<12:42,  5.17it/s]                                                       69%|██████▉   | 8837/12776 [1:32:01<12:42,  5.17it/s] 69%|██████▉   | 8838/12776 [1:32:02<22:35,  2.91it/s]                                                       69%|██████▉   | 8838/12776 [1:32:02<22:35,  2.91it/s] 69%|██████▉   | 8839/12776 [1:32:03<40:34,  1.62it/s]                                                       69%|██████▉   | 8839/12776 [1:32:03<40:34,  1.62it/s] 69%|██████▉   | 8840/12776 [1:32:04<46:09,  1.42it/s]                                                       69%|██████▉   | 8840/12776 [1:32:04<46:09,  1.42it/s] 69%|██████▉   | 8841/12776 [1:32:05<49:47,  1.32it/s]                                                       69%|██████▉   | 8841/12776 [1:32:05<49:47,  1.32it/s] 69%|██████▉   | 8842/12776 [1:32:06<49:19,  1.33it/s]                                                       69%|██████▉   | 8842/12776 [1:32:06<49:19,  1.33it/s] 69%|██████▉   | 8843/12776 [1:32:07<48:24,  1.35it/s]                                                       69%|██████▉   | 8843/12776 [1:32:07<48:24,  1.35it/s] 69%|██████▉   | 8844/12776 [1:32:07<47:53,  1.37it/s]                                                       69%|██████▉   | 8844/12776 [1:32:07<47:53,  1.37it/s] 69%|██████▉   | 8845/12776 [1:32:08<46:10,  1.42it/s]                                                       69%|██████▉   | 8845/12776 [1:32:08<46:10,  1.42it/s] 69%|██████▉   | 8846/12776 [1:32:08<44:25,  1.47it/s]                                                       69%|██████▉   | 8846/12776 [1:32:08<44:25,  1.47it/s] 69%|██████▉   | 8847/12776 [1:32:09<42:41,  1.53it/s]                                                       69%|██████▉   | 8847/12776 [1:32:09<42:41,  1.53it/s] 69%|██████▉   | 8848/12776 [1:32:10<40:51,  1.60it/s]                                                       69%|██████▉   | 8848/12776 [1:32:10<40:51,  1.60it/s] 69%|██████▉   | 8849/12776 [1:32:10<39:05,  1.67it/s]                                                       69%|██████▉   | 8849/12776 [1:32:10<39:05,  1.67it/s] 69%|██████▉   | 8850/12776 [1:32:11<38:14,  1.71it/s]                                                       69%|██████▉   | 8850/12776 [1:32:11<38:14,  1.71it/s] 69%|██████▉   | 8851/12776 [1:32:11<36:10,  1.81it/s]                                                      {'loss': 0.516, 'grad_norm': 1.7198199033737183, 'learning_rate': 9.809384164222873e-05, 'epoch': 1.37}
+{'loss': 1.4235, 'grad_norm': 3.6644818782806396, 'learning_rate': 9.8069403714565e-05, 'epoch': 1.37}
+{'loss': 1.0742, 'grad_norm': 2.1743552684783936, 'learning_rate': 9.804496578690127e-05, 'epoch': 1.37}
+{'loss': 1.6689, 'grad_norm': 4.740699291229248, 'learning_rate': 9.802052785923752e-05, 'epoch': 1.37}
+{'loss': 1.5587, 'grad_norm': 3.387509822845459, 'learning_rate': 9.79960899315738e-05, 'epoch': 1.37}
+{'loss': 0.8358, 'grad_norm': 1.966965675354004, 'learning_rate': 9.797165200391006e-05, 'epoch': 1.37}
+{'loss': 1.0338, 'grad_norm': 2.8340470790863037, 'learning_rate': 9.794721407624632e-05, 'epoch': 1.37}
+{'loss': 1.4119, 'grad_norm': 3.0556154251098633, 'learning_rate': 9.79227761485826e-05, 'epoch': 1.37}
+{'loss': 1.1295, 'grad_norm': 3.300295829772949, 'learning_rate': 9.789833822091886e-05, 'epoch': 1.37}
+{'loss': 0.8346, 'grad_norm': 2.788999557495117, 'learning_rate': 9.787390029325512e-05, 'epoch': 1.37}
+{'loss': 0.6381, 'grad_norm': 1.3150262832641602, 'learning_rate': 9.78494623655914e-05, 'epoch': 1.37}
+{'loss': 0.5895, 'grad_norm': 3.5385401248931885, 'learning_rate': 9.782502443792765e-05, 'epoch': 1.38}
+{'loss': 0.2589, 'grad_norm': 1.0753779411315918, 'learning_rate': 9.780058651026392e-05, 'epoch': 1.38}
+{'loss': 0.4002, 'grad_norm': 2.1634578704833984, 'learning_rate': 9.77761485826002e-05, 'epoch': 1.38}
+{'loss': 1.104, 'grad_norm': 2.8193857669830322, 'learning_rate': 9.775171065493646e-05, 'epoch': 1.38}
+{'loss': 0.8568, 'grad_norm': 2.0298500061035156, 'learning_rate': 9.772727272727271e-05, 'epoch': 1.38}
+{'loss': 0.3035, 'grad_norm': 0.7730556130409241, 'learning_rate': 9.770283479960899e-05, 'epoch': 1.38}
+{'loss': 0.2784, 'grad_norm': 0.873766303062439, 'learning_rate': 9.767839687194526e-05, 'epoch': 1.38}
+{'loss': 0.242, 'grad_norm': 0.5062800049781799, 'learning_rate': 9.765395894428151e-05, 'epoch': 1.38}
+{'loss': 0.2121, 'grad_norm': 0.6650285124778748, 'learning_rate': 9.762952101661779e-05, 'epoch': 1.38}
+{'loss': 0.2907, 'grad_norm': 0.42068031430244446, 'learning_rate': 9.760508308895405e-05, 'epoch': 1.38}
+{'loss': 0.1929, 'grad_norm': 0.5812735557556152, 'learning_rate': 9.758064516129031e-05, 'epoch': 1.38}
+{'loss': 0.1967, 'grad_norm': 0.9358962178230286, 'learning_rate': 9.75562072336266e-05, 'epoch': 1.38}
+{'loss': 0.1991, 'grad_norm': 0.5094256401062012, 'learning_rate': 9.753176930596284e-05, 'epoch': 1.38}
+{'loss': 0.9328, 'grad_norm': 4.9580302238464355, 'learning_rate': 9.750733137829911e-05, 'epoch': 1.38}
+{'loss': 0.2952, 'grad_norm': 0.7062557935714722, 'learning_rate': 9.748289345063539e-05, 'epoch': 1.38}
+{'loss': 0.3241, 'grad_norm': 1.7369906902313232, 'learning_rate': 9.745845552297165e-05, 'epoch': 1.38}
+{'loss': 0.3769, 'grad_norm': 0.8181784152984619, 'learning_rate': 9.74340175953079e-05, 'epoch': 1.38}
+{'loss': 0.2526, 'grad_norm': 0.6165406107902527, 'learning_rate': 9.740957966764418e-05, 'epoch': 1.38}
+{'loss': 0.4735, 'grad_norm': 0.9449729919433594, 'learning_rate': 9.738514173998045e-05, 'epoch': 1.38}
+{'loss': 0.3983, 'grad_norm': 1.4198623895645142, 'learning_rate': 9.73607038123167e-05, 'epoch': 1.38}
+{'loss': 0.3594, 'grad_norm': 1.2503142356872559, 'learning_rate': 9.733626588465298e-05, 'epoch': 1.38}
+{'loss': 0.3658, 'grad_norm': 1.537210464477539, 'learning_rate': 9.731182795698924e-05, 'epoch': 1.38}
+{'loss': 0.2598, 'grad_norm': 0.8660850524902344, 'learning_rate': 9.72873900293255e-05, 'epoch': 1.38}
+{'loss': 0.3427, 'grad_norm': 0.7644106149673462, 'learning_rate': 9.726295210166178e-05, 'epoch': 1.38}
+{'loss': 0.5187, 'grad_norm': 2.4255013465881348, 'learning_rate': 9.723851417399804e-05, 'epoch': 1.38}
+{'loss': 0.172, 'grad_norm': 0.9475809931755066, 'learning_rate': 9.72140762463343e-05, 'epoch': 1.38}
+{'loss': 0.6254, 'grad_norm': 1.3600800037384033, 'learning_rate': 9.718963831867058e-05, 'epoch': 1.38}
+{'loss': 0.3988, 'grad_norm': 1.693412184715271, 'learning_rate': 9.716520039100683e-05, 'epoch': 1.38}
+{'loss': 0.3481, 'grad_norm': 1.2597637176513672, 'learning_rate': 9.71407624633431e-05, 'epoch': 1.38}
+{'loss': 0.3203, 'grad_norm': 1.089922308921814, 'learning_rate': 9.711632453567937e-05, 'epoch': 1.38}
+{'loss': 0.5356, 'grad_norm': 1.767945647239685, 'learning_rate': 9.709188660801564e-05, 'epoch': 1.38}
+{'loss': 0.8449, 'grad_norm': 1.7489302158355713, 'learning_rate': 9.706744868035189e-05, 'epoch': 1.38}
+{'loss': 0.4853, 'grad_norm': 2.965064525604248, 'learning_rate': 9.704301075268817e-05, 'epoch': 1.38}
+{'loss': 0.993, 'grad_norm': 3.0038321018218994, 'learning_rate': 9.701857282502443e-05, 'epoch': 1.38}
+{'loss': 0.7505, 'grad_norm': 3.7745330333709717, 'learning_rate': 9.69941348973607e-05, 'epoch': 1.38}
+{'loss': 0.5765, 'grad_norm': 1.8408567905426025, 'learning_rate': 9.696969696969698e-05, 'epoch': 1.38}
+{'loss': 0.7841, 'grad_norm': 1.4488403797149658, 'learning_rate': 9.694525904203323e-05, 'epoch': 1.38}
+{'loss': 0.9474, 'grad_norm': 2.437166690826416, 'learning_rate': 9.692082111436949e-05, 'epoch': 1.38}
+{'loss': 0.6246, 'grad_norm': 1.6625169515609741, 'learning_rate': 9.689638318670577e-05, 'epoch': 1.38}
+{'loss': 1.0428, 'grad_norm': 2.3555285930633545, 'learning_rate': 9.687194525904202e-05, 'epoch': 1.38}
+{'loss': 0.5075, 'grad_norm': 2.821054220199585, 'learning_rate': 9.684750733137829e-05, 'epoch': 1.38}
+{'loss': 0.6551, 'grad_norm': 1.759642243385315, 'learning_rate': 9.682306940371455e-05, 'epoch': 1.38}
+{'loss': 1.2204, 'grad_norm': 1.687735676765442, 'learning_rate': 9.679863147605083e-05, 'epoch': 1.38}
+{'loss': 1.0335, 'grad_norm': 1.8050616979599, 'learning_rate': 9.677419354838708e-05, 'epoch': 1.38}
+{'loss': 0.9699, 'grad_norm': 7.936594009399414, 'learning_rate': 9.674975562072335e-05, 'epoch': 1.38}
+{'loss': 0.9377, 'grad_norm': 2.0747787952423096, 'learning_rate': 9.672531769305962e-05, 'epoch': 1.38}
+{'loss': 1.1424, 'grad_norm': 2.84826922416687, 'learning_rate': 9.670087976539589e-05, 'epoch': 1.38}
+{'loss': 0.5916, 'grad_norm': 1.3028104305267334, 'learning_rate': 9.667644183773214e-05, 'epoch': 1.38}
+{'loss': 1.2034, 'grad_norm': 2.0030343532562256, 'learning_rate': 9.665200391006842e-05, 'epoch': 1.38}
+{'loss': 0.9957, 'grad_norm': 3.1390602588653564, 'learning_rate': 9.662756598240468e-05, 'epoch': 1.38}
+{'loss': 0.4849, 'grad_norm': 1.9023643732070923, 'learning_rate': 9.660312805474095e-05, 'epoch': 1.38}
+{'loss': 1.2315, 'grad_norm': 5.415816307067871, 'learning_rate': 9.657869012707721e-05, 'epoch': 1.38}
+{'loss': 1.195, 'grad_norm': 2.7929747104644775, 'learning_rate': 9.655425219941348e-05, 'epoch': 1.38}
+{'loss': 0.97, 'grad_norm': 2.5922513008117676, 'learning_rate': 9.652981427174974e-05, 'epoch': 1.38}
+{'loss': 0.9625, 'grad_norm': 2.4513044357299805, 'learning_rate': 9.650537634408602e-05, 'epoch': 1.38}
+{'loss': 0.2183, 'grad_norm': 0.5424944758415222, 'learning_rate': 9.648093841642227e-05, 'epoch': 1.38}
+{'loss': 0.2732, 'grad_norm': 1.09577214717865, 'learning_rate': 9.645650048875854e-05, 'epoch': 1.38}
+{'loss': 0.1935, 'grad_norm': 0.6740018129348755, 'learning_rate': 9.643206256109481e-05, 'epoch': 1.38}
+{'loss': 0.2475, 'grad_norm': 0.7105699181556702, 'learning_rate': 9.640762463343108e-05, 'epoch': 1.38}
+{'loss': 0.273, 'grad_norm': 0.4238963723182678, 'learning_rate': 9.638318670576733e-05, 'epoch': 1.38}
+{'loss': 0.2303, 'grad_norm': 0.680479884147644, 'learning_rate': 9.635874877810361e-05, 'epoch': 1.38}
+{'loss': 0.3086, 'grad_norm': 0.9009299874305725, 'learning_rate': 9.633431085043987e-05, 'epoch': 1.38}
+{'loss': 0.2803, 'grad_norm': 0.7667379379272461, 'learning_rate': 9.630987292277614e-05, 'epoch': 1.38}
+{'loss': 0.3391, 'grad_norm': 0.854888379573822, 'learning_rate': 9.62854349951124e-05, 'epoch': 1.38}
+{'loss': 0.2158, 'grad_norm': 0.6643977761268616, 'learning_rate': 9.626099706744867e-05, 'epoch': 1.39}
+{'loss': 0.257, 'grad_norm': 0.7794937491416931, 'learning_rate': 9.623655913978493e-05, 'epoch': 1.39}
+{'loss': 0.2123, 'grad_norm': 0.7044182419776917, 'learning_rate': 9.621212121212121e-05, 'epoch': 1.39}
+ 69%|██████▉   | 8851/12776 [1:32:11<36:10,  1.81it/s] 69%|██████▉   | 8852/12776 [1:32:12<35:27,  1.84it/s]                                                       69%|██████▉   | 8852/12776 [1:32:12<35:27,  1.84it/s] 69%|██████▉   | 8853/12776 [1:32:12<33:00,  1.98it/s]                                                       69%|██████▉   | 8853/12776 [1:32:12<33:00,  1.98it/s] 69%|██████▉   | 8854/12776 [1:32:13<33:07,  1.97it/s]                                                       69%|██████▉   | 8854/12776 [1:32:13<33:07,  1.97it/s] 69%|██████▉   | 8855/12776 [1:32:13<30:40,  2.13it/s]                                                       69%|██████▉   | 8855/12776 [1:32:13<30:40,  2.13it/s] 69%|██████▉   | 8856/12776 [1:32:13<28:40,  2.28it/s]                                                       69%|██████▉   | 8856/12776 [1:32:13<28:40,  2.28it/s] 69%|██████▉   | 8857/12776 [1:32:14<28:20,  2.30it/s]                                                       69%|██████▉   | 8857/12776 [1:32:14<28:20,  2.30it/s] 69%|██████▉   | 8858/12776 [1:32:14<26:40,  2.45it/s]                                                       69%|██████▉   | 8858/12776 [1:32:14<26:40,  2.45it/s] 69%|██████▉   | 8859/12776 [1:32:14<25:04,  2.60it/s]                                                       69%|██████▉   | 8859/12776 [1:32:14<25:04,  2.60it/s] 69%|██████▉   | 8860/12776 [1:32:15<23:49,  2.74it/s]                                                       69%|██████▉   | 8860/12776 [1:32:15<23:49,  2.74it/s] 69%|██████▉   | 8861/12776 [1:32:15<24:39,  2.65it/s]                                                       69%|██████▉   | 8861/12776 [1:32:15<24:39,  2.65it/s] 69%|██████▉   | 8862/12776 [1:32:16<23:11,  2.81it/s]                                                       69%|██████▉   | 8862/12776 [1:32:16<23:11,  2.81it/s] 69%|██████▉   | 8863/12776 [1:32:16<21:52,  2.98it/s]                                                       69%|██████▉   | 8863/12776 [1:32:16<21:52,  2.98it/s] 69%|██████▉   | 8864/12776 [1:32:16<21:53,  2.98it/s]                                                       69%|██████▉   | 8864/12776 [1:32:16<21:53,  2.98it/s] 69%|██████▉   | 8865/12776 [1:32:16<20:36,  3.16it/s]                                                       69%|██████▉   | 8865/12776 [1:32:16<20:36,  3.16it/s] 69%|██████▉   | 8866/12776 [1:32:17<19:35,  3.33it/s]                                                       69%|██████▉   | 8866/12776 [1:32:17<19:35,  3.33it/s] 69%|██████▉   | 8867/12776 [1:32:17<18:40,  3.49it/s]                                                       69%|██████▉   | 8867/12776 [1:32:17<18:40,  3.49it/s] 69%|██████▉   | 8868/12776 [1:32:17<20:23,  3.19it/s]                                                       69%|██████▉   | 8868/12776 [1:32:17<20:23,  3.19it/s] 69%|██████▉   | 8869/12776 [1:32:18<19:01,  3.42it/s]                                                       69%|██████▉   | 8869/12776 [1:32:18<19:01,  3.42it/s] 69%|██████▉   | 8870/12776 [1:32:18<17:49,  3.65it/s]                                                       69%|██████▉   | 8870/12776 [1:32:18<17:49,  3.65it/s] 69%|██████▉   | 8871/12776 [1:32:18<16:54,  3.85it/s]                                                       69%|██████▉   | 8871/12776 [1:32:18<16:54,  3.85it/s] 69%|██████▉   | 8872/12776 [1:32:18<16:11,  4.02it/s]                                                       69%|██████▉   | 8872/12776 [1:32:18<16:11,  4.02it/s] 69%|██████▉   | 8873/12776 [1:32:19<17:23,  3.74it/s]                                                       69%|██████▉   | 8873/12776 [1:32:19<17:23,  3.74it/s] 69%|██████▉   | 8874/12776 [1:32:19<16:17,  3.99it/s]                                                       69%|██████▉   | 8874/12776 [1:32:19<16:17,  3.99it/s] 69%|██████▉   | 8875/12776 [1:32:19<15:23,  4.22it/s]                                                       69%|██████▉   | 8875/12776 [1:32:19<15:23,  4.22it/s] 69%|██████▉   | 8876/12776 [1:32:19<14:39,  4.44it/s]                                                       69%|██████▉   | 8876/12776 [1:32:19<14:39,  4.44it/s] 69%|██████▉   | 8877/12776 [1:32:19<14:03,  4.62it/s]                                                       69%|██████▉   | 8877/12776 [1:32:19<14:03,  4.62it/s] 69%|██████▉   | 8878/12776 [1:32:20<16:19,  3.98it/s]                                                       69%|██████▉   | 8878/12776 [1:32:20<16:19,  3.98it/s] 69%|██████▉   | 8879/12776 [1:32:20<15:05,  4.30it/s]                                                       69%|██████▉   | 8879/12776 [1:32:20<15:05,  4.30it/s] 70%|██████▉   | 8880/12776 [1:32:20<14:08,  4.59it/s]                                                       70%|██████▉   | 8880/12776 [1:32:20<14:08,  4.59it/s] 70%|██████▉   | 8881/12776 [1:32:20<13:22,  4.85it/s]                                                       70%|██████▉   | 8881/12776 [1:32:20<13:22,  4.85it/s] 70%|██████▉   | 8882/12776 [1:32:20<12:50,  5.05it/s]                                                       70%|██████▉   | 8882/12776 [1:32:20<12:50,  5.05it/s] 70%|██████▉   | 8883/12776 [1:32:21<14:11,  4.57it/s]                                                       70%|██████▉   | 8883/12776 [1:32:21<14:11,  4.57it/s] 70%|██████▉   | 8884/12776 [1:32:21<13:19,  4.87it/s]                                                       70%|██████▉   | 8884/12776 [1:32:21<13:19,  4.87it/s] 70%|██████▉   | 8885/12776 [1:32:21<12:40,  5.11it/s]                                                       70%|██████▉   | 8885/12776 [1:32:21<12:40,  5.11it/s] 70%|██████▉   | 8886/12776 [1:32:21<12:05,  5.36it/s]                                                       70%|██████▉   | 8886/12776 [1:32:21<12:05,  5.36it/s] 70%|██████▉   | 8887/12776 [1:32:21<12:04,  5.37it/s]                                                       70%|██████▉   | 8887/12776 [1:32:21<12:04,  5.37it/s] 70%|██████▉   | 8888/12776 [1:32:22<22:16,  2.91it/s]                                                       70%|██████▉   | 8888/12776 [1:32:22<22:16,  2.91it/s] 70%|██████▉   | 8889/12776 [1:32:24<46:00,  1.41it/s]                                                       70%|██████▉   | 8889/12776 [1:32:24<46:00,  1.41it/s] 70%|██████▉   | 8890/12776 [1:32:25<51:14,  1.26it/s]                                                       70%|██████▉   | 8890/12776 [1:32:25<51:14,  1.26it/s] 70%|██████▉   | 8891/12776 [1:32:25<52:15,  1.24it/s]                                                       70%|██████▉   | 8891/12776 [1:32:25<52:15,  1.24it/s] 70%|██████▉   | 8892/12776 [1:32:26<51:54,  1.25it/s]                                                       70%|██████▉   | 8892/12776 [1:32:26<51:54,  1.25it/s] 70%|██████▉   | 8893/12776 [1:32:27<50:58,  1.27it/s]                                                       70%|██████▉   | 8893/12776 [1:32:27<50:58,  1.27it/s] 70%|██████▉   | 8894/12776 [1:32:28<49:15,  1.31it/s]                                                       70%|██████▉   | 8894/12776 [1:32:28<49:15,  1.31it/s] 70%|██████▉   | 8895/12776 [1:32:28<47:22,  1.37it/s]                                                       70%|██████▉   | 8895/12776 [1:32:28<47:22,  1.37it/s] 70%|██████▉   | 8896/12776 [1:32:29<47:52,  1.35it/s]                                                       70%|██████▉   | 8896/12776 [1:32:29<47:52,  1.35it/s] 70%|██████▉   | 8897/12776 [1:32:30<45:00,  1.44it/s]                                                       70%|██████▉   | 8897/12776 [1:32:30<45:00,  1.44it/s] 70%|██████▉   | 8898/12776 [1:32:30<43:36,  1.48it/s]                                                       70%|██████▉   | 8898/12776 [1:32:30<43:36,  1.48it/s] 70%|██████▉   | 8899/12776 [1:32:31<40:40,  1.59it/s]                                                       70%|██████▉   | 8899/12776 [1:32:31<40:40,  1.59it/s] 70%|██████▉   | 8900/12776 [1:32:31<39:04,  1.65it/s]                                                       70%|██████▉   | 8900/12776 [1:32:31<39:04,  1.65it/s] 70%|██████▉   | 8901/12776 [1:32:32<36:02,  1.79it/s]                                                       70%|██████▉   | 8901/12776 [1:32:32<36:02,  1.79it/s] 70%|██████▉   | 8902/12776 [1:32:32<34:32,  1.87it/s]                                                       70%|██████▉   | 8902/12776 [1:32:32<34:32,  1.87it/s] 70%|██████▉   | 8903/12776 [1:32:33<32:17,  2.00it/s]                                                       70%|██████▉   | 8903/12776 [1:32:33<32:17,  2.00it/s] 70%|██████▉   | 8904/12776 [1:32:33<30:17,  2.13it/s]                                                       70%|██████▉   | 8904/12776 [1:32:33<30:17,  2.13it/s] 70%|██████▉   | 8905/12776 [1:32:34<30:37,  2.11it/s]                                                       70%|██████▉   | 8905/12776 [1:32:34<30:37,  2.11it/s] 70%|██████▉   | 8906/12776 [1:32:34<28:33,  2.26it/s]                                                       70%|██████▉   | 8906/12776 [1:32:34<28:33,  2.26it/s] 70%|██████▉   | 8907/12776 [1:32:34<26:38,  2.42it/s]                                                       70%|██████▉   | 8907/12776 [1:32:34<26:38,  2.42it/s] 70%|██████▉   | 8908/12776 [1:32:35<26:56,  2.39it/s]                                                       70%|██████▉   | 8908/12776 [1:32:35<26:56,  2.39it/s] 70%|██████▉   | 8909/12776 [1:32:35<25:22,  2.54it/s]                                                       70%|██████▉   | 8909/12776 [1:32:35<25:22,  2.54it/s] 70%|██████▉   | 8910/12776 [1:32:35<24:02,  2.68it/s]                                                       70%|██████▉   | 8910/12776 [1:32:35<24:02,  2.68it/s] 70%|██████▉   | 8911/12776 [1:32:36<23:13,  2.77it/s]                                                       70%|██████▉   | 8911/12776 [1:32:36<23:13,  2.77it/s] 70%|██████▉   | 8912/12776 [1:32:36<22:01,  2.92it/s]                                                       70%|██████▉   | 8912/12776 [1:32:36<22:01,  2.92it/s] 70%|██████▉   | 8913/12776 [1:32:36<21:01,  3.06it/s]                                                       70%|██████▉   | 8913/12776 [1:32:36<21:01,  3.06it/s] 70%|██████▉   | 8914/12776 [1:32:37<20:12,  3.18it/s]                                                       70%|██████▉   | 8914/12776 [1:32:37<20:12,  3.18it/s] 70%|██████▉   | 8915/12776 [1:32:37<21:48,  2.95it/s]                                                       70%|██████▉   | 8915/12776 [1:32:37<21:48,  2.95it/s] 70%|██████▉   | 8916/12776 [1:32:37<20:28,  3.14it/s]                                                       70%|██████▉   | 8916/12776 [1:32:37<20:28,  3.14it/s] 70%|██████▉   | 8917/12776 [1:32:38<19:23,  3.32it/s]                                                       70%|██████▉   | 8917/12776 [1:32:38<19:23,  3.32it/s] 70%|██████▉   | 8918/12776 [1:32:38<18:28,  3.48it/s]                                                       70%|██████▉   | 8918/12776 [1:32:38<18:28,  3.48it/s] 70%|██████▉   | 8919/12776 [1:32:38<19:16,  3.34it/s]                                                       70%|██████▉   | 8919/12776 [1:32:38<19:16,  3.34it/s] 70%|██████▉   | 8920/12776 [1:32:38<18:17,  3.51it/s]                                                       70%|██████▉   | 8920/12776 [1:32:38<18:17,  3.51it/s] 70%|██████▉   | 8921/12776 [1:32:39<17:30,  3.67it/s]                                                       70%|██████▉   | 8921/12776 [1:32:39<17:30,  3.67it/s] 70%|██████▉   | 8922/12776 [1:32:39<16:52,  3.81it/s]                                                       70%|██████▉   | 8922/12776 [1:32:39<16:52,  3.81it/s] 70%|██████▉   | 8923/12776 [1:32:39<17:31,  3.66it/s]                                                       70%|██████▉   | 8923/12776 [1:32:39<17:31,  3.66it/s] 70%|██████▉   | 8924/12776 [1:32:39<16:41,  3.84it/s]                                                       70%|██████▉   | 8924/12776 [1:32:39<16:41,  3.84it/s] 70%|██████▉   | 8925/12776 [1:32:40<15:56,  4.02it/s]                                                       70%|██████▉   | 8925/12776 [1:32:40<15:56,  4.02it/s] 70%|██████▉   | 8926/12776 [1:32:40<15:16,  4.20it/s]                                                       70%|██████▉   | 8926/12776 [1:32:40<15:16,  4.20it/s] 70%|██████▉   | 8927/12776 [1:32:40<14:59,  4.28it/s]                                                       70%|██████▉   | 8927/12776 [1:32:40<14:59,  4.28it/s] 70%|██████▉   | 8928/12776 [1:32:40<15:11,  4.22it/s]                                                       70%|██████▉   | 8928/12776 [1:32:40<15:11,  4.22it/s] 70%|██████▉   | 8929/12776 [1:32:41<14:35,  4.39it/s]                                                      {'loss': 0.5418, 'grad_norm': 1.4797452688217163, 'learning_rate': 9.618768328445746e-05, 'epoch': 1.39}
+{'loss': 0.2076, 'grad_norm': 0.6202751398086548, 'learning_rate': 9.616324535679373e-05, 'epoch': 1.39}
+{'loss': 0.2865, 'grad_norm': 0.7685756087303162, 'learning_rate': 9.613880742913e-05, 'epoch': 1.39}
+{'loss': 0.3327, 'grad_norm': 0.6632737517356873, 'learning_rate': 9.611436950146627e-05, 'epoch': 1.39}
+{'loss': 0.4345, 'grad_norm': 1.1949931383132935, 'learning_rate': 9.608993157380252e-05, 'epoch': 1.39}
+{'loss': 0.477, 'grad_norm': 0.9757672548294067, 'learning_rate': 9.60654936461388e-05, 'epoch': 1.39}
+{'loss': 0.5074, 'grad_norm': 1.91054105758667, 'learning_rate': 9.604105571847507e-05, 'epoch': 1.39}
+{'loss': 0.3122, 'grad_norm': 1.4399696588516235, 'learning_rate': 9.601661779081133e-05, 'epoch': 1.39}
+{'loss': 0.7512, 'grad_norm': 2.84247088432312, 'learning_rate': 9.59921798631476e-05, 'epoch': 1.39}
+{'loss': 0.4991, 'grad_norm': 1.674396276473999, 'learning_rate': 9.596774193548386e-05, 'epoch': 1.39}
+{'loss': 0.8064, 'grad_norm': 2.1071906089782715, 'learning_rate': 9.594330400782012e-05, 'epoch': 1.39}
+{'loss': 0.3922, 'grad_norm': 1.3319578170776367, 'learning_rate': 9.59188660801564e-05, 'epoch': 1.39}
+{'loss': 0.5648, 'grad_norm': 1.4761604070663452, 'learning_rate': 9.589442815249265e-05, 'epoch': 1.39}
+{'loss': 0.5432, 'grad_norm': 2.2004549503326416, 'learning_rate': 9.586999022482892e-05, 'epoch': 1.39}
+{'loss': 0.576, 'grad_norm': 1.39911949634552, 'learning_rate': 9.58455522971652e-05, 'epoch': 1.39}
+{'loss': 0.6516, 'grad_norm': 1.3971905708312988, 'learning_rate': 9.582111436950146e-05, 'epoch': 1.39}
+{'loss': 0.457, 'grad_norm': 1.2392330169677734, 'learning_rate': 9.579667644183771e-05, 'epoch': 1.39}
+{'loss': 0.6318, 'grad_norm': 3.725346088409424, 'learning_rate': 9.577223851417399e-05, 'epoch': 1.39}
+{'loss': 0.9208, 'grad_norm': 4.512824058532715, 'learning_rate': 9.574780058651026e-05, 'epoch': 1.39}
+{'loss': 0.852, 'grad_norm': 2.2564964294433594, 'learning_rate': 9.572336265884651e-05, 'epoch': 1.39}
+{'loss': 0.7657, 'grad_norm': 1.9278607368469238, 'learning_rate': 9.569892473118279e-05, 'epoch': 1.39}
+{'loss': 0.4866, 'grad_norm': 2.015615940093994, 'learning_rate': 9.567448680351905e-05, 'epoch': 1.39}
+{'loss': 0.7778, 'grad_norm': 2.1916158199310303, 'learning_rate': 9.565004887585532e-05, 'epoch': 1.39}
+{'loss': 0.7985, 'grad_norm': 3.4717800617218018, 'learning_rate': 9.56256109481916e-05, 'epoch': 1.39}
+{'loss': 0.7681, 'grad_norm': 1.8821927309036255, 'learning_rate': 9.560117302052785e-05, 'epoch': 1.39}
+{'loss': 0.9574, 'grad_norm': 1.9250900745391846, 'learning_rate': 9.557673509286411e-05, 'epoch': 1.39}
+{'loss': 0.8076, 'grad_norm': 2.0036773681640625, 'learning_rate': 9.555229716520039e-05, 'epoch': 1.39}
+{'loss': 0.7549, 'grad_norm': 1.968286395072937, 'learning_rate': 9.552785923753665e-05, 'epoch': 1.39}
+{'loss': 0.7469, 'grad_norm': 2.4910616874694824, 'learning_rate': 9.55034213098729e-05, 'epoch': 1.39}
+{'loss': 1.0438, 'grad_norm': 2.2749216556549072, 'learning_rate': 9.547898338220918e-05, 'epoch': 1.39}
+{'loss': 0.8261, 'grad_norm': 1.8309866189956665, 'learning_rate': 9.545454545454545e-05, 'epoch': 1.39}
+{'loss': 0.6295, 'grad_norm': 3.1002039909362793, 'learning_rate': 9.54301075268817e-05, 'epoch': 1.39}
+{'loss': 1.5787, 'grad_norm': 3.578477144241333, 'learning_rate': 9.540566959921798e-05, 'epoch': 1.39}
+{'loss': 0.5352, 'grad_norm': 0.985260009765625, 'learning_rate': 9.538123167155424e-05, 'epoch': 1.39}
+{'loss': 0.5997, 'grad_norm': 1.0070950984954834, 'learning_rate': 9.53567937438905e-05, 'epoch': 1.39}
+{'loss': 0.7481, 'grad_norm': 3.7554800510406494, 'learning_rate': 9.533235581622678e-05, 'epoch': 1.39}
+{'loss': 1.1695, 'grad_norm': 1.8878397941589355, 'learning_rate': 9.530791788856304e-05, 'epoch': 1.39}
+{'loss': 1.3039, 'grad_norm': 3.70890474319458, 'learning_rate': 9.52834799608993e-05, 'epoch': 1.39}
+{'loss': 0.1757, 'grad_norm': 0.3560074269771576, 'learning_rate': 9.525904203323558e-05, 'epoch': 1.39}
+{'loss': 0.2835, 'grad_norm': 0.8983970880508423, 'learning_rate': 9.523460410557184e-05, 'epoch': 1.39}
+{'loss': 0.1884, 'grad_norm': 0.8294395804405212, 'learning_rate': 9.52101661779081e-05, 'epoch': 1.39}
+{'loss': 0.2084, 'grad_norm': 1.0664957761764526, 'learning_rate': 9.518572825024437e-05, 'epoch': 1.39}
+{'loss': 0.2246, 'grad_norm': 1.1653019189834595, 'learning_rate': 9.516129032258064e-05, 'epoch': 1.39}
+{'loss': 0.6462, 'grad_norm': 0.8518573641777039, 'learning_rate': 9.513685239491689e-05, 'epoch': 1.39}
+{'loss': 0.278, 'grad_norm': 0.7155981063842773, 'learning_rate': 9.511241446725317e-05, 'epoch': 1.39}
+{'loss': 0.2204, 'grad_norm': 0.8205424547195435, 'learning_rate': 9.508797653958943e-05, 'epoch': 1.39}
+{'loss': 0.1775, 'grad_norm': 0.9535171389579773, 'learning_rate': 9.50635386119257e-05, 'epoch': 1.39}
+{'loss': 0.2603, 'grad_norm': 0.6753984689712524, 'learning_rate': 9.503910068426198e-05, 'epoch': 1.39}
+{'loss': 0.3896, 'grad_norm': 0.8725956082344055, 'learning_rate': 9.501466275659823e-05, 'epoch': 1.39}
+{'loss': 0.3452, 'grad_norm': 0.7949926257133484, 'learning_rate': 9.499022482893449e-05, 'epoch': 1.39}
+{'loss': 0.6093, 'grad_norm': 5.083495140075684, 'learning_rate': 9.496578690127077e-05, 'epoch': 1.39}
+{'loss': 0.1909, 'grad_norm': 0.8620098233222961, 'learning_rate': 9.494134897360704e-05, 'epoch': 1.39}
+{'loss': 0.4185, 'grad_norm': 1.0833512544631958, 'learning_rate': 9.491691104594329e-05, 'epoch': 1.39}
+{'loss': 0.3882, 'grad_norm': 1.1511584520339966, 'learning_rate': 9.489247311827956e-05, 'epoch': 1.39}
+{'loss': 0.4503, 'grad_norm': 1.5453732013702393, 'learning_rate': 9.486803519061583e-05, 'epoch': 1.39}
+{'loss': 0.422, 'grad_norm': 1.2637474536895752, 'learning_rate': 9.484359726295208e-05, 'epoch': 1.39}
+{'loss': 0.349, 'grad_norm': 1.7331559658050537, 'learning_rate': 9.481915933528836e-05, 'epoch': 1.39}
+{'loss': 0.3536, 'grad_norm': 1.1923069953918457, 'learning_rate': 9.479472140762462e-05, 'epoch': 1.39}
+{'loss': 0.3895, 'grad_norm': 3.23207950592041, 'learning_rate': 9.477028347996089e-05, 'epoch': 1.39}
+{'loss': 0.6068, 'grad_norm': 1.1913764476776123, 'learning_rate': 9.474584555229717e-05, 'epoch': 1.39}
+{'loss': 0.3515, 'grad_norm': 1.4866349697113037, 'learning_rate': 9.472140762463342e-05, 'epoch': 1.39}
+{'loss': 0.5826, 'grad_norm': 1.2634825706481934, 'learning_rate': 9.469696969696968e-05, 'epoch': 1.4}
+{'loss': 0.54, 'grad_norm': 1.2857826948165894, 'learning_rate': 9.467253176930596e-05, 'epoch': 1.4}
+{'loss': 0.4376, 'grad_norm': 2.951294422149658, 'learning_rate': 9.464809384164221e-05, 'epoch': 1.4}
+{'loss': 0.5928, 'grad_norm': 2.0184364318847656, 'learning_rate': 9.462365591397848e-05, 'epoch': 1.4}
+{'loss': 0.8246, 'grad_norm': 4.42757511138916, 'learning_rate': 9.459921798631476e-05, 'epoch': 1.4}
+{'loss': 0.658, 'grad_norm': 1.6247820854187012, 'learning_rate': 9.457478005865102e-05, 'epoch': 1.4}
+{'loss': 0.5058, 'grad_norm': 1.0989845991134644, 'learning_rate': 9.455034213098727e-05, 'epoch': 1.4}
+{'loss': 0.722, 'grad_norm': 3.0606327056884766, 'learning_rate': 9.452590420332355e-05, 'epoch': 1.4}
+{'loss': 0.8267, 'grad_norm': 2.1190545558929443, 'learning_rate': 9.450146627565982e-05, 'epoch': 1.4}
+{'loss': 0.8312, 'grad_norm': 2.202465057373047, 'learning_rate': 9.447702834799608e-05, 'epoch': 1.4}
+{'loss': 0.8328, 'grad_norm': 1.6959621906280518, 'learning_rate': 9.445259042033236e-05, 'epoch': 1.4}
+{'loss': 0.6202, 'grad_norm': 2.8653564453125, 'learning_rate': 9.442815249266861e-05, 'epoch': 1.4}
+{'loss': 0.7007, 'grad_norm': 1.8655325174331665, 'learning_rate': 9.440371456500487e-05, 'epoch': 1.4}
+{'loss': 0.42, 'grad_norm': 1.3610215187072754, 'learning_rate': 9.437927663734115e-05, 'epoch': 1.4}
+{'loss': 0.9368, 'grad_norm': 2.2759883403778076, 'learning_rate': 9.43548387096774e-05, 'epoch': 1.4}
+{'loss': 1.1112, 'grad_norm': 2.666337728500366, 'learning_rate': 9.433040078201367e-05, 'epoch': 1.4}
+{'loss': 1.377, 'grad_norm': 3.074618101119995, 'learning_rate': 9.430596285434995e-05, 'epoch': 1.4}
+ 70%|██████▉   | 8929/12776 [1:32:41<14:35,  4.39it/s] 70%|██████▉   | 8930/12776 [1:32:41<14:08,  4.53it/s]                                                       70%|██████▉   | 8930/12776 [1:32:41<14:08,  4.53it/s] 70%|██████▉   | 8931/12776 [1:32:41<13:46,  4.65it/s]                                                       70%|██████▉   | 8931/12776 [1:32:41<13:46,  4.65it/s] 70%|██████▉   | 8932/12776 [1:32:41<13:27,  4.76it/s]                                                       70%|██████▉   | 8932/12776 [1:32:41<13:27,  4.76it/s] 70%|██████▉   | 8933/12776 [1:32:42<15:35,  4.11it/s]                                                       70%|██████▉   | 8933/12776 [1:32:42<15:35,  4.11it/s] 70%|██████▉   | 8934/12776 [1:32:42<14:38,  4.38it/s]                                                       70%|██████▉   | 8934/12776 [1:32:42<14:38,  4.38it/s] 70%|██████▉   | 8935/12776 [1:32:42<13:55,  4.60it/s]                                                       70%|██████▉   | 8935/12776 [1:32:42<13:55,  4.60it/s] 70%|██████▉   | 8936/12776 [1:32:42<13:17,  4.82it/s]                                                       70%|██████▉   | 8936/12776 [1:32:42<13:17,  4.82it/s] 70%|██████▉   | 8937/12776 [1:32:42<12:50,  4.98it/s]                                                       70%|██████▉   | 8937/12776 [1:32:42<12:50,  4.98it/s] 70%|██████▉   | 8938/12776 [1:32:43<23:20,  2.74it/s]                                                       70%|██████▉   | 8938/12776 [1:32:43<23:20,  2.74it/s] 70%|██████▉   | 8939/12776 [1:32:45<45:17,  1.41it/s]                                                       70%|██████▉   | 8939/12776 [1:32:45<45:17,  1.41it/s] 70%|██████▉   | 8940/12776 [1:32:45<50:20,  1.27it/s]                                                       70%|██████▉   | 8940/12776 [1:32:45<50:20,  1.27it/s] 70%|██████▉   | 8941/12776 [1:32:46<52:31,  1.22it/s]                                                       70%|██████▉   | 8941/12776 [1:32:46<52:31,  1.22it/s] 70%|██████▉   | 8942/12776 [1:32:47<52:28,  1.22it/s]                                                       70%|██████▉   | 8942/12776 [1:32:47<52:28,  1.22it/s] 70%|██████▉   | 8943/12776 [1:32:48<50:25,  1.27it/s]                                                       70%|██████▉   | 8943/12776 [1:32:48<50:25,  1.27it/s] 70%|███████   | 8944/12776 [1:32:49<48:50,  1.31it/s]                                                       70%|███████   | 8944/12776 [1:32:49<48:50,  1.31it/s] 70%|███████   | 8945/12776 [1:32:49<46:32,  1.37it/s]                                                       70%|███████   | 8945/12776 [1:32:49<46:32,  1.37it/s] 70%|███████   | 8946/12776 [1:32:50<44:33,  1.43it/s]                                                       70%|███████   | 8946/12776 [1:32:50<44:33,  1.43it/s] 70%|███████   | 8947/12776 [1:32:51<44:21,  1.44it/s]                                                       70%|███████   | 8947/12776 [1:32:51<44:21,  1.44it/s] 70%|███████   | 8948/12776 [1:32:51<41:59,  1.52it/s]                                                       70%|███████   | 8948/12776 [1:32:51<41:59,  1.52it/s] 70%|███████   | 8949/12776 [1:32:52<40:55,  1.56it/s]                                                       70%|███████   | 8949/12776 [1:32:52<40:55,  1.56it/s] 70%|███████   | 8950/12776 [1:32:52<38:47,  1.64it/s]                                                       70%|███████   | 8950/12776 [1:32:52<38:47,  1.64it/s] 70%|███████   | 8951/12776 [1:32:53<39:10,  1.63it/s]                                                       70%|███████   | 8951/12776 [1:32:53<39:10,  1.63it/s] 70%|███████   | 8952/12776 [1:32:53<36:41,  1.74it/s]                                                       70%|███████   | 8952/12776 [1:32:53<36:41,  1.74it/s] 70%|███████   | 8953/12776 [1:32:54<34:16,  1.86it/s]                                                       70%|███████   | 8953/12776 [1:32:54<34:16,  1.86it/s] 70%|███████   | 8954/12776 [1:32:54<34:02,  1.87it/s]                                                       70%|███████   | 8954/12776 [1:32:54<34:02,  1.87it/s] 70%|███████   | 8955/12776 [1:32:55<31:41,  2.01it/s]                                                       70%|███████   | 8955/12776 [1:32:55<31:41,  2.01it/s] 70%|███████   | 8956/12776 [1:32:55<31:06,  2.05it/s]                                                       70%|███████   | 8956/12776 [1:32:55<31:06,  2.05it/s] 70%|███████   | 8957/12776 [1:32:56<29:00,  2.19it/s]                                                       70%|███████   | 8957/12776 [1:32:56<29:00,  2.19it/s] 70%|███████   | 8958/12776 [1:32:56<27:15,  2.33it/s]                                                       70%|███████   | 8958/12776 [1:32:56<27:15,  2.33it/s] 70%|███████   | 8959/12776 [1:32:56<26:42,  2.38it/s]                                                       70%|███████   | 8959/12776 [1:32:56<26:42,  2.38it/s] 70%|███████   | 8960/12776 [1:32:57<25:10,  2.53it/s]                                                       70%|███████   | 8960/12776 [1:32:57<25:10,  2.53it/s] 70%|███████   | 8961/12776 [1:32:57<24:01,  2.65it/s]                                                       70%|███████   | 8961/12776 [1:32:57<24:01,  2.65it/s] 70%|███████   | 8962/12776 [1:32:57<22:59,  2.77it/s]                                                       70%|███████   | 8962/12776 [1:32:57<22:59,  2.77it/s] 70%|███████   | 8963/12776 [1:32:58<22:06,  2.87it/s]                                                       70%|███████   | 8963/12776 [1:32:58<22:06,  2.87it/s] 70%|███████   | 8964/12776 [1:32:58<21:10,  3.00it/s]                                                       70%|███████   | 8964/12776 [1:32:58<21:10,  3.00it/s] 70%|███████   | 8965/12776 [1:32:58<20:23,  3.12it/s]                                                       70%|███████   | 8965/12776 [1:32:58<20:23,  3.12it/s] 70%|███████   | 8966/12776 [1:32:59<21:44,  2.92it/s]                                                       70%|███████   | 8966/12776 [1:32:59<21:44,  2.92it/s] 70%|███████   | 8967/12776 [1:32:59<20:21,  3.12it/s]                                                       70%|███████   | 8967/12776 [1:32:59<20:21,  3.12it/s] 70%|███████   | 8968/12776 [1:32:59<19:14,  3.30it/s]                                                       70%|███████   | 8968/12776 [1:32:59<19:14,  3.30it/s] 70%|███████   | 8969/12776 [1:32:59<18:18,  3.46it/s]                                                       70%|███████   | 8969/12776 [1:32:59<18:18,  3.46it/s] 70%|███████   | 8970/12776 [1:33:00<19:01,  3.33it/s]                                                       70%|███████   | 8970/12776 [1:33:00<19:01,  3.33it/s] 70%|███████   | 8971/12776 [1:33:00<18:00,  3.52it/s]                                                       70%|███████   | 8971/12776 [1:33:00<18:00,  3.52it/s] 70%|███████   | 8972/12776 [1:33:00<17:13,  3.68it/s]                                                       70%|███████   | 8972/12776 [1:33:00<17:13,  3.68it/s] 70%|███████   | 8973/12776 [1:33:01<16:29,  3.84it/s]                                                       70%|███████   | 8973/12776 [1:33:01<16:29,  3.84it/s] 70%|███████   | 8974/12776 [1:33:01<15:55,  3.98it/s]                                                       70%|███████   | 8974/12776 [1:33:01<15:55,  3.98it/s] 70%|███████   | 8975/12776 [1:33:01<17:02,  3.72it/s]                                                       70%|███████   | 8975/12776 [1:33:01<17:02,  3.72it/s] 70%|███████   | 8976/12776 [1:33:01<16:04,  3.94it/s]                                                       70%|███████   | 8976/12776 [1:33:01<16:04,  3.94it/s] 70%|███████   | 8977/12776 [1:33:02<15:17,  4.14it/s]                                                       70%|███████   | 8977/12776 [1:33:02<15:17,  4.14it/s] 70%|███████   | 8978/12776 [1:33:02<14:38,  4.32it/s]                                                       70%|███████   | 8978/12776 [1:33:02<14:38,  4.32it/s] 70%|███████   | 8979/12776 [1:33:02<14:12,  4.45it/s]                                                       70%|███████   | 8979/12776 [1:33:02<14:12,  4.45it/s] 70%|███████   | 8980/12776 [1:33:02<15:01,  4.21it/s]                                                       70%|███████   | 8980/12776 [1:33:02<15:01,  4.21it/s] 70%|███████   | 8981/12776 [1:33:02<14:18,  4.42it/s]                                                       70%|███████   | 8981/12776 [1:33:02<14:18,  4.42it/s] 70%|███████   | 8982/12776 [1:33:03<13:46,  4.59it/s]                                                       70%|███████   | 8982/12776 [1:33:03<13:46,  4.59it/s] 70%|███████   | 8983/12776 [1:33:03<13:21,  4.73it/s]                                                       70%|███████   | 8983/12776 [1:33:03<13:21,  4.73it/s] 70%|███████   | 8984/12776 [1:33:03<12:59,  4.86it/s]                                                       70%|███████   | 8984/12776 [1:33:03<12:59,  4.86it/s] 70%|███████   | 8985/12776 [1:33:03<14:54,  4.24it/s]                                                       70%|███████   | 8985/12776 [1:33:03<14:54,  4.24it/s] 70%|███████   | 8986/12776 [1:33:03<13:55,  4.54it/s]                                                       70%|███████   | 8986/12776 [1:33:03<13:55,  4.54it/s] 70%|███████   | 8987/12776 [1:33:04<13:14,  4.77it/s]                                                       70%|███████   | 8987/12776 [1:33:04<13:14,  4.77it/s] 70%|███████   | 8988/12776 [1:33:04<23:58,  2.63it/s]                                                       70%|███████   | 8988/12776 [1:33:04<23:58,  2.63it/s] 70%|███████   | 8989/12776 [1:33:06<45:19,  1.39it/s]                                                       70%|███████   | 8989/12776 [1:33:06<45:19,  1.39it/s] 70%|███████   | 8990/12776 [1:33:07<52:18,  1.21it/s]                                                       70%|███████   | 8990/12776 [1:33:07<52:18,  1.21it/s] 70%|███████   | 8991/12776 [1:33:08<52:23,  1.20it/s]                                                       70%|███████   | 8991/12776 [1:33:08<52:23,  1.20it/s] 70%|███████   | 8992/12776 [1:33:09<51:09,  1.23it/s]                                                       70%|███████   | 8992/12776 [1:33:09<51:09,  1.23it/s] 70%|███████   | 8993/12776 [1:33:09<51:09,  1.23it/s]                                                       70%|███████   | 8993/12776 [1:33:09<51:09,  1.23it/s] 70%|███████   | 8994/12776 [1:33:10<49:27,  1.27it/s]                                                       70%|███████   | 8994/12776 [1:33:10<49:27,  1.27it/s] 70%|███████   | 8995/12776 [1:33:11<46:19,  1.36it/s]                                                       70%|███████   | 8995/12776 [1:33:11<46:19,  1.36it/s] 70%|███████   | 8996/12776 [1:33:11<43:02,  1.46it/s]                                                       70%|███████   | 8996/12776 [1:33:11<43:02,  1.46it/s] 70%|███████   | 8997/12776 [1:33:12<40:51,  1.54it/s]                                                       70%|███████   | 8997/12776 [1:33:12<40:51,  1.54it/s] 70%|███████   | 8998/12776 [1:33:12<38:48,  1.62it/s]                                                       70%|███████   | 8998/12776 [1:33:12<38:48,  1.62it/s] 70%|███████   | 8999/12776 [1:33:13<36:58,  1.70it/s]                                                       70%|███████   | 8999/12776 [1:33:13<36:58,  1.70it/s] 70%|███████   | 9000/12776 [1:33:14<38:52,  1.62it/s]                                                       70%|███████   | 9000/12776 [1:33:14<38:52,  1.62it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 0.9021, 'grad_norm': 1.411201000213623, 'learning_rate': 9.428152492668621e-05, 'epoch': 1.4}
+{'loss': 1.0022, 'grad_norm': 2.165524482727051, 'learning_rate': 9.425708699902246e-05, 'epoch': 1.4}
+{'loss': 1.6909, 'grad_norm': 1.9300085306167603, 'learning_rate': 9.423264907135874e-05, 'epoch': 1.4}
+{'loss': 0.7395, 'grad_norm': 2.606879711151123, 'learning_rate': 9.4208211143695e-05, 'epoch': 1.4}
+{'loss': 1.3278, 'grad_norm': 2.456814765930176, 'learning_rate': 9.418377321603127e-05, 'epoch': 1.4}
+{'loss': 0.3545, 'grad_norm': 1.9723178148269653, 'learning_rate': 9.415933528836755e-05, 'epoch': 1.4}
+{'loss': 0.4407, 'grad_norm': 1.6386150121688843, 'learning_rate': 9.41348973607038e-05, 'epoch': 1.4}
+{'loss': 0.6822, 'grad_norm': 1.4594956636428833, 'learning_rate': 9.411045943304007e-05, 'epoch': 1.4}
+{'loss': 0.9911, 'grad_norm': 2.2683069705963135, 'learning_rate': 9.408602150537634e-05, 'epoch': 1.4}
+{'loss': 1.4219, 'grad_norm': 3.4247801303863525, 'learning_rate': 9.40615835777126e-05, 'epoch': 1.4}
+{'loss': 0.2495, 'grad_norm': 0.4289674758911133, 'learning_rate': 9.403714565004886e-05, 'epoch': 1.4}
+{'loss': 0.2006, 'grad_norm': 0.4746815860271454, 'learning_rate': 9.401270772238514e-05, 'epoch': 1.4}
+{'loss': 0.1976, 'grad_norm': 0.42531949281692505, 'learning_rate': 9.39882697947214e-05, 'epoch': 1.4}
+{'loss': 0.2068, 'grad_norm': 0.8540092706680298, 'learning_rate': 9.396383186705765e-05, 'epoch': 1.4}
+{'loss': 0.242, 'grad_norm': 0.6693518757820129, 'learning_rate': 9.393939393939393e-05, 'epoch': 1.4}
+{'loss': 0.335, 'grad_norm': 0.9643524289131165, 'learning_rate': 9.39149560117302e-05, 'epoch': 1.4}
+{'loss': 0.222, 'grad_norm': 0.5744693279266357, 'learning_rate': 9.389051808406646e-05, 'epoch': 1.4}
+{'loss': 0.3107, 'grad_norm': 1.0014822483062744, 'learning_rate': 9.386608015640274e-05, 'epoch': 1.4}
+{'loss': 0.3322, 'grad_norm': 0.8832296133041382, 'learning_rate': 9.384164222873899e-05, 'epoch': 1.4}
+{'loss': 0.254, 'grad_norm': 0.46709567308425903, 'learning_rate': 9.381720430107526e-05, 'epoch': 1.4}
+{'loss': 0.3261, 'grad_norm': 1.0916463136672974, 'learning_rate': 9.379276637341154e-05, 'epoch': 1.4}
+{'loss': 0.4645, 'grad_norm': 1.3483251333236694, 'learning_rate': 9.376832844574779e-05, 'epoch': 1.4}
+{'loss': 0.3502, 'grad_norm': 1.411783218383789, 'learning_rate': 9.374389051808405e-05, 'epoch': 1.4}
+{'loss': 0.2547, 'grad_norm': 0.6709024310112, 'learning_rate': 9.371945259042033e-05, 'epoch': 1.4}
+{'loss': 0.2641, 'grad_norm': 0.7980188727378845, 'learning_rate': 9.36950146627566e-05, 'epoch': 1.4}
+{'loss': 0.5429, 'grad_norm': 1.5928226709365845, 'learning_rate': 9.367057673509285e-05, 'epoch': 1.4}
+{'loss': 1.0404, 'grad_norm': 2.243115186691284, 'learning_rate': 9.364613880742912e-05, 'epoch': 1.4}
+{'loss': 0.5766, 'grad_norm': 2.163041591644287, 'learning_rate': 9.362170087976539e-05, 'epoch': 1.4}
+{'loss': 0.5016, 'grad_norm': 1.891203761100769, 'learning_rate': 9.359726295210165e-05, 'epoch': 1.4}
+{'loss': 0.3276, 'grad_norm': 0.8651189208030701, 'learning_rate': 9.357282502443793e-05, 'epoch': 1.4}
+{'loss': 0.6193, 'grad_norm': 2.448225736618042, 'learning_rate': 9.354838709677418e-05, 'epoch': 1.4}
+{'loss': 0.4568, 'grad_norm': 2.657435178756714, 'learning_rate': 9.352394916911045e-05, 'epoch': 1.4}
+{'loss': 1.0414, 'grad_norm': 3.460101842880249, 'learning_rate': 9.349951124144673e-05, 'epoch': 1.4}
+{'loss': 0.6641, 'grad_norm': 1.3931571245193481, 'learning_rate': 9.347507331378298e-05, 'epoch': 1.4}
+{'loss': 0.8282, 'grad_norm': 1.7879291772842407, 'learning_rate': 9.345063538611924e-05, 'epoch': 1.4}
+{'loss': 0.7365, 'grad_norm': 1.4838361740112305, 'learning_rate': 9.342619745845552e-05, 'epoch': 1.4}
+{'loss': 0.24, 'grad_norm': 1.6955534219741821, 'learning_rate': 9.340175953079179e-05, 'epoch': 1.4}
+{'loss': 0.7978, 'grad_norm': 2.0134198665618896, 'learning_rate': 9.337732160312804e-05, 'epoch': 1.4}
+{'loss': 0.6308, 'grad_norm': 1.6322916746139526, 'learning_rate': 9.335288367546432e-05, 'epoch': 1.4}
+{'loss': 0.504, 'grad_norm': 1.2652108669281006, 'learning_rate': 9.332844574780058e-05, 'epoch': 1.4}
+{'loss': 0.3422, 'grad_norm': 1.0733211040496826, 'learning_rate': 9.330400782013684e-05, 'epoch': 1.4}
+{'loss': 0.3383, 'grad_norm': 1.4503707885742188, 'learning_rate': 9.327956989247311e-05, 'epoch': 1.4}
+{'loss': 0.7733, 'grad_norm': 1.633813738822937, 'learning_rate': 9.325513196480937e-05, 'epoch': 1.4}
+{'loss': 0.3886, 'grad_norm': 1.3477745056152344, 'learning_rate': 9.323069403714564e-05, 'epoch': 1.4}
+{'loss': 0.7784, 'grad_norm': 2.0735232830047607, 'learning_rate': 9.320625610948192e-05, 'epoch': 1.4}
+{'loss': 0.88, 'grad_norm': 2.603829860687256, 'learning_rate': 9.318181818181817e-05, 'epoch': 1.4}
+{'loss': 0.9435, 'grad_norm': 1.9070336818695068, 'learning_rate': 9.315738025415443e-05, 'epoch': 1.4}
+{'loss': 0.9925, 'grad_norm': 1.9831008911132812, 'learning_rate': 9.313294232649071e-05, 'epoch': 1.41}
+{'loss': 1.2224, 'grad_norm': 2.742797374725342, 'learning_rate': 9.310850439882698e-05, 'epoch': 1.41}
+{'loss': 0.9814, 'grad_norm': 2.2004201412200928, 'learning_rate': 9.308406647116323e-05, 'epoch': 1.41}
+{'loss': 1.2202, 'grad_norm': 2.0755558013916016, 'learning_rate': 9.30596285434995e-05, 'epoch': 1.41}
+{'loss': 0.8416, 'grad_norm': 2.546623706817627, 'learning_rate': 9.303519061583577e-05, 'epoch': 1.41}
+{'loss': 1.7262, 'grad_norm': 4.5670485496521, 'learning_rate': 9.301075268817204e-05, 'epoch': 1.41}
+{'loss': 1.209, 'grad_norm': 1.8530974388122559, 'learning_rate': 9.29863147605083e-05, 'epoch': 1.41}
+{'loss': 1.0305, 'grad_norm': 1.714417815208435, 'learning_rate': 9.296187683284457e-05, 'epoch': 1.41}
+{'loss': 0.7601, 'grad_norm': 3.0805559158325195, 'learning_rate': 9.293743890518083e-05, 'epoch': 1.41}
+{'loss': 0.1582, 'grad_norm': 1.3468403816223145, 'learning_rate': 9.291300097751711e-05, 'epoch': 1.41}
+{'loss': 0.3851, 'grad_norm': 1.245559573173523, 'learning_rate': 9.288856304985336e-05, 'epoch': 1.41}
+{'loss': 0.5471, 'grad_norm': 2.107041358947754, 'learning_rate': 9.286412512218962e-05, 'epoch': 1.41}
+{'loss': 0.5434, 'grad_norm': 1.9285342693328857, 'learning_rate': 9.28396871945259e-05, 'epoch': 1.41}
+{'loss': 0.222, 'grad_norm': 0.39765217900276184, 'learning_rate': 9.281524926686217e-05, 'epoch': 1.41}
+{'loss': 0.2276, 'grad_norm': 0.596780002117157, 'learning_rate': 9.279081133919842e-05, 'epoch': 1.41}
+{'loss': 0.196, 'grad_norm': 0.4721631705760956, 'learning_rate': 9.27663734115347e-05, 'epoch': 1.41}
+{'loss': 0.4844, 'grad_norm': 2.310553550720215, 'learning_rate': 9.274193548387096e-05, 'epoch': 1.41}
+{'loss': 0.2752, 'grad_norm': 0.47364363074302673, 'learning_rate': 9.271749755620723e-05, 'epoch': 1.41}
+{'loss': 0.4011, 'grad_norm': 1.2700831890106201, 'learning_rate': 9.269305962854349e-05, 'epoch': 1.41}
+{'loss': 0.2654, 'grad_norm': 0.6444479823112488, 'learning_rate': 9.266862170087976e-05, 'epoch': 1.41}
+{'loss': 0.3546, 'grad_norm': 0.6023876667022705, 'learning_rate': 9.264418377321602e-05, 'epoch': 1.41}
+{'loss': 0.249, 'grad_norm': 0.7151503562927246, 'learning_rate': 9.26197458455523e-05, 'epoch': 1.41}
+{'loss': 0.2368, 'grad_norm': 0.5194156765937805, 'learning_rate': 9.259530791788855e-05, 'epoch': 1.41}
+{'loss': 0.2769, 'grad_norm': 0.761722207069397, 'learning_rate': 9.257086999022482e-05, 'epoch': 1.41}
+{'loss': 0.2916, 'grad_norm': 0.7770941853523254, 'learning_rate': 9.25464320625611e-05, 'epoch': 1.41}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:04,  6.21it/s][A
+  0%|          | 3/774 [00:00<02:47,  4.60it/s][A
+  1%|          | 4/774 [00:00<03:23,  3.79it/s][A
+  1%|          | 5/774 [00:01<03:20,  3.84it/s][A
+  1%|          | 6/774 [00:01<03:31,  3.63it/s][A
+  1%|          | 7/774 [00:01<03:28,  3.69it/s][A
+  1%|          | 8/774 [00:02<03:30,  3.65it/s][A
+  1%|          | 9/774 [00:02<03:18,  3.85it/s][A
+  1%|▏         | 10/774 [00:02<03:18,  3.86it/s][A
+  1%|▏         | 11/774 [00:02<03:33,  3.58it/s][A
+  2%|▏         | 12/774 [00:03<03:17,  3.85it/s][A
+  2%|▏         | 13/774 [00:03<03:10,  4.00it/s][A
+  2%|▏         | 14/774 [00:03<03:22,  3.75it/s][A
+  2%|▏         | 15/774 [00:03<03:40,  3.44it/s][A
+  2%|▏         | 16/774 [00:04<03:37,  3.49it/s][A
+  2%|▏         | 17/774 [00:04<03:14,  3.89it/s][A
+  2%|▏         | 18/774 [00:04<03:07,  4.04it/s][A
+  2%|▏         | 19/774 [00:04<03:16,  3.83it/s][A
+  3%|▎         | 20/774 [00:05<03:14,  3.88it/s][A
+  3%|▎         | 21/774 [00:05<03:16,  3.84it/s][A
+  3%|▎         | 22/774 [00:05<03:21,  3.73it/s][A
+  3%|▎         | 23/774 [00:06<03:33,  3.51it/s][A
+  3%|▎         | 24/774 [00:06<03:32,  3.53it/s][A
+  3%|▎         | 25/774 [00:06<03:37,  3.45it/s][A
+  3%|▎         | 26/774 [00:06<03:36,  3.46it/s][A
+  3%|▎         | 27/774 [00:07<03:35,  3.47it/s][A
+  4%|▎         | 28/774 [00:07<03:41,  3.37it/s][A
+  4%|▎         | 29/774 [00:07<03:44,  3.32it/s][A
+  4%|▍         | 30/774 [00:08<03:32,  3.51it/s][A
+  4%|▍         | 31/774 [00:08<03:32,  3.50it/s][A
+  4%|▍         | 32/774 [00:08<04:27,  2.77it/s][A
+  4%|▍         | 33/774 [00:09<04:09,  2.97it/s][A
+  4%|▍         | 34/774 [00:09<03:51,  3.19it/s][A
+  5%|▍         | 35/774 [00:09<03:56,  3.13it/s][A
+  5%|▍         | 36/774 [00:10<03:52,  3.18it/s][A
+  5%|▍         | 37/774 [00:10<03:51,  3.18it/s][A
+  5%|▍         | 38/774 [00:10<03:40,  3.33it/s][A
+  5%|▌         | 39/774 [00:10<03:25,  3.57it/s][A
+  5%|▌         | 40/774 [00:11<03:29,  3.51it/s][A
+  5%|▌         | 41/774 [00:11<03:26,  3.56it/s][A
+  5%|▌         | 42/774 [00:11<03:13,  3.78it/s][A
+  6%|▌         | 43/774 [00:12<03:26,  3.55it/s][A
+  6%|▌         | 44/774 [00:12<03:29,  3.49it/s][A
+  6%|▌         | 45/774 [00:12<03:16,  3.71it/s][A
+  6%|▌         | 46/774 [00:12<03:01,  4.00it/s][A
+  6%|▌         | 47/774 [00:12<02:51,  4.24it/s][A
+  6%|▌         | 48/774 [00:13<02:53,  4.18it/s][A
+  6%|▋         | 49/774 [00:13<02:55,  4.14it/s][A
+  6%|▋         | 50/774 [00:13<02:55,  4.12it/s][A
+  7%|▋         | 51/774 [00:13<02:57,  4.08it/s][A
+  7%|▋         | 52/774 [00:14<02:56,  4.09it/s][A
+  7%|▋         | 53/774 [00:14<03:05,  3.88it/s][A
+  7%|▋         | 54/774 [00:14<03:09,  3.80it/s][A
+  7%|▋         | 55/774 [00:15<03:18,  3.62it/s][A
+  7%|▋         | 56/774 [00:15<03:18,  3.61it/s][A
+  7%|▋         | 57/774 [00:15<03:25,  3.48it/s][A
+  7%|▋         | 58/774 [00:15<03:25,  3.49it/s][A
+  8%|▊         | 59/774 [00:16<03:09,  3.78it/s][A
+  8%|▊         | 60/774 [00:16<02:55,  4.08it/s][A
+  8%|▊         | 61/774 [00:16<02:33,  4.65it/s][A
+  8%|▊         | 62/774 [00:16<02:31,  4.70it/s][A
+  8%|▊         | 63/774 [00:17<02:56,  4.02it/s][A
+  8%|▊         | 64/774 [00:17<02:47,  4.24it/s][A
+  8%|▊         | 65/774 [00:17<02:48,  4.20it/s][A
+  9%|▊         | 66/774 [00:17<02:46,  4.26it/s][A
+  9%|▊         | 67/774 [00:17<02:40,  4.41it/s][A
+  9%|▉         | 68/774 [00:18<02:36,  4.51it/s][A
+  9%|▉         | 69/774 [00:18<02:26,  4.81it/s][A
+  9%|▉         | 70/774 [00:18<02:34,  4.56it/s][A
+  9%|▉         | 71/774 [00:18<02:29,  4.70it/s][A
+  9%|▉         | 72/774 [00:19<02:40,  4.37it/s][A
+  9%|▉         | 73/774 [00:19<02:50,  4.11it/s][A
+ 10%|▉         | 74/774 [00:19<02:56,  3.96it/s][A
+ 10%|▉         | 75/774 [00:19<03:03,  3.81it/s][A
+ 10%|▉         | 76/774 [00:20<02:59,  3.89it/s][A
+ 10%|▉         | 77/774 [00:20<03:12,  3.62it/s][A
+ 10%|█         | 78/774 [00:20<02:53,  4.02it/s][A
+ 10%|█         | 79/774 [00:20<02:41,  4.32it/s][A
+ 10%|█         | 80/774 [00:21<02:38,  4.39it/s][A
+ 10%|█         | 81/774 [00:21<02:16,  5.07it/s][A
+ 11%|█         | 82/774 [00:21<02:16,  5.06it/s][A
+ 11%|█         | 83/774 [00:21<02:20,  4.92it/s][A
+ 11%|█         | 84/774 [00:21<02:25,  4.75it/s][A
+ 11%|█         | 85/774 [00:22<02:34,  4.45it/s][A
+ 11%|█         | 86/774 [00:22<02:42,  4.24it/s][A
+ 11%|█         | 87/774 [00:22<02:43,  4.19it/s][A
+ 11%|█▏        | 88/774 [00:22<02:32,  4.50it/s][A
+ 11%|█▏        | 89/774 [00:22<02:25,  4.70it/s][A
+ 12%|█▏        | 90/774 [00:23<02:34,  4.44it/s][A
+ 12%|█▏        | 91/774 [00:23<02:48,  4.05it/s][A
+ 12%|█▏        | 92/774 [00:23<03:03,  3.72it/s][A
+ 12%|█▏        | 93/774 [00:24<02:59,  3.80it/s][A
+ 12%|█▏        | 94/774 [00:24<03:02,  3.73it/s][A
+ 12%|█▏        | 95/774 [00:24<03:00,  3.76it/s][A
+ 12%|█▏        | 96/774 [00:24<02:55,  3.86it/s][A
+ 13%|█▎        | 97/774 [00:25<02:40,  4.22it/s][A
+ 13%|█▎        | 98/774 [00:25<02:33,  4.40it/s][A
+ 13%|█▎        | 99/774 [00:25<02:45,  4.07it/s][A
+ 13%|█▎        | 100/774 [00:25<02:57,  3.80it/s][A
+ 13%|█▎        | 101/774 [00:26<03:03,  3.67it/s][A
+ 13%|█▎        | 102/774 [00:26<03:15,  3.45it/s][A
+ 13%|█▎        | 103/774 [00:26<03:17,  3.40it/s][A
+ 13%|█▎        | 104/774 [00:27<03:16,  3.41it/s][A
+ 14%|█▎        | 105/774 [00:27<03:16,  3.40it/s][A
+ 14%|█▎        | 106/774 [00:27<03:34,  3.12it/s][A
+ 14%|█▍        | 107/774 [00:28<03:46,  2.95it/s][A
+ 14%|█▍        | 108/774 [00:28<03:37,  3.06it/s][A
+ 14%|█▍        | 109/774 [00:28<03:34,  3.09it/s][A
+ 14%|█▍        | 110/774 [00:28<03:24,  3.24it/s][A
+ 14%|█▍        | 111/774 [00:29<03:23,  3.26it/s][A
+ 14%|█▍        | 112/774 [00:29<03:12,  3.43it/s][A
+ 15%|█▍        | 113/774 [00:29<03:17,  3.34it/s][A
+ 15%|█▍        | 114/774 [00:30<03:21,  3.27it/s][A
+ 15%|█▍        | 115/774 [00:30<03:16,  3.36it/s][A
+ 15%|█▍        | 116/774 [00:30<03:00,  3.64it/s][A
+ 15%|█▌        | 117/774 [00:31<03:07,  3.51it/s][A
+ 15%|█▌        | 118/774 [00:31<03:04,  3.56it/s][A
+ 15%|█▌        | 119/774 [00:31<02:56,  3.70it/s][A
+ 16%|█▌        | 120/774 [00:31<03:07,  3.49it/s][A
+ 16%|█▌        | 121/774 [00:32<03:02,  3.58it/s][A
+ 16%|█▌        | 122/774 [00:32<03:04,  3.54it/s][A
+ 16%|█▌        | 123/774 [00:32<02:56,  3.69it/s][A
+ 16%|█▌        | 124/774 [00:32<02:58,  3.64it/s][A
+ 16%|█▌        | 125/774 [00:33<03:00,  3.60it/s][A
+ 16%|█▋        | 126/774 [00:33<03:07,  3.45it/s][A
+ 16%|█▋        | 127/774 [00:33<03:17,  3.27it/s][A
+ 17%|█▋        | 128/774 [00:34<03:09,  3.41it/s][A
+ 17%|█▋        | 129/774 [00:34<03:13,  3.34it/s][A
+ 17%|█▋        | 130/774 [00:34<03:17,  3.26it/s][A
+ 17%|█▋        | 131/774 [00:35<03:08,  3.42it/s][A
+ 17%|█▋        | 132/774 [00:35<03:08,  3.41it/s][A
+ 17%|█▋        | 133/774 [00:35<03:05,  3.46it/s][A
+ 17%|█▋        | 134/774 [00:35<03:03,  3.48it/s][A
+ 17%|█▋        | 135/774 [00:36<03:20,  3.18it/s][A
+ 18%|█▊        | 136/774 [00:36<03:28,  3.06it/s][A
+ 18%|█▊        | 137/774 [00:36<03:27,  3.06it/s][A
+ 18%|█▊        | 138/774 [00:37<03:22,  3.15it/s][A
+ 18%|█▊        | 139/774 [00:37<03:22,  3.13it/s][A
+ 18%|█▊        | 140/774 [00:37<03:18,  3.19it/s][A
+ 18%|█▊        | 141/774 [00:38<03:10,  3.32it/s][A
+ 18%|█▊        | 142/774 [00:38<03:20,  3.15it/s][A
+ 18%|█▊        | 143/774 [00:38<03:17,  3.20it/s][A
+ 19%|█▊        | 144/774 [00:39<03:06,  3.37it/s][A
+ 19%|█▊        | 145/774 [00:39<02:59,  3.50it/s][A
+ 19%|█▉        | 146/774 [00:39<02:47,  3.76it/s][A
+ 19%|█▉        | 147/774 [00:39<02:38,  3.95it/s][A
+ 19%|█▉        | 148/774 [00:40<02:48,  3.71it/s][A
+ 19%|█▉        | 149/774 [00:40<03:01,  3.44it/s][A
+ 19%|█▉        | 150/774 [00:40<03:03,  3.40it/s][A
+ 20%|█▉        | 151/774 [00:40<02:54,  3.58it/s][A
+ 20%|█▉        | 152/774 [00:41<02:45,  3.75it/s][A
+ 20%|█▉        | 153/774 [00:41<02:53,  3.58it/s][A
+ 20%|█▉        | 154/774 [00:41<02:48,  3.67it/s][A
+ 20%|██        | 155/774 [00:42<02:45,  3.74it/s][A
+ 20%|██        | 156/774 [00:42<02:40,  3.86it/s][A
+ 20%|██        | 157/774 [00:42<02:33,  4.02it/s][A
+ 20%|██        | 158/774 [00:42<02:37,  3.91it/s][A
+ 21%|██        | 159/774 [00:43<02:39,  3.86it/s][A
+ 21%|██        | 160/774 [00:43<02:31,  4.06it/s][A
+ 21%|██        | 161/774 [00:43<02:40,  3.81it/s][A
+ 21%|██        | 162/774 [00:43<02:47,  3.66it/s][A
+ 21%|██        | 163/774 [00:44<02:45,  3.69it/s][A
+ 21%|██        | 164/774 [00:44<02:38,  3.84it/s][A
+ 21%|██▏       | 165/774 [00:44<02:36,  3.88it/s][A
+ 21%|██▏       | 166/774 [00:44<02:40,  3.78it/s][A
+ 22%|██▏       | 167/774 [00:45<02:43,  3.71it/s][A
+ 22%|██▏       | 168/774 [00:45<02:34,  3.93it/s][A
+ 22%|██▏       | 169/774 [00:45<02:26,  4.12it/s][A
+ 22%|██▏       | 170/774 [00:45<02:35,  3.89it/s][A
+ 22%|██▏       | 171/774 [00:46<02:46,  3.62it/s][A
+ 22%|██▏       | 172/774 [00:46<02:54,  3.46it/s][A
+ 22%|██▏       | 173/774 [00:46<02:50,  3.53it/s][A
+ 22%|██▏       | 174/774 [00:47<02:42,  3.69it/s][A
+ 23%|██▎       | 175/774 [00:47<02:43,  3.65it/s][A
+ 23%|██▎       | 176/774 [00:47<02:37,  3.80it/s][A
+ 23%|██▎       | 177/774 [00:47<02:49,  3.52it/s][A
+ 23%|██▎       | 178/774 [00:48<02:35,  3.83it/s][A
+ 23%|██▎       | 179/774 [00:48<02:22,  4.17it/s][A
+ 23%|██▎       | 180/774 [00:48<02:17,  4.33it/s][A
+ 23%|██▎       | 181/774 [00:48<02:20,  4.22it/s][A
+ 24%|██▎       | 182/774 [00:48<02:23,  4.12it/s][A
+ 24%|██▎       | 183/774 [00:49<02:24,  4.09it/s][A
+ 24%|██▍       | 184/774 [00:49<02:34,  3.81it/s][A
+ 24%|██▍       | 185/774 [00:49<02:42,  3.62it/s][A
+ 24%|██▍       | 186/774 [00:50<02:39,  3.68it/s][A
+ 24%|██▍       | 187/774 [00:50<02:33,  3.82it/s][A
+ 24%|██▍       | 188/774 [00:50<02:32,  3.83it/s][A
+ 24%|██▍       | 189/774 [00:50<02:31,  3.87it/s][A
+ 25%|██▍       | 190/774 [00:51<02:25,  4.00it/s][A
+ 25%|██▍       | 191/774 [00:51<02:30,  3.87it/s][A
+ 25%|██▍       | 192/774 [00:51<02:35,  3.74it/s][A
+ 25%|██▍       | 193/774 [00:51<02:38,  3.65it/s][A
+ 25%|██▌       | 194/774 [00:52<02:48,  3.45it/s][A
+ 25%|██▌       | 195/774 [00:52<02:55,  3.29it/s][A
+ 25%|██▌       | 196/774 [00:52<02:56,  3.27it/s][A
+ 25%|██▌       | 197/774 [00:53<02:54,  3.31it/s][A
+ 26%|██▌       | 198/774 [00:53<02:45,  3.49it/s][A
+ 26%|██▌       | 199/774 [00:53<02:44,  3.50it/s][A
+ 26%|██▌       | 200/774 [00:54<02:39,  3.60it/s][A
+ 26%|██▌       | 201/774 [00:54<02:36,  3.66it/s][A
+ 26%|██▌       | 202/774 [00:54<02:33,  3.72it/s][A
+ 26%|██▌       | 203/774 [00:54<02:27,  3.88it/s][A
+ 26%|██▋       | 204/774 [00:55<02:30,  3.79it/s][A
+ 26%|██▋       | 205/774 [00:55<02:40,  3.55it/s][A
+ 27%|██▋       | 206/774 [00:55<02:36,  3.64it/s][A
+ 27%|██▋       | 207/774 [00:55<02:33,  3.70it/s][A
+ 27%|██▋       | 208/774 [00:56<02:33,  3.68it/s][A
+ 27%|██▋       | 209/774 [00:56<02:32,  3.71it/s][A
+ 27%|██▋       | 210/774 [00:56<02:31,  3.73it/s][A
+ 27%|██▋       | 211/774 [00:56<02:27,  3.81it/s][A
+ 27%|██▋       | 212/774 [00:57<02:15,  4.13it/s][A
+ 28%|██▊       | 213/774 [00:57<02:01,  4.63it/s][A
+ 28%|██▊       | 214/774 [00:57<02:03,  4.54it/s][A
+ 28%|██▊       | 215/774 [00:57<02:03,  4.52it/s][A
+ 28%|██▊       | 216/774 [00:57<02:02,  4.56it/s][A
+ 28%|██▊       | 217/774 [00:58<02:05,  4.43it/s][A
+ 28%|██▊       | 218/774 [00:58<02:11,  4.22it/s][A
+ 28%|██▊       | 219/774 [00:58<02:21,  3.93it/s][A
+ 28%|██▊       | 220/774 [00:59<02:19,  3.96it/s][A
+ 29%|██▊       | 221/774 [00:59<02:24,  3.82it/s][A
+ 29%|██▊       | 222/774 [00:59<02:33,  3.59it/s][A
+ 29%|██▉       | 223/774 [00:59<02:50,  3.24it/s][A
+ 29%|██▉       | 224/774 [01:00<02:59,  3.06it/s][A
+ 29%|██▉       | 225/774 [01:00<03:10,  2.87it/s][A
+ 29%|██▉       | 226/774 [01:01<03:14,  2.81it/s][A
+ 29%|██▉       | 227/774 [01:01<03:11,  2.85it/s][A
+ 29%|██▉       | 228/774 [01:01<03:02,  2.99it/s][A
+ 30%|██▉       | 229/774 [01:02<03:18,  2.75it/s][A
+ 30%|██▉       | 230/774 [01:02<03:04,  2.95it/s][A
+ 30%|██▉       | 231/774 [01:02<03:01,  2.99it/s][A
+ 30%|██▉       | 232/774 [01:03<02:52,  3.14it/s][A
+ 30%|███       | 233/774 [01:03<03:08,  2.87it/s][A
+ 30%|███       | 234/774 [01:03<03:12,  2.81it/s][A
+ 30%|███       | 235/774 [01:04<03:09,  2.84it/s][A
+ 30%|███       | 236/774 [01:04<03:14,  2.76it/s][A
+ 31%|███       | 237/774 [01:04<03:12,  2.80it/s][A
+ 31%|███       | 238/774 [01:05<03:02,  2.94it/s][A
+ 31%|███       | 239/774 [01:05<03:00,  2.96it/s][A
+ 31%|███       | 240/774 [01:05<03:00,  2.96it/s][A
+ 31%|███       | 241/774 [01:06<03:03,  2.91it/s][A
+ 31%|███▏      | 242/774 [01:06<03:22,  2.62it/s][A
+ 31%|███▏      | 243/774 [01:07<03:29,  2.53it/s][A
+ 32%|███▏      | 244/774 [01:07<03:22,  2.62it/s][A
+ 32%|███▏      | 245/774 [01:07<03:12,  2.75it/s][A
+ 32%|███▏      | 246/774 [01:08<03:10,  2.78it/s][A
+ 32%|███▏      | 247/774 [01:08<03:46,  2.33it/s][A
+ 32%|███▏      | 248/774 [01:09<03:51,  2.27it/s][A
+ 32%|███▏      | 249/774 [01:09<03:27,  2.53it/s][A
+ 32%|███▏      | 250/774 [01:09<03:20,  2.61it/s][A
+ 32%|███▏      | 251/774 [01:10<03:18,  2.64it/s][A
+ 33%|███▎      | 252/774 [01:10<03:14,  2.69it/s][A
+ 33%|███▎      | 253/774 [01:10<03:12,  2.71it/s][A
+ 33%|███▎      | 254/774 [01:11<03:07,  2.77it/s][A
+ 33%|███▎      | 255/774 [01:11<03:03,  2.83it/s][A
+ 33%|███▎      | 256/774 [01:11<02:58,  2.89it/s][A
+ 33%|███▎      | 257/774 [01:12<02:55,  2.94it/s][A
+ 33%|███▎      | 258/774 [01:12<02:41,  3.20it/s][A
+ 33%|███▎      | 259/774 [01:12<02:23,  3.58it/s][A
+ 34%|███▎      | 260/774 [01:13<02:23,  3.59it/s][A
+ 34%|███▎      | 261/774 [01:13<02:28,  3.46it/s][A
+ 34%|███▍      | 262/774 [01:13<02:13,  3.84it/s][A
+ 34%|███▍      | 263/774 [01:13<02:05,  4.06it/s][A
+ 34%|███▍      | 264/774 [01:14<02:15,  3.76it/s][A
+ 34%|███▍      | 265/774 [01:14<02:09,  3.92it/s][A
+ 34%|███▍      | 266/774 [01:14<02:03,  4.13it/s][A
+ 34%|███▍      | 267/774 [01:14<02:01,  4.17it/s][A
+ 35%|███▍      | 268/774 [01:15<02:08,  3.93it/s][A
+ 35%|███▍      | 269/774 [01:15<02:14,  3.77it/s][A
+ 35%|███▍      | 270/774 [01:15<02:19,  3.61it/s][A
+ 35%|███▌      | 271/774 [01:15<02:16,  3.70it/s][A
+ 35%|███▌      | 272/774 [01:16<02:05,  4.00it/s][A
+ 35%|███▌      | 273/774 [01:16<02:01,  4.13it/s][A
+ 35%|███▌      | 274/774 [01:16<02:08,  3.90it/s][A
+ 36%|███▌      | 275/774 [01:16<01:59,  4.16it/s][A
+ 36%|███▌      | 276/774 [01:17<01:53,  4.39it/s][A
+ 36%|███▌      | 277/774 [01:17<01:57,  4.22it/s][A
+ 36%|███▌      | 278/774 [01:17<01:59,  4.14it/s][A
+ 36%|███▌      | 279/774 [01:17<01:53,  4.35it/s][A
+ 36%|███▌      | 280/774 [01:17<01:54,  4.32it/s][A
+ 36%|███▋      | 281/774 [01:18<02:05,  3.92it/s][A
+ 36%|███▋      | 282/774 [01:18<02:16,  3.59it/s][A
+ 37%|███▋      | 283/774 [01:18<02:12,  3.71it/s][A
+ 37%|███▋      | 284/774 [01:19<02:12,  3.69it/s][A
+ 37%|███▋      | 285/774 [01:19<02:05,  3.90it/s][A
+ 37%|███▋      | 286/774 [01:19<02:00,  4.05it/s][A
+ 37%|███▋      | 287/774 [01:19<02:12,  3.68it/s][A
+ 37%|███▋      | 288/774 [01:20<02:16,  3.56it/s][A
+ 37%|███▋      | 289/774 [01:20<02:12,  3.65it/s][A
+ 37%|███▋      | 290/774 [01:20<02:08,  3.76it/s][A
+ 38%|███▊      | 291/774 [01:20<02:07,  3.78it/s][A
+ 38%|███▊      | 292/774 [01:21<02:04,  3.86it/s][A
+ 38%|███▊      | 293/774 [01:21<01:54,  4.20it/s][A
+ 38%|███▊      | 294/774 [01:21<01:49,  4.38it/s][A
+ 38%|███▊      | 295/774 [01:21<01:48,  4.42it/s][A
+ 38%|███▊      | 296/774 [01:22<01:43,  4.61it/s][A
+ 38%|███▊      | 297/774 [01:22<01:38,  4.85it/s][A
+ 39%|███▊      | 298/774 [01:22<01:43,  4.61it/s][A
+ 39%|███▊      | 299/774 [01:22<01:47,  4.44it/s][A
+ 39%|███▉      | 300/774 [01:22<01:53,  4.17it/s][A
+ 39%|███▉      | 301/774 [01:23<01:46,  4.45it/s][A
+ 39%|███▉      | 302/774 [01:23<01:40,  4.70it/s][A
+ 39%|███▉      | 303/774 [01:23<01:37,  4.83it/s][A
+ 39%|███▉      | 304/774 [01:23<01:26,  5.44it/s][A
+ 39%|███▉      | 305/774 [01:23<01:25,  5.50it/s][A
+ 40%|███▉      | 306/774 [01:24<01:37,  4.80it/s][A
+ 40%|███▉      | 307/774 [01:24<01:42,  4.55it/s][A
+ 40%|███▉      | 308/774 [01:24<01:38,  4.75it/s][A
+ 40%|███▉      | 309/774 [01:24<01:38,  4.71it/s][A
+ 40%|████      | 310/774 [01:25<01:43,  4.48it/s][A
+ 40%|████      | 311/774 [01:25<01:41,  4.54it/s][A
+ 40%|████      | 312/774 [01:25<01:39,  4.65it/s][A
+ 40%|████      | 313/774 [01:25<01:39,  4.66it/s][A
+ 41%|████      | 314/774 [01:25<01:40,  4.58it/s][A
+ 41%|████      | 315/774 [01:26<01:49,  4.21it/s][A
+ 41%|████      | 316/774 [01:26<01:40,  4.58it/s][A
+ 41%|████      | 317/774 [01:26<01:33,  4.91it/s][A
+ 41%|████      | 318/774 [01:26<01:36,  4.72it/s][A
+ 41%|████      | 319/774 [01:26<01:38,  4.61it/s][A
+ 41%|████▏     | 320/774 [01:27<01:39,  4.57it/s][A
+ 41%|████▏     | 321/774 [01:27<01:29,  5.04it/s][A
+ 42%|████▏     | 322/774 [01:27<01:24,  5.32it/s][A
+ 42%|████▏     | 323/774 [01:27<01:16,  5.88it/s][A
+ 42%|████▏     | 324/774 [01:27<01:23,  5.37it/s][A
+ 42%|████▏     | 325/774 [01:28<01:28,  5.10it/s][A
+ 42%|████▏     | 326/774 [01:28<01:25,  5.26it/s][A
+ 42%|████▏     | 327/774 [01:28<01:27,  5.09it/s][A
+ 42%|████▏     | 328/774 [01:28<01:25,  5.20it/s][A
+ 43%|████▎     | 329/774 [01:28<01:34,  4.71it/s][A
+ 43%|████▎     | 330/774 [01:29<01:30,  4.91it/s][A
+ 43%|████▎     | 331/774 [01:29<01:22,  5.39it/s][A
+ 43%|████▎     | 332/774 [01:29<01:20,  5.52it/s][A
+ 43%|████▎     | 333/774 [01:29<01:23,  5.31it/s][A
+ 43%|████▎     | 334/774 [01:29<01:27,  5.06it/s][A
+ 43%|████▎     | 335/774 [01:30<01:27,  5.02it/s][A
+ 43%|████▎     | 336/774 [01:30<01:26,  5.05it/s][A
+ 44%|████▎     | 337/774 [01:30<01:20,  5.42it/s][A
+ 44%|████▎     | 338/774 [01:30<01:15,  5.79it/s][A
+ 44%|████▍     | 339/774 [01:30<01:10,  6.17it/s][A
+ 44%|████▍     | 340/774 [01:30<01:10,  6.18it/s][A
+ 44%|████▍     | 341/774 [01:31<01:27,  4.93it/s][A
+ 44%|████▍     | 342/774 [01:31<01:36,  4.46it/s][A
+ 44%|████▍     | 343/774 [01:31<01:37,  4.41it/s][A
+ 44%|████▍     | 344/774 [01:31<01:41,  4.23it/s][A
+ 45%|████▍     | 345/774 [01:32<01:43,  4.13it/s][A
+ 45%|████▍     | 346/774 [01:32<01:46,  4.03it/s][A
+ 45%|████▍     | 347/774 [01:32<01:43,  4.13it/s][A
+ 45%|████▍     | 348/774 [01:32<01:38,  4.31it/s][A
+ 45%|████▌     | 349/774 [01:33<01:34,  4.48it/s][A
+ 45%|████▌     | 350/774 [01:33<01:36,  4.38it/s][A
+ 45%|████▌     | 351/774 [01:33<01:37,  4.35it/s][A
+ 45%|████▌     | 352/774 [01:33<01:33,  4.51it/s][A
+ 46%|████▌     | 353/774 [01:33<01:33,  4.48it/s][A
+ 46%|████▌     | 354/774 [01:34<01:33,  4.47it/s][A
+ 46%|████▌     | 355/774 [01:34<01:38,  4.25it/s][A
+ 46%|████▌     | 356/774 [01:34<01:48,  3.85it/s][A
+ 46%|████▌     | 357/774 [01:35<02:04,  3.34it/s][A
+ 46%|████▋     | 358/774 [01:35<02:09,  3.22it/s][A
+ 46%|████▋     | 359/774 [01:35<02:08,  3.24it/s][A
+ 47%|████▋     | 360/774 [01:36<02:08,  3.23it/s][A
+ 47%|████▋     | 361/774 [01:36<02:02,  3.37it/s][A
+ 47%|████▋     | 362/774 [01:36<02:08,  3.20it/s][A
+ 47%|████▋     | 363/774 [01:36<02:07,  3.23it/s][A
+ 47%|████▋     | 364/774 [01:37<02:08,  3.18it/s][A
+ 47%|████▋     | 365/774 [01:37<02:05,  3.26it/s][A
+ 47%|████▋     | 366/774 [01:37<01:56,  3.51it/s][A
+ 47%|████▋     | 367/774 [01:38<01:49,  3.70it/s][A
+ 48%|████▊     | 368/774 [01:38<01:47,  3.77it/s][A
+ 48%|████▊     | 369/774 [01:38<01:55,  3.51it/s][A
+ 48%|████▊     | 370/774 [01:39<02:09,  3.11it/s][A
+ 48%|████▊     | 371/774 [01:39<02:00,  3.35it/s][A
+ 48%|████▊     | 372/774 [01:39<02:00,  3.34it/s][A
+ 48%|████▊     | 373/774 [01:39<01:58,  3.37it/s][A
+ 48%|████▊     | 374/774 [01:40<01:56,  3.44it/s][A
+ 48%|████▊     | 375/774 [01:40<01:55,  3.46it/s][A
+ 49%|████▊     | 376/774 [01:40<01:59,  3.32it/s][A
+ 49%|████▊     | 377/774 [01:41<02:12,  3.00it/s][A
+ 49%|████▉     | 378/774 [01:41<02:12,  2.99it/s][A
+ 49%|████▉     | 379/774 [01:41<02:03,  3.21it/s][A
+ 49%|████▉     | 380/774 [01:42<01:52,  3.49it/s][A
+ 49%|████▉     | 381/774 [01:42<01:44,  3.75it/s][A
+ 49%|████▉     | 382/774 [01:42<01:41,  3.86it/s][A
+ 49%|████▉     | 383/774 [01:42<01:38,  3.96it/s][A
+ 50%|████▉     | 384/774 [01:43<01:46,  3.66it/s][A
+ 50%|████▉     | 385/774 [01:43<01:55,  3.37it/s][A
+ 50%|████▉     | 386/774 [01:43<01:48,  3.59it/s][A
+ 50%|█████     | 387/774 [01:43<01:40,  3.84it/s][A
+ 50%|█████     | 388/774 [01:44<01:46,  3.62it/s][A
+ 50%|█████     | 389/774 [01:44<01:43,  3.74it/s][A
+ 50%|█████     | 390/774 [01:44<01:56,  3.29it/s][A
+ 51%|█████     | 391/774 [01:45<01:59,  3.21it/s][A
+ 51%|█████     | 392/774 [01:45<01:49,  3.50it/s][A
+ 51%|█████     | 393/774 [01:45<01:40,  3.79it/s][A
+ 51%|█████     | 394/774 [01:45<01:40,  3.77it/s][A
+ 51%|█████     | 395/774 [01:46<01:48,  3.51it/s][A
+ 51%|█████     | 396/774 [01:46<01:45,  3.59it/s][A
+ 51%|█████▏    | 397/774 [01:46<01:48,  3.47it/s][A
+ 51%|█████▏    | 398/774 [01:46<01:43,  3.62it/s][A
+ 52%|█████▏    | 399/774 [01:47<01:42,  3.68it/s][A
+ 52%|█████▏    | 400/774 [01:47<01:33,  3.98it/s][A
+ 52%|█████▏    | 401/774 [01:47<01:30,  4.11it/s][A
+ 52%|█████▏    | 402/774 [01:47<01:30,  4.10it/s][A
+ 52%|█████▏    | 403/774 [01:48<01:35,  3.90it/s][A
+ 52%|█████▏    | 404/774 [01:48<01:41,  3.66it/s][A
+ 52%|█████▏    | 405/774 [01:48<01:37,  3.79it/s][A
+ 52%|█████▏    | 406/774 [01:49<01:39,  3.68it/s][A
+ 53%|█████▎    | 407/774 [01:49<01:45,  3.46it/s][A
+ 53%|█████▎    | 408/774 [01:49<01:41,  3.59it/s][A
+ 53%|█████▎    | 409/774 [01:49<01:38,  3.70it/s][A
+ 53%|█████▎    | 410/774 [01:50<01:39,  3.65it/s][A
+ 53%|█████▎    | 411/774 [01:50<01:39,  3.64it/s][A
+ 53%|█████▎    | 412/774 [01:50<01:40,  3.59it/s][A
+ 53%|█████▎    | 413/774 [01:51<01:38,  3.66it/s][A
+ 53%|█████▎    | 414/774 [01:51<01:36,  3.75it/s][A
+ 54%|█████▎    | 415/774 [01:51<01:24,  4.23it/s][A
+ 54%|█████▎    | 416/774 [01:51<01:25,  4.19it/s][A
+ 54%|█████▍    | 417/774 [01:51<01:23,  4.26it/s][A
+ 54%|█████▍    | 418/774 [01:52<01:17,  4.61it/s][A
+ 54%|█████▍    | 419/774 [01:52<01:31,  3.86it/s][A
+ 54%|█████▍    | 420/774 [01:52<01:36,  3.68it/s][A
+ 54%|█████▍    | 421/774 [01:52<01:35,  3.68it/s][A
+ 55%|█████▍    | 422/774 [01:53<01:35,  3.70it/s][A
+ 55%|█████▍    | 423/774 [01:53<01:36,  3.65it/s][A
+ 55%|█████▍    | 424/774 [01:53<01:34,  3.71it/s][A
+ 55%|█████▍    | 425/774 [01:53<01:23,  4.17it/s][A
+ 55%|█████▌    | 426/774 [01:54<01:17,  4.48it/s][A
+ 55%|█████▌    | 427/774 [01:54<01:14,  4.68it/s][A
+ 55%|█████▌    | 428/774 [01:54<01:15,  4.57it/s][A
+ 55%|█████▌    | 429/774 [01:54<01:18,  4.40it/s][A
+ 56%|█████▌    | 430/774 [01:55<01:23,  4.14it/s][A
+ 56%|█████▌    | 431/774 [01:55<01:35,  3.59it/s][A
+ 56%|█████▌    | 432/774 [01:55<01:33,  3.66it/s][A
+ 56%|█████▌    | 433/774 [01:55<01:27,  3.91it/s][A
+ 56%|█████▌    | 434/774 [01:56<01:22,  4.12it/s][A
+ 56%|█████▌    | 435/774 [01:56<01:21,  4.15it/s][A
+ 56%|█████▋    | 436/774 [01:56<01:23,  4.06it/s][A
+ 56%|█████▋    | 437/774 [01:56<01:19,  4.23it/s][A
+ 57%|█████▋    | 438/774 [01:57<01:15,  4.42it/s][A
+ 57%|█████▋    | 439/774 [01:57<01:19,  4.24it/s][A
+ 57%|█████▋    | 440/774 [01:57<01:23,  4.02it/s][A
+ 57%|█████▋    | 441/774 [01:57<01:29,  3.72it/s][A
+ 57%|█████▋    | 442/774 [01:58<01:30,  3.68it/s][A
+ 57%|█████▋    | 443/774 [01:58<01:27,  3.78it/s][A
+ 57%|█████▋    | 444/774 [01:58<01:25,  3.84it/s][A
+ 57%|█████▋    | 445/774 [01:58<01:25,  3.84it/s][A
+ 58%|█████▊    | 446/774 [01:59<01:23,  3.94it/s][A
+ 58%|█████▊    | 447/774 [01:59<01:21,  4.01it/s][A
+ 58%|█████▊    | 448/774 [01:59<01:14,  4.38it/s][A
+ 58%|█████▊    | 449/774 [01:59<01:15,  4.32it/s][A
+ 58%|█████▊    | 450/774 [02:00<01:18,  4.14it/s][A
+ 58%|█████▊    | 451/774 [02:00<01:15,  4.26it/s][A
+ 58%|█████▊    | 452/774 [02:00<01:11,  4.47it/s][A
+ 59%|█████▊    | 453/774 [02:00<01:10,  4.54it/s][A
+ 59%|█████▊    | 454/774 [02:01<01:16,  4.19it/s][A
+ 59%|█████▉    | 455/774 [02:01<01:20,  3.95it/s][A
+ 59%|█████▉    | 456/774 [02:01<01:24,  3.78it/s][A
+ 59%|█████▉    | 457/774 [02:01<01:18,  4.05it/s][A
+ 59%|█████▉    | 458/774 [02:02<01:18,  4.05it/s][A
+ 59%|█████▉    | 459/774 [02:02<01:16,  4.12it/s][A
+ 59%|█████▉    | 460/774 [02:02<01:21,  3.84it/s][A
+ 60%|█████▉    | 461/774 [02:02<01:28,  3.55it/s][A
+ 60%|█████▉    | 462/774 [02:03<01:25,  3.64it/s][A
+ 60%|█████▉    | 463/774 [02:03<01:23,  3.74it/s][A
+ 60%|█████▉    | 464/774 [02:03<01:23,  3.72it/s][A
+ 60%|██████    | 465/774 [02:03<01:15,  4.11it/s][A
+ 60%|██████    | 466/774 [02:04<01:12,  4.26it/s][A
+ 60%|██████    | 467/774 [02:04<01:08,  4.51it/s][A
+ 60%|██████    | 468/774 [02:04<01:08,  4.46it/s][A
+ 61%|██████    | 469/774 [02:04<01:02,  4.85it/s][A
+ 61%|██████    | 470/774 [02:04<01:00,  5.04it/s][A
+ 61%|██████    | 471/774 [02:05<01:02,  4.86it/s][A
+ 61%|██████    | 472/774 [02:05<01:07,  4.49it/s][A
+ 61%|██████    | 473/774 [02:05<01:10,  4.27it/s][A
+ 61%|██████    | 474/774 [02:05<01:08,  4.36it/s][A
+ 61%|██████▏   | 475/774 [02:06<01:10,  4.27it/s][A
+ 61%|██████▏   | 476/774 [02:06<01:17,  3.83it/s][A
+ 62%|██████▏   | 477/774 [02:06<01:32,  3.23it/s][A
+ 62%|██████▏   | 478/774 [02:07<01:33,  3.17it/s][A
+ 62%|██████▏   | 479/774 [02:07<01:30,  3.24it/s][A
+ 62%|██████▏   | 480/774 [02:07<01:27,  3.35it/s][A
+ 62%|██████▏   | 481/774 [02:08<01:29,  3.28it/s][A
+ 62%|██████▏   | 482/774 [02:08<01:27,  3.34it/s][A
+ 62%|██████▏   | 483/774 [02:08<01:24,  3.43it/s][A
+ 63%|██████▎   | 484/774 [02:08<01:27,  3.33it/s][A
+ 63%|██████▎   | 485/774 [02:09<01:28,  3.26it/s][A
+ 63%|██████▎   | 486/774 [02:09<01:24,  3.39it/s][A
+ 63%|██████▎   | 487/774 [02:09<01:25,  3.34it/s][A
+ 63%|█��████▎   | 488/774 [02:10<01:23,  3.41it/s][A
+ 63%|██████▎   | 489/774 [02:10<01:18,  3.64it/s][A
+ 63%|██████▎   | 490/774 [02:10<01:18,  3.63it/s][A
+ 63%|██████▎   | 491/774 [02:10<01:16,  3.68it/s][A
+ 64%|██████▎   | 492/774 [02:11<01:18,  3.61it/s][A
+ 64%|██████▎   | 493/774 [02:11<01:18,  3.57it/s][A
+ 64%|██████▍   | 494/774 [02:11<01:17,  3.61it/s][A
+ 64%|██████▍   | 495/774 [02:12<01:17,  3.61it/s][A
+ 64%|██████▍   | 496/774 [02:12<01:22,  3.37it/s][A
+ 64%|██████▍   | 497/774 [02:12<01:23,  3.33it/s][A
+ 64%|██████▍   | 498/774 [02:12<01:21,  3.37it/s][A
+ 64%|██████▍   | 499/774 [02:13<01:19,  3.44it/s][A
+ 65%|██████▍   | 500/774 [02:13<01:16,  3.56it/s][A
+ 65%|██████▍   | 501/774 [02:13<01:14,  3.68it/s][A
+ 65%|██████▍   | 502/774 [02:13<01:13,  3.68it/s][A
+ 65%|██████▍   | 503/774 [02:14<01:19,  3.39it/s][A
+ 65%|██████▌   | 504/774 [02:14<01:21,  3.30it/s][A
+ 65%|██████▌   | 505/774 [02:14<01:18,  3.41it/s][A
+ 65%|██████▌   | 506/774 [02:15<01:18,  3.41it/s][A
+ 66%|██████▌   | 507/774 [02:15<01:23,  3.20it/s][A
+ 66%|██████▌   | 508/774 [02:15<01:20,  3.29it/s][A
+ 66%|██████▌   | 509/774 [02:16<01:19,  3.32it/s][A
+ 66%|██████▌   | 510/774 [02:16<01:17,  3.41it/s][A
+ 66%|██████▌   | 511/774 [02:16<01:13,  3.59it/s][A
+ 66%|██████▌   | 512/774 [02:16<01:11,  3.67it/s][A
+ 66%|██████▋   | 513/774 [02:17<01:14,  3.52it/s][A
+ 66%|██████▋   | 514/774 [02:17<01:16,  3.41it/s][A
+ 67%|██████▋   | 515/774 [02:17<01:22,  3.14it/s][A
+ 67%|██████▋   | 516/774 [02:18<01:16,  3.35it/s][A
+ 67%|██████▋   | 517/774 [02:18<01:10,  3.65it/s][A
+ 67%|██████▋   | 518/774 [02:18<01:08,  3.76it/s][A
+ 67%|██████▋   | 519/774 [02:18<01:11,  3.58it/s][A
+ 67%|██████▋   | 520/774 [02:19<01:10,  3.62it/s][A
+ 67%|██████▋   | 521/774 [02:19<01:07,  3.75it/s][A
+ 67%|██████▋   | 522/774 [02:19<01:03,  3.96it/s][A
+ 68%|██████▊   | 523/774 [02:19<01:02,  4.04it/s][A
+ 68%|██████▊   | 524/774 [02:20<01:06,  3.79it/s][A
+ 68%|██████▊   | 525/774 [02:20<01:06,  3.72it/s][A
+ 68%|██████▊   | 526/774 [02:20<01:09,  3.56it/s][A
+ 68%|██████▊   | 527/774 [02:21<01:11,  3.47it/s][A
+ 68%|██████▊   | 528/774 [02:21<01:10,  3.48it/s][A
+ 68%|██████▊   | 529/774 [02:21<01:06,  3.68it/s][A
+ 68%|██████▊   | 530/774 [02:21<01:05,  3.73it/s][A
+ 69%|██████▊   | 531/774 [02:22<01:05,  3.74it/s][A
+ 69%|██████▊   | 532/774 [02:22<01:03,  3.84it/s][A
+ 69%|██████▉   | 533/774 [02:22<00:59,  4.04it/s][A
+ 69%|██████▉   | 534/774 [02:22<00:56,  4.25it/s][A
+ 69%|██████▉   | 535/774 [02:23<00:59,  4.05it/s][A
+ 69%|██████▉   | 536/774 [02:23<01:01,  3.88it/s][A
+ 69%|██████▉   | 537/774 [02:23<01:02,  3.79it/s][A
+ 70%|██████▉   | 538/774 [02:24<01:06,  3.57it/s][A
+ 70%|██████▉   | 539/774 [02:24<01:05,  3.59it/s][A
+ 70%|██████▉   | 540/774 [02:24<01:05,  3.59it/s][A
+ 70%|██████▉   | 541/774 [02:24<01:02,  3.72it/s][A
+ 70%|███████   | 542/774 [02:25<01:02,  3.70it/s][A
+ 70%|███████   | 543/774 [02:25<01:04,  3.57it/s][A
+ 70%|███████   | 544/774 [02:25<01:04,  3.57it/s][A
+ 70%|███████   | 545/774 [02:25<01:01,  3.69it/s][A
+ 71%|███████   | 546/774 [02:26<00:58,  3.90it/s][A
+ 71%|███████   | 547/774 [02:26<00:55,  4.08it/s][A
+ 71%|███████   | 548/774 [02:26<00:54,  4.13it/s][A
+ 71%|███████   | 549/774 [02:26<00:55,  4.06it/s][A
+ 71%|███████   | 550/774 [02:27<00:58,  3.82it/s][A
+ 71%|███████   | 551/774 [02:27<01:01,  3.62it/s][A
+ 71%|███████▏  | 552/774 [02:27<01:05,  3.41it/s][A
+ 71%|███████▏  | 553/774 [02:28<01:08,  3.21it/s][A
+ 72%|███████▏  | 554/774 [02:28<01:07,  3.24it/s][A
+ 72%|███████▏  | 555/774 [02:28<01:06,  3.27it/s][A
+ 72%|███████▏  | 556/774 [02:28<01:03,  3.44it/s][A
+ 72%|███████▏  | 557/774 [02:29<01:07,  3.23it/s][A
+ 72%|███████▏  | 558/774 [02:29<01:01,  3.53it/s][A
+ 72%|███████▏  | 559/774 [02:29<00:56,  3.82it/s][A
+ 72%|███████▏  | 560/774 [02:30<01:00,  3.51it/s][A
+ 72%|███████▏  | 561/774 [02:30<00:57,  3.71it/s][A
+ 73%|███████▎  | 562/774 [02:30<00:52,  4.02it/s][A
+ 73%|███████▎  | 563/774 [02:30<00:50,  4.17it/s][A
+ 73%|███████▎  | 564/774 [02:31<00:51,  4.04it/s][A
+ 73%|███████▎  | 565/774 [02:31<00:53,  3.88it/s][A
+ 73%|███████▎  | 566/774 [02:31<00:50,  4.16it/s][A
+ 73%|███████▎  | 567/774 [02:31<00:46,  4.50it/s][A
+ 73%|███████▎  | 568/774 [02:31<00:47,  4.31it/s][A
+ 74%|███████▎  | 569/774 [02:32<00:48,  4.25it/s][A
+ 74%|███████▎  | 570/774 [02:32<00:48,  4.24it/s][A
+ 74%|███████▍  | 571/774 [02:32<00:52,  3.90it/s][A
+ 74%|███████▍  | 572/774 [02:33<00:54,  3.74it/s][A
+ 74%|███████▍  | 573/774 [02:33<00:53,  3.74it/s][A
+ 74%|███████▍  | 574/774 [02:33<00:52,  3.83it/s][A
+ 74%|███████▍  | 575/774 [02:33<00:51,  3.85it/s][A
+ 74%|███████▍  | 576/774 [02:34<00:57,  3.47it/s][A
+ 75%|███████▍  | 577/774 [02:34<00:55,  3.58it/s][A
+ 75%|███████▍  | 578/774 [02:34<00:53,  3.66it/s][A
+ 75%|███████▍  | 579/774 [02:34<00:56,  3.48it/s][A
+ 75%|███████▍  | 580/774 [02:35<00:55,  3.49it/s][A
+ 75%|███████▌  | 581/774 [02:35<00:54,  3.51it/s][A
+ 75%|███████▌  | 582/774 [02:35<00:52,  3.65it/s][A
+ 75%|███████▌  | 583/774 [02:36<00:50,  3.77it/s][A
+ 75%|███████▌  | 584/774 [02:36<00:50,  3.79it/s][A
+ 76%|███████▌  | 585/774 [02:36<00:52,  3.62it/s][A
+ 76%|███████▌  | 586/774 [02:36<00:52,  3.56it/s][A
+ 76%|███████▌  | 587/774 [02:37<00:51,  3.64it/s][A
+ 76%|███████▌  | 588/774 [02:37<00:50,  3.71it/s][A
+ 76%|███████▌  | 589/774 [02:37<00:49,  3.77it/s][A
+ 76%|███████▌  | 590/774 [02:37<00:45,  4.02it/s][A
+ 76%|███████▋  | 591/774 [02:38<00:47,  3.89it/s][A
+ 76%|███████▋  | 592/774 [02:38<00:49,  3.64it/s][A
+ 77%|███████▋  | 593/774 [02:38<00:50,  3.57it/s][A
+ 77%|███████▋  | 594/774 [02:39<00:50,  3.56it/s][A
+ 77%|███████▋  | 595/774 [02:39<00:53,  3.33it/s][A
+ 77%|███████▋  | 596/774 [02:39<00:56,  3.16it/s][A
+ 77%|███████▋  | 597/774 [02:40<00:56,  3.13it/s][A
+ 77%|███████▋  | 598/774 [02:40<00:57,  3.06it/s][A
+ 77%|███████▋  | 599/774 [02:40<00:58,  3.01it/s][A
+ 78%|███████▊  | 600/774 [02:41<00:57,  3.01it/s][A
+ 78%|███████▊  | 601/774 [02:41<00:58,  2.98it/s][A
+ 78%|███████▊  | 602/774 [02:41<00:58,  2.97it/s][A
+ 78%|███████▊  | 603/774 [02:42<00:57,  3.00it/s][A
+ 78%|███████▊  | 604/774 [02:42<00:57,  2.94it/s][A
+ 78%|███████▊  | 605/774 [02:42<00:56,  3.01it/s][A
+ 78%|███████▊  | 606/774 [02:43<00:57,  2.92it/s][A
+ 78%|███████▊  | 607/774 [02:43<00:56,  2.94it/s][A
+ 79%|███████▊  | 608/774 [02:43<00:56,  2.93it/s][A
+ 79%|███████▊  | 609/774 [02:44<00:54,  3.05it/s][A
+ 79%|███████▉  | 610/774 [02:44<00:55,  2.97it/s][A
+ 79%|███████▉  | 611/774 [02:44<00:59,  2.73it/s][A
+ 79%|███████▉  | 612/774 [02:45<01:01,  2.63it/s][A
+ 79%|███████▉  | 613/774 [02:45<00:57,  2.82it/s][A
+ 79%|███████▉  | 614/774 [02:45<00:55,  2.88it/s][A
+ 79%|███████▉  | 615/774 [02:46<00:52,  3.03it/s][A
+ 80%|███████▉  | 616/774 [02:46<00:51,  3.08it/s][A
+ 80%|███████▉  | 617/774 [02:46<00:50,  3.10it/s][A
+ 80%|███████▉  | 618/774 [02:47<00:48,  3.24it/s][A
+ 80%|███████▉  | 619/774 [02:47<00:45,  3.42it/s][A
+ 80%|████████  | 620/774 [02:47<00:44,  3.45it/s][A
+ 80%|████████  | 621/774 [02:47<00:41,  3.71it/s][A
+ 80%|████████  | 622/774 [02:48<00:38,  3.97it/s][A
+ 80%|████████  | 623/774 [02:48<00:38,  3.92it/s][A
+ 81%|████████  | 624/774 [02:48<00:41,  3.60it/s][A
+ 81%|████████  | 625/774 [02:49<00:42,  3.54it/s][A
+ 81%|████████  | 626/774 [02:49<00:44,  3.29it/s][A
+ 81%|████████  | 627/774 [02:49<00:45,  3.22it/s][A
+ 81%|████████  | 628/774 [02:50<00:45,  3.21it/s][A
+ 81%|████████▏ | 629/774 [02:50<00:43,  3.32it/s][A
+ 81%|████████▏ | 630/774 [02:50<00:40,  3.54it/s][A
+ 82%|████████▏ | 631/774 [02:50<00:38,  3.72it/s][A
+ 82%|████████▏ | 632/774 [02:51<00:38,  3.73it/s][A
+ 82%|████████▏ | 633/774 [02:51<00:39,  3.55it/s][A
+ 82%|████████▏ | 634/774 [02:51<00:40,  3.44it/s][A
+ 82%|████████▏ | 635/774 [02:51<00:39,  3.50it/s][A
+ 82%|████████▏ | 636/774 [02:52<00:40,  3.45it/s][A
+ 82%|████████▏ | 637/774 [02:52<00:39,  3.49it/s][A
+ 82%|████████▏ | 638/774 [02:52<00:39,  3.46it/s][A
+ 83%|████████▎ | 639/774 [02:53<00:43,  3.08it/s][A
+ 83%|████████▎ | 640/774 [02:53<00:50,  2.64it/s][A
+ 83%|████████▎ | 641/774 [02:54<00:49,  2.67it/s][A
+ 83%|████████▎ | 642/774 [02:54<00:46,  2.85it/s][A
+ 83%|████████▎ | 643/774 [02:54<00:45,  2.88it/s][A
+ 83%|████████▎ | 644/774 [02:54<00:41,  3.10it/s][A
+ 83%|████████▎ | 645/774 [02:55<00:38,  3.39it/s][A
+ 83%|████████▎ | 646/774 [02:55<00:35,  3.63it/s][A
+ 84%|████████▎ | 647/774 [02:55<00:32,  3.91it/s][A
+ 84%|████████▎ | 648/774 [02:55<00:31,  4.06it/s][A
+ 84%|████████▍ | 649/774 [02:56<00:30,  4.09it/s][A
+ 84%|████████▍ | 650/774 [02:56<00:28,  4.30it/s][A
+ 84%|████████▍ | 651/774 [02:56<00:29,  4.24it/s][A
+ 84%|████████▍ | 652/774 [02:56<00:29,  4.17it/s][A
+ 84%|████████▍ | 653/774 [02:57<00:31,  3.86it/s][A
+ 84%|████████▍ | 654/774 [02:57<00:29,  4.09it/s][A
+ 85%|████████▍ | 655/774 [02:57<00:27,  4.40it/s][A
+ 85%|████████▍ | 656/774 [02:57<00:27,  4.23it/s][A
+ 85%|████████▍ | 657/774 [02:57<00:26,  4.48it/s][A
+ 85%|████████▌ | 658/774 [02:58<00:27,  4.26it/s][A
+ 85%|████████▌ | 659/774 [02:58<00:29,  3.90it/s][A
+ 85%|████████▌ | 660/774 [02:58<00:30,  3.76it/s][A
+ 85%|████████▌ | 661/774 [02:59<00:30,  3.72it/s][A
+ 86%|████████▌ | 662/774 [02:59<00:28,  3.89it/s][A
+ 86%|████████▌ | 663/774 [02:59<00:30,  3.67it/s][A
+ 86%|████████▌ | 664/774 [02:59<00:30,  3.63it/s][A
+ 86%|████████▌ | 665/774 [03:00<00:27,  3.90it/s][A
+ 86%|████████▌ | 666/774 [03:00<00:24,  4.36it/s][A
+ 86%|████████▌ | 667/774 [03:00<00:23,  4.60it/s][A
+ 86%|████████▋ | 668/774 [03:00<00:23,  4.45it/s][A
+ 86%|████████▋ | 669/774 [03:00<00:25,  4.17it/s][A
+ 87%|████████▋ | 670/774 [03:01<00:24,  4.30it/s][A
+ 87%|████████▋ | 671/774 [03:01<00:26,  3.90it/s][A
+ 87%|████████▋ | 672/774 [03:01<00:25,  3.97it/s][A
+ 87%|████████▋ | 673/774 [03:01<00:24,  4.06it/s][A
+ 87%|████████▋ | 674/774 [03:02<00:25,  3.99it/s][A
+ 87%|████████▋ | 675/774 [03:02<00:23,  4.19it/s][A
+ 87%|████████▋ | 676/774 [03:02<00:22,  4.42it/s][A
+ 87%|████████▋ | 677/774 [03:02<00:21,  4.41it/s][A
+ 88%|████████▊ | 678/774 [03:03<00:21,  4.45it/s][A
+ 88%|████████▊ | 679/774 [03:03<00:22,  4.18it/s][A
+ 88%|████████▊ | 680/774 [03:03<00:22,  4.15it/s][A
+ 88%|████████▊ | 681/774 [03:03<00:20,  4.46it/s][A
+ 88%|████████▊ | 682/774 [03:04<00:20,  4.49it/s][A
+ 88%|████████▊ | 683/774 [03:04<00:22,  4.13it/s][A
+ 88%|████████▊ | 684/774 [03:04<00:23,  3.87it/s][A
+ 89%|████████▊ | 685/774 [03:04<00:24,  3.69it/s][A
+ 89%|████████▊ | 686/774 [03:05<00:23,  3.80it/s][A
+ 89%|████████▉ | 687/774 [03:05<00:21,  4.01it/s][A
+ 89%|████████▉ | 688/774 [03:05<00:21,  4.01it/s][A
+ 89%|████████▉ | 689/774 [03:05<00:20,  4.15it/s][A
+ 89%|████████▉ | 690/774 [03:06<00:19,  4.26it/s][A
+ 89%|████████▉ | 691/774 [03:06<00:18,  4.38it/s][A
+ 89%|████████▉ | 692/774 [03:06<00:18,  4.44it/s][A
+ 90%|████████▉ | 693/774 [03:06<00:18,  4.46it/s][A
+ 90%|████████▉ | 694/774 [03:07<00:19,  4.18it/s][A
+ 90%|████████▉ | 695/774 [03:07<00:20,  3.86it/s][A
+ 90%|████████▉ | 696/774 [03:07<00:19,  3.96it/s][A
+ 90%|█████████ | 697/774 [03:07<00:19,  3.96it/s][A
+ 90%|█████████ | 698/774 [03:07<00:17,  4.36it/s][A
+ 90%|█████████ | 699/774 [03:08<00:15,  4.71it/s][A
+ 90%|█████████ | 700/774 [03:08<00:17,  4.34it/s][A
+ 91%|█████████ | 701/774 [03:08<00:16,  4.41it/s][A
+ 91%|█████████ | 702/774 [03:08<00:16,  4.40it/s][A
+ 91%|█████████ | 703/774 [03:09<00:16,  4.39it/s][A
+ 91%|██���██████ | 704/774 [03:09<00:16,  4.26it/s][A
+ 91%|█████████ | 705/774 [03:09<00:14,  4.62it/s][A
+ 91%|█████████ | 706/774 [03:09<00:14,  4.80it/s][A
+ 91%|█████████▏| 707/774 [03:09<00:14,  4.68it/s][A
+ 91%|█████████▏| 708/774 [03:10<00:13,  4.94it/s][A
+ 92%|█████████▏| 709/774 [03:10<00:13,  4.77it/s][A
+ 92%|█████████▏| 710/774 [03:10<00:13,  4.68it/s][A
+ 92%|█████████▏| 711/774 [03:10<00:12,  4.86it/s][A
+ 92%|█████████▏| 712/774 [03:10<00:11,  5.18it/s][A
+ 92%|█████████▏| 713/774 [03:11<00:12,  4.99it/s][A
+ 92%|█████████▏| 714/774 [03:11<00:12,  4.66it/s][A
+ 92%|█████████▏| 715/774 [03:11<00:12,  4.76it/s][A
+ 93%|█████████▎| 716/774 [03:11<00:11,  5.25it/s][A
+ 93%|█████████▎| 717/774 [03:11<00:10,  5.32it/s][A
+ 93%|█████████▎| 718/774 [03:12<00:11,  4.78it/s][A
+ 93%|█████████▎| 719/774 [03:12<00:11,  4.64it/s][A
+ 93%|█████████▎| 720/774 [03:12<00:10,  4.95it/s][A
+ 93%|█████████▎| 721/774 [03:12<00:10,  5.21it/s][A
+ 93%|█████████▎| 722/774 [03:12<00:09,  5.63it/s][A
+ 93%|█████████▎| 723/774 [03:13<00:09,  5.42it/s][A
+ 94%|█████████▎| 724/774 [03:13<00:09,  5.43it/s][A
+ 94%|█████████▎| 725/774 [03:13<00:08,  5.55it/s][A
+ 94%|█████████▍| 726/774 [03:13<00:08,  5.60it/s][A
+ 94%|█████████▍| 727/774 [03:13<00:08,  5.37it/s][A
+ 94%|█████████▍| 728/774 [03:14<00:09,  4.81it/s][A
+ 94%|█████████▍| 729/774 [03:14<00:08,  5.10it/s][A
+ 94%|█████████▍| 730/774 [03:14<00:08,  5.43it/s][A
+ 94%|█████████▍| 731/774 [03:14<00:07,  5.41it/s][A
+ 95%|█████████▍| 732/774 [03:14<00:07,  5.56it/s][A
+ 95%|█████████▍| 733/774 [03:14<00:07,  5.55it/s][A
+ 95%|█████████▍| 734/774 [03:15<00:07,  5.60it/s][A
+ 95%|█████████▍| 735/774 [03:15<00:06,  5.73it/s][A
+ 95%|█████████▌| 736/774 [03:15<00:06,  5.82it/s][A
+ 95%|█████████▌| 737/774 [03:15<00:06,  5.82it/s][A
+ 95%|█████████▌| 738/774 [03:15<00:06,  5.59it/s][A
+ 95%|█████████▌| 739/774 [03:15<00:06,  5.52it/s][A
+ 96%|█████████▌| 740/774 [03:16<00:06,  5.43it/s][A
+ 96%|█████████▌| 741/774 [03:16<00:06,  5.11it/s][A
+ 96%|█████████▌| 742/774 [03:16<00:06,  5.30it/s][A
+ 96%|█████████▌| 743/774 [03:16<00:05,  5.66it/s][A
+ 96%|█████████▌| 744/774 [03:16<00:05,  5.43it/s][A
+ 96%|█████████▋| 745/774 [03:17<00:06,  4.48it/s][A
+ 96%|█████████▋| 746/774 [03:17<00:07,  3.88it/s][A
+ 97%|█████████▋| 747/774 [03:17<00:06,  4.08it/s][A
+ 97%|█████████▋| 748/774 [03:17<00:06,  4.33it/s][A
+ 97%|█████████▋| 749/774 [03:18<00:05,  4.62it/s][A
+ 97%|█████████▋| 750/774 [03:18<00:05,  4.30it/s][A
+ 97%|█████████▋| 751/774 [03:18<00:05,  4.49it/s][A
+ 97%|█████████▋| 752/774 [03:18<00:04,  4.43it/s][A
+ 97%|█████████▋| 753/774 [03:19<00:04,  4.71it/s][A
+ 97%|█████████▋| 754/774 [03:19<00:03,  5.43it/s][A
+ 98%|█████████▊| 755/774 [03:19<00:03,  5.69it/s][A
+ 98%|█████████▊| 756/774 [03:19<00:03,  5.52it/s][A
+ 98%|█████████▊| 757/774 [03:19<00:03,  5.34it/s][A
+ 98%|█████████▊| 758/774 [03:19<00:03,  5.22it/s][A
+ 98%|█████████▊| 759/774 [03:20<00:02,  5.47it/s][A
+ 98%|█████████▊| 760/774 [03:20<00:02,  5.49it/s][A
+ 98%|█████████▊| 761/774 [03:20<00:02,  5.95it/s][A
+ 98%|█████████▊| 762/774 [03:20<00:01,  6.04it/s][A
+ 99%|█████████▊| 763/774 [03:20<00:01,  6.21it/s][A
+ 99%|█████████▊| 764/774 [03:20<00:01,  6.32it/s][A
+ 99%|█████████▉| 765/774 [03:21<00:01,  6.21it/s][A
+ 99%|█████████▉| 766/774 [03:21<00:01,  5.31it/s][A
+ 99%|█████████▉| 767/774 [03:21<00:01,  5.50it/s][A
+ 99%|█████████▉| 768/774 [03:21<00:01,  5.46it/s][A
+ 99%|█████████▉| 769/774 [03:21<00:00,  5.16it/s][A
+ 99%|█████████▉| 770/774 [03:22<00:00,  5.03it/s][A
+100%|█████████▉| 771/774 [03:22<00:00,  5.31it/s][A
+100%|█████████▉| 772/774 [03:22<00:00,  5.05it/s][A
+100%|█████████▉| 773/774 [03:22<00:00,  4.93it/s][A                                                      
+                                                 [A 70%|███████   | 9000/12776 [1:36:39<38:52,  1.62it/s]
+100%|██████████| 774/774 [03:25<00:00,  4.93it/s][A
+                                                 [A 70%|███████   | 9001/12776 [1:36:40<65:16:12, 62.24s/it]                                                          70%|███████   | 9001/12776 [1:36:40<65:16:12, 62.24s/it] 70%|███████   | 9002/12776 [1:36:40<45:49:03, 43.71s/it]                                                          70%|███████   | 9002/12776 [1:36:40<45:49:03, 43.71s/it] 70%|███████   | 9003/12776 [1:36:41<32:12:01, 30.72s/it]                                                          70%|███████   | 9003/12776 [1:36:41<32:12:01, 30.72s/it] 70%|███████   | 9004/12776 [1:36:41<22:40:01, 21.63s/it]                                                          70%|███████   | 9004/12776 [1:36:41<22:40:01, 21.63s/it] 70%|███████   | 9005/12776 [1:36:41<16:00:28, 15.28s/it]                                                          70%|███████   | 9005/12776 [1:36:41<16:00:28, 15.28s/it] 70%|███████   | 9006/12776 [1:36:42<11:19:20, 10.81s/it]                                                          70%|███████   | 9006/12776 [1:36:42<11:19:20, 10.81s/it] 70%|███████   | 9007/12776 [1:36:42<8:02:02,  7.67s/it]                                                          70%|███████   | 9007/12776 [1:36:42<8:02:02,  7.67s/it] 71%|███████   | 9008/12776 [1:36:43<5:45:24,  5.50s/it]                                                         71%|███████   | 9008/12776 [1:36:43<5:45:24,  5.50s/it] 71%|███████   | 9009/12776 [1:36:43<4:08:03,  3.95s/it]                                                         71%|███████   | 9009/12776 [1:36:43<4:08:03,  3.95s/it] 71%|███████   | 9010/12776 [1:36:43<2:59:43,  2.86s/it]                                                         71%|███████   | 9010/12776 [1:36:43<2:59:43,  2.86s/it] 71%|███████   | 9011/12776 [1:36:44<2:13:12,  2.12s/it]                                                         71%|███████   | 9011/12776 [1:36:44<2:13:12,  2.12s/it] 71%|███████   | 9012/12776 [1:36:44<1:38:46,  1.57s/it]                                                         71%|███████   | 9012/12776 [1:36:44<1:38:46,  1.57s/it] 71%|███████   | 9013/12776 [1:36:44<1:14:34,  1.19s/it]                                                         71%|███████   | 9013/12776 [1:36:44<1:14:34,  1.19s/it] 71%|███████   | 9014/12776 [1:36:45<59:43,  1.05it/s]                                                         71%|███████   | 9014/12776 [1:36:45<59:43,  1.05it/s] 71%|███████   | 9015/12776 [1:36:45<46:58,  1.33it/s]                                                       71%|███████   | 9015/12776 [1:36:45<46:58,  1.33it/s] 71%|███████   | 9016/12776 [1:36:45<37:55,  1.65it/s]                                                       71%|███████   | 9016/12776 [1:36:45<37:55,  1.65it/s] 71%|███████   | 9017/12776 [1:36:45<31:30,  1.99it/s]                                                       71%|███████   | 9017/12776 [1:36:45<31:30,  1.99it/s] 71%|███████   | 9018/12776 [1:36:46<28:12,  2.22it/s]                                                       71%|███████   | 9018/12776 [1:36:46<28:12,  2.22it/s] 71%|███████   | 9019/12776 [1:36:46<24:33,  2.55it/s]                                                       71%|███████   | 9019/12776 [1:36:46<24:33,  2.55it/s] 71%|███████   | 9020/12776 [1:36:46<21:52,  2.86it/s]                                                       71%|███████   | 9020/12776 [1:36:46<21:52,  2.86it/s] 71%|███████   | 9021/12776 [1:36:47<19:56,  3.14it/s]                                                       71%|███████   | 9021/12776 [1:36:47<19:56,  3.14it/s] 71%|███████   | 9022/12776 [1:36:47<18:24,  3.40it/s]                                                       71%|███████   | 9022/12776 [1:36:47<18:24,  3.40it/s] 71%|███████   | 9023/12776 [1:36:47<17:49,  3.51it/s]                                                       71%|███████   | 9023/12776 [1:36:47<17:49,  3.51it/s] 71%|███████   | 9024/12776 [1:36:47<16:36,  3.76it/s]                                                       71%|██��████   | 9024/12776 [1:36:47<16:36,  3.76it/s] 71%|███████   | 9025/12776 [1:36:48<15:39,  3.99it/s]                                                       71%|███████   | 9025/12776 [1:36:48<15:39,  3.99it/s] 71%|███████   | 9026/12776 [1:36:48<14:56,  4.18it/s]                                                       71%|███████   | 9026/12776 [1:36:48<14:56,  4.18it/s] 71%|███████   | 9027/12776 [1:36:48<14:28,  4.32it/s]                                                       71%|███████   | 9027/12776 [1:36:48<14:28,  4.32it/s] 71%|███████   | 9028/12776 [1:36:48<14:50,  4.21it/s]                                                       71%|███████   | 9028/12776 [1:36:48<14:50,  4.21it/s] 71%|███████   | 9029/12776 [1:36:48<14:16,  4.38it/s]                                                       71%|███████   | 9029/12776 [1:36:48<14:16,  4.38it/s] 71%|███████   | 9030/12776 [1:36:49<13:49,  4.51it/s]                                                       71%|███████   | 9030/12776 [1:36:49<13:49,  4.51it/s] 71%|███████   | 9031/12776 [1:36:49<13:26,  4.64it/s]                                                       71%|███████   | 9031/12776 [1:36:49<13:26,  4.64it/s] 71%|███████   | 9032/12776 [1:36:49<13:16,  4.70it/s]                                                       71%|███████   | 9032/12776 [1:36:49<13:16,  4.70it/s] 71%|███████   | 9033/12776 [1:36:49<15:22,  4.06it/s]                                                       71%|███████   | 9033/12776 [1:36:49<15:22,  4.06it/s] 71%|███████   | 9034/12776 [1:36:50<14:26,  4.32it/s]                                                       71%|███████   | 9034/12776 [1:36:50<14:26,  4.32it/s] 71%|███████   | 9035/12776 [1:36:50<13:44,  4.54it/s]                                                       71%|███████   | 9035/12776 [1:36:50<13:44,  4.54it/s] 71%|███████   | 9036/12776 [1:36:50<13:07,  4.75it/s]                                                       71%|███████   | 9036/12776 [1:36:50<13:07,  4.75it/s] 71%|███████   | 9037/12776 [1:36:50<12:40,  4.92it/s]                                                       71%|███████   | 9037/12776 [1:36:50<12:40,  4.92it/s] 71%|███████   | 9038/12776 [1:36:51<21:25,  2.91it/s]                                                       71%|███████   | 9038/12776 [1:36:51<21:25,  2.91it/s] 71%|███████   | 9039/12776 [1:36:52<41:57,  1.48it/s]                                                       71%|███████   | 9039/12776 [1:36:52<41:57,  1.48it/s] 71%|███████   | 9040/12776 [1:36:53<48:12,  1.29it/s]                                                       71%|███████   | 9040/12776 [1:36:53<48:12,  1.29it/s] 71%|███████   | 9041/12776 [1:36:54<51:25,  1.21it/s]                                                       71%|███████   | 9041/12776 [1:36:54<51:25,  1.21it/s] 71%|███████   | 9042/12776 [1:36:55<51:10,  1.22it/s]                                                       71%|███████   | 9042/12776 [1:36:55<51:10,  1.22it/s] 71%|███████   | 9043/12776 [1:36:56<49:28,  1.26it/s]                                                       71%|███████   | 9043/12776 [1:36:56<49:28,  1.26it/s] 71%|███████   | 9044/12776 [1:36:56<49:23,  1.26it/s]                                                       71%|███████   | 9044/12776 [1:36:56<49:23,  1.26it/s] 71%|███████   | 9045/12776 [1:36:57<48:21,  1.29it/s]                                                       71%|███████   | 9045/12776 [1:36:57<48:21,  1.29it/s] 71%|███████   | 9046/12776 [1:36:58<45:41,  1.36it/s]                                                       71%|███████   | 9046/12776 [1:36:58<45:41,  1.36it/s] 71%|███████   | 9047/12776 [1:36:59<46:27,  1.34it/s]                                                       71%|███████   | 9047/12776 [1:36:59<46:27,  1.34it/s] 71%|███████   | 9048/12776 [1:36:59<43:15,  1.44it/s]                                                       71%|███████   | 9048/12776 [1:36:59<43:15,  1.44it/s] 71%|███████   | 9049/12776 [1:37:00<42:13,  1.47it/s]                                                       71%|███████   | 9049/12776 [1:37:00<42:13,  1.47it/s] 71%|███████   | 9050/12776 [1:37:00<39:59,  1.55it/s]                                                       71%|███████   | 9050/12776 [1:37:00<39:59,  1.55it/s] 71%|███████   | 9051/12776 [1:37:01<39:30,  1.57it/s]                                                       71%|███████   | 9051/12776 [1:37:01<39:30,  1.57it/s] 71%|███████   | 9052/12776 [1:37:02<36:56,  1.68it/s]                                                       71%|███████   | 9052/12776 [1:37:02<36:56,  1.68it/s] 71%|███████   | 9053/12776 [1:37:02<35:24,  1.75it/s]                                                       71%|███████   | 9053/12776 [1:37:02<35:24,  1.75it/s] 71%|███████   | 9054/12776 [1:37:03<33:05,  1.87it/s]                                                       71%|███████   | 9054/12776 [1:37:03<33:05,  1.87it/s] 71%|███████   | 9055/12776 [1:37:03<32:35,  1.90it/s]                                                       71%|███████   | 9055/12776 [1:37:03<32:35,  1.90it/s] 71%|███████   | 9056/12776 [1:37:03<30:28,  2.03it/s]                                                       71%|███████   | 9056/12776 [1:37:03<30:28,  2.03it/s] 71%|███████   | 9057/12776 [1:37:04<28:45,  2.15it/s]                                                       71%|███████   | 9057/12776 [1:37:04<28:45,  2.15it/s] 71%|███████   | 9058/12776 [1:37:04<29:13,  2.12it/s]                                                       71%|███████   | 9058/12776 [1:37:04<29:13,  2.12it/s] 71%|███████   | 9059/12776 [1:37:05<27:14,  2.27it/s]                                                       71%|███████   | 9059/12776 [1:37:05<27:14,  2.27it/s] 71%|███████   | 9060/12776 [1:37:05<25:37,  2.42it/s]                                                       71%|███████   | 9060/12776 [1:37:05<25:37,  2.42it/s] 71%|███████   | 9061/12776 [1:37:05<25:44,  2.41it/s]                                                       71%|███████   | 9061/12776 [1:37:05<25:44,  2.41it/s] 71%|███████   | 9062/12776 [1:37:06<24:12,  2.56it/s]                                                       71%|███████   | 9062/12776 [1:37:06<24:12,  2.56it/s] 71%|███████   | 9063/12776 [1:37:06<23:01,  2.69it/s]                                                       71%|███████   | 9063/12776 [1:37:06<23:01,  2.69it/s] 71%|███████   | 9064/12776 [1:37:06<22:34,  2.74it/s]                                                       71%|███████   | 9064/12776 [1:37:06<22:34,  2.74it/s] 71%|███████   | 9065/12776 [1:37:07<22:25,  2.76it/s]                                                       71%|███████   | 9065/12776 [1:37:07<22:25,  2.76it/s] 71%|███████   | 9066/12776 [1:37:07<21:08,  2.92it/s]                                                       71%|███████   | 9066/12776 [1:37:07<21:08,  2.92it/s] 71%|███████   | 9067/12776 [1:37:07<20:02,  3.08it/s]                                                       71%|███████   | 9067/12776 [1:37:07<20:02,  3.08it/s] 71%|███████   | 9068/12776 [1:37:08<20:26,  3.02it/s]                                                       71%|███████   | 9068/12776 [1:37:08<20:26,  3.02it/s] 71%|███████   | 9069/12776 [1:37:08<19:16,  3.21it/s]                                                       71%|███████   | 9069/12776 [1:37:08<19:16,  3.21it/s] 71%|███████   | 9070/12776 [1:37:08<18:18,  3.37it/s]                                                       71%|███████   | 9070/12776 [1:37:08<18:18,  3.37it/s] 71%|███████   | 9071/12776 [1:37:09<17:30,  3.53it/s]                                                       71%|███████   | 9071/12776 [1:37:09<17:30,  3.53it/s] 71%|███████   | 9072/12776 [1:37:09<18:10,  3.40it/s]                                                       71%|███████   | 9072/12776 [1:37:09<18:10,  3.40it/s] 71%|███████   | 9073/12776 [1:37:09<17:23,  3.55it/s]                                                       71%|███████   | 9073/12776 [1:37:09<17:23,  3.55it/s] 71%|███████   | 9074/12776 [1:37:09<16:29,  3.74it/s]                                                       71%|███████   | 9074/12776 [1:37:09<16:29,  3.74it/s] 71%|███████   | 9075/12776 [1:37:10<15:42,  3.93it/s]                                                       71%|███████   | 9075/12776 [1:37:10<15:42,  3.93it/s] 71%|███████   | 9076/12776 [1:37:10<16:02,  3.84it/s]                                                       71%|███████   | 9076/12776 [1:37:10<16:02,  3.84it/s] 71%|███████   | 9077/12776 [1:37:10<15:08,  4.07it/s]                                                      {'eval_loss': 0.49010393023490906, 'eval_wer': 0.3170729115753235, 'eval_runtime': 205.5553, 'eval_samples_per_second': 60.242, 'eval_steps_per_second': 3.765, 'epoch': 1.41}
+{'loss': 0.4199, 'grad_norm': 1.107610821723938, 'learning_rate': 9.252199413489736e-05, 'epoch': 1.41}
+{'loss': 0.4704, 'grad_norm': 1.4401216506958008, 'learning_rate': 9.249755620723361e-05, 'epoch': 1.41}
+{'loss': 0.325, 'grad_norm': 0.8990141749382019, 'learning_rate': 9.247311827956989e-05, 'epoch': 1.41}
+{'loss': 0.5246, 'grad_norm': 1.2181795835494995, 'learning_rate': 9.244868035190615e-05, 'epoch': 1.41}
+{'loss': 0.3604, 'grad_norm': 0.9682489037513733, 'learning_rate': 9.242424242424242e-05, 'epoch': 1.41}
+{'loss': 0.5106, 'grad_norm': 1.354854702949524, 'learning_rate': 9.239980449657868e-05, 'epoch': 1.41}
+{'loss': 0.3421, 'grad_norm': 0.8772504925727844, 'learning_rate': 9.237536656891495e-05, 'epoch': 1.41}
+{'loss': 0.3441, 'grad_norm': 0.9204466938972473, 'learning_rate': 9.235092864125121e-05, 'epoch': 1.41}
+{'loss': 0.5436, 'grad_norm': 1.3267632722854614, 'learning_rate': 9.232649071358749e-05, 'epoch': 1.41}
+{'loss': 0.5114, 'grad_norm': 1.8505562543869019, 'learning_rate': 9.230205278592374e-05, 'epoch': 1.41}
+{'loss': 0.4848, 'grad_norm': 2.1749753952026367, 'learning_rate': 9.227761485826001e-05, 'epoch': 1.41}
+{'loss': 0.5289, 'grad_norm': 1.2745190858840942, 'learning_rate': 9.225317693059629e-05, 'epoch': 1.41}
+{'loss': 0.2345, 'grad_norm': 0.8753221035003662, 'learning_rate': 9.222873900293255e-05, 'epoch': 1.41}
+{'loss': 0.7186, 'grad_norm': 1.8300694227218628, 'learning_rate': 9.22043010752688e-05, 'epoch': 1.41}
+{'loss': 0.7047, 'grad_norm': 2.5237860679626465, 'learning_rate': 9.217986314760508e-05, 'epoch': 1.41}
+{'loss': 0.5634, 'grad_norm': 1.3644801378250122, 'learning_rate': 9.215542521994134e-05, 'epoch': 1.41}
+{'loss': 0.3977, 'grad_norm': 3.8021445274353027, 'learning_rate': 9.213098729227761e-05, 'epoch': 1.41}
+{'loss': 1.2041, 'grad_norm': 2.972228527069092, 'learning_rate': 9.210654936461387e-05, 'epoch': 1.41}
+{'loss': 0.5613, 'grad_norm': 2.1545019149780273, 'learning_rate': 9.208211143695014e-05, 'epoch': 1.41}
+{'loss': 0.5319, 'grad_norm': 2.832612991333008, 'learning_rate': 9.20576735092864e-05, 'epoch': 1.41}
+{'loss': 0.7196, 'grad_norm': 2.0338563919067383, 'learning_rate': 9.203323558162268e-05, 'epoch': 1.41}
+{'loss': 1.0724, 'grad_norm': 3.1499016284942627, 'learning_rate': 9.200879765395893e-05, 'epoch': 1.41}
+{'loss': 0.8717, 'grad_norm': 3.166935443878174, 'learning_rate': 9.19843597262952e-05, 'epoch': 1.41}
+{'loss': 1.0294, 'grad_norm': 4.040381908416748, 'learning_rate': 9.195992179863148e-05, 'epoch': 1.41}
+{'loss': 0.8536, 'grad_norm': 3.9483299255371094, 'learning_rate': 9.193548387096774e-05, 'epoch': 1.41}
+{'loss': 0.9649, 'grad_norm': 7.188598155975342, 'learning_rate': 9.191104594330399e-05, 'epoch': 1.41}
+{'loss': 1.0681, 'grad_norm': 2.007357597351074, 'learning_rate': 9.188660801564027e-05, 'epoch': 1.41}
+{'loss': 0.9114, 'grad_norm': 3.2346351146698, 'learning_rate': 9.186217008797654e-05, 'epoch': 1.41}
+{'loss': 0.883, 'grad_norm': 3.834306240081787, 'learning_rate': 9.183773216031279e-05, 'epoch': 1.41}
+{'loss': 0.8806, 'grad_norm': 4.527556419372559, 'learning_rate': 9.181329423264907e-05, 'epoch': 1.41}
+{'loss': 1.2201, 'grad_norm': 3.6661274433135986, 'learning_rate': 9.178885630498533e-05, 'epoch': 1.41}
+{'loss': 0.7024, 'grad_norm': 1.937548279762268, 'learning_rate': 9.17644183773216e-05, 'epoch': 1.41}
+{'loss': 0.8705, 'grad_norm': 2.842707872390747, 'learning_rate': 9.173998044965787e-05, 'epoch': 1.41}
+{'loss': 0.4965, 'grad_norm': 1.4602514505386353, 'learning_rate': 9.171554252199412e-05, 'epoch': 1.41}
+{'loss': 1.2006, 'grad_norm': 2.891000747680664, 'learning_rate': 9.169110459433039e-05, 'epoch': 1.41}
+{'loss': 0.4995, 'grad_norm': 1.664858102798462, 'learning_rate': 9.166666666666667e-05, 'epoch': 1.41}
+{'loss': 1.5147, 'grad_norm': 3.580497980117798, 'learning_rate': 9.164222873900293e-05, 'epoch': 1.41}
+{'loss': 1.297, 'grad_norm': 2.788262367248535, 'learning_rate': 9.161779081133918e-05, 'epoch': 1.41}
+{'loss': 0.2044, 'grad_norm': 0.46237197518348694, 'learning_rate': 9.159335288367546e-05, 'epoch': 1.41}
+{'loss': 0.2271, 'grad_norm': 0.7125651836395264, 'learning_rate': 9.156891495601173e-05, 'epoch': 1.42}
+{'loss': 0.2674, 'grad_norm': 0.6166035532951355, 'learning_rate': 9.154447702834798e-05, 'epoch': 1.42}
+{'loss': 0.2302, 'grad_norm': 0.3849860727787018, 'learning_rate': 9.152003910068426e-05, 'epoch': 1.42}
+{'loss': 0.2397, 'grad_norm': 0.6451732516288757, 'learning_rate': 9.149560117302052e-05, 'epoch': 1.42}
+{'loss': 0.229, 'grad_norm': 0.41967400908470154, 'learning_rate': 9.147116324535679e-05, 'epoch': 1.42}
+{'loss': 0.3821, 'grad_norm': 0.9496431946754456, 'learning_rate': 9.144672531769306e-05, 'epoch': 1.42}
+{'loss': 0.1948, 'grad_norm': 0.634460985660553, 'learning_rate': 9.142228739002932e-05, 'epoch': 1.42}
+{'loss': 0.2305, 'grad_norm': 0.9624339938163757, 'learning_rate': 9.139784946236558e-05, 'epoch': 1.42}
+{'loss': 0.2501, 'grad_norm': 1.082798719406128, 'learning_rate': 9.137341153470186e-05, 'epoch': 1.42}
+{'loss': 0.1449, 'grad_norm': 0.5590087175369263, 'learning_rate': 9.134897360703812e-05, 'epoch': 1.42}
+{'loss': 0.1856, 'grad_norm': 1.2364987134933472, 'learning_rate': 9.132453567937437e-05, 'epoch': 1.42}
+{'loss': 0.2644, 'grad_norm': 0.5167357325553894, 'learning_rate': 9.130009775171065e-05, 'epoch': 1.42}
+{'loss': 0.4774, 'grad_norm': 2.4510698318481445, 'learning_rate': 9.127565982404692e-05, 'epoch': 1.42}
+{'loss': 0.5563, 'grad_norm': 1.2611892223358154, 'learning_rate': 9.125122189638317e-05, 'epoch': 1.42}
+{'loss': 0.4933, 'grad_norm': 1.279375672340393, 'learning_rate': 9.122678396871945e-05, 'epoch': 1.42}
+{'loss': 0.5997, 'grad_norm': 1.3161873817443848, 'learning_rate': 9.120234604105571e-05, 'epoch': 1.42}
+{'loss': 0.2048, 'grad_norm': 0.7085786461830139, 'learning_rate': 9.117790811339198e-05, 'epoch': 1.42}
+{'loss': 0.3395, 'grad_norm': 1.8899716138839722, 'learning_rate': 9.115347018572826e-05, 'epoch': 1.42}
+{'loss': 0.5075, 'grad_norm': 2.1545331478118896, 'learning_rate': 9.112903225806451e-05, 'epoch': 1.42}
+{'loss': 0.2844, 'grad_norm': 1.0476807355880737, 'learning_rate': 9.110459433040077e-05, 'epoch': 1.42}
+{'loss': 0.481, 'grad_norm': 1.534098505973816, 'learning_rate': 9.108015640273705e-05, 'epoch': 1.42}
+{'loss': 0.3992, 'grad_norm': 1.6640297174453735, 'learning_rate': 9.105571847507331e-05, 'epoch': 1.42}
+{'loss': 0.484, 'grad_norm': 1.0398399829864502, 'learning_rate': 9.103128054740957e-05, 'epoch': 1.42}
+{'loss': 0.5006, 'grad_norm': 1.172825574874878, 'learning_rate': 9.100684261974584e-05, 'epoch': 1.42}
+{'loss': 0.3896, 'grad_norm': 1.707702875137329, 'learning_rate': 9.098240469208211e-05, 'epoch': 1.42}
+{'loss': 0.689, 'grad_norm': 1.5226316452026367, 'learning_rate': 9.095796676441836e-05, 'epoch': 1.42}
+{'loss': 0.4796, 'grad_norm': 1.6208080053329468, 'learning_rate': 9.093352883675464e-05, 'epoch': 1.42}
+{'loss': 0.5755, 'grad_norm': 2.61185622215271, 'learning_rate': 9.09090909090909e-05, 'epoch': 1.42}
+{'loss': 0.7231, 'grad_norm': 2.0593795776367188, 'learning_rate': 9.088465298142717e-05, 'epoch': 1.42}
+{'loss': 0.7263, 'grad_norm': 1.8883817195892334, 'learning_rate': 9.086021505376345e-05, 'epoch': 1.42}
+{'loss': 0.7273, 'grad_norm': 3.10384464263916, 'learning_rate': 9.08357771260997e-05, 'epoch': 1.42}
+{'loss': 0.7773, 'grad_norm': 1.468448281288147, 'learning_rate': 9.081133919843596e-05, 'epoch': 1.42}
+{'loss': 0.4677, 'grad_norm': 1.079703450202942, 'learning_rate': 9.078690127077224e-05, 'epoch': 1.42}
+{'loss': 0.5249, 'grad_norm': 2.0892856121063232, 'learning_rate': 9.076246334310849e-05, 'epoch': 1.42}
+{'loss': 0.4734, 'grad_norm': 2.025650978088379, 'learning_rate': 9.073802541544476e-05, 'epoch': 1.42}
+{'loss': 0.776, 'grad_norm': 2.4783313274383545, 'learning_rate': 9.071358748778104e-05, 'epoch': 1.42}
+{'loss': 0.772, 'grad_norm': 2.0602166652679443, 'learning_rate': 9.06891495601173e-05, 'epoch': 1.42}
+ 71%|███████   | 9077/12776 [1:37:10<15:08,  4.07it/s] 71%|███████   | 9078/12776 [1:37:10<14:26,  4.27it/s]                                                       71%|███████   | 9078/12776 [1:37:10<14:26,  4.27it/s] 71%|███████   | 9079/12776 [1:37:10<13:59,  4.41it/s]                                                       71%|███████   | 9079/12776 [1:37:10<13:59,  4.41it/s] 71%|███████   | 9080/12776 [1:37:11<13:34,  4.54it/s]                                                       71%|███████   | 9080/12776 [1:37:11<13:34,  4.54it/s] 71%|███████   | 9081/12776 [1:37:11<15:03,  4.09it/s]                                                       71%|███████   | 9081/12776 [1:37:11<15:03,  4.09it/s] 71%|███████   | 9082/12776 [1:37:11<14:13,  4.33it/s]                                                       71%|███████   | 9082/12776 [1:37:11<14:13,  4.33it/s] 71%|███████   | 9083/12776 [1:37:11<13:34,  4.53it/s]                                                       71%|███████   | 9083/12776 [1:37:11<13:34,  4.53it/s] 71%|███████   | 9084/12776 [1:37:12<13:05,  4.70it/s]                                                       71%|███████   | 9084/12776 [1:37:12<13:05,  4.70it/s] 71%|███████   | 9085/12776 [1:37:12<12:42,  4.84it/s]                                                       71%|███████   | 9085/12776 [1:37:12<12:42,  4.84it/s] 71%|███████   | 9086/12776 [1:37:12<12:18,  5.00it/s]                                                       71%|███████   | 9086/12776 [1:37:12<12:18,  5.00it/s] 71%|███████   | 9087/12776 [1:37:12<14:05,  4.37it/s]                                                       71%|███████   | 9087/12776 [1:37:12<14:05,  4.37it/s] 71%|███��███   | 9088/12776 [1:37:13<22:45,  2.70it/s]                                                       71%|███████   | 9088/12776 [1:37:13<22:45,  2.70it/s] 71%|███████   | 9089/12776 [1:37:14<42:44,  1.44it/s]                                                       71%|███████   | 9089/12776 [1:37:14<42:44,  1.44it/s] 71%|███████   | 9090/12776 [1:37:15<47:39,  1.29it/s]                                                       71%|███████   | 9090/12776 [1:37:15<47:39,  1.29it/s] 71%|███████   | 9091/12776 [1:37:16<48:57,  1.25it/s]                                                       71%|███████   | 9091/12776 [1:37:16<48:57,  1.25it/s] 71%|███████   | 9092/12776 [1:37:17<50:17,  1.22it/s]                                                       71%|███████   | 9092/12776 [1:37:17<50:17,  1.22it/s] 71%|███████   | 9093/12776 [1:37:18<50:34,  1.21it/s]                                                       71%|███████   | 9093/12776 [1:37:18<50:34,  1.21it/s] 71%|███████   | 9094/12776 [1:37:19<47:54,  1.28it/s]                                                       71%|███████   | 9094/12776 [1:37:19<47:54,  1.28it/s] 71%|███████   | 9095/12776 [1:37:19<47:29,  1.29it/s]                                                       71%|███████   | 9095/12776 [1:37:19<47:29,  1.29it/s] 71%|███████   | 9096/12776 [1:37:20<44:35,  1.38it/s]                                                       71%|███████   | 9096/12776 [1:37:20<44:35,  1.38it/s] 71%|███████   | 9097/12776 [1:37:21<41:57,  1.46it/s]                                                       71%|███████   | 9097/12776 [1:37:21<41:57,  1.46it/s] 71%|███████   | 9098/12776 [1:37:21<39:30,  1.55it/s]                                                       71%|███████   | 9098/12776 [1:37:21<39:30,  1.55it/s] 71%|███████   | 9099/12776 [1:37:22<38:29,  1.59it/s]                                                       71%|███████   | 9099/12776 [1:37:22<38:29,  1.59it/s] 71%|███████   | 9100/12776 [1:37:22<36:09,  1.69it/s]                                                       71%|███████   | 9100/12776 [1:37:22<36:09,  1.69it/s] 71%|███████   | 9101/12776 [1:37:23<35:01,  1.75it/s]                                                       71%|███████   | 9101/12776 [1:37:23<35:01,  1.75it/s] 71%|███████   | 9102/12776 [1:37:23<32:41,  1.87it/s]                                                       71%|███████   | 9102/12776 [1:37:23<32:41,  1.87it/s] 71%|███████▏  | 9103/12776 [1:37:24<32:38,  1.88it/s]                                                       71%|███████▏  | 9103/12776 [1:37:24<32:38,  1.88it/s] 71%|███████▏  | 9104/12776 [1:37:24<30:21,  2.02it/s]                                                       71%|███████▏  | 9104/12776 [1:37:24<30:21,  2.02it/s] 71%|███████▏  | 9105/12776 [1:37:24<28:25,  2.15it/s]                                                       71%|███████▏  | 9105/12776 [1:37:24<28:25,  2.15it/s] 71%|███████▏  | 9106/12776 [1:37:25<28:56,  2.11it/s]                                                       71%|███████▏  | 9106/12776 [1:37:25<28:56,  2.11it/s] 71%|███████▏  | 9107/12776 [1:37:25<26:49,  2.28it/s]                                                       71%|███████▏  | 9107/12776 [1:37:25<26:49,  2.28it/s] 71%|███████▏  | 9108/12776 [1:37:26<25:12,  2.43it/s]                                                       71%|███████▏  | 9108/12776 [1:37:26<25:12,  2.43it/s] 71%|███████▏  | 9109/12776 [1:37:26<25:36,  2.39it/s]                                                       71%|███████▏  | 9109/12776 [1:37:26<25:36,  2.39it/s] 71%|███████▏  | 9110/12776 [1:37:26<23:56,  2.55it/s]                                                       71%|███████▏  | 9110/12776 [1:37:26<23:56,  2.55it/s] 71%|███████▏  | 9111/12776 [1:37:27<22:25,  2.72it/s]                                                       71%|███████▏  | 9111/12776 [1:37:27<22:25,  2.72it/s] 71%|███████▏  | 9112/12776 [1:37:27<22:05,  2.76it/s]                                                       71%|███████▏  | 9112/12776 [1:37:27<22:05,  2.76it/s] 71%|███████▏  | 9113/12776 [1:37:27<20:51,  2.93it/s]                                                       71%|███████▏  | 9113/12776 [1:37:27<20:51,  2.93it/s] 71%|███████▏  | 9114/12776 [1:37:28<19:49,  3.08it/s]                                                       71%|███████▏  | 9114/12776 [1:37:28<19:49,  3.08it/s] 71%|███████▏  | 9115/12776 [1:37:28<18:55,  3.22it/s]                                                       71%|███████▏  | 9115/12776 [1:37:28<18:55,  3.22it/s] 71%|███████▏  | 9116/12776 [1:37:28<19:17,  3.16it/s]                                                       71%|███████▏  | 9116/12776 [1:37:28<19:17,  3.16it/s] 71%|███████▏  | 9117/12776 [1:37:29<18:18,  3.33it/s]                                                       71%|███████▏  | 9117/12776 [1:37:29<18:18,  3.33it/s] 71%|███████▏  | 9118/12776 [1:37:29<17:28,  3.49it/s]                                                       71%|███████▏  | 9118/12776 [1:37:29<17:28,  3.49it/s] 71%|███████▏  | 9119/12776 [1:37:29<16:48,  3.63it/s]                                                       71%|███████▏  | 9119/12776 [1:37:29<16:48,  3.63it/s] 71%|███████▏  | 9120/12776 [1:37:29<18:48,  3.24it/s]                                                       71%|███████▏  | 9120/12776 [1:37:29<18:48,  3.24it/s] 71%|███████▏  | 9121/12776 [1:37:30<17:32,  3.47it/s]                                                       71%|███████▏  | 9121/12776 [1:37:30<17:32,  3.47it/s] 71%|███████▏  | 9122/12776 [1:37:30<16:33,  3.68it/s]                                                       71%|███████▏  | 9122/12776 [1:37:30<16:33,  3.68it/s] 71%|███████▏  | 9123/12776 [1:37:30<15:45,  3.86it/s]                                                       71%|███████▏  | 9123/12776 [1:37:30<15:45,  3.86it/s] 71%|███████▏  | 9124/12776 [1:37:30<15:03,  4.04it/s]                                                       71%|███████▏  | 9124/12776 [1:37:30<15:03,  4.04it/s] 71%|███████▏  | 9125/12776 [1:37:31<16:13,  3.75it/s]                                                       71%|███████▏  | 9125/12776 [1:37:31<16:13,  3.75it/s] 71%|███████▏  | 9126/12776 [1:37:31<15:10,  4.01it/s]                                                       71%|███████▏  | 9126/12776 [1:37:31<15:10,  4.01it/s] 71%|███████▏  | 9127/12776 [1:37:31<14:28,  4.20it/s]                                                       71%|███████▏  | 9127/12776 [1:37:31<14:28,  4.20it/s] 71%|███████▏  | 9128/12776 [1:37:31<13:55,  4.37it/s]                                                       71%|███████▏  | 9128/12776 [1:37:31<13:55,  4.37it/s] 71%|███████▏  | 9129/12776 [1:37:32<13:30,  4.50it/s]                                                       71%|███████▏  | 9129/12776 [1:37:32<13:30,  4.50it/s] 71%|███████▏  | 9130/12776 [1:37:32<15:01,  4.05it/s]                                                       71%|███████▏  | 9130/12776 [1:37:32<15:01,  4.05it/s] 71%|███████▏  | 9131/12776 [1:37:32<14:11,  4.28it/s]                                                       71%|███████▏  | 9131/12776 [1:37:32<14:11,  4.28it/s] 71%|███████▏  | 9132/12776 [1:37:32<13:32,  4.49it/s]                                                       71%|███████▏  | 9132/12776 [1:37:32<13:32,  4.49it/s] 71%|███████▏  | 9133/12776 [1:37:32<13:03,  4.65it/s]                                                       71%|███████▏  | 9133/12776 [1:37:32<13:03,  4.65it/s] 71%|███████▏  | 9134/12776 [1:37:33<12:38,  4.80it/s]                                                       71%|███████▏  | 9134/12776 [1:37:33<12:38,  4.80it/s] 72%|███████▏  | 9135/12776 [1:37:33<14:23,  4.21it/s]                                                       72%|███████▏  | 9135/12776 [1:37:33<14:23,  4.21it/s] 72%|███████▏  | 9136/12776 [1:37:33<13:27,  4.51it/s]                                                       72%|███████▏  | 9136/12776 [1:37:33<13:27,  4.51it/s] 72%|███████▏  | 9137/12776 [1:37:33<12:44,  4.76it/s]                                                       72%|███████▏  | 9137/12776 [1:37:33<12:44,  4.76it/s] 72%|███████▏  | 9138/12776 [1:37:34<22:57,  2.64it/s]                                                       72%|███████▏  | 9138/12776 [1:37:34<22:57,  2.64it/s] 72%|███████▏  | 9139/12776 [1:37:35<42:11,  1.44it/s]                                                       72%|███████▏  | 9139/12776 [1:37:35<42:11,  1.44it/s] 72%|███████▏  | 9140/12776 [1:37:36<46:49,  1.29it/s]                                                       72%|███████▏  | 9140/12776 [1:37:36<46:49,  1.29it/s] 72%|███████▏  | 9141/12776 [1:37:37<50:57,  1.19it/s]                                                       72%|███████▏  | 9141/12776 [1:37:37<50:57,  1.19it/s] 72%|███████▏  | 9142/12776 [1:37:38<50:21,  1.20it/s]                                                       72%|███████▏  | 9142/12776 [1:37:38<50:21,  1.20it/s] 72%|███████▏  | 9143/12776 [1:37:39<49:09,  1.23it/s]                                                       72%|███████▏  | 9143/12776 [1:37:39<49:09,  1.23it/s] 72%|███████▏  | 9144/12776 [1:37:40<47:47,  1.27it/s]                                                       72%|███████▏  | 9144/12776 [1:37:40<47:47,  1.27it/s] 72%|███████▏  | 9145/12776 [1:37:40<45:58,  1.32it/s]                                                       72%|███████▏  | 9145/12776 [1:37:40<45:58,  1.32it/s] 72%|███████▏  | 9146/12776 [1:37:41<46:07,  1.31it/s]                                                       72%|███████▏  | 9146/12776 [1:37:41<46:07,  1.31it/s] 72%|███████▏  | 9147/12776 [1:37:42<43:43,  1.38it/s]                                                       72%|███████▏  | 9147/12776 [1:37:42<43:43,  1.38it/s] 72%|███████▏  | 9148/12776 [1:37:42<41:10,  1.47it/s]                                                       72%|███████▏  | 9148/12776 [1:37:42<41:10,  1.47it/s] 72%|███████▏  | 9149/12776 [1:37:43<38:53,  1.55it/s]                                                       72%|███████▏  | 9149/12776 [1:37:43<38:53,  1.55it/s] 72%|███████▏  | 9150/12776 [1:37:44<37:07,  1.63it/s]                                                       72%|███████▏  | 9150/12776 [1:37:44<37:07,  1.63it/s] 72%|███████▏  | 9151/12776 [1:37:44<35:19,  1.71it/s]                                                       72%|███████▏  | 9151/12776 [1:37:44<35:19,  1.71it/s] 72%|███████▏  | 9152/12776 [1:37:45<35:32,  1.70it/s]                                                       72%|███████▏  | 9152/12776 [1:37:45<35:32,  1.70it/s] 72%|███████▏  | 9153/12776 [1:37:45<33:20,  1.81it/s]                                                       72%|███████▏  | 9153/12776 [1:37:45<33:20,  1.81it/s] 72%|███████▏  | 9154/12776 [1:37:46<33:06,  1.82it/s]                                                       72%|███████▏  | 9154/12776 [1:37:46<33:06,  1.82it/s] 72%|███████▏  | 9155/12776 [1:37:46<30:58,  1.95it/s]                                                      {'loss': 0.5761, 'grad_norm': 2.0721070766448975, 'learning_rate': 9.066471163245355e-05, 'epoch': 1.42}
+{'loss': 0.7641, 'grad_norm': 2.9403793811798096, 'learning_rate': 9.064027370478983e-05, 'epoch': 1.42}
+{'loss': 1.1597, 'grad_norm': 2.754152536392212, 'learning_rate': 9.06158357771261e-05, 'epoch': 1.42}
+{'loss': 1.0165, 'grad_norm': 3.737293243408203, 'learning_rate': 9.059139784946236e-05, 'epoch': 1.42}
+{'loss': 0.8176, 'grad_norm': 4.649268627166748, 'learning_rate': 9.056695992179861e-05, 'epoch': 1.42}
+{'loss': 1.1525, 'grad_norm': 4.204807758331299, 'learning_rate': 9.054252199413489e-05, 'epoch': 1.42}
+{'loss': 1.4486, 'grad_norm': 1.6178537607192993, 'learning_rate': 9.051808406647115e-05, 'epoch': 1.42}
+{'loss': 0.4542, 'grad_norm': 1.8884669542312622, 'learning_rate': 9.049364613880742e-05, 'epoch': 1.42}
+{'loss': 0.7788, 'grad_norm': 2.976574182510376, 'learning_rate': 9.046920821114368e-05, 'epoch': 1.42}
+{'loss': 0.642, 'grad_norm': 2.2067227363586426, 'learning_rate': 9.044477028347995e-05, 'epoch': 1.42}
+{'loss': 0.4552, 'grad_norm': 1.5230540037155151, 'learning_rate': 9.042033235581621e-05, 'epoch': 1.42}
+{'loss': 0.8108, 'grad_norm': 1.6714625358581543, 'learning_rate': 9.039589442815249e-05, 'epoch': 1.42}
+{'loss': 0.2396, 'grad_norm': 0.5419172644615173, 'learning_rate': 9.037145650048874e-05, 'epoch': 1.42}
+{'loss': 0.2611, 'grad_norm': 0.5144709944725037, 'learning_rate': 9.034701857282501e-05, 'epoch': 1.42}
+{'loss': 0.3145, 'grad_norm': 0.7084149718284607, 'learning_rate': 9.032258064516129e-05, 'epoch': 1.42}
+{'loss': 0.265, 'grad_norm': 0.7008088827133179, 'learning_rate': 9.029814271749755e-05, 'epoch': 1.42}
+{'loss': 0.4497, 'grad_norm': 2.3117635250091553, 'learning_rate': 9.02737047898338e-05, 'epoch': 1.42}
+{'loss': 0.3003, 'grad_norm': 0.9773268103599548, 'learning_rate': 9.024926686217008e-05, 'epoch': 1.42}
+{'loss': 0.2749, 'grad_norm': 0.8467937111854553, 'learning_rate': 9.022482893450635e-05, 'epoch': 1.42}
+{'loss': 0.2498, 'grad_norm': 0.7267017960548401, 'learning_rate': 9.020039100684261e-05, 'epoch': 1.42}
+{'loss': 0.2563, 'grad_norm': 0.7348589301109314, 'learning_rate': 9.017595307917887e-05, 'epoch': 1.42}
+{'loss': 0.3757, 'grad_norm': 1.283211350440979, 'learning_rate': 9.015151515151514e-05, 'epoch': 1.42}
+{'loss': 0.177, 'grad_norm': 0.4818941652774811, 'learning_rate': 9.01270772238514e-05, 'epoch': 1.42}
+{'loss': 0.2567, 'grad_norm': 0.830980122089386, 'learning_rate': 9.010263929618768e-05, 'epoch': 1.42}
+{'loss': 0.4034, 'grad_norm': 1.4470046758651733, 'learning_rate': 9.007820136852393e-05, 'epoch': 1.42}
+{'loss': 0.377, 'grad_norm': 1.1172550916671753, 'learning_rate': 9.00537634408602e-05, 'epoch': 1.42}
+{'loss': 0.2965, 'grad_norm': 1.1227866411209106, 'learning_rate': 9.002932551319648e-05, 'epoch': 1.43}
+{'loss': 0.3678, 'grad_norm': 0.9577900767326355, 'learning_rate': 9.000488758553274e-05, 'epoch': 1.43}
+{'loss': 0.2968, 'grad_norm': 1.0543158054351807, 'learning_rate': 8.998044965786899e-05, 'epoch': 1.43}
+{'loss': 0.4421, 'grad_norm': 0.9723239541053772, 'learning_rate': 8.995601173020527e-05, 'epoch': 1.43}
+{'loss': 0.5202, 'grad_norm': 1.2686231136322021, 'learning_rate': 8.993157380254154e-05, 'epoch': 1.43}
+{'loss': 0.4751, 'grad_norm': 1.9442496299743652, 'learning_rate': 8.99071358748778e-05, 'epoch': 1.43}
+{'loss': 0.517, 'grad_norm': 2.063786745071411, 'learning_rate': 8.988269794721407e-05, 'epoch': 1.43}
+{'loss': 0.2919, 'grad_norm': 0.9845725893974304, 'learning_rate': 8.985826001955033e-05, 'epoch': 1.43}
+{'loss': 0.9835, 'grad_norm': 2.66135311126709, 'learning_rate': 8.98338220918866e-05, 'epoch': 1.43}
+{'loss': 0.7316, 'grad_norm': 2.083529233932495, 'learning_rate': 8.980938416422287e-05, 'epoch': 1.43}
+{'loss': 0.5985, 'grad_norm': 1.5067460536956787, 'learning_rate': 8.978494623655913e-05, 'epoch': 1.43}
+{'loss': 0.4876, 'grad_norm': 2.6055350303649902, 'learning_rate': 8.976050830889539e-05, 'epoch': 1.43}
+{'loss': 0.4211, 'grad_norm': 1.7742613554000854, 'learning_rate': 8.973607038123167e-05, 'epoch': 1.43}
+{'loss': 0.2719, 'grad_norm': 0.9075111746788025, 'learning_rate': 8.971163245356793e-05, 'epoch': 1.43}
+{'loss': 0.6691, 'grad_norm': 1.751970648765564, 'learning_rate': 8.968719452590418e-05, 'epoch': 1.43}
+{'loss': 0.4919, 'grad_norm': 1.8061598539352417, 'learning_rate': 8.966275659824046e-05, 'epoch': 1.43}
+{'loss': 1.2975, 'grad_norm': 5.433608055114746, 'learning_rate': 8.963831867057673e-05, 'epoch': 1.43}
+{'loss': 0.3322, 'grad_norm': 1.150429129600525, 'learning_rate': 8.961388074291299e-05, 'epoch': 1.43}
+{'loss': 0.5617, 'grad_norm': 1.8451645374298096, 'learning_rate': 8.958944281524926e-05, 'epoch': 1.43}
+{'loss': 0.9685, 'grad_norm': 2.325820207595825, 'learning_rate': 8.956500488758552e-05, 'epoch': 1.43}
+{'loss': 0.6398, 'grad_norm': 2.732015609741211, 'learning_rate': 8.954056695992179e-05, 'epoch': 1.43}
+{'loss': 0.8187, 'grad_norm': 2.809544086456299, 'learning_rate': 8.951612903225806e-05, 'epoch': 1.43}
+{'loss': 0.9776, 'grad_norm': 2.3526062965393066, 'learning_rate': 8.949169110459432e-05, 'epoch': 1.43}
+{'loss': 1.0524, 'grad_norm': 3.0225517749786377, 'learning_rate': 8.946725317693058e-05, 'epoch': 1.43}
+{'loss': 0.8393, 'grad_norm': 9.465794563293457, 'learning_rate': 8.944281524926686e-05, 'epoch': 1.43}
+{'loss': 0.9948, 'grad_norm': 2.5339739322662354, 'learning_rate': 8.941837732160312e-05, 'epoch': 1.43}
+{'loss': 0.988, 'grad_norm': 1.795351266860962, 'learning_rate': 8.939393939393938e-05, 'epoch': 1.43}
+{'loss': 1.1098, 'grad_norm': 2.1344945430755615, 'learning_rate': 8.936950146627565e-05, 'epoch': 1.43}
+{'loss': 0.7107, 'grad_norm': 2.079683542251587, 'learning_rate': 8.934506353861192e-05, 'epoch': 1.43}
+{'loss': 1.8157, 'grad_norm': 2.6614019870758057, 'learning_rate': 8.932062561094817e-05, 'epoch': 1.43}
+{'loss': 1.1497, 'grad_norm': 2.139328718185425, 'learning_rate': 8.929618768328445e-05, 'epoch': 1.43}
+{'loss': 0.3898, 'grad_norm': 1.177253246307373, 'learning_rate': 8.927174975562071e-05, 'epoch': 1.43}
+{'loss': 0.8206, 'grad_norm': 1.5603402853012085, 'learning_rate': 8.924731182795698e-05, 'epoch': 1.43}
+{'loss': 0.4282, 'grad_norm': 3.4352011680603027, 'learning_rate': 8.922287390029326e-05, 'epoch': 1.43}
+{'loss': 0.8339, 'grad_norm': 1.563423752784729, 'learning_rate': 8.919843597262951e-05, 'epoch': 1.43}
+{'loss': 1.0363, 'grad_norm': 3.0659782886505127, 'learning_rate': 8.917399804496577e-05, 'epoch': 1.43}
+{'loss': 0.1943, 'grad_norm': 0.4259863793849945, 'learning_rate': 8.914956011730205e-05, 'epoch': 1.43}
+{'loss': 0.3264, 'grad_norm': 0.8139044046401978, 'learning_rate': 8.912512218963832e-05, 'epoch': 1.43}
+{'loss': 0.2079, 'grad_norm': 0.38787421584129333, 'learning_rate': 8.910068426197457e-05, 'epoch': 1.43}
+{'loss': 0.2371, 'grad_norm': 1.820959210395813, 'learning_rate': 8.907624633431084e-05, 'epoch': 1.43}
+{'loss': 0.2075, 'grad_norm': 0.6413818597793579, 'learning_rate': 8.905180840664711e-05, 'epoch': 1.43}
+{'loss': 0.3897, 'grad_norm': 0.7421493530273438, 'learning_rate': 8.902737047898336e-05, 'epoch': 1.43}
+{'loss': 0.2849, 'grad_norm': 1.2470545768737793, 'learning_rate': 8.900293255131964e-05, 'epoch': 1.43}
+{'loss': 0.2254, 'grad_norm': 0.5670220255851746, 'learning_rate': 8.89784946236559e-05, 'epoch': 1.43}
+{'loss': 0.2496, 'grad_norm': 0.7760573625564575, 'learning_rate': 8.895405669599217e-05, 'epoch': 1.43}
+{'loss': 0.5412, 'grad_norm': 1.6587246656417847, 'learning_rate': 8.892961876832845e-05, 'epoch': 1.43}
+{'loss': 0.1799, 'grad_norm': 0.6467307806015015, 'learning_rate': 8.89051808406647e-05, 'epoch': 1.43}
+{'loss': 0.2824, 'grad_norm': 1.049241542816162, 'learning_rate': 8.888074291300096e-05, 'epoch': 1.43}
+{'loss': 0.4649, 'grad_norm': 1.2838311195373535, 'learning_rate': 8.885630498533724e-05, 'epoch': 1.43}
+{'loss': 0.392, 'grad_norm': 0.9091600775718689, 'learning_rate': 8.88318670576735e-05, 'epoch': 1.43}
+{'loss': 0.4008, 'grad_norm': 1.10474693775177, 'learning_rate': 8.880742913000976e-05, 'epoch': 1.43}
+{'loss': 0.3551, 'grad_norm': 1.1482359170913696, 'learning_rate': 8.878299120234604e-05, 'epoch': 1.43}
+ 72%|███████▏  | 9155/12776 [1:37:46<30:58,  1.95it/s] 72%|███████▏  | 9156/12776 [1:37:47<31:39,  1.91it/s]                                                       72%|███████▏  | 9156/12776 [1:37:47<31:39,  1.91it/s] 72%|███████▏  | 9157/12776 [1:37:47<29:18,  2.06it/s]                                                       72%|███████▏  | 9157/12776 [1:37:47<29:18,  2.06it/s] 72%|███████▏  | 9158/12776 [1:37:47<27:28,  2.19it/s]                                                       72%|███████▏  | 9158/12776 [1:37:47<27:28,  2.19it/s] 72%|███████▏  | 9159/12776 [1:37:48<26:01,  2.32it/s]                                                       72%|███████▏  | 9159/12776 [1:37:48<26:01,  2.32it/s] 72%|███████▏  | 9160/12776 [1:37:48<24:37,  2.45it/s]                                                       72%|███████▏  | 9160/12776 [1:37:48<24:37,  2.45it/s] 72%|███████▏  | 9161/12776 [1:37:48<23:30,  2.56it/s]                                                       72%|███████▏  | 9161/12776 [1:37:48<23:30,  2.56it/s] 72%|███████▏  | 9162/12776 [1:37:49<23:53,  2.52it/s]                                                       72%|███████▏  | 9162/12776 [1:37:49<23:53,  2.52it/s] 72%|███████▏  | 9163/12776 [1:37:49<22:51,  2.64it/s]                                                       72%|███████▏  | 9163/12776 [1:37:49<22:51,  2.64it/s] 72%|███████▏  | 9164/12776 [1:37:50<21:53,  2.75it/s]                                                       72%|███████▏  | 9164/12776 [1:37:50<21:53,  2.75it/s] 72%|███████▏  | 9165/12776 [1:37:50<20:54,  2.88it/s]                                                       72%|███████▏  | 9165/12776 [1:37:50<20:54,  2.88it/s] 72%|███████▏  | 9166/12776 [1:37:50<20:37,  2.92it/s]                                                       72%|███████▏  | 9166/12776 [1:37:50<20:37,  2.92it/s] 72%|███████▏  | 9167/12776 [1:37:51<19:49,  3.03it/s]                                                       72%|███████▏  | 9167/12776 [1:37:51<19:49,  3.03it/s] 72%|███████▏  | 9168/12776 [1:37:51<18:57,  3.17it/s]                                                       72%|███████▏  | 9168/12776 [1:37:51<18:57,  3.17it/s] 72%|███████▏  | 9169/12776 [1:37:51<20:10,  2.98it/s]                                                       72%|███████▏  | 9169/12776 [1:37:51<20:10,  2.98it/s] 72%|███████▏  | 9170/12776 [1:37:51<18:49,  3.19it/s]                                                       72%|███████▏  | 9170/12776 [1:37:51<18:49,  3.19it/s] 72%|███████▏  | 9171/12776 [1:37:52<17:42,  3.39it/s]                                                       72%|███████▏  | 9171/12776 [1:37:52<17:42,  3.39it/s] 72%|███████▏  | 9172/12776 [1:37:52<16:49,  3.57it/s]                                                       72%|███████▏  | 9172/12776 [1:37:52<16:49,  3.57it/s] 72%|███████▏  | 9173/12776 [1:37:52<16:06,  3.73it/s]                                                       72%|███████▏  | 9173/12776 [1:37:52<16:06,  3.73it/s] 72%|███████▏  | 9174/12776 [1:37:52<16:04,  3.73it/s]                                                       72%|███████▏  | 9174/12776 [1:37:52<16:04,  3.73it/s] 72%|███████▏  | 9175/12776 [1:37:53<15:23,  3.90it/s]                                                       72%|███████▏  | 9175/12776 [1:37:53<15:23,  3.90it/s] 72%|███████▏  | 9176/12776 [1:37:53<14:47,  4.06it/s]                                                       72%|███████▏  | 9176/12776 [1:37:53<14:47,  4.06it/s] 72%|███████▏  | 9177/12776 [1:37:53<14:17,  4.20it/s]                                                       72%|███████▏  | 9177/12776 [1:37:53<14:17,  4.20it/s] 72%|███████▏  | 9178/12776 [1:37:53<16:02,  3.74it/s]                                                       72%|███████▏  | 9178/12776 [1:37:53<16:02,  3.74it/s] 72%|███████▏  | 9179/12776 [1:37:54<15:00,  3.99it/s]                                                       72%|███████▏  | 9179/12776 [1:37:54<15:00,  3.99it/s] 72%|███████▏  | 9180/12776 [1:37:54<14:11,  4.22it/s]                                                       72%|███████▏  | 9180/12776 [1:37:54<14:11,  4.22it/s] 72%|███████▏  | 9181/12776 [1:37:54<13:33,  4.42it/s]                                                       72%|███████▏  | 9181/12776 [1:37:54<13:33,  4.42it/s] 72%|███████▏  | 9182/12776 [1:37:54<13:04,  4.58it/s]                                                       72%|███████▏  | 9182/12776 [1:37:54<13:04,  4.58it/s] 72%|███████▏  | 9183/12776 [1:37:55<14:51,  4.03it/s]                                                       72%|███████▏  | 9183/12776 [1:37:55<14:51,  4.03it/s] 72%|███████▏  | 9184/12776 [1:37:55<13:53,  4.31it/s]                                                       72%|███████▏  | 9184/12776 [1:37:55<13:53,  4.31it/s] 72%|███████▏  | 9185/12776 [1:37:55<13:08,  4.55it/s]                                                       72%|███████▏  | 9185/12776 [1:37:55<13:08,  4.55it/s] 72%|███████▏  | 9186/12776 [1:37:55<12:30,  4.78it/s]                                                       72%|███████▏  | 9186/12776 [1:37:55<12:30,  4.78it/s] 72%|███████▏  | 9187/12776 [1:37:55<12:05,  4.95it/s]                                                       72%|███████▏  | 9187/12776 [1:37:55<12:05,  4.95it/s] 72%|███████▏  | 9188/12776 [1:37:56<22:06,  2.70it/s]                                                       72%|███████▏  | 9188/12776 [1:37:56<22:06,  2.70it/s] 72%|███████▏  | 9189/12776 [1:37:58<43:07,  1.39it/s]                                                       72%|███████▏  | 9189/12776 [1:37:58<43:07,  1.39it/s] 72%|███████▏  | 9190/12776 [1:37:59<49:05,  1.22it/s]                                                       72%|███████▏  | 9190/12776 [1:37:59<49:05,  1.22it/s] 72%|███████▏  | 9191/12776 [1:38:00<50:31,  1.18it/s]                                                       72%|███████▏  | 9191/12776 [1:38:00<50:31,  1.18it/s] 72%|███████▏  | 9192/12776 [1:38:00<50:08,  1.19it/s]                                                       72%|███████▏  | 9192/12776 [1:38:00<50:08,  1.19it/s] 72%|███████▏  | 9193/12776 [1:38:01<48:19,  1.24it/s]                                                       72%|███████▏  | 9193/12776 [1:38:01<48:19,  1.24it/s] 72%|███████▏  | 9194/12776 [1:38:02<46:50,  1.27it/s]                                                       72%|███████▏  | 9194/12776 [1:38:02<46:50,  1.27it/s] 72%|███████▏  | 9195/12776 [1:38:03<46:09,  1.29it/s]                                                       72%|███████▏  | 9195/12776 [1:38:03<46:09,  1.29it/s] 72%|███████▏  | 9196/12776 [1:38:03<43:20,  1.38it/s]                                                       72%|███████▏  | 9196/12776 [1:38:03<43:20,  1.38it/s] 72%|███████▏  | 9197/12776 [1:38:04<40:32,  1.47it/s]                                                       72%|███████▏  | 9197/12776 [1:38:04<40:32,  1.47it/s] 72%|███████▏  | 9198/12776 [1:38:04<38:08,  1.56it/s]                                                       72%|███████▏  | 9198/12776 [1:38:04<38:08,  1.56it/s] 72%|███████▏  | 9199/12776 [1:38:05<37:15,  1.60it/s]                                                       72%|███████▏  | 9199/12776 [1:38:05<37:15,  1.60it/s] 72%|███████▏  | 9200/12776 [1:38:05<34:58,  1.70it/s]                                                       72%|███████▏  | 9200/12776 [1:38:05<34:58,  1.70it/s]Saving model checkpoint to ./checkpoint-9200
+Configuration saved in ./checkpoint-9200/config.json
+Model weights saved in ./checkpoint-9200/model.safetensors
+Feature extractor saved in ./checkpoint-9200/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-9200/tokenizer_config.json
+Special tokens file saved in ./checkpoint-9200/special_tokens_map.json
+added tokens file saved in ./checkpoint-9200/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-8000] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 72%|███████▏  | 9201/12776 [1:38:11<2:06:56,  2.13s/it]                                                         72%|███████▏  | 9201/12776 [1:38:11<2:06:56,  2.13s/it] 72%|███████▏  | 9202/12776 [1:38:12<1:36:06,  1.61s/it]                                                         72%|███████▏  | 9202/12776 [1:38:12<1:36:06,  1.61s/it] 72%|███████▏  | 9203/12776 [1:38:12<1:15:27,  1.27s/it]                                                         72%|███████▏  | 9203/12776 [1:38:12<1:15:27,  1.27s/it] 72%|███████▏  | 9204/12776 [1:38:12<1:00:04,  1.01s/it]                                                         72%|███████▏  | 9204/12776 [1:38:12<1:00:04,  1.01s/it] 72%|███████▏  | 9205/12776 [1:38:13<50:48,  1.17it/s]                                                         72%|███████▏  | 9205/12776 [1:38:13<50:48,  1.17it/s] 72%|███████▏  | 9206/12776 [1:38:13<41:50,  1.42it/s]                                                       72%|███████▏  | 9206/12776 [1:38:13<41:50,  1.42it/s] 72%|███████▏  | 9207/12776 [1:38:14<35:21,  1.68it/s]                                                       72%|███████▏  | 9207/12776 [1:38:14<35:21,  1.68it/s] 72%|███████▏  | 9208/12776 [1:38:14<32:40,  1.82it/s]                                                       72%|███████▏  | 9208/12776 [1:38:14<32:40,  1.82it/s] 72%|███████▏  | 9209/12776 [1:38:14<28:35,  2.08it/s]                                                       72%|███████▏  | 9209/12776 [1:38:14<28:35,  2.08it/s] 72%|███████▏  | 9210/12776 [1:38:15<25:29,  2.33it/s]                                                       72%|███████▏  | 9210/12776 [1:38:15<25:29,  2.33it/s] 72%|███████▏  | 9211/12776 [1:38:15<23:16,  2.55it/s]                                                       72%|███████▏  | 9211/12776 [1:38:15<23:16,  2.55it/s] 72%|███████▏  | 9212/12776 [1:38:15<23:24,  2.54it/s]                                                       72%|███████▏  | 9212/12776 [1:38:15<23:24,  2.54it/s] 72%|███████▏  | 9213/12776 [1:38:16<21:20,  2.78it/s]                                                       72%|███████▏  | 9213/12776 [1:38:16<21:20,  2.78it/s] 72%|███████▏  | 9214/12776 [1:38:16<19:45,  3.00it/s]                                                       72%|███████▏  | 9214/12776 [1:38:16<19:45,  3.00it/s] 72%|███████▏  | 9215/12776 [1:38:16<18:28,  3.21it/s]                                                       72%|███████▏  | 9215/12776 [1:38:16<18:28,  3.21it/s] 72%|███████▏  | 9216/12776 [1:38:17<18:02,  3.29it/s]                                                       72%|███████▏  | 9216/12776 [1:38:17<18:02,  3.29it/s] 72%|███████▏  | 9217/12776 [1:38:17<17:03,  3.48it/s]                                                       72%|███████▏  | 9217/12776 [1:38:17<17:03,  3.48it/s] 72%|███████▏  | 9218/12776 [1:38:17<16:20,  3.63it/s]                                                       72%|███████▏  | 9218/12776 [1:38:17<16:20,  3.63it/s] 72%|███████▏  | 9219/12776 [1:38:17<15:33,  3.81it/s]                                                       72%|███████▏  | 9219/12776 [1:38:17<15:33,  3.81it/s] 72%|███████▏  | 9220/12776 [1:38:18<15:56,  3.72it/s]                                                       72%|███████▏  | 9220/12776 [1:38:18<15:56,  3.72it/s] 72%|███████▏  | 9221/12776 [1:38:18<15:07,  3.92it/s]                                                       72%|███████▏  | 9221/12776 [1:38:18<15:07,  3.92it/s] 72%|███████▏  | 9222/12776 [1:38:18<14:27,  4.10it/s]                                                       72%|███████▏  | 9222/12776 [1:38:18<14:27,  4.10it/s] 72%|███████▏  | 9223/12776 [1:38:18<13:58,  4.23it/s]                                                       72%|███████▏  | 9223/12776 [1:38:18<13:58,  4.23it/s] 72%|███████▏  | 9224/12776 [1:38:18<13:32,  4.37it/s]                                                       72%|███████▏  | 9224/12776 [1:38:18<13:32,  4.37it/s] 72%|███████▏  | 9225/12776 [1:38:19<14:07,  4.19it/s]                                                       72%|███████▏  | 9225/12776 [1:38:19<14:07,  4.19it/s] 72%|███████▏  | 9226/12776 [1:38:19<13:25,  4.41it/s]                                                       72%|███████▏  | 9226/12776 [1:38:19<13:25,  4.41it/s] 72%|███████▏  | 9227/12776 [1:38:19<12:49,  4.62it/s]                                                       72%|███████▏  | 9227/12776 [1:38:19<12:49,  4.62it/s] 72%|███████▏  | 9228/12776 [1:38:19<12:19,  4.80it/s]                                                       72%|███████▏  | 9228/12776 [1:38:19<12:19,  4.80it/s] 72%|███████▏  | 9229/12776 [1:38:19<11:59,  4.93it/s]                                                       72%|███████▏  | 9229/12776 [1:38:19<11:59,  4.93it/s] 72%|███████▏  | 9230/12776 [1:38:20<11:38,  5.08it/s]                                                       72%|███████▏  | 9230/12776 [1:38:20<11:38,  5.08it/s] 72%|███████▏  | 9231/12776 [1:38:20<12:40,  4.66it/s]                                                       72%|███████▏  | 9231/12776 [1:38:20<12:40,  4.66it/s] 72%|███████▏  | 9232/12776 [1:38:20<12:01,  4.91it/s]                                                       72%|███████▏  | 9232/12776 [1:38:20<12:01,  4.91it/s] 72%|███████▏  | 9233/12776 [1:38:20<11:33,  5.11it/s]                                                      {'loss': 0.2164, 'grad_norm': 0.6834793090820312, 'learning_rate': 8.87585532746823e-05, 'epoch': 1.43}
+{'loss': 0.3163, 'grad_norm': 0.9541566371917725, 'learning_rate': 8.873411534701855e-05, 'epoch': 1.43}
+{'loss': 0.3442, 'grad_norm': 2.6311590671539307, 'learning_rate': 8.870967741935483e-05, 'epoch': 1.43}
+{'loss': 0.3939, 'grad_norm': 1.168099045753479, 'learning_rate': 8.86852394916911e-05, 'epoch': 1.43}
+{'loss': 0.4121, 'grad_norm': 1.946874737739563, 'learning_rate': 8.866080156402736e-05, 'epoch': 1.43}
+{'loss': 0.5132, 'grad_norm': 1.7437493801116943, 'learning_rate': 8.863636363636364e-05, 'epoch': 1.43}
+{'loss': 0.2326, 'grad_norm': 1.0139257907867432, 'learning_rate': 8.861192570869989e-05, 'epoch': 1.43}
+{'loss': 0.5857, 'grad_norm': 1.9671473503112793, 'learning_rate': 8.858748778103615e-05, 'epoch': 1.43}
+{'loss': 0.4004, 'grad_norm': 7.848803520202637, 'learning_rate': 8.856304985337243e-05, 'epoch': 1.43}
+{'loss': 0.4384, 'grad_norm': 1.3776273727416992, 'learning_rate': 8.85386119257087e-05, 'epoch': 1.43}
+{'loss': 0.4875, 'grad_norm': 1.2949081659317017, 'learning_rate': 8.851417399804495e-05, 'epoch': 1.43}
+{'loss': 0.7237, 'grad_norm': 3.020263433456421, 'learning_rate': 8.848973607038123e-05, 'epoch': 1.43}
+{'loss': 0.6773, 'grad_norm': 2.61968994140625, 'learning_rate': 8.846529814271749e-05, 'epoch': 1.44}
+{'loss': 0.903, 'grad_norm': 1.7759416103363037, 'learning_rate': 8.844086021505374e-05, 'epoch': 1.44}
+{'loss': 0.4451, 'grad_norm': 1.5969222784042358, 'learning_rate': 8.841642228739002e-05, 'epoch': 1.44}
+{'loss': 0.4141, 'grad_norm': 1.7365254163742065, 'learning_rate': 8.839198435972629e-05, 'epoch': 1.44}
+{'loss': 1.1615, 'grad_norm': 3.0730156898498535, 'learning_rate': 8.836754643206255e-05, 'epoch': 1.44}
+{'loss': 0.6218, 'grad_norm': 2.1355032920837402, 'learning_rate': 8.834310850439883e-05, 'epoch': 1.44}
+{'loss': 1.368, 'grad_norm': 2.897294759750366, 'learning_rate': 8.831867057673508e-05, 'epoch': 1.44}
+{'loss': 0.4797, 'grad_norm': 1.462666630744934, 'learning_rate': 8.829423264907135e-05, 'epoch': 1.44}
+{'loss': 0.8913, 'grad_norm': 3.690739631652832, 'learning_rate': 8.826979472140762e-05, 'epoch': 1.44}
+{'loss': 0.573, 'grad_norm': 3.024446964263916, 'learning_rate': 8.824535679374389e-05, 'epoch': 1.44}
+{'loss': 0.5313, 'grad_norm': 1.713948130607605, 'learning_rate': 8.822091886608014e-05, 'epoch': 1.44}
+{'loss': 1.3517, 'grad_norm': 3.0209193229675293, 'learning_rate': 8.819648093841642e-05, 'epoch': 1.44}
+{'loss': 1.1816, 'grad_norm': 4.99892520904541, 'learning_rate': 8.817204301075268e-05, 'epoch': 1.44}
+{'loss': 0.6138, 'grad_norm': 2.933112144470215, 'learning_rate': 8.814760508308893e-05, 'epoch': 1.44}
+{'loss': 1.1627, 'grad_norm': 1.887498378753662, 'learning_rate': 8.812316715542521e-05, 'epoch': 1.44}
+{'loss': 0.8495, 'grad_norm': 2.869887351989746, 'learning_rate': 8.809872922776148e-05, 'epoch': 1.44}
+{'loss': 0.8784, 'grad_norm': 2.865903854370117, 'learning_rate': 8.807429130009774e-05, 'epoch': 1.44}
+{'loss': 0.5029, 'grad_norm': 1.4047309160232544, 'learning_rate': 8.804985337243402e-05, 'epoch': 1.44}
+{'loss': 0.5816, 'grad_norm': 1.5280976295471191, 'learning_rate': 8.802541544477027e-05, 'epoch': 1.44}
+{'loss': 0.6115, 'grad_norm': 2.952573776245117, 'learning_rate': 8.800097751710654e-05, 'epoch': 1.44}
+{'loss': 0.3277, 'grad_norm': 0.848155677318573, 'learning_rate': 8.797653958944282e-05, 'epoch': 1.44}
+{'loss': 0.8794, 'grad_norm': 4.988794326782227, 'learning_rate': 8.795210166177907e-05, 'epoch': 1.44}
+{'loss': 0.2078, 'grad_norm': 0.3419736325740814, 'learning_rate': 8.792766373411533e-05, 'epoch': 1.44}
+{'loss': 0.2801, 'grad_norm': 0.519394040107727, 'learning_rate': 8.790322580645161e-05, 'epoch': 1.44}
+{'loss': 0.3893, 'grad_norm': 0.9486970901489258, 'learning_rate': 8.787878787878787e-05, 'epoch': 1.44}
+{'loss': 0.2653, 'grad_norm': 0.5619981288909912, 'learning_rate': 8.785434995112413e-05, 'epoch': 1.44}
+{'loss': 0.3212, 'grad_norm': 0.7176588773727417, 'learning_rate': 8.78299120234604e-05, 'epoch': 1.44}
+{'loss': 0.1653, 'grad_norm': 0.6982567310333252, 'learning_rate': 8.780547409579667e-05, 'epoch': 1.44}
+{'loss': 0.2405, 'grad_norm': 0.5017626881599426, 'learning_rate': 8.778103616813293e-05, 'epoch': 1.44}
+{'loss': 0.3396, 'grad_norm': 0.9920409917831421, 'learning_rate': 8.775659824046921e-05, 'epoch': 1.44}
+{'loss': 0.1906, 'grad_norm': 0.9404519200325012, 'learning_rate': 8.773216031280546e-05, 'epoch': 1.44}
+{'loss': 0.2486, 'grad_norm': 0.6449622511863708, 'learning_rate': 8.770772238514173e-05, 'epoch': 1.44}
+{'loss': 0.3537, 'grad_norm': 0.9190917611122131, 'learning_rate': 8.7683284457478e-05, 'epoch': 1.44}
+{'loss': 0.2703, 'grad_norm': 0.9570406079292297, 'learning_rate': 8.765884652981426e-05, 'epoch': 1.44}
+{'loss': 0.4503, 'grad_norm': 1.3468104600906372, 'learning_rate': 8.763440860215052e-05, 'epoch': 1.44}
+{'loss': 0.3656, 'grad_norm': 0.8499281406402588, 'learning_rate': 8.76099706744868e-05, 'epoch': 1.44}
+{'loss': 0.2962, 'grad_norm': 1.0467718839645386, 'learning_rate': 8.758553274682307e-05, 'epoch': 1.44}
+{'loss': 0.3295, 'grad_norm': 0.8589369654655457, 'learning_rate': 8.756109481915932e-05, 'epoch': 1.44}
+{'loss': 0.4164, 'grad_norm': 1.2273787260055542, 'learning_rate': 8.75366568914956e-05, 'epoch': 1.44}
+{'loss': 0.5016, 'grad_norm': 1.0226304531097412, 'learning_rate': 8.751221896383186e-05, 'epoch': 1.44}
+{'loss': 0.482, 'grad_norm': 1.2805366516113281, 'learning_rate': 8.748778103616812e-05, 'epoch': 1.44}
+{'loss': 0.3297, 'grad_norm': 1.0432116985321045, 'learning_rate': 8.74633431085044e-05, 'epoch': 1.44}
+{'loss': 0.52, 'grad_norm': 3.9404122829437256, 'learning_rate': 8.743890518084065e-05, 'epoch': 1.44}
+{'loss': 0.6131, 'grad_norm': 5.121649742126465, 'learning_rate': 8.741446725317692e-05, 'epoch': 1.44}
+{'loss': 0.4629, 'grad_norm': 1.2252410650253296, 'learning_rate': 8.73900293255132e-05, 'epoch': 1.44}
+{'loss': 0.6523, 'grad_norm': 1.7832077741622925, 'learning_rate': 8.736559139784945e-05, 'epoch': 1.44}
+{'loss': 0.3546, 'grad_norm': 3.1595590114593506, 'learning_rate': 8.734115347018571e-05, 'epoch': 1.44}
+{'loss': 0.9762, 'grad_norm': 1.8645236492156982, 'learning_rate': 8.731671554252199e-05, 'epoch': 1.44}
+{'loss': 0.6703, 'grad_norm': 2.1606290340423584, 'learning_rate': 8.729227761485826e-05, 'epoch': 1.44}
+{'loss': 0.5665, 'grad_norm': 3.289787530899048, 'learning_rate': 8.726783968719451e-05, 'epoch': 1.44}
+{'loss': 0.864, 'grad_norm': 2.60479736328125, 'learning_rate': 8.724340175953079e-05, 'epoch': 1.44}
+{'loss': 0.4926, 'grad_norm': 2.7393736839294434, 'learning_rate': 8.721896383186705e-05, 'epoch': 1.44}
+{'loss': 0.9697, 'grad_norm': 2.50663161277771, 'learning_rate': 8.719452590420332e-05, 'epoch': 1.44}
+{'loss': 0.3924, 'grad_norm': 1.2371330261230469, 'learning_rate': 8.71700879765396e-05, 'epoch': 1.44}
+{'loss': 0.6051, 'grad_norm': 2.1638879776000977, 'learning_rate': 8.714565004887585e-05, 'epoch': 1.44}
+{'loss': 0.8727, 'grad_norm': 1.8525105714797974, 'learning_rate': 8.712121212121211e-05, 'epoch': 1.44}
+{'loss': 0.9543, 'grad_norm': 4.00303840637207, 'learning_rate': 8.709677419354839e-05, 'epoch': 1.44}
+{'loss': 0.4322, 'grad_norm': 1.3205608129501343, 'learning_rate': 8.707233626588464e-05, 'epoch': 1.44}
+{'loss': 0.7875, 'grad_norm': 2.571690559387207, 'learning_rate': 8.70478983382209e-05, 'epoch': 1.44}
+{'loss': 1.0932, 'grad_norm': 3.207707166671753, 'learning_rate': 8.702346041055718e-05, 'epoch': 1.44}
+{'loss': 0.7563, 'grad_norm': 1.89205002784729, 'learning_rate': 8.699902248289345e-05, 'epoch': 1.44}
+{'loss': 0.8936, 'grad_norm': 1.761757493019104, 'learning_rate': 8.69745845552297e-05, 'epoch': 1.44}
+{'loss': 1.1292, 'grad_norm': 4.034841060638428, 'learning_rate': 8.695014662756598e-05, 'epoch': 1.44}
+{'loss': 2.0323, 'grad_norm': 5.492170333862305, 'learning_rate': 8.692570869990224e-05, 'epoch': 1.44}
+{'loss': 0.9711, 'grad_norm': 3.997959613800049, 'learning_rate': 8.690127077223851e-05, 'epoch': 1.45}
+{'loss': 1.0842, 'grad_norm': 3.610450267791748, 'learning_rate': 8.687683284457477e-05, 'epoch': 1.45}
+ 72%|███████▏  | 9233/12776 [1:38:20<11:33,  5.11it/s] 72%|███████▏  | 9234/12776 [1:38:20<11:12,  5.27it/s]                                                       72%|███████▏  | 9234/12776 [1:38:20<11:12,  5.27it/s] 72%|███████▏  | 9235/12776 [1:38:21<10:51,  5.44it/s]                                                       72%|███████▏  | 9235/12776 [1:38:21<10:51,  5.44it/s] 72%|███████▏  | 9236/12776 [1:38:21<10:29,  5.62it/s]                                                       72%|███████▏  | 9236/12776 [1:38:21<10:29,  5.62it/s] 72%|███████▏  | 9237/12776 [1:38:21<12:21,  4.77it/s]                                                       72%|███████▏  | 9237/12776 [1:38:21<12:21,  4.77it/s] 72%|███████▏  | 9238/12776 [1:38:22<21:05,  2.80it/s]                                                       72%|███████▏  | 9238/12776 [1:38:22<21:05,  2.80it/s] 72%|███████▏  | 9239/12776 [1:38:23<38:52,  1.52it/s]                                                       72%|███████▏  | 9239/12776 [1:38:23<38:52,  1.52it/s] 72%|███████▏  | 9240/12776 [1:38:24<43:44,  1.35it/s]                                                       72%|███████▏  | 9240/12776 [1:38:24<43:44,  1.35it/s] 72%|███████▏  | 9241/12776 [1:38:25<45:26,  1.30it/s]                                                       72%|███████▏  | 9241/12776 [1:38:25<45:26,  1.30it/s] 72%|███████▏  | 9242/12776 [1:38:26<45:18,  1.30it/s]                                                       72%|███████▏  | 9242/12776 [1:38:26<45:18,  1.30it/s] 72%|███████▏  | 9243/12776 [1:38:26<44:18,  1.33it/s]                                                       72%|███████▏  | 9243/12776 [1:38:26<44:18,  1.33it/s] 72%|███████▏  | 9244/12776 [1:38:27<43:51,  1.34it/s]                                                       72%|███████▏  | 9244/12776 [1:38:27<43:51,  1.34it/s] 72%|███████▏  | 9245/12776 [1:38:28<43:36,  1.35it/s]                                                       72%|███████▏  | 9245/12776 [1:38:28<43:36,  1.35it/s] 72%|███████▏  | 9246/12776 [1:38:28<40:50,  1.44it/s]                                                       72%|███████▏  | 9246/12776 [1:38:28<40:50,  1.44it/s] 72%|███████▏  | 9247/12776 [1:38:29<38:55,  1.51it/s]                                                       72%|███████▏  | 9247/12776 [1:38:29<38:55,  1.51it/s] 72%|███████▏  | 9248/12776 [1:38:30<36:25,  1.61it/s]                                                       72%|███████▏  | 9248/12776 [1:38:30<36:25,  1.61it/s] 72%|███████▏  | 9249/12776 [1:38:30<36:26,  1.61it/s]                                                       72%|███████▏  | 9249/12776 [1:38:30<36:26,  1.61it/s] 72%|███████▏  | 9250/12776 [1:38:31<33:49,  1.74it/s]                                                       72%|███████▏  | 9250/12776 [1:38:31<33:49,  1.74it/s] 72%|███████▏  | 9251/12776 [1:38:31<31:47,  1.85it/s]                                                       72%|███████▏  | 9251/12776 [1:38:31<31:47,  1.85it/s] 72%|███████▏  | 9252/12776 [1:38:32<30:52,  1.90it/s]                                                       72%|███████▏  | 9252/12776 [1:38:32<30:52,  1.90it/s] 72%|███████▏  | 9253/12776 [1:38:32<28:45,  2.04it/s]                                                       72%|███████▏  | 9253/12776 [1:38:32<28:45,  2.04it/s] 72%|███████▏  | 9254/12776 [1:38:32<29:35,  1.98it/s]                                                       72%|███████▏  | 9254/12776 [1:38:32<29:35,  1.98it/s] 72%|███████▏  | 9255/12776 [1:38:33<27:24,  2.14it/s]                                                       72%|███████▏  | 9255/12776 [1:38:33<27:24,  2.14it/s] 72%|███████▏  | 9256/12776 [1:38:33<25:42,  2.28it/s]                                                       72%|███████▏  | 9256/12776 [1:38:33<25:42,  2.28it/s] 72%|███████▏  | 9257/12776 [1:38:34<25:44,  2.28it/s]                                                       72%|███████▏  | 9257/12776 [1:38:34<25:44,  2.28it/s] 72%|███████▏  | 9258/12776 [1:38:34<24:10,  2.42it/s]                                                       72%|███████▏  | 9258/12776 [1:38:34<24:10,  2.42it/s] 72%|███████▏  | 9259/12776 [1:38:34<22:44,  2.58it/s]                                                       72%|███████▏  | 9259/12776 [1:38:34<22:44,  2.58it/s] 72%|███████▏  | 9260/12776 [1:38:35<23:53,  2.45it/s]                                                       72%|███████▏  | 9260/12776 [1:38:35<23:53,  2.45it/s] 72%|███████▏  | 9261/12776 [1:38:35<22:16,  2.63it/s]                                                       72%|███████▏  | 9261/12776 [1:38:35<22:16,  2.63it/s] 72%|███████▏  | 9262/12776 [1:38:35<20:58,  2.79it/s]                                                       72%|███████▏  | 9262/12776 [1:38:35<20:58,  2.79it/s] 73%|███████▎  | 9263/12776 [1:38:36<20:14,  2.89it/s]                                                       73%|███████▎  | 9263/12776 [1:38:36<20:14,  2.89it/s] 73%|███████▎  | 9264/12776 [1:38:36<20:25,  2.87it/s]                                                       73%|███████▎  | 9264/12776 [1:38:36<20:25,  2.87it/s] 73%|███████▎  | 9265/12776 [1:38:36<19:32,  3.00it/s]                                                       73%|███████▎  | 9265/12776 [1:38:36<19:32,  3.00it/s] 73%|███████▎  | 9266/12776 [1:38:37<18:47,  3.11it/s]                                                       73%|███████▎  | 9266/12776 [1:38:37<18:47,  3.11it/s] 73%|███████▎  | 9267/12776 [1:38:37<19:42,  2.97it/s]                                                       73%|███████▎  | 9267/12776 [1:38:37<19:42,  2.97it/s] 73%|███████▎  | 9268/12776 [1:38:37<18:39,  3.13it/s]                                                       73%|███████▎  | 9268/12776 [1:38:37<18:39,  3.13it/s] 73%|███████▎  | 9269/12776 [1:38:38<17:48,  3.28it/s]                                                       73%|███████▎  | 9269/12776 [1:38:38<17:48,  3.28it/s] 73%|███████▎  | 9270/12776 [1:38:38<17:07,  3.41it/s]                                                       73%|███████▎  | 9270/12776 [1:38:38<17:07,  3.41it/s] 73%|███████▎  | 9271/12776 [1:38:38<17:26,  3.35it/s]                                                       73%|███████▎  | 9271/12776 [1:38:38<17:26,  3.35it/s] 73%|███████▎  | 9272/12776 [1:38:38<16:35,  3.52it/s]                                                       73%|███████▎  | 9272/12776 [1:38:38<16:35,  3.52it/s] 73%|███████▎  | 9273/12776 [1:38:39<15:51,  3.68it/s]                                                       73%|███████▎  | 9273/12776 [1:38:39<15:51,  3.68it/s] 73%|███████▎  | 9274/12776 [1:38:39<15:10,  3.85it/s]                                                       73%|███████▎  | 9274/12776 [1:38:39<15:10,  3.85it/s] 73%|███████▎  | 9275/12776 [1:38:39<14:37,  3.99it/s]                                                       73%|███████▎  | 9275/12776 [1:38:39<14:37,  3.99it/s] 73%|███████▎  | 9276/12776 [1:38:39<15:31,  3.76it/s]                                                       73%|███████▎  | 9276/12776 [1:38:39<15:31,  3.76it/s] 73%|███████▎  | 9277/12776 [1:38:40<14:38,  3.98it/s]                                                       73%|███████▎  | 9277/12776 [1:38:40<14:38,  3.98it/s] 73%|███████▎  | 9278/12776 [1:38:40<13:54,  4.19it/s]                                                       73%|███████▎  | 9278/12776 [1:38:40<13:54,  4.19it/s] 73%|███████▎  | 9279/12776 [1:38:40<13:22,  4.36it/s]                                                       73%|███████▎  | 9279/12776 [1:38:40<13:22,  4.36it/s] 73%|███████▎  | 9280/12776 [1:38:40<12:56,  4.50it/s]                                                       73%|███████▎  | 9280/12776 [1:38:40<12:56,  4.50it/s] 73%|███████▎  | 9281/12776 [1:38:41<14:04,  4.14it/s]                                                       73%|███████▎  | 9281/12776 [1:38:41<14:04,  4.14it/s] 73%|███████▎  | 9282/12776 [1:38:41<12:58,  4.49it/s]                                                       73%|███████▎  | 9282/12776 [1:38:41<12:58,  4.49it/s] 73%|███████▎  | 9283/12776 [1:38:41<12:05,  4.81it/s]                                                       73%|███████▎  | 9283/12776 [1:38:41<12:05,  4.81it/s] 73%|███████▎  | 9284/12776 [1:38:41<11:26,  5.09it/s]                                                       73%|███████▎  | 9284/12776 [1:38:41<11:26,  5.09it/s] 73%|███████▎  | 9285/12776 [1:38:41<10:55,  5.33it/s]                                                       73%|███████▎  | 9285/12776 [1:38:41<10:55,  5.33it/s] 73%|███████▎  | 9286/12776 [1:38:41<10:26,  5.57it/s]                                                       73%|███████▎  | 9286/12776 [1:38:41<10:26,  5.57it/s] 73%|███████▎  | 9287/12776 [1:38:42<12:07,  4.80it/s]                                                       73%|███████▎  | 9287/12776 [1:38:42<12:07,  4.80it/s] 73%|███████▎  | 9288/12776 [1:38:42<21:30,  2.70it/s]                                                       73%|███████▎  | 9288/12776 [1:38:42<21:30,  2.70it/s] 73%|███████▎  | 9289/12776 [1:38:44<40:48,  1.42it/s]                                                       73%|███████▎  | 9289/12776 [1:38:44<40:48,  1.42it/s] 73%|███████▎  | 9290/12776 [1:38:45<48:35,  1.20it/s]                                                       73%|███████▎  | 9290/12776 [1:38:45<48:35,  1.20it/s] 73%|███████▎  | 9291/12776 [1:38:46<49:23,  1.18it/s]                                                       73%|███████▎  | 9291/12776 [1:38:46<49:23,  1.18it/s] 73%|███████▎  | 9292/12776 [1:38:47<48:32,  1.20it/s]                                                       73%|███████▎  | 9292/12776 [1:38:47<48:32,  1.20it/s] 73%|███████▎  | 9293/12776 [1:38:48<46:44,  1.24it/s]                                                       73%|███████▎  | 9293/12776 [1:38:48<46:44,  1.24it/s] 73%|███████▎  | 9294/12776 [1:38:48<45:30,  1.28it/s]                                                       73%|███████▎  | 9294/12776 [1:38:48<45:30,  1.28it/s] 73%|███████▎  | 9295/12776 [1:38:49<44:50,  1.29it/s]                                                       73%|███████▎  | 9295/12776 [1:38:49<44:50,  1.29it/s] 73%|███████▎  | 9296/12776 [1:38:50<44:19,  1.31it/s]                                                       73%|███████▎  | 9296/12776 [1:38:50<44:19,  1.31it/s] 73%|███████▎  | 9297/12776 [1:38:50<41:48,  1.39it/s]                                                       73%|███████▎  | 9297/12776 [1:38:50<41:48,  1.39it/s] 73%|███████▎  | 9298/12776 [1:38:51<39:19,  1.47it/s]                                                       73%|███████▎  | 9298/12776 [1:38:51<39:19,  1.47it/s] 73%|███████▎  | 9299/12776 [1:38:51<37:21,  1.55it/s]                                                       73%|███████▎  | 9299/12776 [1:38:51<37:21,  1.55it/s] 73%|███████▎  | 9300/12776 [1:38:52<36:17,  1.60it/s]                                                       73%|███████▎  | 9300/12776 [1:38:52<36:17,  1.60it/s] 73%|███████▎  | 9301/12776 [1:38:53<34:33,  1.68it/s]                                                       73%|███████▎  | 9301/12776 [1:38:53<34:33,  1.68it/s] 73%|███████▎  | 9302/12776 [1:38:53<34:48,  1.66it/s]                                                       73%|███████▎  | 9302/12776 [1:38:53<34:48,  1.66it/s] 73%|███████▎  | 9303/12776 [1:38:54<32:09,  1.80it/s]                                                       73%|███████▎  | 9303/12776 [1:38:54<32:09,  1.80it/s] 73%|███████▎  | 9304/12776 [1:38:54<30:01,  1.93it/s]                                                       73%|███████▎  | 9304/12776 [1:38:54<30:01,  1.93it/s] 73%|███████▎  | 9305/12776 [1:38:55<30:06,  1.92it/s]                                                       73%|███████▎  | 9305/12776 [1:38:55<30:06,  1.92it/s] 73%|███████▎  | 9306/12776 [1:38:55<27:54,  2.07it/s]                                                       73%|███████▎  | 9306/12776 [1:38:55<27:54,  2.07it/s] 73%|███████▎  | 9307/12776 [1:38:56<28:05,  2.06it/s]                                                       73%|███████▎  | 9307/12776 [1:38:56<28:05,  2.06it/s] 73%|███████▎  | 9308/12776 [1:38:56<25:57,  2.23it/s]                                                       73%|███████▎  | 9308/12776 [1:38:56<25:57,  2.23it/s] 73%|███████▎  | 9309/12776 [1:38:56<24:10,  2.39it/s]                                                       73%|███████▎  | 9309/12776 [1:38:56<24:10,  2.39it/s] 73%|███████▎  | 9310/12776 [1:38:57<24:38,  2.34it/s]                                                       73%|███████▎  | 9310/12776 [1:38:57<24:38,  2.34it/s] 73%|███████▎  | 9311/12776 [1:38:57<22:58,  2.51it/s]                                                      {'loss': 1.499, 'grad_norm': 2.2445812225341797, 'learning_rate': 8.685239491691104e-05, 'epoch': 1.45}
+{'loss': 0.4075, 'grad_norm': 4.479135513305664, 'learning_rate': 8.68279569892473e-05, 'epoch': 1.45}
+{'loss': 0.9783, 'grad_norm': 1.9963476657867432, 'learning_rate': 8.680351906158358e-05, 'epoch': 1.45}
+{'loss': 0.4627, 'grad_norm': 1.3088605403900146, 'learning_rate': 8.677908113391983e-05, 'epoch': 1.45}
+{'loss': 1.1461, 'grad_norm': 3.202507257461548, 'learning_rate': 8.67546432062561e-05, 'epoch': 1.45}
+{'loss': 0.9054, 'grad_norm': 2.1418967247009277, 'learning_rate': 8.673020527859237e-05, 'epoch': 1.45}
+{'loss': 0.27, 'grad_norm': 0.5720778703689575, 'learning_rate': 8.670576735092864e-05, 'epoch': 1.45}
+{'loss': 0.1783, 'grad_norm': 0.45177343487739563, 'learning_rate': 8.668132942326489e-05, 'epoch': 1.45}
+{'loss': 0.4301, 'grad_norm': 1.081150770187378, 'learning_rate': 8.665689149560117e-05, 'epoch': 1.45}
+{'loss': 0.2019, 'grad_norm': 0.7842183113098145, 'learning_rate': 8.663245356793743e-05, 'epoch': 1.45}
+{'loss': 0.2977, 'grad_norm': 0.7052162885665894, 'learning_rate': 8.66080156402737e-05, 'epoch': 1.45}
+{'loss': 0.1986, 'grad_norm': 0.5708596110343933, 'learning_rate': 8.658357771260996e-05, 'epoch': 1.45}
+{'loss': 0.1634, 'grad_norm': 0.4889955222606659, 'learning_rate': 8.655913978494623e-05, 'epoch': 1.45}
+{'loss': 0.156, 'grad_norm': 0.5861791372299194, 'learning_rate': 8.653470185728249e-05, 'epoch': 1.45}
+{'loss': 0.2483, 'grad_norm': 0.8749104142189026, 'learning_rate': 8.651026392961877e-05, 'epoch': 1.45}
+{'loss': 0.339, 'grad_norm': 0.7886466383934021, 'learning_rate': 8.648582600195502e-05, 'epoch': 1.45}
+{'loss': 0.3371, 'grad_norm': 1.1284881830215454, 'learning_rate': 8.646138807429129e-05, 'epoch': 1.45}
+{'loss': 0.2101, 'grad_norm': 11.110546112060547, 'learning_rate': 8.643695014662757e-05, 'epoch': 1.45}
+{'loss': 0.3903, 'grad_norm': 0.8709673881530762, 'learning_rate': 8.641251221896383e-05, 'epoch': 1.45}
+{'loss': 0.495, 'grad_norm': 7.446573257446289, 'learning_rate': 8.638807429130008e-05, 'epoch': 1.45}
+{'loss': 0.3358, 'grad_norm': 1.0395238399505615, 'learning_rate': 8.636363636363636e-05, 'epoch': 1.45}
+{'loss': 0.3986, 'grad_norm': 1.0077224969863892, 'learning_rate': 8.633919843597262e-05, 'epoch': 1.45}
+{'loss': 0.2916, 'grad_norm': 1.1264681816101074, 'learning_rate': 8.631476050830889e-05, 'epoch': 1.45}
+{'loss': 0.3287, 'grad_norm': 1.2706413269042969, 'learning_rate': 8.629032258064515e-05, 'epoch': 1.45}
+{'loss': 0.3033, 'grad_norm': 1.2557499408721924, 'learning_rate': 8.626588465298142e-05, 'epoch': 1.45}
+{'loss': 0.5071, 'grad_norm': 0.9736500382423401, 'learning_rate': 8.624144672531768e-05, 'epoch': 1.45}
+{'loss': 0.3498, 'grad_norm': 1.106518030166626, 'learning_rate': 8.621700879765396e-05, 'epoch': 1.45}
+{'loss': 0.462, 'grad_norm': 1.3853764533996582, 'learning_rate': 8.619257086999021e-05, 'epoch': 1.45}
+{'loss': 0.5758, 'grad_norm': 1.618614912033081, 'learning_rate': 8.616813294232648e-05, 'epoch': 1.45}
+{'loss': 0.4159, 'grad_norm': 1.4383054971694946, 'learning_rate': 8.614369501466276e-05, 'epoch': 1.45}
+{'loss': 0.4319, 'grad_norm': 1.0663392543792725, 'learning_rate': 8.611925708699902e-05, 'epoch': 1.45}
+{'loss': 0.6587, 'grad_norm': 2.9339816570281982, 'learning_rate': 8.609481915933527e-05, 'epoch': 1.45}
+{'loss': 0.6003, 'grad_norm': 2.176449775695801, 'learning_rate': 8.607038123167155e-05, 'epoch': 1.45}
+{'loss': 0.6367, 'grad_norm': 2.8206968307495117, 'learning_rate': 8.604594330400782e-05, 'epoch': 1.45}
+{'loss': 0.4951, 'grad_norm': 1.6528189182281494, 'learning_rate': 8.602150537634408e-05, 'epoch': 1.45}
+{'loss': 0.7283, 'grad_norm': 2.3281638622283936, 'learning_rate': 8.599706744868035e-05, 'epoch': 1.45}
+{'loss': 0.3834, 'grad_norm': 1.1274622678756714, 'learning_rate': 8.597262952101661e-05, 'epoch': 1.45}
+{'loss': 0.7657, 'grad_norm': 2.182931423187256, 'learning_rate': 8.594819159335287e-05, 'epoch': 1.45}
+{'loss': 0.8096, 'grad_norm': 2.473872423171997, 'learning_rate': 8.592375366568915e-05, 'epoch': 1.45}
+{'loss': 0.6692, 'grad_norm': 1.8091752529144287, 'learning_rate': 8.58993157380254e-05, 'epoch': 1.45}
+{'loss': 0.5206, 'grad_norm': 3.2494728565216064, 'learning_rate': 8.587487781036167e-05, 'epoch': 1.45}
+{'loss': 0.8174, 'grad_norm': 6.569478511810303, 'learning_rate': 8.585043988269795e-05, 'epoch': 1.45}
+{'loss': 1.2162, 'grad_norm': 3.5665347576141357, 'learning_rate': 8.582600195503421e-05, 'epoch': 1.45}
+{'loss': 1.2148, 'grad_norm': 3.3499755859375, 'learning_rate': 8.580156402737046e-05, 'epoch': 1.45}
+{'loss': 0.6484, 'grad_norm': 1.8471848964691162, 'learning_rate': 8.577712609970674e-05, 'epoch': 1.45}
+{'loss': 0.8956, 'grad_norm': 3.943901777267456, 'learning_rate': 8.575268817204301e-05, 'epoch': 1.45}
+{'loss': 1.4366, 'grad_norm': 3.428492784500122, 'learning_rate': 8.572825024437927e-05, 'epoch': 1.45}
+{'loss': 0.7291, 'grad_norm': 2.345398426055908, 'learning_rate': 8.570381231671554e-05, 'epoch': 1.45}
+{'loss': 0.9401, 'grad_norm': 2.542097806930542, 'learning_rate': 8.56793743890518e-05, 'epoch': 1.45}
+{'loss': 1.1182, 'grad_norm': 1.3730517625808716, 'learning_rate': 8.565493646138807e-05, 'epoch': 1.45}
+{'loss': 1.2957, 'grad_norm': 3.189399003982544, 'learning_rate': 8.563049853372434e-05, 'epoch': 1.45}
+{'loss': 0.4811, 'grad_norm': 2.0311431884765625, 'learning_rate': 8.56060606060606e-05, 'epoch': 1.45}
+{'loss': 0.7376, 'grad_norm': 1.2809655666351318, 'learning_rate': 8.558162267839686e-05, 'epoch': 1.45}
+{'loss': 0.5024, 'grad_norm': 1.898149013519287, 'learning_rate': 8.555718475073314e-05, 'epoch': 1.45}
+{'loss': 0.6526, 'grad_norm': 1.5057889223098755, 'learning_rate': 8.55327468230694e-05, 'epoch': 1.45}
+{'loss': 1.1135, 'grad_norm': 6.357172012329102, 'learning_rate': 8.550830889540565e-05, 'epoch': 1.45}
+{'loss': 0.1894, 'grad_norm': 0.4889274537563324, 'learning_rate': 8.548387096774193e-05, 'epoch': 1.45}
+{'loss': 0.1703, 'grad_norm': 0.653469443321228, 'learning_rate': 8.54594330400782e-05, 'epoch': 1.45}
+{'loss': 0.2752, 'grad_norm': 0.6301447153091431, 'learning_rate': 8.543499511241445e-05, 'epoch': 1.45}
+{'loss': 0.2306, 'grad_norm': 0.5737555027008057, 'learning_rate': 8.541055718475073e-05, 'epoch': 1.45}
+{'loss': 0.2238, 'grad_norm': 0.48317751288414, 'learning_rate': 8.538611925708699e-05, 'epoch': 1.45}
+{'loss': 0.25, 'grad_norm': 0.660159170627594, 'learning_rate': 8.536168132942326e-05, 'epoch': 1.45}
+{'loss': 0.1323, 'grad_norm': 0.5787570476531982, 'learning_rate': 8.533724340175954e-05, 'epoch': 1.46}
+{'loss': 0.2394, 'grad_norm': 0.7087740898132324, 'learning_rate': 8.531280547409579e-05, 'epoch': 1.46}
+{'loss': 0.2902, 'grad_norm': 1.1337648630142212, 'learning_rate': 8.528836754643205e-05, 'epoch': 1.46}
+{'loss': 0.2308, 'grad_norm': 0.6737481951713562, 'learning_rate': 8.526392961876833e-05, 'epoch': 1.46}
+{'loss': 0.1904, 'grad_norm': 0.6452515721321106, 'learning_rate': 8.52394916911046e-05, 'epoch': 1.46}
+{'loss': 0.1995, 'grad_norm': 0.5262974500656128, 'learning_rate': 8.521505376344085e-05, 'epoch': 1.46}
+{'loss': 0.4979, 'grad_norm': 1.1509872674942017, 'learning_rate': 8.519061583577712e-05, 'epoch': 1.46}
+{'loss': 0.2421, 'grad_norm': 0.8436789512634277, 'learning_rate': 8.516617790811339e-05, 'epoch': 1.46}
+{'loss': 0.2879, 'grad_norm': 1.1247527599334717, 'learning_rate': 8.514173998044964e-05, 'epoch': 1.46}
+{'loss': 0.328, 'grad_norm': 1.1355880498886108, 'learning_rate': 8.511730205278592e-05, 'epoch': 1.46}
+{'loss': 0.7011, 'grad_norm': 4.993597984313965, 'learning_rate': 8.509286412512218e-05, 'epoch': 1.46}
+{'loss': 0.1718, 'grad_norm': 0.7840385437011719, 'learning_rate': 8.506842619745845e-05, 'epoch': 1.46}
+{'loss': 0.5557, 'grad_norm': 1.4984678030014038, 'learning_rate': 8.504398826979473e-05, 'epoch': 1.46}
+{'loss': 0.548, 'grad_norm': 1.5178111791610718, 'learning_rate': 8.501955034213098e-05, 'epoch': 1.46}
+{'loss': 0.5248, 'grad_norm': 2.9656150341033936, 'learning_rate': 8.499511241446724e-05, 'epoch': 1.46}
+{'loss': 0.7294, 'grad_norm': 1.327226161956787, 'learning_rate': 8.497067448680352e-05, 'epoch': 1.46}
+ 73%|███████▎  | 9311/12776 [1:38:57<22:58,  2.51it/s] 73%|███████▎  | 9312/12776 [1:38:57<21:24,  2.70it/s]                                                       73%|███████▎  | 9312/12776 [1:38:57<21:24,  2.70it/s] 73%|███████▎  | 9313/12776 [1:38:58<20:13,  2.85it/s]                                                       73%|███████▎  | 9313/12776 [1:38:58<20:13,  2.85it/s] 73%|███████▎  | 9314/12776 [1:38:58<19:50,  2.91it/s]                                                       73%|███████▎  | 9314/12776 [1:38:58<19:50,  2.91it/s] 73%|███████▎  | 9315/12776 [1:38:58<18:46,  3.07it/s]                                                       73%|███████▎  | 9315/12776 [1:38:58<18:46,  3.07it/s] 73%|███████▎  | 9316/12776 [1:38:58<17:57,  3.21it/s]                                                       73%|███████▎  | 9316/12776 [1:38:58<17:57,  3.21it/s] 73%|███████▎  | 9317/12776 [1:38:59<17:12,  3.35it/s]                                                       73%|███████▎  | 9317/12776 [1:38:59<17:12,  3.35it/s] 73%|███████▎  | 9318/12776 [1:38:59<16:54,  3.41it/s]                                                       73%|███████▎  | 9318/12776 [1:38:59<16:54,  3.41it/s] 73%|███████▎  | 9319/12776 [1:38:59<16:14,  3.55it/s]                                                       73%|███████▎  | 9319/12776 [1:38:59<16:14,  3.55it/s] 73%|███████▎  | 9320/12776 [1:39:00<15:46,  3.65it/s]                                                       73%|███████▎  | 9320/12776 [1:39:00<15:46,  3.65it/s] 73%|███████▎  | 9321/12776 [1:39:00<15:17,  3.76it/s]                                                       73%|███████▎  | 9321/12776 [1:39:00<15:17,  3.76it/s] 73%|███████▎  | 9322/12776 [1:39:00<16:43,  3.44it/s]                                                       73%|███████▎  | 9322/12776 [1:39:00<16:43,  3.44it/s] 73%|███████▎  | 9323/12776 [1:39:00<15:42,  3.66it/s]                                                       73%|███████▎  | 9323/12776 [1:39:00<15:42,  3.66it/s] 73%|███████▎  | 9324/12776 [1:39:01<14:56,  3.85it/s]                                                       73%|███████▎  | 9324/12776 [1:39:01<14:56,  3.85it/s] 73%|███████▎  | 9325/12776 [1:39:01<14:18,  4.02it/s]                                                       73%|███████▎  | 9325/12776 [1:39:01<14:18,  4.02it/s] 73%|███████▎  | 9326/12776 [1:39:01<15:28,  3.71it/s]                                                       73%|███████▎  | 9326/12776 [1:39:01<15:28,  3.71it/s] 73%|███████▎  | 9327/12776 [1:39:01<14:29,  3.96it/s]                                                       73%|███████▎  | 9327/12776 [1:39:01<14:29,  3.96it/s] 73%|███████▎  | 9328/12776 [1:39:02<13:43,  4.19it/s]                                                       73%|███████▎  | 9328/12776 [1:39:02<13:43,  4.19it/s] 73%|███████▎  | 9329/12776 [1:39:02<13:13,  4.35it/s]                                                       73%|███████▎  | 9329/12776 [1:39:02<13:13,  4.35it/s] 73%|███████▎  | 9330/12776 [1:39:02<12:46,  4.50it/s]                                                       73%|███████▎  | 9330/12776 [1:39:02<12:46,  4.50it/s] 73%|███████▎  | 9331/12776 [1:39:02<14:11,  4.05it/s]                                                       73%|███████▎  | 9331/12776 [1:39:02<14:11,  4.05it/s] 73%|███████▎  | 9332/12776 [1:39:02<13:22,  4.29it/s]                                                       73%|███████▎  | 9332/12776 [1:39:03<13:22,  4.29it/s] 73%|███████▎  | 9333/12776 [1:39:03<12:53,  4.45it/s]                                                       73%|███████▎  | 9333/12776 [1:39:03<12:53,  4.45it/s] 73%|███████▎  | 9334/12776 [1:39:03<12:20,  4.65it/s]                                                       73%|███████▎  | 9334/12776 [1:39:03<12:20,  4.65it/s] 73%|███████▎  | 9335/12776 [1:39:03<11:55,  4.81it/s]                                                       73%|███████▎  | 9335/12776 [1:39:03<11:55,  4.81it/s] 73%|███████▎  | 9336/12776 [1:39:03<11:32,  4.97it/s]                                                       73%|███████▎  | 9336/12776 [1:39:03<11:32,  4.97it/s] 73%|███████▎  | 9337/12776 [1:39:04<13:07,  4.37it/s]                                                       73%|███████▎  | 9337/12776 [1:39:04<13:07,  4.37it/s] 73%|███████▎  | 9338/12776 [1:39:04<22:00,  2.60it/s]                                                       73%|███████▎  | 9338/12776 [1:39:04<22:00,  2.60it/s] 73%|███████▎  | 9339/12776 [1:39:06<42:22,  1.35it/s]                                                       73%|███████▎  | 9339/12776 [1:39:06<42:22,  1.35it/s] 73%|███████▎  | 9340/12776 [1:39:07<48:14,  1.19it/s]                                                       73%|███████▎  | 9340/12776 [1:39:07<48:14,  1.19it/s] 73%|███████▎  | 9341/12776 [1:39:08<48:53,  1.17it/s]                                                       73%|███████▎  | 9341/12776 [1:39:08<48:53,  1.17it/s] 73%|███████▎  | 9342/12776 [1:39:09<48:08,  1.19it/s]                                                       73%|███████▎  | 9342/12776 [1:39:09<48:08,  1.19it/s] 73%|███████▎  | 9343/12776 [1:39:09<48:03,  1.19it/s]                                                       73%|███████▎  | 9343/12776 [1:39:09<48:03,  1.19it/s] 73%|███████▎  | 9344/12776 [1:39:10<48:16,  1.18it/s]                                                       73%|███████▎  | 9344/12776 [1:39:10<48:16,  1.18it/s] 73%|███████▎  | 9345/12776 [1:39:11<45:08,  1.27it/s]                                                       73%|███████▎  | 9345/12776 [1:39:11<45:08,  1.27it/s] 73%|███████▎  | 9346/12776 [1:39:12<44:41,  1.28it/s]                                                       73%|███████▎  | 9346/12776 [1:39:12<44:41,  1.28it/s] 73%|███████▎  | 9347/12776 [1:39:12<41:41,  1.37it/s]                                                       73%|███████▎  | 9347/12776 [1:39:12<41:41,  1.37it/s] 73%|███████▎  | 9348/12776 [1:39:13<39:27,  1.45it/s]                                                       73%|███████▎  | 9348/12776 [1:39:13<39:27,  1.45it/s] 73%|███████▎  | 9349/12776 [1:39:14<36:59,  1.54it/s]                                                       73%|███████▎  | 9349/12776 [1:39:14<36:59,  1.54it/s] 73%|███████▎  | 9350/12776 [1:39:14<35:55,  1.59it/s]                                                       73%|███████▎  | 9350/12776 [1:39:14<35:55,  1.59it/s] 73%|███████▎  | 9351/12776 [1:39:15<33:36,  1.70it/s]                                                       73%|███████▎  | 9351/12776 [1:39:15<33:36,  1.70it/s] 73%|███████▎  | 9352/12776 [1:39:15<32:35,  1.75it/s]                                                       73%|███████▎  | 9352/12776 [1:39:15<32:35,  1.75it/s] 73%|███████▎  | 9353/12776 [1:39:16<30:20,  1.88it/s]                                                       73%|███████▎  | 9353/12776 [1:39:16<30:20,  1.88it/s] 73%|███████▎  | 9354/12776 [1:39:16<29:55,  1.91it/s]                                                       73%|███████▎  | 9354/12776 [1:39:16<29:55,  1.91it/s] 73%|███████▎  | 9355/12776 [1:39:16<27:56,  2.04it/s]                                                       73%|███████▎  | 9355/12776 [1:39:16<27:56,  2.04it/s] 73%|███████▎  | 9356/12776 [1:39:17<26:13,  2.17it/s]                                                       73%|███████▎  | 9356/12776 [1:39:17<26:13,  2.17it/s] 73%|███████▎  | 9357/12776 [1:39:17<27:31,  2.07it/s]                                                       73%|███████▎  | 9357/12776 [1:39:17<27:31,  2.07it/s] 73%|███████▎  | 9358/12776 [1:39:18<25:25,  2.24it/s]                                                       73%|███████▎  | 9358/12776 [1:39:18<25:25,  2.24it/s] 73%|███████▎  | 9359/12776 [1:39:18<23:46,  2.39it/s]                                                       73%|███████▎  | 9359/12776 [1:39:18<23:46,  2.39it/s] 73%|███████▎  | 9360/12776 [1:39:19<23:33,  2.42it/s]                                                       73%|███████▎  | 9360/12776 [1:39:19<23:33,  2.42it/s] 73%|███████▎  | 9361/12776 [1:39:19<22:16,  2.56it/s]                                                       73%|███████▎  | 9361/12776 [1:39:19<22:16,  2.56it/s] 73%|███████▎  | 9362/12776 [1:39:19<21:08,  2.69it/s]                                                       73%|███████▎  | 9362/12776 [1:39:19<21:08,  2.69it/s] 73%|███████▎  | 9363/12776 [1:39:20<21:44,  2.62it/s]                                                       73%|███████▎  | 9363/12776 [1:39:20<21:44,  2.62it/s] 73%|███████▎  | 9364/12776 [1:39:20<20:23,  2.79it/s]                                                       73%|███████▎  | 9364/12776 [1:39:20<20:23,  2.79it/s] 73%|███████▎  | 9365/12776 [1:39:20<19:15,  2.95it/s]                                                       73%|███████▎  | 9365/12776 [1:39:20<19:15,  2.95it/s] 73%|███████▎  | 9366/12776 [1:39:21<20:03,  2.83it/s]                                                       73%|███████▎  | 9366/12776 [1:39:21<20:03,  2.83it/s] 73%|███████▎  | 9367/12776 [1:39:21<18:42,  3.04it/s]                                                       73%|███████▎  | 9367/12776 [1:39:21<18:42,  3.04it/s] 73%|███████▎  | 9368/12776 [1:39:21<17:38,  3.22it/s]                                                       73%|███████▎  | 9368/12776 [1:39:21<17:38,  3.22it/s] 73%|███████▎  | 9369/12776 [1:39:21<16:47,  3.38it/s]                                                       73%|███████▎  | 9369/12776 [1:39:21<16:47,  3.38it/s] 73%|███████▎  | 9370/12776 [1:39:22<17:16,  3.28it/s]                                                       73%|███████▎  | 9370/12776 [1:39:22<17:16,  3.28it/s] 73%|███████▎  | 9371/12776 [1:39:22<16:19,  3.48it/s]                                                       73%|███████▎  | 9371/12776 [1:39:22<16:19,  3.48it/s] 73%|███████▎  | 9372/12776 [1:39:22<15:32,  3.65it/s]                                                       73%|███████▎  | 9372/12776 [1:39:22<15:32,  3.65it/s] 73%|███████▎  | 9373/12776 [1:39:22<14:56,  3.79it/s]                                                       73%|███████▎  | 9373/12776 [1:39:22<14:56,  3.79it/s] 73%|███████▎  | 9374/12776 [1:39:23<14:23,  3.94it/s]                                                       73%|███████▎  | 9374/12776 [1:39:23<14:23,  3.94it/s] 73%|███████▎  | 9375/12776 [1:39:23<15:14,  3.72it/s]                                                       73%|███████▎  | 9375/12776 [1:39:23<15:14,  3.72it/s] 73%|███████▎  | 9376/12776 [1:39:23<14:24,  3.93it/s]                                                       73%|███████▎  | 9376/12776 [1:39:23<14:24,  3.93it/s] 73%|███████▎  | 9377/12776 [1:39:23<13:41,  4.14it/s]                                                       73%|███████▎  | 9377/12776 [1:39:23<13:41,  4.14it/s] 73%|███████▎  | 9378/12776 [1:39:24<13:07,  4.32it/s]                                                       73%|███████▎  | 9378/12776 [1:39:24<13:07,  4.32it/s] 73%|███████▎  | 9379/12776 [1:39:24<12:47,  4.43it/s]                                                       73%|███████▎  | 9379/12776 [1:39:24<12:47,  4.43it/s] 73%|███████▎  | 9380/12776 [1:39:24<13:42,  4.13it/s]                                                       73%|███████▎  | 9380/12776 [1:39:24<13:42,  4.13it/s] 73%|███████▎  | 9381/12776 [1:39:24<13:01,  4.34it/s]                                                       73%|███████▎  | 9381/12776 [1:39:24<13:01,  4.34it/s] 73%|███████▎  | 9382/12776 [1:39:25<12:30,  4.53it/s]                                                       73%|███████▎  | 9382/12776 [1:39:25<12:30,  4.53it/s] 73%|███████▎  | 9383/12776 [1:39:25<12:05,  4.68it/s]                                                       73%|███████▎  | 9383/12776 [1:39:25<12:05,  4.68it/s] 73%|███████▎  | 9384/12776 [1:39:25<11:44,  4.82it/s]                                                       73%|███████▎  | 9384/12776 [1:39:25<11:44,  4.82it/s] 73%|███████▎  | 9385/12776 [1:39:25<13:27,  4.20it/s]                                                       73%|███████▎  | 9385/12776 [1:39:25<13:27,  4.20it/s] 73%|███████▎  | 9386/12776 [1:39:25<12:36,  4.48it/s]                                                       73%|███████▎  | 9386/12776 [1:39:25<12:36,  4.48it/s] 73%|███████▎  | 9387/12776 [1:39:26<11:58,  4.71it/s]                                                       73%|███████▎  | 9387/12776 [1:39:26<11:58,  4.71it/s] 73%|███████▎  | 9388/12776 [1:39:26<21:33,  2.62it/s]                                                       73%|███████▎  | 9388/12776 [1:39:26<21:33,  2.62it/s] 73%|███████▎  | 9389/12776 [1:39:28<41:55,  1.35it/s]                                                      {'loss': 0.4204, 'grad_norm': 1.0923349857330322, 'learning_rate': 8.494623655913979e-05, 'epoch': 1.46}
+{'loss': 0.7072, 'grad_norm': 2.7213759422302246, 'learning_rate': 8.492179863147604e-05, 'epoch': 1.46}
+{'loss': 0.7492, 'grad_norm': 1.255927324295044, 'learning_rate': 8.489736070381232e-05, 'epoch': 1.46}
+{'loss': 0.5131, 'grad_norm': 2.039738416671753, 'learning_rate': 8.487292277614858e-05, 'epoch': 1.46}
+{'loss': 0.4055, 'grad_norm': 2.313304901123047, 'learning_rate': 8.484848484848483e-05, 'epoch': 1.46}
+{'loss': 0.5852, 'grad_norm': 1.3143826723098755, 'learning_rate': 8.482404692082111e-05, 'epoch': 1.46}
+{'loss': 0.8262, 'grad_norm': 2.0478479862213135, 'learning_rate': 8.479960899315737e-05, 'epoch': 1.46}
+{'loss': 0.9234, 'grad_norm': 3.0443527698516846, 'learning_rate': 8.477517106549364e-05, 'epoch': 1.46}
+{'loss': 0.708, 'grad_norm': 5.6164727210998535, 'learning_rate': 8.475073313782992e-05, 'epoch': 1.46}
+{'loss': 0.9464, 'grad_norm': 1.4889097213745117, 'learning_rate': 8.472629521016617e-05, 'epoch': 1.46}
+{'loss': 1.1254, 'grad_norm': 2.5142250061035156, 'learning_rate': 8.470185728250243e-05, 'epoch': 1.46}
+{'loss': 1.037, 'grad_norm': 3.2503488063812256, 'learning_rate': 8.467741935483871e-05, 'epoch': 1.46}
+{'loss': 0.8223, 'grad_norm': 2.1030449867248535, 'learning_rate': 8.465298142717498e-05, 'epoch': 1.46}
+{'loss': 0.5398, 'grad_norm': 1.6335690021514893, 'learning_rate': 8.462854349951123e-05, 'epoch': 1.46}
+{'loss': 0.3341, 'grad_norm': 2.1362364292144775, 'learning_rate': 8.46041055718475e-05, 'epoch': 1.46}
+{'loss': 1.4995, 'grad_norm': 2.837796449661255, 'learning_rate': 8.457966764418377e-05, 'epoch': 1.46}
+{'loss': 1.3477, 'grad_norm': 4.29352331161499, 'learning_rate': 8.455522971652002e-05, 'epoch': 1.46}
+{'loss': 0.8274, 'grad_norm': 1.990387201309204, 'learning_rate': 8.45307917888563e-05, 'epoch': 1.46}
+{'loss': 0.6768, 'grad_norm': 1.3266398906707764, 'learning_rate': 8.450635386119257e-05, 'epoch': 1.46}
+{'loss': 0.9489, 'grad_norm': 4.102444171905518, 'learning_rate': 8.448191593352883e-05, 'epoch': 1.46}
+{'loss': 0.8909, 'grad_norm': 2.765338897705078, 'learning_rate': 8.445747800586511e-05, 'epoch': 1.46}
+{'loss': 1.2414, 'grad_norm': 2.984898805618286, 'learning_rate': 8.443304007820136e-05, 'epoch': 1.46}
+{'loss': 1.6595, 'grad_norm': 4.269297122955322, 'learning_rate': 8.440860215053763e-05, 'epoch': 1.46}
+{'loss': 0.1954, 'grad_norm': 1.1461013555526733, 'learning_rate': 8.43841642228739e-05, 'epoch': 1.46}
+{'loss': 0.7527, 'grad_norm': 2.7744386196136475, 'learning_rate': 8.435972629521017e-05, 'epoch': 1.46}
+{'loss': 0.7368, 'grad_norm': 1.470504641532898, 'learning_rate': 8.433528836754642e-05, 'epoch': 1.46}
+{'loss': 0.3352, 'grad_norm': 2.8413960933685303, 'learning_rate': 8.431085043988268e-05, 'epoch': 1.46}
+{'loss': 0.5112, 'grad_norm': 1.2316049337387085, 'learning_rate': 8.428641251221896e-05, 'epoch': 1.46}
+{'loss': 0.2659, 'grad_norm': 0.4800587594509125, 'learning_rate': 8.426197458455521e-05, 'epoch': 1.46}
+{'loss': 0.2848, 'grad_norm': 0.5300424695014954, 'learning_rate': 8.423753665689148e-05, 'epoch': 1.46}
+{'loss': 0.1509, 'grad_norm': 0.4099821448326111, 'learning_rate': 8.421309872922776e-05, 'epoch': 1.46}
+{'loss': 0.2769, 'grad_norm': 0.5445888042449951, 'learning_rate': 8.418866080156402e-05, 'epoch': 1.46}
+{'loss': 0.24, 'grad_norm': 0.7307244539260864, 'learning_rate': 8.416422287390027e-05, 'epoch': 1.46}
+{'loss': 0.174, 'grad_norm': 0.590489387512207, 'learning_rate': 8.413978494623655e-05, 'epoch': 1.46}
+{'loss': 0.2945, 'grad_norm': 0.8075463175773621, 'learning_rate': 8.411534701857282e-05, 'epoch': 1.46}
+{'loss': 0.2182, 'grad_norm': 0.6793466806411743, 'learning_rate': 8.409090909090908e-05, 'epoch': 1.46}
+{'loss': 0.347, 'grad_norm': 0.9996579885482788, 'learning_rate': 8.406647116324535e-05, 'epoch': 1.46}
+{'loss': 0.2452, 'grad_norm': 0.5806265473365784, 'learning_rate': 8.404203323558161e-05, 'epoch': 1.46}
+{'loss': 0.3277, 'grad_norm': 0.9214386343955994, 'learning_rate': 8.401759530791788e-05, 'epoch': 1.46}
+{'loss': 0.7389, 'grad_norm': 2.849910020828247, 'learning_rate': 8.399315738025415e-05, 'epoch': 1.46}
+{'loss': 0.2119, 'grad_norm': 0.5131305456161499, 'learning_rate': 8.39687194525904e-05, 'epoch': 1.46}
+{'loss': 0.3357, 'grad_norm': 2.853576898574829, 'learning_rate': 8.394428152492667e-05, 'epoch': 1.46}
+{'loss': 0.2544, 'grad_norm': 1.252537488937378, 'learning_rate': 8.391984359726295e-05, 'epoch': 1.46}
+{'loss': 0.4257, 'grad_norm': 1.4162060022354126, 'learning_rate': 8.389540566959921e-05, 'epoch': 1.46}
+{'loss': 0.4705, 'grad_norm': 1.189596176147461, 'learning_rate': 8.387096774193546e-05, 'epoch': 1.46}
+{'loss': 0.411, 'grad_norm': 0.7394349575042725, 'learning_rate': 8.384652981427174e-05, 'epoch': 1.46}
+{'loss': 0.5221, 'grad_norm': 1.2858307361602783, 'learning_rate': 8.382209188660801e-05, 'epoch': 1.46}
+{'loss': 0.407, 'grad_norm': 1.020027756690979, 'learning_rate': 8.379765395894427e-05, 'epoch': 1.46}
+{'loss': 0.4782, 'grad_norm': 1.9283796548843384, 'learning_rate': 8.377321603128054e-05, 'epoch': 1.47}
+{'loss': 0.4688, 'grad_norm': 1.3848600387573242, 'learning_rate': 8.37487781036168e-05, 'epoch': 1.47}
+{'loss': 0.8337, 'grad_norm': 1.769733190536499, 'learning_rate': 8.372434017595307e-05, 'epoch': 1.47}
+{'loss': 0.1812, 'grad_norm': 1.7718392610549927, 'learning_rate': 8.369990224828934e-05, 'epoch': 1.47}
+{'loss': 0.5035, 'grad_norm': 2.2144126892089844, 'learning_rate': 8.36754643206256e-05, 'epoch': 1.47}
+{'loss': 0.3048, 'grad_norm': 0.7767239212989807, 'learning_rate': 8.365102639296186e-05, 'epoch': 1.47}
+{'loss': 0.7307, 'grad_norm': 3.264582872390747, 'learning_rate': 8.362658846529814e-05, 'epoch': 1.47}
+{'loss': 0.6826, 'grad_norm': 1.8177357912063599, 'learning_rate': 8.36021505376344e-05, 'epoch': 1.47}
+{'loss': 0.4975, 'grad_norm': 3.0217385292053223, 'learning_rate': 8.357771260997066e-05, 'epoch': 1.47}
+{'loss': 0.3554, 'grad_norm': 1.1318516731262207, 'learning_rate': 8.355327468230693e-05, 'epoch': 1.47}
+{'loss': 0.579, 'grad_norm': 1.6963642835617065, 'learning_rate': 8.35288367546432e-05, 'epoch': 1.47}
+{'loss': 0.6147, 'grad_norm': 2.232435941696167, 'learning_rate': 8.350439882697946e-05, 'epoch': 1.47}
+{'loss': 1.0373, 'grad_norm': 3.5765178203582764, 'learning_rate': 8.347996089931573e-05, 'epoch': 1.47}
+{'loss': 0.3683, 'grad_norm': 1.6168148517608643, 'learning_rate': 8.345552297165199e-05, 'epoch': 1.47}
+{'loss': 0.5754, 'grad_norm': 1.8201894760131836, 'learning_rate': 8.343108504398826e-05, 'epoch': 1.47}
+{'loss': 0.6643, 'grad_norm': 2.91982364654541, 'learning_rate': 8.340664711632454e-05, 'epoch': 1.47}
+{'loss': 0.624, 'grad_norm': 2.993435859680176, 'learning_rate': 8.338220918866079e-05, 'epoch': 1.47}
+{'loss': 0.7056, 'grad_norm': 4.31162691116333, 'learning_rate': 8.335777126099705e-05, 'epoch': 1.47}
+{'loss': 1.2868, 'grad_norm': 3.5542516708374023, 'learning_rate': 8.333333333333333e-05, 'epoch': 1.47}
+{'loss': 0.9045, 'grad_norm': 2.0428109169006348, 'learning_rate': 8.33088954056696e-05, 'epoch': 1.47}
+{'loss': 0.8775, 'grad_norm': 2.306626796722412, 'learning_rate': 8.328445747800585e-05, 'epoch': 1.47}
+{'loss': 1.4198, 'grad_norm': 2.654895544052124, 'learning_rate': 8.326001955034212e-05, 'epoch': 1.47}
+{'loss': 0.9698, 'grad_norm': 2.5380771160125732, 'learning_rate': 8.323558162267839e-05, 'epoch': 1.47}
+{'loss': 1.164, 'grad_norm': 3.0304949283599854, 'learning_rate': 8.321114369501465e-05, 'epoch': 1.47}
+{'loss': 1.1842, 'grad_norm': 2.77571177482605, 'learning_rate': 8.318670576735092e-05, 'epoch': 1.47}
+{'loss': 1.0286, 'grad_norm': 2.4832112789154053, 'learning_rate': 8.316226783968718e-05, 'epoch': 1.47}
+{'loss': 1.3073, 'grad_norm': 4.166812419891357, 'learning_rate': 8.313782991202345e-05, 'epoch': 1.47}
+{'loss': 0.5474, 'grad_norm': 1.6507247686386108, 'learning_rate': 8.311339198435973e-05, 'epoch': 1.47}
+{'loss': 0.8368, 'grad_norm': 4.904731273651123, 'learning_rate': 8.308895405669598e-05, 'epoch': 1.47}
+{'loss': 0.5796, 'grad_norm': 1.261414647102356, 'learning_rate': 8.306451612903224e-05, 'epoch': 1.47}
+ 73%|███████▎  | 9389/12776 [1:39:28<41:55,  1.35it/s] 73%|███████▎  | 9390/12776 [1:39:29<47:20,  1.19it/s]                                                       73%|███████▎  | 9390/12776 [1:39:29<47:20,  1.19it/s] 74%|███████▎  | 9391/12776 [1:39:30<47:58,  1.18it/s]                                                       74%|███████▎  | 9391/12776 [1:39:30<47:58,  1.18it/s] 74%|███████▎  | 9392/12776 [1:39:31<48:05,  1.17it/s]                                                       74%|███████▎  | 9392/12776 [1:39:31<48:05,  1.17it/s] 74%|███████▎  | 9393/12776 [1:39:32<46:38,  1.21it/s]                                                       74%|███████▎  | 9393/12776 [1:39:32<46:38,  1.21it/s] 74%|███████▎  | 9394/12776 [1:39:32<47:19,  1.19it/s]                                                       74%|███████▎  | 9394/12776 [1:39:32<47:19,  1.19it/s] 74%|███████▎  | 9395/12776 [1:39:33<44:33,  1.26it/s]                                                       74%|███████▎  | 9395/12776 [1:39:33<44:33,  1.26it/s] 74%|███████▎  | 9396/12776 [1:39:34<42:14,  1.33it/s]                                                       74%|███████▎  | 9396/12776 [1:39:34<42:14,  1.33it/s] 74%|███████▎  | 9397/12776 [1:39:34<41:50,  1.35it/s]                                                       74%|███████▎  | 9397/12776 [1:39:34<41:50,  1.35it/s] 74%|███████▎  | 9398/12776 [1:39:35<39:09,  1.44it/s]                                                       74%|███████▎  | 9398/12776 [1:39:35<39:09,  1.44it/s] 74%|███████▎  | 9399/12776 [1:39:36<37:23,  1.51it/s]                                                       74%|███████▎  | 9399/12776 [1:39:36<37:23,  1.51it/s] 74%|███████▎  | 9400/12776 [1:39:36<35:44,  1.57it/s]                                                       74%|███████▎  | 9400/12776 [1:39:36<35:44,  1.57it/s] 74%|███████▎  | 9401/12776 [1:39:37<34:52,  1.61it/s]                                                       74%|███████▎  | 9401/12776 [1:39:37<34:52,  1.61it/s] 74%|███████▎  | 9402/12776 [1:39:37<32:45,  1.72it/s]                                                       74%|███████▎  | 9402/12776 [1:39:37<32:45,  1.72it/s] 74%|███████▎  | 9403/12776 [1:39:38<30:50,  1.82it/s]                                                       74%|███████▎  | 9403/12776 [1:39:38<30:50,  1.82it/s] 74%|███████▎  | 9404/12776 [1:39:38<29:07,  1.93it/s]                                                       74%|███████▎  | 9404/12776 [1:39:38<29:07,  1.93it/s] 74%|███████▎  | 9405/12776 [1:39:39<27:55,  2.01it/s]                                                       74%|███████▎  | 9405/12776 [1:39:39<27:55,  2.01it/s] 74%|███████▎  | 9406/12776 [1:39:39<26:48,  2.10it/s]                                                       74%|███████▎  | 9406/12776 [1:39:39<26:48,  2.10it/s] 74%|███████▎  | 9407/12776 [1:39:39<25:49,  2.17it/s]                                                       74%|███████▎  | 9407/12776 [1:39:39<25:49,  2.17it/s] 74%|███████▎  | 9408/12776 [1:39:40<24:21,  2.31it/s]                                                       74%|███████▎  | 9408/12776 [1:39:40<24:21,  2.31it/s] 74%|███████▎  | 9409/12776 [1:39:40<23:18,  2.41it/s]                                                       74%|███████▎  | 9409/12776 [1:39:40<23:18,  2.41it/s] 74%|███████▎  | 9410/12776 [1:39:41<22:09,  2.53it/s]                                                       74%|███████▎  | 9410/12776 [1:39:41<22:09,  2.53it/s] 74%|███████▎  | 9411/12776 [1:39:41<21:08,  2.65it/s]                                                       74%|███████▎  | 9411/12776 [1:39:41<21:08,  2.65it/s] 74%|███████▎  | 9412/12776 [1:39:41<22:03,  2.54it/s]                                                       74%|███████▎  | 9412/12776 [1:39:41<22:03,  2.54it/s] 74%|███████▎  | 9413/12776 [1:39:42<20:43,  2.70it/s]                                                       74%|███████▎  | 9413/12776 [1:39:42<20:43,  2.70it/s] 74%|███████▎  | 9414/12776 [1:39:42<19:32,  2.87it/s]                                                       74%|███████▎  | 9414/12776 [1:39:42<19:32,  2.87it/s] 74%|███████▎  | 9415/12776 [1:39:42<18:36,  3.01it/s]                                                       74%|███████▎  | 9415/12776 [1:39:42<18:36,  3.01it/s] 74%|███████▎  | 9416/12776 [1:39:43<19:20,  2.90it/s]                                                       74%|███████▎  | 9416/12776 [1:39:43<19:20,  2.90it/s] 74%|███████▎  | 9417/12776 [1:39:43<18:05,  3.10it/s]                                                       74%|███████▎  | 9417/12776 [1:39:43<18:05,  3.10it/s] 74%|███████▎  | 9418/12776 [1:39:43<17:05,  3.27it/s]                                                       74%|███████▎  | 9418/12776 [1:39:43<17:05,  3.27it/s] 74%|███████▎  | 9419/12776 [1:39:43<16:19,  3.43it/s]                                                       74%|███████▎  | 9419/12776 [1:39:43<16:19,  3.43it/s] 74%|███████▎  | 9420/12776 [1:39:44<16:45,  3.34it/s]                                                       74%|███████▎  | 9420/12776 [1:39:44<16:45,  3.34it/s] 74%|███████▎  | 9421/12776 [1:39:44<15:50,  3.53it/s]                                                       74%|███████▎  | 9421/12776 [1:39:44<15:50,  3.53it/s] 74%|███████▎  | 9422/12776 [1:39:44<15:08,  3.69it/s]                                                       74%|███████▎  | 9422/12776 [1:39:44<15:08,  3.69it/s] 74%|███████▍  | 9423/12776 [1:39:44<14:26,  3.87it/s]                                                       74%|███████▍  | 9423/12776 [1:39:44<14:26,  3.87it/s] 74%|███████▍  | 9424/12776 [1:39:45<15:02,  3.71it/s]                                                       74%|███████▍  | 9424/12776 [1:39:45<15:02,  3.71it/s] 74%|███████▍  | 9425/12776 [1:39:45<14:17,  3.91it/s]                                                       74%|███████▍  | 9425/12776 [1:39:45<14:17,  3.91it/s] 74%|███████▍  | 9426/12776 [1:39:45<13:38,  4.09it/s]                                                       74%|███████▍  | 9426/12776 [1:39:45<13:38,  4.09it/s] 74%|███████▍  | 9427/12776 [1:39:45<13:06,  4.26it/s]                                                       74%|███████▍  | 9427/12776 [1:39:45<13:06,  4.26it/s] 74%|███████▍  | 9428/12776 [1:39:46<12:39,  4.41it/s]                                                       74%|███████▍  | 9428/12776 [1:39:46<12:39,  4.41it/s] 74%|███████▍  | 9429/12776 [1:39:46<13:31,  4.13it/s]                                                       74%|███████▍  | 9429/12776 [1:39:46<13:31,  4.13it/s] 74%|███████▍  | 9430/12776 [1:39:46<12:53,  4.33it/s]                                                       74%|███████▍  | 9430/12776 [1:39:46<12:53,  4.33it/s] 74%|███████▍  | 9431/12776 [1:39:46<12:22,  4.51it/s]                                                       74%|███████▍  | 9431/12776 [1:39:46<12:22,  4.51it/s] 74%|███████▍  | 9432/12776 [1:39:46<12:00,  4.64it/s]                                                       74%|███████▍  | 9432/12776 [1:39:46<12:00,  4.64it/s] 74%|███████▍  | 9433/12776 [1:39:47<11:43,  4.75it/s]                                                       74%|███████▍  | 9433/12776 [1:39:47<11:43,  4.75it/s] 74%|███████▍  | 9434/12776 [1:39:47<13:17,  4.19it/s]                                                       74%|███████▍  | 9434/12776 [1:39:47<13:17,  4.19it/s] 74%|███████▍  | 9435/12776 [1:39:47<12:32,  4.44it/s]                                                       74%|███████▍  | 9435/12776 [1:39:47<12:32,  4.44it/s] 74%|███████▍  | 9436/12776 [1:39:47<11:53,  4.68it/s]                                                       74%|███████▍  | 9436/12776 [1:39:47<11:53,  4.68it/s] 74%|███████▍  | 9437/12776 [1:39:48<11:22,  4.89it/s]                                                       74%|███████▍  | 9437/12776 [1:39:48<11:22,  4.89it/s] 74%|███████▍  | 9438/12776 [1:39:48<19:32,  2.85it/s]                                                       74%|███████▍  | 9438/12776 [1:39:48<19:32,  2.85it/s] 74%|███████▍  | 9439/12776 [1:39:50<38:45,  1.43it/s]                                                       74%|███████▍  | 9439/12776 [1:39:50<38:45,  1.43it/s] 74%|███████▍  | 9440/12776 [1:39:51<44:44,  1.24it/s]                                                       74%|███████▍  | 9440/12776 [1:39:51<44:44,  1.24it/s] 74%|███████▍  | 9441/12776 [1:39:52<45:12,  1.23it/s]                                                       74%|███████▍  | 9441/12776 [1:39:52<45:12,  1.23it/s] 74%|███████▍  | 9442/12776 [1:39:52<44:23,  1.25it/s]                                                       74%|███████▍  | 9442/12776 [1:39:52<44:23,  1.25it/s] 74%|███████▍  | 9443/12776 [1:39:53<43:29,  1.28it/s]                                                       74%|███████▍  | 9443/12776 [1:39:53<43:29,  1.28it/s] 74%|███████▍  | 9444/12776 [1:39:54<42:31,  1.31it/s]                                                       74%|███████▍  | 9444/12776 [1:39:54<42:31,  1.31it/s] 74%|███████▍  | 9445/12776 [1:39:55<40:20,  1.38it/s]                                                       74%|███████▍  | 9445/12776 [1:39:55<40:20,  1.38it/s] 74%|███████▍  | 9446/12776 [1:39:55<41:19,  1.34it/s]                                                       74%|███████▍  | 9446/12776 [1:39:55<41:19,  1.34it/s] 74%|███████▍  | 9447/12776 [1:39:56<38:23,  1.45it/s]                                                       74%|███████▍  | 9447/12776 [1:39:56<38:23,  1.45it/s] 74%|███████▍  | 9448/12776 [1:39:56<36:36,  1.51it/s]                                                       74%|███████▍  | 9448/12776 [1:39:56<36:36,  1.51it/s] 74%|███████▍  | 9449/12776 [1:39:57<34:23,  1.61it/s]                                                       74%|███████▍  | 9449/12776 [1:39:57<34:23,  1.61it/s] 74%|███████▍  | 9450/12776 [1:39:58<34:40,  1.60it/s]                                                       74%|███████▍  | 9450/12776 [1:39:58<34:40,  1.60it/s] 74%|███████▍  | 9451/12776 [1:39:58<31:58,  1.73it/s]                                                       74%|███████▍  | 9451/12776 [1:39:58<31:58,  1.73it/s] 74%|███████▍  | 9452/12776 [1:39:59<30:34,  1.81it/s]                                                       74%|███████▍  | 9452/12776 [1:39:59<30:34,  1.81it/s] 74%|███████▍  | 9453/12776 [1:39:59<28:42,  1.93it/s]                                                       74%|███████▍  | 9453/12776 [1:39:59<28:42,  1.93it/s] 74%|███████▍  | 9454/12776 [1:39:59<27:12,  2.04it/s]                                                       74%|███████▍  | 9454/12776 [1:39:59<27:12,  2.04it/s] 74%|███████▍  | 9455/12776 [1:40:00<27:16,  2.03it/s]                                                       74%|███████▍  | 9455/12776 [1:40:00<27:16,  2.03it/s] 74%|███████▍  | 9456/12776 [1:40:00<25:43,  2.15it/s]                                                       74%|███████▍  | 9456/12776 [1:40:00<25:43,  2.15it/s] 74%|███████▍  | 9457/12776 [1:40:01<26:05,  2.12it/s]                                                       74%|███████▍  | 9457/12776 [1:40:01<26:05,  2.12it/s] 74%|███████▍  | 9458/12776 [1:40:01<24:27,  2.26it/s]                                                       74%|███████▍  | 9458/12776 [1:40:01<24:27,  2.26it/s] 74%|███████▍  | 9459/12776 [1:40:02<22:56,  2.41it/s]                                                       74%|███████▍  | 9459/12776 [1:40:02<22:56,  2.41it/s] 74%|███████▍  | 9460/12776 [1:40:02<22:48,  2.42it/s]                                                       74%|███████▍  | 9460/12776 [1:40:02<22:48,  2.42it/s] 74%|███████▍  | 9461/12776 [1:40:02<21:33,  2.56it/s]                                                       74%|███████▍  | 9461/12776 [1:40:02<21:33,  2.56it/s] 74%|███████▍  | 9462/12776 [1:40:03<20:35,  2.68it/s]                                                       74%|███████▍  | 9462/12776 [1:40:03<20:35,  2.68it/s] 74%|███████▍  | 9463/12776 [1:40:03<19:41,  2.80it/s]                                                       74%|███████▍  | 9463/12776 [1:40:03<19:41,  2.80it/s] 74%|███████▍  | 9464/12776 [1:40:03<19:03,  2.90it/s]                                                       74%|███████▍  | 9464/12776 [1:40:03<19:03,  2.90it/s] 74%|███████▍  | 9465/12776 [1:40:04<18:17,  3.02it/s]                                                       74%|███████▍  | 9465/12776 [1:40:04<18:17,  3.02it/s] 74%|███████▍  | 9466/12776 [1:40:04<17:34,  3.14it/s]                                                       74%|███████▍  | 9466/12776 [1:40:04<17:34,  3.14it/s] 74%|███████▍  | 9467/12776 [1:40:04<18:43,  2.94it/s]                                                      {'loss': 0.2298, 'grad_norm': 0.3774799704551697, 'learning_rate': 8.304007820136852e-05, 'epoch': 1.47}
+{'loss': 0.1553, 'grad_norm': 0.3805764615535736, 'learning_rate': 8.301564027370479e-05, 'epoch': 1.47}
+{'loss': 0.2439, 'grad_norm': 0.48448646068573, 'learning_rate': 8.299120234604104e-05, 'epoch': 1.47}
+{'loss': 0.2297, 'grad_norm': 0.5146775841712952, 'learning_rate': 8.296676441837732e-05, 'epoch': 1.47}
+{'loss': 0.2025, 'grad_norm': 0.5531075596809387, 'learning_rate': 8.294232649071358e-05, 'epoch': 1.47}
+{'loss': 0.2208, 'grad_norm': 0.606508195400238, 'learning_rate': 8.291788856304985e-05, 'epoch': 1.47}
+{'loss': 0.253, 'grad_norm': 0.7570547461509705, 'learning_rate': 8.289345063538611e-05, 'epoch': 1.47}
+{'loss': 0.2676, 'grad_norm': 0.5803053379058838, 'learning_rate': 8.286901270772238e-05, 'epoch': 1.47}
+{'loss': 0.1975, 'grad_norm': 0.5528594255447388, 'learning_rate': 8.284457478005864e-05, 'epoch': 1.47}
+{'loss': 0.2068, 'grad_norm': 0.7934523820877075, 'learning_rate': 8.282013685239492e-05, 'epoch': 1.47}
+{'loss': 0.2537, 'grad_norm': 0.7662394046783447, 'learning_rate': 8.279569892473117e-05, 'epoch': 1.47}
+{'loss': 0.247, 'grad_norm': 1.507385492324829, 'learning_rate': 8.277126099706743e-05, 'epoch': 1.47}
+{'loss': 0.2944, 'grad_norm': 0.87679523229599, 'learning_rate': 8.274682306940371e-05, 'epoch': 1.47}
+{'loss': 0.3211, 'grad_norm': 1.0118069648742676, 'learning_rate': 8.272238514173998e-05, 'epoch': 1.47}
+{'loss': 0.1825, 'grad_norm': 0.34879270195961, 'learning_rate': 8.269794721407623e-05, 'epoch': 1.47}
+{'loss': 0.3664, 'grad_norm': 0.8371734619140625, 'learning_rate': 8.267350928641251e-05, 'epoch': 1.47}
+{'loss': 0.3631, 'grad_norm': 1.5805662870407104, 'learning_rate': 8.264907135874877e-05, 'epoch': 1.47}
+{'loss': 0.309, 'grad_norm': 0.7317862510681152, 'learning_rate': 8.262463343108502e-05, 'epoch': 1.47}
+{'loss': 0.3384, 'grad_norm': 1.2464789152145386, 'learning_rate': 8.26001955034213e-05, 'epoch': 1.47}
+{'loss': 0.4214, 'grad_norm': 0.7848882079124451, 'learning_rate': 8.257575757575757e-05, 'epoch': 1.47}
+{'loss': 0.4749, 'grad_norm': 3.3078129291534424, 'learning_rate': 8.255131964809383e-05, 'epoch': 1.47}
+{'loss': 0.5161, 'grad_norm': 6.444918155670166, 'learning_rate': 8.252688172043011e-05, 'epoch': 1.47}
+{'loss': 0.5021, 'grad_norm': 1.534546136856079, 'learning_rate': 8.250244379276636e-05, 'epoch': 1.47}
+{'loss': 0.4859, 'grad_norm': 1.4558238983154297, 'learning_rate': 8.247800586510263e-05, 'epoch': 1.47}
+{'loss': 0.262, 'grad_norm': 0.8588922023773193, 'learning_rate': 8.24535679374389e-05, 'epoch': 1.47}
+{'loss': 0.7627, 'grad_norm': 1.7501471042633057, 'learning_rate': 8.242913000977517e-05, 'epoch': 1.47}
+{'loss': 0.3825, 'grad_norm': 4.139252185821533, 'learning_rate': 8.240469208211142e-05, 'epoch': 1.47}
+{'loss': 0.9586, 'grad_norm': 3.472306489944458, 'learning_rate': 8.23802541544477e-05, 'epoch': 1.47}
+{'loss': 0.6018, 'grad_norm': 1.744738221168518, 'learning_rate': 8.235581622678396e-05, 'epoch': 1.47}
+{'loss': 0.5255, 'grad_norm': 2.3839869499206543, 'learning_rate': 8.233137829912021e-05, 'epoch': 1.47}
+{'loss': 0.3971, 'grad_norm': 1.5559438467025757, 'learning_rate': 8.230694037145649e-05, 'epoch': 1.47}
+{'loss': 0.783, 'grad_norm': 7.917959690093994, 'learning_rate': 8.228250244379276e-05, 'epoch': 1.47}
+{'loss': 0.6701, 'grad_norm': 2.138312816619873, 'learning_rate': 8.225806451612902e-05, 'epoch': 1.47}
+{'loss': 0.8102, 'grad_norm': 3.2708122730255127, 'learning_rate': 8.22336265884653e-05, 'epoch': 1.47}
+{'loss': 0.9962, 'grad_norm': 2.629610300064087, 'learning_rate': 8.220918866080155e-05, 'epoch': 1.48}
+{'loss': 0.6238, 'grad_norm': 2.9563984870910645, 'learning_rate': 8.218475073313782e-05, 'epoch': 1.48}
+{'loss': 0.965, 'grad_norm': 3.6667652130126953, 'learning_rate': 8.21603128054741e-05, 'epoch': 1.48}
+{'loss': 0.9697, 'grad_norm': 2.6509387493133545, 'learning_rate': 8.213587487781036e-05, 'epoch': 1.48}
+{'loss': 0.9831, 'grad_norm': 2.9789719581604004, 'learning_rate': 8.211143695014661e-05, 'epoch': 1.48}
+{'loss': 1.0215, 'grad_norm': 3.1920371055603027, 'learning_rate': 8.208699902248289e-05, 'epoch': 1.48}
+{'loss': 0.941, 'grad_norm': 2.6322896480560303, 'learning_rate': 8.206256109481915e-05, 'epoch': 1.48}
+{'loss': 1.4716, 'grad_norm': 3.8062565326690674, 'learning_rate': 8.20381231671554e-05, 'epoch': 1.48}
+{'loss': 1.0996, 'grad_norm': 1.7152727842330933, 'learning_rate': 8.201368523949168e-05, 'epoch': 1.48}
+{'loss': 1.2918, 'grad_norm': 3.0590524673461914, 'learning_rate': 8.198924731182795e-05, 'epoch': 1.48}
+{'loss': 0.8202, 'grad_norm': 2.010657548904419, 'learning_rate': 8.196480938416421e-05, 'epoch': 1.48}
+{'loss': 0.5172, 'grad_norm': 1.9447054862976074, 'learning_rate': 8.194037145650049e-05, 'epoch': 1.48}
+{'loss': 0.4932, 'grad_norm': 1.379649043083191, 'learning_rate': 8.191593352883674e-05, 'epoch': 1.48}
+{'loss': 0.8522, 'grad_norm': 2.9022107124328613, 'learning_rate': 8.189149560117301e-05, 'epoch': 1.48}
+{'loss': 1.4347, 'grad_norm': 2.831977128982544, 'learning_rate': 8.186705767350929e-05, 'epoch': 1.48}
+{'loss': 1.0414, 'grad_norm': 1.9703986644744873, 'learning_rate': 8.184261974584555e-05, 'epoch': 1.48}
+{'loss': 0.2257, 'grad_norm': 0.4396578371524811, 'learning_rate': 8.18181818181818e-05, 'epoch': 1.48}
+{'loss': 0.3489, 'grad_norm': 0.6509549617767334, 'learning_rate': 8.179374389051808e-05, 'epoch': 1.48}
+{'loss': 0.1851, 'grad_norm': 0.587127685546875, 'learning_rate': 8.176930596285435e-05, 'epoch': 1.48}
+{'loss': 0.2927, 'grad_norm': 0.4554985463619232, 'learning_rate': 8.17448680351906e-05, 'epoch': 1.48}
+{'loss': 0.154, 'grad_norm': 0.3806717097759247, 'learning_rate': 8.172043010752688e-05, 'epoch': 1.48}
+{'loss': 0.2549, 'grad_norm': 0.5629563331604004, 'learning_rate': 8.169599217986314e-05, 'epoch': 1.48}
+{'loss': 0.2599, 'grad_norm': 0.7409935593605042, 'learning_rate': 8.16715542521994e-05, 'epoch': 1.48}
+{'loss': 0.3104, 'grad_norm': 1.7930724620819092, 'learning_rate': 8.164711632453568e-05, 'epoch': 1.48}
+{'loss': 0.3755, 'grad_norm': 0.7437885999679565, 'learning_rate': 8.162267839687193e-05, 'epoch': 1.48}
+{'loss': 0.2964, 'grad_norm': 0.7157233357429504, 'learning_rate': 8.15982404692082e-05, 'epoch': 1.48}
+{'loss': 0.1982, 'grad_norm': 0.8939926028251648, 'learning_rate': 8.157380254154448e-05, 'epoch': 1.48}
+{'loss': 0.4599, 'grad_norm': 1.4006277322769165, 'learning_rate': 8.154936461388073e-05, 'epoch': 1.48}
+{'loss': 0.4006, 'grad_norm': 1.421652913093567, 'learning_rate': 8.1524926686217e-05, 'epoch': 1.48}
+{'loss': 0.229, 'grad_norm': 1.1265709400177002, 'learning_rate': 8.150048875855327e-05, 'epoch': 1.48}
+{'loss': 0.3313, 'grad_norm': 1.248371958732605, 'learning_rate': 8.147605083088954e-05, 'epoch': 1.48}
+{'loss': 0.526, 'grad_norm': 1.0506163835525513, 'learning_rate': 8.145161290322579e-05, 'epoch': 1.48}
+{'loss': 0.3684, 'grad_norm': 0.9769994616508484, 'learning_rate': 8.142717497556207e-05, 'epoch': 1.48}
+{'loss': 0.4004, 'grad_norm': 1.5080924034118652, 'learning_rate': 8.140273704789833e-05, 'epoch': 1.48}
+{'loss': 0.5374, 'grad_norm': 2.587855339050293, 'learning_rate': 8.13782991202346e-05, 'epoch': 1.48}
+{'loss': 0.4843, 'grad_norm': 1.3442978858947754, 'learning_rate': 8.135386119257087e-05, 'epoch': 1.48}
+{'loss': 0.3719, 'grad_norm': 1.6153483390808105, 'learning_rate': 8.132942326490713e-05, 'epoch': 1.48}
+{'loss': 1.0182, 'grad_norm': 2.865086317062378, 'learning_rate': 8.130498533724339e-05, 'epoch': 1.48}
+{'loss': 0.4096, 'grad_norm': 1.1315138339996338, 'learning_rate': 8.128054740957967e-05, 'epoch': 1.48}
+{'loss': 0.5152, 'grad_norm': 1.3342171907424927, 'learning_rate': 8.125610948191592e-05, 'epoch': 1.48}
+{'loss': 0.4946, 'grad_norm': 1.2070798873901367, 'learning_rate': 8.123167155425218e-05, 'epoch': 1.48}
+{'loss': 0.4527, 'grad_norm': 1.9415396451950073, 'learning_rate': 8.120723362658846e-05, 'epoch': 1.48}
+{'loss': 0.2465, 'grad_norm': 1.080844759941101, 'learning_rate': 8.118279569892473e-05, 'epoch': 1.48}
+{'loss': 0.6034, 'grad_norm': 1.6477303504943848, 'learning_rate': 8.115835777126098e-05, 'epoch': 1.48}
+ 74%|███████▍  | 9467/12776 [1:40:04<18:43,  2.94it/s] 74%|███████▍  | 9468/12776 [1:40:05<17:37,  3.13it/s]                                                       74%|███████▍  | 9468/12776 [1:40:05<17:37,  3.13it/s] 74%|███████▍  | 9469/12776 [1:40:05<16:39,  3.31it/s]                                                       74%|███████▍  | 9469/12776 [1:40:05<16:39,  3.31it/s] 74%|███████▍  | 9470/12776 [1:40:05<15:50,  3.48it/s]                                                       74%|███████▍  | 9470/12776 [1:40:05<15:50,  3.48it/s] 74%|███████▍  | 9471/12776 [1:40:05<16:52,  3.27it/s]                                                       74%|███████▍  | 9471/12776 [1:40:05<16:52,  3.27it/s] 74%|███████▍  | 9472/12776 [1:40:06<15:47,  3.49it/s]                                                       74%|███████▍  | 9472/12776 [1:40:06<15:47,  3.49it/s] 74%|███████▍  | 9473/12776 [1:40:06<14:57,  3.68it/s]                                                       74%|███████▍  | 9473/12776 [1:40:06<14:57,  3.68it/s] 74%|███████▍  | 9474/12776 [1:40:06<14:15,  3.86it/s]                                                       74%|███████▍  | 9474/12776 [1:40:06<14:15,  3.86it/s] 74%|███████▍  | 9475/12776 [1:40:06<13:39,  4.03it/s]                                                       74%|███████▍  | 9475/12776 [1:40:06<13:39,  4.03it/s] 74%|███████▍  | 9476/12776 [1:40:07<14:59,  3.67it/s]                                                       74%|███████▍  | 9476/12776 [1:40:07<14:59,  3.67it/s] 74%|███████▍  | 9477/12776 [1:40:07<14:02,  3.91it/s]                                                       74%|███████▍  | 9477/12776 [1:40:07<14:02,  3.91it/s] 74%|███████▍  | 9478/12776 [1:40:07<13:20,  4.12it/s]                                                       74%|███████▍  | 9478/12776 [1:40:07<13:20,  4.12it/s] 74%|███████▍  | 9479/12776 [1:40:07<12:52,  4.27it/s]                                                       74%|███████▍  | 9479/12776 [1:40:07<12:52,  4.27it/s] 74%|███████▍  | 9480/12776 [1:40:07<12:22,  4.44it/s]                                                       74%|███████▍  | 9480/12776 [1:40:07<12:22,  4.44it/s] 74%|███████▍  | 9481/12776 [1:40:08<13:36,  4.04it/s]                                                       74%|███████▍  | 9481/12776 [1:40:08<13:36,  4.04it/s] 74%|███████▍  | 9482/12776 [1:40:08<12:49,  4.28it/s]                                                       74%|███████▍  | 9482/12776 [1:40:08<12:49,  4.28it/s] 74%|███████▍  | 9483/12776 [1:40:08<12:16,  4.47it/s]                                                       74%|███████▍  | 9483/12776 [1:40:08<12:16,  4.47it/s] 74%|███████▍  | 9484/12776 [1:40:08<11:51,  4.63it/s]                                                       74%|███████▍  | 9484/12776 [1:40:08<11:51,  4.63it/s] 74%|███████▍  | 9485/12776 [1:40:09<11:29,  4.77it/s]                                                       74%|███████▍  | 9485/12776 [1:40:09<11:29,  4.77it/s] 74%|███████▍  | 9486/12776 [1:40:09<12:52,  4.26it/s]                                                       74%|███████▍  | 9486/12776 [1:40:09<12:52,  4.26it/s] 74%|███████▍  | 9487/12776 [1:40:09<12:04,  4.54it/s]                                                       74%|███████▍  | 9487/12776 [1:40:09<12:04,  4.54it/s] 74%|███████▍  | 9488/12776 [1:40:10<19:38,  2.79it/s]                                                       74%|███████▍  | 9488/12776 [1:40:10<19:38,  2.79it/s] 74%|███████▍  | 9489/12776 [1:40:11<38:25,  1.43it/s]                                                       74%|███████▍  | 9489/12776 [1:40:11<38:25,  1.43it/s] 74%|███████▍  | 9490/12776 [1:40:12<43:08,  1.27it/s]                                                       74%|███████▍  | 9490/12776 [1:40:12<43:08,  1.27it/s] 74%|███████▍  | 9491/12776 [1:40:13<44:03,  1.24it/s]                                                       74%|███████▍  | 9491/12776 [1:40:13<44:03,  1.24it/s] 74%|███████▍  | 9492/12776 [1:40:14<43:31,  1.26it/s]                                                       74%|███████▍  | 9492/12776 [1:40:14<43:31,  1.26it/s] 74%|███████▍  | 9493/12776 [1:40:15<42:11,  1.30it/s]                                                       74%|███████▍  | 9493/12776 [1:40:15<42:11,  1.30it/s] 74%|███████▍  | 9494/12776 [1:40:15<40:36,  1.35it/s]                                                       74%|███████▍  | 9494/12776 [1:40:15<40:36,  1.35it/s] 74%|███████▍  | 9495/12776 [1:40:16<40:05,  1.36it/s]                                                       74%|███████▍  | 9495/12776 [1:40:16<40:05,  1.36it/s] 74%|███████▍  | 9496/12776 [1:40:17<38:12,  1.43it/s]                                                       74%|███████▍  | 9496/12776 [1:40:17<38:12,  1.43it/s] 74%|███████▍  | 9497/12776 [1:40:17<36:53,  1.48it/s]                                                       74%|███████▍  | 9497/12776 [1:40:17<36:53,  1.48it/s] 74%|███████▍  | 9498/12776 [1:40:18<34:52,  1.57it/s]                                                       74%|███████▍  | 9498/12776 [1:40:18<34:52,  1.57it/s] 74%|███████▍  | 9499/12776 [1:40:18<33:13,  1.64it/s]                                                       74%|███████▍  | 9499/12776 [1:40:18<33:13,  1.64it/s] 74%|███████▍  | 9500/12776 [1:40:19<31:38,  1.73it/s]                                                       74%|███████▍  | 9500/12776 [1:40:19<31:38,  1.73it/s] 74%|███████▍  | 9501/12776 [1:40:19<30:51,  1.77it/s]                                                       74%|███████▍  | 9501/12776 [1:40:19<30:51,  1.77it/s] 74%|███████▍  | 9502/12776 [1:40:20<29:03,  1.88it/s]                                                       74%|███████▍  | 9502/12776 [1:40:20<29:03,  1.88it/s] 74%|███████▍  | 9503/12776 [1:40:20<26:53,  2.03it/s]                                                       74%|███████▍  | 9503/12776 [1:40:20<26:53,  2.03it/s] 74%|███████▍  | 9504/12776 [1:40:21<25:51,  2.11it/s]                                                       74%|███████▍  | 9504/12776 [1:40:21<25:51,  2.11it/s] 74%|███████▍  | 9505/12776 [1:40:21<24:43,  2.20it/s]                                                       74%|███████▍  | 9505/12776 [1:40:21<24:43,  2.20it/s] 74%|███████▍  | 9506/12776 [1:40:21<23:48,  2.29it/s]                                                       74%|███████▍  | 9506/12776 [1:40:21<23:48,  2.29it/s] 74%|███████▍  | 9507/12776 [1:40:22<22:49,  2.39it/s]                                                       74%|███████▍  | 9507/12776 [1:40:22<22:49,  2.39it/s] 74%|███████▍  | 9508/12776 [1:40:22<21:42,  2.51it/s]                                                       74%|███████▍  | 9508/12776 [1:40:22<21:42,  2.51it/s] 74%|███████▍  | 9509/12776 [1:40:23<21:53,  2.49it/s]                                                       74%|███████▍  | 9509/12776 [1:40:23<21:53,  2.49it/s] 74%|███████▍  | 9510/12776 [1:40:23<20:48,  2.62it/s]                                                       74%|███████▍  | 9510/12776 [1:40:23<20:48,  2.62it/s] 74%|███████▍  | 9511/12776 [1:40:23<19:55,  2.73it/s]                                                       74%|███████▍  | 9511/12776 [1:40:23<19:55,  2.73it/s] 74%|███████▍  | 9512/12776 [1:40:24<19:07,  2.84it/s]                                                       74%|███████▍  | 9512/12776 [1:40:24<19:07,  2.84it/s] 74%|███████▍  | 9513/12776 [1:40:24<18:34,  2.93it/s]                                                       74%|███████▍  | 9513/12776 [1:40:24<18:34,  2.93it/s] 74%|███████▍  | 9514/12776 [1:40:24<17:42,  3.07it/s]                                                       74%|███████▍  | 9514/12776 [1:40:24<17:42,  3.07it/s] 74%|███████▍  | 9515/12776 [1:40:24<16:59,  3.20it/s]                                                       74%|███████▍  | 9515/12776 [1:40:24<16:59,  3.20it/s] 74%|███████▍  | 9516/12776 [1:40:25<16:22,  3.32it/s]                                                       74%|███████▍  | 9516/12776 [1:40:25<16:22,  3.32it/s] 74%|███████▍  | 9517/12776 [1:40:25<16:18,  3.33it/s]                                                       74%|███████▍  | 9517/12776 [1:40:25<16:18,  3.33it/s] 74%|███████▍  | 9518/12776 [1:40:25<15:39,  3.47it/s]                                                       74%|███████▍  | 9518/12776 [1:40:25<15:39,  3.47it/s] 75%|███████▍  | 9519/12776 [1:40:26<15:05,  3.60it/s]                                                       75%|███████▍  | 9519/12776 [1:40:26<15:05,  3.60it/s] 75%|███████▍  | 9520/12776 [1:40:26<14:37,  3.71it/s]                                                       75%|███████▍  | 9520/12776 [1:40:26<14:37,  3.71it/s] 75%|███████▍  | 9521/12776 [1:40:26<15:59,  3.39it/s]                                                       75%|███████▍  | 9521/12776 [1:40:26<15:59,  3.39it/s] 75%|███████▍  | 9522/12776 [1:40:26<15:08,  3.58it/s]                                                       75%|███████▍  | 9522/12776 [1:40:26<15:08,  3.58it/s] 75%|███████▍  | 9523/12776 [1:40:27<14:24,  3.76it/s]                                                       75%|███████▍  | 9523/12776 [1:40:27<14:24,  3.76it/s] 75%|███████▍  | 9524/12776 [1:40:27<13:48,  3.92it/s]                                                       75%|███████▍  | 9524/12776 [1:40:27<13:48,  3.92it/s] 75%|███████▍  | 9525/12776 [1:40:27<14:50,  3.65it/s]                                                       75%|███████▍  | 9525/12776 [1:40:27<14:50,  3.65it/s] 75%|███████▍  | 9526/12776 [1:40:27<13:54,  3.89it/s]                                                       75%|███████▍  | 9526/12776 [1:40:27<13:54,  3.89it/s] 75%|███████▍  | 9527/12776 [1:40:28<13:13,  4.09it/s]                                                       75%|███████▍  | 9527/12776 [1:40:28<13:13,  4.09it/s] 75%|███████▍  | 9528/12776 [1:40:28<12:42,  4.26it/s]                                                       75%|███████▍  | 9528/12776 [1:40:28<12:42,  4.26it/s] 75%|███████▍  | 9529/12776 [1:40:28<12:19,  4.39it/s]                                                       75%|███████▍  | 9529/12776 [1:40:28<12:19,  4.39it/s] 75%|███████▍  | 9530/12776 [1:40:28<13:20,  4.06it/s]                                                       75%|███████▍  | 9530/12776 [1:40:28<13:20,  4.06it/s] 75%|███████▍  | 9531/12776 [1:40:29<12:42,  4.26it/s]                                                       75%|███████▍  | 9531/12776 [1:40:29<12:42,  4.26it/s] 75%|███████▍  | 9532/12776 [1:40:29<12:08,  4.45it/s]                                                       75%|███████▍  | 9532/12776 [1:40:29<12:08,  4.45it/s] 75%|███████▍  | 9533/12776 [1:40:29<11:44,  4.61it/s]                                                       75%|███████▍  | 9533/12776 [1:40:29<11:44,  4.61it/s] 75%|███████▍  | 9534/12776 [1:40:29<11:24,  4.73it/s]                                                       75%|███████▍  | 9534/12776 [1:40:29<11:24,  4.73it/s] 75%|███████▍  | 9535/12776 [1:40:29<12:52,  4.19it/s]                                                       75%|███████▍  | 9535/12776 [1:40:29<12:52,  4.19it/s] 75%|███████▍  | 9536/12776 [1:40:30<12:07,  4.46it/s]                                                       75%|███████▍  | 9536/12776 [1:40:30<12:07,  4.46it/s] 75%|███████▍  | 9537/12776 [1:40:30<11:30,  4.69it/s]                                                       75%|███████▍  | 9537/12776 [1:40:30<11:30,  4.69it/s] 75%|███████▍  | 9538/12776 [1:40:31<20:45,  2.60it/s]                                                       75%|███████▍  | 9538/12776 [1:40:31<20:45,  2.60it/s] 75%|███████▍  | 9539/12776 [1:40:32<38:08,  1.41it/s]                                                       75%|███████▍  | 9539/12776 [1:40:32<38:08,  1.41it/s] 75%|███████▍  | 9540/12776 [1:40:33<41:48,  1.29it/s]                                                       75%|███████▍  | 9540/12776 [1:40:33<41:48,  1.29it/s] 75%|███████▍  | 9541/12776 [1:40:34<43:05,  1.25it/s]                                                       75%|███████▍  | 9541/12776 [1:40:34<43:05,  1.25it/s] 75%|███████▍  | 9542/12776 [1:40:35<42:30,  1.27it/s]                                                       75%|███████▍  | 9542/12776 [1:40:35<42:30,  1.27it/s] 75%|███████▍  | 9543/12776 [1:40:35<41:26,  1.30it/s]                                                       75%|███████▍  | 9543/12776 [1:40:35<41:26,  1.30it/s] 75%|███████▍  | 9544/12776 [1:40:36<40:00,  1.35it/s]                                                       75%|███████▍  | 9544/12776 [1:40:36<40:00,  1.35it/s] 75%|███████▍  | 9545/12776 [1:40:37<39:38,  1.36it/s]                                                      {'loss': 0.6707, 'grad_norm': 2.1239099502563477, 'learning_rate': 8.113391984359726e-05, 'epoch': 1.48}
+{'loss': 0.5054, 'grad_norm': 1.421504259109497, 'learning_rate': 8.110948191593352e-05, 'epoch': 1.48}
+{'loss': 0.4603, 'grad_norm': 3.8904707431793213, 'learning_rate': 8.108504398826979e-05, 'epoch': 1.48}
+{'loss': 0.5213, 'grad_norm': 1.8012278079986572, 'learning_rate': 8.106060606060607e-05, 'epoch': 1.48}
+{'loss': 1.315, 'grad_norm': 3.616649627685547, 'learning_rate': 8.103616813294232e-05, 'epoch': 1.48}
+{'loss': 0.9852, 'grad_norm': 2.4311721324920654, 'learning_rate': 8.101173020527858e-05, 'epoch': 1.48}
+{'loss': 0.8551, 'grad_norm': 1.9556763172149658, 'learning_rate': 8.098729227761486e-05, 'epoch': 1.48}
+{'loss': 0.4723, 'grad_norm': 2.610199213027954, 'learning_rate': 8.096285434995111e-05, 'epoch': 1.48}
+{'loss': 1.0211, 'grad_norm': 1.7846730947494507, 'learning_rate': 8.093841642228738e-05, 'epoch': 1.48}
+{'loss': 1.0827, 'grad_norm': 2.926453113555908, 'learning_rate': 8.091397849462365e-05, 'epoch': 1.48}
+{'loss': 1.0163, 'grad_norm': 2.9660816192626953, 'learning_rate': 8.088954056695992e-05, 'epoch': 1.48}
+{'loss': 0.6645, 'grad_norm': 2.5968966484069824, 'learning_rate': 8.086510263929617e-05, 'epoch': 1.48}
+{'loss': 1.0715, 'grad_norm': 4.448347091674805, 'learning_rate': 8.084066471163245e-05, 'epoch': 1.48}
+{'loss': 1.1425, 'grad_norm': 2.39528226852417, 'learning_rate': 8.081622678396871e-05, 'epoch': 1.48}
+{'loss': 0.9069, 'grad_norm': 2.856279134750366, 'learning_rate': 8.079178885630498e-05, 'epoch': 1.48}
+{'loss': 0.7697, 'grad_norm': 2.2662487030029297, 'learning_rate': 8.076735092864126e-05, 'epoch': 1.48}
+{'loss': 1.2625, 'grad_norm': 2.555561065673828, 'learning_rate': 8.074291300097751e-05, 'epoch': 1.48}
+{'loss': 1.1531, 'grad_norm': 1.9698431491851807, 'learning_rate': 8.071847507331377e-05, 'epoch': 1.48}
+{'loss': 0.41, 'grad_norm': 2.155409336090088, 'learning_rate': 8.069403714565005e-05, 'epoch': 1.48}
+{'loss': 0.8686, 'grad_norm': 2.2174770832061768, 'learning_rate': 8.06695992179863e-05, 'epoch': 1.48}
+{'loss': 1.2679, 'grad_norm': 3.0642926692962646, 'learning_rate': 8.064516129032257e-05, 'epoch': 1.49}
+{'loss': 0.8235, 'grad_norm': 3.7173335552215576, 'learning_rate': 8.062072336265885e-05, 'epoch': 1.49}
+{'loss': 0.2204, 'grad_norm': 0.439689964056015, 'learning_rate': 8.059628543499511e-05, 'epoch': 1.49}
+{'loss': 0.2014, 'grad_norm': 0.3609512448310852, 'learning_rate': 8.057184750733136e-05, 'epoch': 1.49}
+{'loss': 0.1972, 'grad_norm': 0.530227541923523, 'learning_rate': 8.054740957966764e-05, 'epoch': 1.49}
+{'loss': 0.2325, 'grad_norm': 0.6065565943717957, 'learning_rate': 8.05229716520039e-05, 'epoch': 1.49}
+{'loss': 0.3771, 'grad_norm': 0.9413346648216248, 'learning_rate': 8.049853372434017e-05, 'epoch': 1.49}
+{'loss': 0.2029, 'grad_norm': 0.57166588306427, 'learning_rate': 8.047409579667645e-05, 'epoch': 1.49}
+{'loss': 0.2793, 'grad_norm': 0.8557118773460388, 'learning_rate': 8.04496578690127e-05, 'epoch': 1.49}
+{'loss': 0.764, 'grad_norm': 3.641554117202759, 'learning_rate': 8.042521994134896e-05, 'epoch': 1.49}
+{'loss': 0.1924, 'grad_norm': 0.5618619918823242, 'learning_rate': 8.040078201368524e-05, 'epoch': 1.49}
+{'loss': 0.3731, 'grad_norm': 1.0576132535934448, 'learning_rate': 8.03763440860215e-05, 'epoch': 1.49}
+{'loss': 0.2424, 'grad_norm': 0.5164195895195007, 'learning_rate': 8.035190615835776e-05, 'epoch': 1.49}
+{'loss': 0.1442, 'grad_norm': 0.5563727617263794, 'learning_rate': 8.032746823069404e-05, 'epoch': 1.49}
+{'loss': 0.3146, 'grad_norm': 0.9167290329933167, 'learning_rate': 8.03030303030303e-05, 'epoch': 1.49}
+{'loss': 0.259, 'grad_norm': 0.6889357566833496, 'learning_rate': 8.027859237536655e-05, 'epoch': 1.49}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 8.027859237536655e-05, 'epoch': 1.49}
+{'loss': 0.3823, 'grad_norm': 0.6606603860855103, 'learning_rate': 8.025415444770283e-05, 'epoch': 1.49}
+{'loss': 0.4172, 'grad_norm': 1.8341822624206543, 'learning_rate': 8.02297165200391e-05, 'epoch': 1.49}
+{'loss': 0.3575, 'grad_norm': 1.2621349096298218, 'learning_rate': 8.020527859237536e-05, 'epoch': 1.49}
+{'loss': 0.3535, 'grad_norm': 0.7620284557342529, 'learning_rate': 8.018084066471163e-05, 'epoch': 1.49}
+{'loss': 0.6989, 'grad_norm': 1.7038427591323853, 'learning_rate': 8.015640273704789e-05, 'epoch': 1.49}
+{'loss': 0.4143, 'grad_norm': 1.006224274635315, 'learning_rate': 8.013196480938415e-05, 'epoch': 1.49}
+{'loss': 0.3454, 'grad_norm': 1.3613799810409546, 'learning_rate': 8.010752688172043e-05, 'epoch': 1.49}
+{'loss': 0.3727, 'grad_norm': 2.3779265880584717, 'learning_rate': 8.008308895405668e-05, 'epoch': 1.49}
+{'loss': 0.4661, 'grad_norm': 1.3316359519958496, 'learning_rate': 8.005865102639295e-05, 'epoch': 1.49}
+{'loss': 0.6581, 'grad_norm': 2.452564239501953, 'learning_rate': 8.003421309872923e-05, 'epoch': 1.49}
+{'loss': 0.7966, 'grad_norm': 2.7504520416259766, 'learning_rate': 8.000977517106549e-05, 'epoch': 1.49}
+{'loss': 0.9771, 'grad_norm': 2.999328851699829, 'learning_rate': 7.998533724340174e-05, 'epoch': 1.49}
+{'loss': 0.6239, 'grad_norm': 2.237630605697632, 'learning_rate': 7.996089931573802e-05, 'epoch': 1.49}
+{'loss': 0.421, 'grad_norm': 1.3302862644195557, 'learning_rate': 7.993646138807429e-05, 'epoch': 1.49}
+{'loss': 0.6495, 'grad_norm': 1.9129620790481567, 'learning_rate': 7.991202346041055e-05, 'epoch': 1.49}
+{'loss': 0.607, 'grad_norm': 1.6199758052825928, 'learning_rate': 7.988758553274682e-05, 'epoch': 1.49}
+{'loss': 1.0782, 'grad_norm': 2.3590195178985596, 'learning_rate': 7.986314760508308e-05, 'epoch': 1.49}
+{'loss': 0.9664, 'grad_norm': 2.1550941467285156, 'learning_rate': 7.983870967741935e-05, 'epoch': 1.49}
+{'loss': 0.9356, 'grad_norm': 4.303733825683594, 'learning_rate': 7.981427174975562e-05, 'epoch': 1.49}
+{'loss': 0.6391, 'grad_norm': 2.1720597743988037, 'learning_rate': 7.978983382209188e-05, 'epoch': 1.49}
+{'loss': 0.6076, 'grad_norm': 2.204709053039551, 'learning_rate': 7.976539589442814e-05, 'epoch': 1.49}
+{'loss': 1.1823, 'grad_norm': 2.43810772895813, 'learning_rate': 7.974095796676442e-05, 'epoch': 1.49}
+{'loss': 0.3912, 'grad_norm': 1.9804140329360962, 'learning_rate': 7.971652003910068e-05, 'epoch': 1.49}
+{'loss': 0.8703, 'grad_norm': 2.124976396560669, 'learning_rate': 7.969208211143693e-05, 'epoch': 1.49}
+{'loss': 0.9466, 'grad_norm': 1.996584415435791, 'learning_rate': 7.966764418377321e-05, 'epoch': 1.49}
+{'loss': 0.7035, 'grad_norm': 1.8637217283248901, 'learning_rate': 7.964320625610948e-05, 'epoch': 1.49}
+{'loss': 0.8354, 'grad_norm': 4.413573741912842, 'learning_rate': 7.961876832844574e-05, 'epoch': 1.49}
+{'loss': 1.1676, 'grad_norm': 2.3614120483398438, 'learning_rate': 7.959433040078201e-05, 'epoch': 1.49}
+{'loss': 1.3798, 'grad_norm': 2.0499792098999023, 'learning_rate': 7.956989247311827e-05, 'epoch': 1.49}
+{'loss': 0.8426, 'grad_norm': 2.0810813903808594, 'learning_rate': 7.954545454545454e-05, 'epoch': 1.49}
+{'loss': 1.0149, 'grad_norm': 2.218729257583618, 'learning_rate': 7.952101661779082e-05, 'epoch': 1.49}
+{'loss': 0.1697, 'grad_norm': 1.0656377077102661, 'learning_rate': 7.949657869012707e-05, 'epoch': 1.49}
+{'loss': 0.6699, 'grad_norm': 2.4055192470550537, 'learning_rate': 7.947214076246333e-05, 'epoch': 1.49}
+{'loss': 0.7338, 'grad_norm': 1.910409927368164, 'learning_rate': 7.944770283479961e-05, 'epoch': 1.49}
+{'loss': 0.3543, 'grad_norm': 1.1563485860824585, 'learning_rate': 7.942326490713587e-05, 'epoch': 1.49}
+{'loss': 0.1907, 'grad_norm': 0.43213382363319397, 'learning_rate': 7.939882697947213e-05, 'epoch': 1.49}
+{'loss': 0.2515, 'grad_norm': 0.5200718641281128, 'learning_rate': 7.93743890518084e-05, 'epoch': 1.49}
+{'loss': 0.2597, 'grad_norm': 1.2368117570877075, 'learning_rate': 7.934995112414467e-05, 'epoch': 1.49}
+{'loss': 0.3166, 'grad_norm': 0.9348035454750061, 'learning_rate': 7.932551319648093e-05, 'epoch': 1.49}
+{'loss': 0.2521, 'grad_norm': 0.7543662190437317, 'learning_rate': 7.93010752688172e-05, 'epoch': 1.49}
+{'loss': 0.1899, 'grad_norm': 0.44818371534347534, 'learning_rate': 7.927663734115346e-05, 'epoch': 1.49}
+ 75%|███████▍  | 9545/12776 [1:40:37<39:38,  1.36it/s] 75%|███████▍  | 9546/12776 [1:40:37<37:55,  1.42it/s]                                                       75%|███████▍  | 9546/12776 [1:40:37<37:55,  1.42it/s] 75%|███████▍  | 9547/12776 [1:40:38<36:16,  1.48it/s]                                                       75%|███████▍  | 9547/12776 [1:40:38<36:16,  1.48it/s] 75%|███████▍  | 9548/12776 [1:40:39<34:41,  1.55it/s]                                                       75%|███████▍  | 9548/12776 [1:40:39<34:41,  1.55it/s] 75%|███████▍  | 9549/12776 [1:40:39<33:29,  1.61it/s]                                                       75%|███████▍  | 9549/12776 [1:40:39<33:29,  1.61it/s] 75%|███████▍  | 9550/12776 [1:40:40<32:00,  1.68it/s]                                                       75%|███████▍  | 9550/12776 [1:40:40<32:00,  1.68it/s] 75%|███████▍  | 9551/12776 [1:40:40<31:40,  1.70it/s]                                                       75%|███████▍  | 9551/12776 [1:40:40<31:40,  1.70it/s] 75%|███████▍  | 9552/12776 [1:40:41<29:46,  1.80it/s]                                                       75%|███████▍  | 9552/12776 [1:40:41<29:46,  1.80it/s] 75%|███████▍  | 9553/12776 [1:40:41<30:45,  1.75it/s]                                                       75%|███████▍  | 9553/12776 [1:40:41<30:45,  1.75it/s] 75%|███████▍  | 9554/12776 [1:40:42<28:20,  1.90it/s]                                                       75%|███████▍  | 9554/12776 [1:40:42<28:20,  1.90it/s] 75%|███████▍  | 9555/12776 [1:40:42<28:04,  1.91it/s]                                                       75%|███████▍  | 9555/12776 [1:40:42<28:04,  1.91it/s] 75%|███████▍  | 9556/12776 [1:40:43<25:53,  2.07it/s]                                                       75%|███████▍  | 9556/12776 [1:40:43<25:53,  2.07it/s] 75%|███████▍  | 9557/12776 [1:40:43<24:09,  2.22it/s]                                                       75%|███████▍  | 9557/12776 [1:40:43<24:09,  2.22it/s] 75%|███████▍  | 9558/12776 [1:40:43<23:21,  2.30it/s]                                                       75%|███████▍  | 9558/12776 [1:40:43<23:21,  2.30it/s] 75%|███████▍  | 9559/12776 [1:40:44<21:53,  2.45it/s]                                                       75%|███████▍  | 9559/12776 [1:40:44<21:53,  2.45it/s] 75%|███████▍  | 9560/12776 [1:40:44<20:42,  2.59it/s]                                                       75%|███████▍  | 9560/12776 [1:40:44<20:42,  2.59it/s] 75%|███████▍  | 9561/12776 [1:40:45<22:10,  2.42it/s]                                                       75%|███████▍  | 9561/12776 [1:40:45<22:10,  2.42it/s] 75%|███████▍  | 9562/12776 [1:40:45<20:24,  2.63it/s]                                                       75%|███████▍  | 9562/12776 [1:40:45<20:24,  2.63it/s] 75%|███████▍  | 9563/12776 [1:40:45<19:01,  2.82it/s]                                                       75%|███████▍  | 9563/12776 [1:40:45<19:01,  2.82it/s] 75%|███████▍  | 9564/12776 [1:40:45<17:54,  2.99it/s]                                                       75%|███████▍  | 9564/12776 [1:40:45<17:54,  2.99it/s] 75%|███████▍  | 9565/12776 [1:40:46<19:03,  2.81it/s]                                                       75%|███████▍  | 9565/12776 [1:40:46<19:03,  2.81it/s] 75%|███████▍  | 9566/12776 [1:40:46<17:38,  3.03it/s]                                                       75%|███████▍  | 9566/12776 [1:40:46<17:38,  3.03it/s] 75%|███████▍  | 9567/12776 [1:40:46<16:35,  3.22it/s]                                                       75%|███████▍  | 9567/12776 [1:40:46<16:35,  3.22it/s] 75%|███████▍  | 9568/12776 [1:40:47<15:42,  3.40it/s]                                                       75%|███████▍  | 9568/12776 [1:40:47<15:42,  3.40it/s] 75%|███████▍  | 9569/12776 [1:40:47<16:15,  3.29it/s]                                                       75%|███████▍  | 9569/12776 [1:40:47<16:15,  3.29it/s] 75%|███████▍  | 9570/12776 [1:40:47<15:22,  3.47it/s]                                                       75%|███████▍  | 9570/12776 [1:40:47<15:22,  3.47it/s] 75%|███████▍  | 9571/12776 [1:40:47<14:39,  3.64it/s]                                                       75%|███████▍  | 9571/12776 [1:40:47<14:39,  3.64it/s] 75%|███████▍  | 9572/12776 [1:40:48<14:01,  3.81it/s]                                                       75%|███████▍  | 9572/12776 [1:40:48<14:01,  3.81it/s] 75%|███████▍  | 9573/12776 [1:40:48<14:35,  3.66it/s]                                                       75%|███████▍  | 9573/12776 [1:40:48<14:35,  3.66it/s] 75%|███████▍  | 9574/12776 [1:40:48<13:46,  3.87it/s]                                                       75%|███████▍  | 9574/12776 [1:40:48<13:46,  3.87it/s] 75%|███████▍  | 9575/12776 [1:40:48<13:04,  4.08it/s]                                                       75%|███████▍  | 9575/12776 [1:40:48<13:04,  4.08it/s] 75%|███████▍  | 9576/12776 [1:40:49<12:32,  4.25it/s]                                                       75%|███████▍  | 9576/12776 [1:40:49<12:32,  4.25it/s] 75%|███████▍  | 9577/12776 [1:40:49<12:10,  4.38it/s]                                                       75%|███████▍  | 9577/12776 [1:40:49<12:10,  4.38it/s] 75%|███████▍  | 9578/12776 [1:40:49<12:52,  4.14it/s]                                                       75%|███████▍  | 9578/12776 [1:40:49<12:52,  4.14it/s] 75%|███████▍  | 9579/12776 [1:40:49<12:19,  4.32it/s]                                                       75%|███████▍  | 9579/12776 [1:40:49<12:19,  4.32it/s] 75%|███████▍  | 9580/12776 [1:40:50<11:54,  4.47it/s]                                                       75%|███████▍  | 9580/12776 [1:40:50<11:54,  4.47it/s] 75%|███████▍  | 9581/12776 [1:40:50<11:33,  4.61it/s]                                                       75%|███████▍  | 9581/12776 [1:40:50<11:33,  4.61it/s] 75%|███████▌  | 9582/12776 [1:40:50<11:19,  4.70it/s]                                                       75%|███████▌  | 9582/12776 [1:40:50<11:19,  4.70it/s] 75%|███████��  | 9583/12776 [1:40:50<13:04,  4.07it/s]                                                       75%|███████▌  | 9583/12776 [1:40:50<13:04,  4.07it/s] 75%|███████▌  | 9584/12776 [1:40:50<12:15,  4.34it/s]                                                       75%|███████▌  | 9584/12776 [1:40:50<12:15,  4.34it/s] 75%|███████▌  | 9585/12776 [1:40:51<11:44,  4.53it/s]                                                       75%|███████▌  | 9585/12776 [1:40:51<11:44,  4.53it/s] 75%|███████▌  | 9586/12776 [1:40:51<11:11,  4.75it/s]                                                       75%|███████▌  | 9586/12776 [1:40:51<11:11,  4.75it/s] 75%|███████▌  | 9587/12776 [1:40:51<10:45,  4.94it/s]                                                       75%|███████▌  | 9587/12776 [1:40:51<10:45,  4.94it/s] 75%|███████▌  | 9588/12776 [1:40:52<17:58,  2.96it/s]                                                       75%|███████▌  | 9588/12776 [1:40:52<17:58,  2.96it/s] 75%|███████▌  | 9589/12776 [1:40:53<34:44,  1.53it/s]                                                       75%|███████▌  | 9589/12776 [1:40:53<34:44,  1.53it/s] 75%|███████▌  | 9590/12776 [1:40:54<39:40,  1.34it/s]                                                       75%|███████▌  | 9590/12776 [1:40:54<39:40,  1.34it/s] 75%|███████▌  | 9591/12776 [1:40:55<41:53,  1.27it/s]                                                       75%|███████▌  | 9591/12776 [1:40:55<41:53,  1.27it/s] 75%|███████▌  | 9592/12776 [1:40:56<41:16,  1.29it/s]                                                       75%|███████▌  | 9592/12776 [1:40:56<41:16,  1.29it/s] 75%|███████▌  | 9593/12776 [1:40:56<40:54,  1.30it/s]                                                       75%|███████▌  | 9593/12776 [1:40:56<40:54,  1.30it/s] 75%|███████▌  | 9594/12776 [1:40:57<40:24,  1.31it/s]                                                       75%|███████▌  | 9594/12776 [1:40:57<40:24,  1.31it/s] 75%|███████▌  | 9595/12776 [1:40:58<38:22,  1.38it/s]                                                       75%|███████▌  | 9595/12776 [1:40:58<38:22,  1.38it/s] 75%|███████▌  | 9596/12776 [1:40:58<36:27,  1.45it/s]                                                       75%|███████▌  | 9596/12776 [1:40:58<36:27,  1.45it/s] 75%|███████▌  | 9597/12776 [1:40:59<34:25,  1.54it/s]                                                       75%|███████▌  | 9597/12776 [1:40:59<34:25,  1.54it/s] 75%|███████▌  | 9598/12776 [1:40:59<32:30,  1.63it/s]                                                       75%|███████▌  | 9598/12776 [1:40:59<32:30,  1.63it/s] 75%|███████▌  | 9599/12776 [1:41:00<30:46,  1.72it/s]                                                       75%|███████▌  | 9599/12776 [1:41:00<30:46,  1.72it/s] 75%|███████▌  | 9600/12776 [1:41:00<29:02,  1.82it/s]                                                       75%|███████▌  | 9600/12776 [1:41:00<29:02,  1.82it/s]Saving model checkpoint to ./checkpoint-9600
+Configuration saved in ./checkpoint-9600/config.json
+Model weights saved in ./checkpoint-9600/model.safetensors
+Feature extractor saved in ./checkpoint-9600/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-9600/tokenizer_config.json
+Special tokens file saved in ./checkpoint-9600/special_tokens_map.json
+added tokens file saved in ./checkpoint-9600/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-8400] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 75%|███████▌  | 9601/12776 [1:41:06<1:52:26,  2.12s/it]                                                         75%|███████▌  | 9601/12776 [1:41:06<1:52:26,  2.12s/it] 75%|███████▌  | 9602/12776 [1:41:07<1:25:13,  1.61s/it]                                                         75%|███████▌  | 9602/12776 [1:41:07<1:25:13,  1.61s/it] 75%|███████▌  | 9603/12776 [1:41:07<1:05:52,  1.25s/it]                                                         75%|███████▌  | 9603/12776 [1:41:07<1:05:52,  1.25s/it] 75%|███████▌  | 9604/12776 [1:41:08<54:18,  1.03s/it]                                                         75%|███████▌  | 9604/12776 [1:41:08<54:18,  1.03s/it] 75%|███████▌  | 9605/12776 [1:41:08<44:08,  1.20it/s]                                                       75%|███████▌  | 9605/12776 [1:41:08<44:08,  1.20it/s] 75%|███████▌  | 9606/12776 [1:41:08<36:45,  1.44it/s]                                                       75%|███████▌  | 9606/12776 [1:41:08<36:45,  1.44it/s] 75%|███████▌  | 9607/12776 [1:41:09<32:31,  1.62it/s]                                                       75%|███████▌  | 9607/12776 [1:41:09<32:31,  1.62it/s] 75%|███████▌  | 9608/12776 [1:41:09<27:56,  1.89it/s]                                                       75%|███████▌  | 9608/12776 [1:41:09<27:56,  1.89it/s] 75%|███████▌  | 9609/12776 [1:41:09<24:36,  2.14it/s]                                                       75%|███████▌  | 9609/12776 [1:41:09<24:36,  2.14it/s] 75%|███████▌  | 9610/12776 [1:41:10<24:02,  2.19it/s]                                                       75%|███████▌  | 9610/12776 [1:41:10<24:02,  2.19it/s] 75%|███████▌  | 9611/12776 [1:41:10<21:33,  2.45it/s]                                                       75%|███████▌  | 9611/12776 [1:41:10<21:33,  2.45it/s] 75%|███████▌  | 9612/12776 [1:41:10<19:32,  2.70it/s]                                                       75%|███████▌  | 9612/12776 [1:41:10<19:32,  2.70it/s] 75%|███████▌  | 9613/12776 [1:41:11<20:50,  2.53it/s]                                                       75%|███████▌  | 9613/12776 [1:41:11<20:50,  2.53it/s] 75%|███████▌  | 9614/12776 [1:41:11<18:53,  2.79it/s]                                                       75%|███████▌  | 9614/12776 [1:41:11<18:53,  2.79it/s] 75%|███████▌  | 9615/12776 [1:41:11<17:20,  3.04it/s]                                                       75%|███████▌  | 9615/12776 [1:41:11<17:20,  3.04it/s] 75%|███████▌  | 9616/12776 [1:41:12<16:08,  3.26it/s]                                                       75%|███████▌  | 9616/12776 [1:41:12<16:08,  3.26it/s] 75%|███████▌  | 9617/12776 [1:41:12<16:33,  3.18it/s]                                                       75%|███████▌  | 9617/12776 [1:41:12<16:33,  3.18it/s] 75%|███████▌  | 9618/12776 [1:41:12<15:22,  3.42it/s]                                                       75%|███████▌  | 9618/12776 [1:41:12<15:22,  3.42it/s] 75%|███████▌  | 9619/12776 [1:41:12<14:23,  3.66it/s]                                                       75%|███████▌  | 9619/12776 [1:41:12<14:23,  3.66it/s] 75%|███████▌  | 9620/12776 [1:41:13<13:38,  3.86it/s]                                                       75%|███████▌  | 9620/12776 [1:41:13<13:38,  3.86it/s] 75%|███████▌  | 9621/12776 [1:41:13<13:01,  4.04it/s]                                                       75%|███████▌  | 9621/12776 [1:41:13<13:01,  4.04it/s] 75%|███████▌  | 9622/12776 [1:41:13<14:05,  3.73it/s]                                                       75%|███████▌  | 9622/12776 [1:41:13<14:05,  3.73it/s] 75%|███████▌  | 9623/12776 [1:41:13<13:10,  3.99it/s]                                                       75%|███████▌  | 9623/12776 [1:41:13<13:10,  3.99it/s] 75%|███████▌  | 9624/12776 [1:41:14<12:24,  4.23it/s]                                                      {'loss': 0.2408, 'grad_norm': 0.6756463050842285, 'learning_rate': 7.925219941348973e-05, 'epoch': 1.49}
+{'loss': 0.2446, 'grad_norm': 1.1319537162780762, 'learning_rate': 7.9227761485826e-05, 'epoch': 1.49}
+{'loss': 0.2676, 'grad_norm': 1.639535903930664, 'learning_rate': 7.920332355816226e-05, 'epoch': 1.49}
+{'loss': 0.4021, 'grad_norm': 0.6942873001098633, 'learning_rate': 7.917888563049852e-05, 'epoch': 1.49}
+{'loss': 0.2719, 'grad_norm': 0.61109858751297, 'learning_rate': 7.91544477028348e-05, 'epoch': 1.49}
+{'loss': 0.4091, 'grad_norm': 0.8826609253883362, 'learning_rate': 7.913000977517107e-05, 'epoch': 1.49}
+{'loss': 0.3353, 'grad_norm': 2.2055704593658447, 'learning_rate': 7.910557184750732e-05, 'epoch': 1.5}
+{'loss': 0.5081, 'grad_norm': 4.32928991317749, 'learning_rate': 7.90811339198436e-05, 'epoch': 1.5}
+{'loss': 0.2946, 'grad_norm': 0.8008466958999634, 'learning_rate': 7.905669599217986e-05, 'epoch': 1.5}
+{'loss': 0.3579, 'grad_norm': 1.2389719486236572, 'learning_rate': 7.903225806451613e-05, 'epoch': 1.5}
+{'loss': 0.4454, 'grad_norm': 1.683296799659729, 'learning_rate': 7.900782013685239e-05, 'epoch': 1.5}
+{'loss': 0.3898, 'grad_norm': 1.3349992036819458, 'learning_rate': 7.898338220918865e-05, 'epoch': 1.5}
+{'loss': 0.4127, 'grad_norm': 0.86898273229599, 'learning_rate': 7.895894428152492e-05, 'epoch': 1.5}
+{'loss': 0.262, 'grad_norm': 1.2158864736557007, 'learning_rate': 7.89345063538612e-05, 'epoch': 1.5}
+{'loss': 0.3522, 'grad_norm': 1.0197027921676636, 'learning_rate': 7.891006842619745e-05, 'epoch': 1.5}
+{'loss': 0.4608, 'grad_norm': 2.6261444091796875, 'learning_rate': 7.888563049853371e-05, 'epoch': 1.5}
+{'loss': 0.4477, 'grad_norm': 1.4743295907974243, 'learning_rate': 7.886119257086999e-05, 'epoch': 1.5}
+{'loss': 0.2981, 'grad_norm': 1.690913438796997, 'learning_rate': 7.883675464320626e-05, 'epoch': 1.5}
+{'loss': 0.5513, 'grad_norm': 1.879437804222107, 'learning_rate': 7.881231671554251e-05, 'epoch': 1.5}
+{'loss': 0.677, 'grad_norm': 2.7271459102630615, 'learning_rate': 7.878787878787879e-05, 'epoch': 1.5}
+{'loss': 0.787, 'grad_norm': 3.2245678901672363, 'learning_rate': 7.876344086021505e-05, 'epoch': 1.5}
+{'loss': 0.7679, 'grad_norm': 1.3379689455032349, 'learning_rate': 7.87390029325513e-05, 'epoch': 1.5}
+{'loss': 0.613, 'grad_norm': 1.8764911890029907, 'learning_rate': 7.871456500488758e-05, 'epoch': 1.5}
+{'loss': 0.643, 'grad_norm': 3.4302899837493896, 'learning_rate': 7.869012707722385e-05, 'epoch': 1.5}
+{'loss': 0.4512, 'grad_norm': 2.4290006160736084, 'learning_rate': 7.866568914956011e-05, 'epoch': 1.5}
+{'loss': 0.7265, 'grad_norm': 2.3394429683685303, 'learning_rate': 7.864125122189639e-05, 'epoch': 1.5}
+{'loss': 0.7219, 'grad_norm': 3.2177608013153076, 'learning_rate': 7.861681329423264e-05, 'epoch': 1.5}
+{'loss': 0.921, 'grad_norm': 2.652704954147339, 'learning_rate': 7.85923753665689e-05, 'epoch': 1.5}
+{'loss': 0.8148, 'grad_norm': 1.5639492273330688, 'learning_rate': 7.856793743890518e-05, 'epoch': 1.5}
+{'loss': 0.429, 'grad_norm': 1.7422173023223877, 'learning_rate': 7.854349951124145e-05, 'epoch': 1.5}
+{'loss': 0.9982, 'grad_norm': 2.2326202392578125, 'learning_rate': 7.85190615835777e-05, 'epoch': 1.5}
+{'loss': 0.8497, 'grad_norm': 2.4067230224609375, 'learning_rate': 7.849462365591398e-05, 'epoch': 1.5}
+{'loss': 1.1441, 'grad_norm': 5.408649444580078, 'learning_rate': 7.847018572825024e-05, 'epoch': 1.5}
+{'loss': 1.1049, 'grad_norm': 2.413092613220215, 'learning_rate': 7.84457478005865e-05, 'epoch': 1.5}
+{'loss': 1.061, 'grad_norm': 2.305614471435547, 'learning_rate': 7.842130987292277e-05, 'epoch': 1.5}
+{'loss': 0.5431, 'grad_norm': 2.732689619064331, 'learning_rate': 7.839687194525904e-05, 'epoch': 1.5}
+{'loss': 1.5274, 'grad_norm': 2.2175774574279785, 'learning_rate': 7.83724340175953e-05, 'epoch': 1.5}
+{'loss': 1.4016, 'grad_norm': 3.7100279331207275, 'learning_rate': 7.834799608993158e-05, 'epoch': 1.5}
+{'loss': 0.6064, 'grad_norm': 2.078692674636841, 'learning_rate': 7.832355816226783e-05, 'epoch': 1.5}
+{'loss': 0.5273, 'grad_norm': 2.7273752689361572, 'learning_rate': 7.82991202346041e-05, 'epoch': 1.5}
+{'loss': 0.3828, 'grad_norm': 1.3551000356674194, 'learning_rate': 7.827468230694037e-05, 'epoch': 1.5}
+{'loss': 1.221, 'grad_norm': 3.331238269805908, 'learning_rate': 7.825024437927664e-05, 'epoch': 1.5}
+{'loss': 0.799, 'grad_norm': 2.849250555038452, 'learning_rate': 7.822580645161289e-05, 'epoch': 1.5}
+{'loss': 1.6444, 'grad_norm': 2.9601728916168213, 'learning_rate': 7.820136852394917e-05, 'epoch': 1.5}
+{'loss': 0.2622, 'grad_norm': 0.5723337531089783, 'learning_rate': 7.817693059628543e-05, 'epoch': 1.5}
+{'loss': 0.2687, 'grad_norm': 0.8948503732681274, 'learning_rate': 7.815249266862169e-05, 'epoch': 1.5}
+{'loss': 0.2752, 'grad_norm': 0.6991376876831055, 'learning_rate': 7.812805474095796e-05, 'epoch': 1.5}
+{'loss': 0.1794, 'grad_norm': 0.6468504667282104, 'learning_rate': 7.810361681329423e-05, 'epoch': 1.5}
+{'loss': 0.2153, 'grad_norm': 0.5929017066955566, 'learning_rate': 7.807917888563049e-05, 'epoch': 1.5}
+{'loss': 0.2172, 'grad_norm': 7.306670188903809, 'learning_rate': 7.805474095796674e-05, 'epoch': 1.5}
+{'loss': 0.2575, 'grad_norm': 0.5348293781280518, 'learning_rate': 7.803030303030302e-05, 'epoch': 1.5}
+{'loss': 0.229, 'grad_norm': 0.5054227709770203, 'learning_rate': 7.800586510263929e-05, 'epoch': 1.5}
+{'loss': 0.1961, 'grad_norm': 0.5968636274337769, 'learning_rate': 7.798142717497555e-05, 'epoch': 1.5}
+{'loss': 0.328, 'grad_norm': 1.1959178447723389, 'learning_rate': 7.795698924731183e-05, 'epoch': 1.5}
+{'loss': 0.294, 'grad_norm': 1.3676921129226685, 'learning_rate': 7.793255131964808e-05, 'epoch': 1.5}
+{'loss': 0.2935, 'grad_norm': 0.5510010123252869, 'learning_rate': 7.790811339198435e-05, 'epoch': 1.5}
+{'loss': 0.2587, 'grad_norm': 1.0297011137008667, 'learning_rate': 7.788367546432063e-05, 'epoch': 1.5}
+{'loss': 0.4379, 'grad_norm': 2.1265127658843994, 'learning_rate': 7.785923753665688e-05, 'epoch': 1.5}
+{'loss': 0.5179, 'grad_norm': 1.283608317375183, 'learning_rate': 7.783479960899314e-05, 'epoch': 1.5}
+{'loss': 0.3469, 'grad_norm': 0.9307227730751038, 'learning_rate': 7.781036168132942e-05, 'epoch': 1.5}
+{'loss': 0.3251, 'grad_norm': 1.4115785360336304, 'learning_rate': 7.778592375366568e-05, 'epoch': 1.5}
+{'loss': 0.4412, 'grad_norm': 1.2950360774993896, 'learning_rate': 7.776148582600194e-05, 'epoch': 1.5}
+{'loss': 0.5847, 'grad_norm': 1.8613760471343994, 'learning_rate': 7.773704789833821e-05, 'epoch': 1.5}
+{'loss': 0.4761, 'grad_norm': 1.611189365386963, 'learning_rate': 7.771260997067448e-05, 'epoch': 1.5}
+{'loss': 0.3882, 'grad_norm': 1.2871978282928467, 'learning_rate': 7.768817204301074e-05, 'epoch': 1.5}
+{'loss': 0.3907, 'grad_norm': 0.9030357599258423, 'learning_rate': 7.766373411534701e-05, 'epoch': 1.5}
+{'loss': 0.4686, 'grad_norm': 1.4141883850097656, 'learning_rate': 7.763929618768327e-05, 'epoch': 1.5}
+{'loss': 0.5278, 'grad_norm': 2.3717682361602783, 'learning_rate': 7.761485826001954e-05, 'epoch': 1.5}
+{'loss': 0.3828, 'grad_norm': 0.8933207988739014, 'learning_rate': 7.759042033235582e-05, 'epoch': 1.5}
+{'loss': 0.6257, 'grad_norm': 1.469948649406433, 'learning_rate': 7.756598240469207e-05, 'epoch': 1.51}
+{'loss': 0.5167, 'grad_norm': 2.732593297958374, 'learning_rate': 7.754154447702833e-05, 'epoch': 1.51}
+{'loss': 0.4295, 'grad_norm': 2.5369060039520264, 'learning_rate': 7.751710654936461e-05, 'epoch': 1.51}
+{'loss': 0.3771, 'grad_norm': 3.549316883087158, 'learning_rate': 7.749266862170088e-05, 'epoch': 1.51}
+{'loss': 0.6011, 'grad_norm': 1.7720410823822021, 'learning_rate': 7.746823069403713e-05, 'epoch': 1.51}
+{'loss': 0.237, 'grad_norm': 0.6939887404441833, 'learning_rate': 7.74437927663734e-05, 'epoch': 1.51}
+{'loss': 1.1664, 'grad_norm': 2.6376233100891113, 'learning_rate': 7.741935483870967e-05, 'epoch': 1.51}
+{'loss': 0.7487, 'grad_norm': 2.496608018875122, 'learning_rate': 7.739491691104593e-05, 'epoch': 1.51}
+{'loss': 0.8468, 'grad_norm': 2.7778053283691406, 'learning_rate': 7.73704789833822e-05, 'epoch': 1.51}
+{'loss': 0.9794, 'grad_norm': 2.1602282524108887, 'learning_rate': 7.734604105571846e-05, 'epoch': 1.51}
+ 75%|███████▌  | 9624/12776 [1:41:14<12:24,  4.23it/s] 75%|███████▌  | 9625/12776 [1:41:14<11:46,  4.46it/s]                                                       75%|███████▌  | 9625/12776 [1:41:14<11:46,  4.46it/s] 75%|███████▌  | 9626/12776 [1:41:14<11:18,  4.64it/s]                                                       75%|███████▌  | 9626/12776 [1:41:14<11:18,  4.64it/s] 75%|███████▌  | 9627/12776 [1:41:14<12:45,  4.11it/s]                                                       75%|███████▌  | 9627/12776 [1:41:14<12:45,  4.11it/s] 75%|███████▌  | 9628/12776 [1:41:15<11:54,  4.40it/s]                                                       75%|███████▌  | 9628/12776 [1:41:15<11:54,  4.40it/s] 75%|███████▌  | 9629/12776 [1:41:15<11:14,  4.67it/s]                                                       75%|███████▌  | 9629/12776 [1:41:15<11:14,  4.67it/s] 75%|███████▌  | 9630/12776 [1:41:15<10:43,  4.89it/s]                                                       75%|███████▌  | 9630/12776 [1:41:15<10:43,  4.89it/s] 75%|███████▌  | 9631/12776 [1:41:15<10:18,  5.09it/s]                                                       75%|███████▌  | 9631/12776 [1:41:15<10:18,  5.09it/s] 75%|███████▌  | 9632/12776 [1:41:15<11:47,  4.44it/s]                                                       75%|███████▌  | 9632/12776 [1:41:15<11:47,  4.44it/s] 75%|███████▌  | 9633/12776 [1:41:16<10:59,  4.77it/s]                                                       75%|███████▌  | 9633/12776 [1:41:16<10:59,  4.77it/s] 75%|███████▌  | 9634/12776 [1:41:16<10:23,  5.04it/s]                                                       75%|███████▌  | 9634/12776 [1:41:16<10:23,  5.04it/s] 75%|███████▌  | 9635/12776 [1:41:16<09:51,  5.31it/s]                                                       75%|███████▌  | 9635/12776 [1:41:16<09:51,  5.31it/s] 75%|███████▌  | 9636/12776 [1:41:16<09:28,  5.53it/s]                                                       75%|███████▌  | 9636/12776 [1:41:16<09:28,  5.53it/s] 75%|███████▌  | 9637/12776 [1:41:16<09:12,  5.68it/s]                                                       75%|███████▌  | 9637/12776 [1:41:16<09:12,  5.68it/s] 75%|███████▌  | 9638/12776 [1:41:17<16:43,  3.13it/s]                                                       75%|███████▌  | 9638/12776 [1:41:17<16:43,  3.13it/s] 75%|███████▌  | 9639/12776 [1:41:18<32:17,  1.62it/s]                                                       75%|███████▌  | 9639/12776 [1:41:18<32:17,  1.62it/s] 75%|███████▌  | 9640/12776 [1:41:19<38:25,  1.36it/s]                                                       75%|███████▌  | 9640/12776 [1:41:19<38:25,  1.36it/s] 75%|███████▌  | 9641/12776 [1:41:20<40:06,  1.30it/s]                                                       75%|███████▌  | 9641/12776 [1:41:20<40:06,  1.30it/s] 75%|███████▌  | 9642/12776 [1:41:21<40:04,  1.30it/s]                                                       75%|███████▌  | 9642/12776 [1:41:21<40:04,  1.30it/s] 75%|███████▌  | 9643/12776 [1:41:22<39:13,  1.33it/s]                                                       75%|███████▌  | 9643/12776 [1:41:22<39:13,  1.33it/s] 75%|███████▌  | 9644/12776 [1:41:22<38:30,  1.36it/s]                                                       75%|███████▌  | 9644/12776 [1:41:22<38:30,  1.36it/s] 75%|███████▌  | 9645/12776 [1:41:23<36:29,  1.43it/s]                                                       75%|███████▌  | 9645/12776 [1:41:23<36:29,  1.43it/s] 76%|███████▌  | 9646/12776 [1:41:23<34:31,  1.51it/s]                                                       76%|███████▌  | 9646/12776 [1:41:23<34:31,  1.51it/s] 76%|███████▌  | 9647/12776 [1:41:24<32:34,  1.60it/s]                                                       76%|███████▌  | 9647/12776 [1:41:24<32:34,  1.60it/s] 76%|███████▌  | 9648/12776 [1:41:25<32:37,  1.60it/s]                                                       76%|███████▌  | 9648/12776 [1:41:25<32:37,  1.60it/s] 76%|███████▌  | 9649/12776 [1:41:25<30:34,  1.70it/s]                                                       76%|███████▌  | 9649/12776 [1:41:25<30:34,  1.70it/s] 76%|███████▌  | 9650/12776 [1:41:26<28:45,  1.81it/s]                                                       76%|███████▌  | 9650/12776 [1:41:26<28:45,  1.81it/s] 76%|███████▌  | 9651/12776 [1:41:26<27:05,  1.92it/s]                                                       76%|███████▌  | 9651/12776 [1:41:26<27:05,  1.92it/s] 76%|███████▌  | 9652/12776 [1:41:26<25:30,  2.04it/s]                                                       76%|███████▌  | 9652/12776 [1:41:26<25:30,  2.04it/s] 76%|███████▌  | 9653/12776 [1:41:27<25:29,  2.04it/s]                                                       76%|███████▌  | 9653/12776 [1:41:27<25:29,  2.04it/s] 76%|███████▌  | 9654/12776 [1:41:27<23:55,  2.18it/s]                                                       76%|███████▌  | 9654/12776 [1:41:27<23:55,  2.18it/s] 76%|███████▌  | 9655/12776 [1:41:28<22:33,  2.31it/s]                                                       76%|███████▌  | 9655/12776 [1:41:28<22:33,  2.31it/s] 76%|███████▌  | 9656/12776 [1:41:28<22:19,  2.33it/s]                                                       76%|███████▌  | 9656/12776 [1:41:28<22:19,  2.33it/s] 76%|███████▌  | 9657/12776 [1:41:28<21:08,  2.46it/s]                                                       76%|███████▌  | 9657/12776 [1:41:28<21:08,  2.46it/s] 76%|███████▌  | 9658/12776 [1:41:29<20:02,  2.59it/s]                                                       76%|███████▌  | 9658/12776 [1:41:29<20:02,  2.59it/s] 76%|███████▌  | 9659/12776 [1:41:29<21:13,  2.45it/s]                                                       76%|███████▌  | 9659/12776 [1:41:29<21:13,  2.45it/s] 76%|███████▌  | 9660/12776 [1:41:30<19:45,  2.63it/s]                                                       76%|███████▌  | 9660/12776 [1:41:30<19:45,  2.63it/s] 76%|███████▌  | 9661/12776 [1:41:30<18:33,  2.80it/s]                                                       76%|███████▌  | 9661/12776 [1:41:30<18:33,  2.80it/s] 76%|███████▌  | 9662/12776 [1:41:30<17:33,  2.96it/s]                                                       76%|███████▌  | 9662/12776 [1:41:30<17:33,  2.96it/s] 76%|███████▌  | 9663/12776 [1:41:30<17:41,  2.93it/s]                                                       76%|███████▌  | 9663/12776 [1:41:30<17:41,  2.93it/s] 76%|███████▌  | 9664/12776 [1:41:31<16:32,  3.14it/s]                                                       76%|███████▌  | 9664/12776 [1:41:31<16:32,  3.14it/s] 76%|███████▌  | 9665/12776 [1:41:31<15:38,  3.32it/s]                                                       76%|███████▌  | 9665/12776 [1:41:31<15:38,  3.32it/s] 76%|███████▌  | 9666/12776 [1:41:31<15:11,  3.41it/s]                                                       76%|███████▌  | 9666/12776 [1:41:31<15:11,  3.41it/s] 76%|███████▌  | 9667/12776 [1:41:32<15:45,  3.29it/s]                                                       76%|███████▌  | 9667/12776 [1:41:32<15:45,  3.29it/s] 76%|███████▌  | 9668/12776 [1:41:32<14:54,  3.47it/s]                                                       76%|███████▌  | 9668/12776 [1:41:32<14:54,  3.47it/s] 76%|███████▌  | 9669/12776 [1:41:32<14:19,  3.61it/s]                                                       76%|███████▌  | 9669/12776 [1:41:32<14:19,  3.61it/s] 76%|███████▌  | 9670/12776 [1:41:32<13:48,  3.75it/s]                                                       76%|███████▌  | 9670/12776 [1:41:32<13:48,  3.75it/s] 76%|███████▌  | 9671/12776 [1:41:33<15:15,  3.39it/s]                                                       76%|███████▌  | 9671/12776 [1:41:33<15:15,  3.39it/s] 76%|███████▌  | 9672/12776 [1:41:33<14:25,  3.59it/s]                                                       76%|███████▌  | 9672/12776 [1:41:33<14:25,  3.59it/s] 76%|███████▌  | 9673/12776 [1:41:33<13:44,  3.76it/s]                                                       76%|███████▌  | 9673/12776 [1:41:33<13:44,  3.76it/s] 76%|███████▌  | 9674/12776 [1:41:33<13:10,  3.92it/s]                                                       76%|███████▌  | 9674/12776 [1:41:33<13:10,  3.92it/s] 76%|███████▌  | 9675/12776 [1:41:34<14:02,  3.68it/s]                                                       76%|███████▌  | 9675/12776 [1:41:34<14:02,  3.68it/s] 76%|███████▌  | 9676/12776 [1:41:34<13:13,  3.91it/s]                                                       76%|███████▌  | 9676/12776 [1:41:34<13:13,  3.91it/s] 76%|███████▌  | 9677/12776 [1:41:34<12:31,  4.12it/s]                                                       76%|███████▌  | 9677/12776 [1:41:34<12:31,  4.12it/s] 76%|███████▌  | 9678/12776 [1:41:34<11:59,  4.30it/s]                                                       76%|███████▌  | 9678/12776 [1:41:34<11:59,  4.30it/s] 76%|███████▌  | 9679/12776 [1:41:35<11:18,  4.56it/s]                                                       76%|███████▌  | 9679/12776 [1:41:35<11:18,  4.56it/s] 76%|███████▌  | 9680/12776 [1:41:35<12:21,  4.17it/s]                                                       76%|███████▌  | 9680/12776 [1:41:35<12:21,  4.17it/s] 76%|███████▌  | 9681/12776 [1:41:35<11:35,  4.45it/s]                                                       76%|███████▌  | 9681/12776 [1:41:35<11:35,  4.45it/s] 76%|███████▌  | 9682/12776 [1:41:35<10:49,  4.76it/s]                                                       76%|███████▌  | 9682/12776 [1:41:35<10:49,  4.76it/s] 76%|███████▌  | 9683/12776 [1:41:35<10:16,  5.02it/s]                                                       76%|███████▌  | 9683/12776 [1:41:35<10:16,  5.02it/s] 76%|███████▌  | 9684/12776 [1:41:36<09:52,  5.22it/s]                                                       76%|███████▌  | 9684/12776 [1:41:36<09:52,  5.22it/s] 76%|███████▌  | 9685/12776 [1:41:36<09:29,  5.42it/s]                                                       76%|███████▌  | 9685/12776 [1:41:36<09:29,  5.42it/s] 76%|███████▌  | 9686/12776 [1:41:36<10:10,  5.06it/s]                                                       76%|███████▌  | 9686/12776 [1:41:36<10:10,  5.06it/s] 76%|███████▌  | 9687/12776 [1:41:36<09:37,  5.35it/s]                                                       76%|███████▌  | 9687/12776 [1:41:36<09:37,  5.35it/s] 76%|███████▌  | 9688/12776 [1:41:37<17:42,  2.91it/s]                                                       76%|███████▌  | 9688/12776 [1:41:37<17:42,  2.91it/s] 76%|███████▌  | 9689/12776 [1:41:38<37:10,  1.38it/s]                                                       76%|███████▌  | 9689/12776 [1:41:38<37:10,  1.38it/s] 76%|███████▌  | 9690/12776 [1:41:39<40:20,  1.27it/s]                                                       76%|███████▌  | 9690/12776 [1:41:39<40:20,  1.27it/s] 76%|███████▌  | 9691/12776 [1:41:40<40:59,  1.25it/s]                                                       76%|███████▌  | 9691/12776 [1:41:40<40:59,  1.25it/s] 76%|███████▌  | 9692/12776 [1:41:41<40:13,  1.28it/s]                                                       76%|███████▌  | 9692/12776 [1:41:41<40:13,  1.28it/s] 76%|███████▌  | 9693/12776 [1:41:42<39:11,  1.31it/s]                                                       76%|███████▌  | 9693/12776 [1:41:42<39:11,  1.31it/s] 76%|███████▌  | 9694/12776 [1:41:42<37:35,  1.37it/s]                                                       76%|███████▌  | 9694/12776 [1:41:42<37:35,  1.37it/s] 76%|███████▌  | 9695/12776 [1:41:43<37:46,  1.36it/s]                                                       76%|███████▌  | 9695/12776 [1:41:43<37:46,  1.36it/s] 76%|███████▌  | 9696/12776 [1:41:44<35:48,  1.43it/s]                                                       76%|███████▌  | 9696/12776 [1:41:44<35:48,  1.43it/s] 76%|███████▌  | 9697/12776 [1:41:44<34:27,  1.49it/s]                                                       76%|███████▌  | 9697/12776 [1:41:44<34:27,  1.49it/s] 76%|███████▌  | 9698/12776 [1:41:45<32:40,  1.57it/s]                                                       76%|███████▌  | 9698/12776 [1:41:45<32:40,  1.57it/s] 76%|███████▌  | 9699/12776 [1:41:45<32:01,  1.60it/s]                                                       76%|███████▌  | 9699/12776 [1:41:45<32:01,  1.60it/s] 76%|███████▌  | 9700/12776 [1:41:46<30:22,  1.69it/s]                                                       76%|███████▌  | 9700/12776 [1:41:46<30:22,  1.69it/s] 76%|███████▌  | 9701/12776 [1:41:47<30:32,  1.68it/s]                                                       76%|███████▌  | 9701/12776 [1:41:47<30:32,  1.68it/s] 76%|███████▌  | 9702/12776 [1:41:47<28:30,  1.80it/s]                                                      {'loss': 0.953, 'grad_norm': 2.9261624813079834, 'learning_rate': 7.732160312805473e-05, 'epoch': 1.51}
+{'loss': 0.8946, 'grad_norm': 3.67445969581604, 'learning_rate': 7.729716520039101e-05, 'epoch': 1.51}
+{'loss': 1.5854, 'grad_norm': 2.395833730697632, 'learning_rate': 7.727272727272726e-05, 'epoch': 1.51}
+{'loss': 0.5556, 'grad_norm': 2.6899259090423584, 'learning_rate': 7.724828934506352e-05, 'epoch': 1.51}
+{'loss': 1.3441, 'grad_norm': 1.9388134479522705, 'learning_rate': 7.72238514173998e-05, 'epoch': 1.51}
+{'loss': 1.3142, 'grad_norm': 3.582058906555176, 'learning_rate': 7.719941348973607e-05, 'epoch': 1.51}
+{'loss': 0.64, 'grad_norm': 1.9698890447616577, 'learning_rate': 7.717497556207232e-05, 'epoch': 1.51}
+{'loss': 1.1611, 'grad_norm': 2.8033182621002197, 'learning_rate': 7.71505376344086e-05, 'epoch': 1.51}
+{'loss': 0.9295, 'grad_norm': 3.156467914581299, 'learning_rate': 7.712609970674486e-05, 'epoch': 1.51}
+{'loss': 0.5743, 'grad_norm': 1.1335583925247192, 'learning_rate': 7.710166177908113e-05, 'epoch': 1.51}
+{'loss': 0.6852, 'grad_norm': 2.90801739692688, 'learning_rate': 7.707722385141739e-05, 'epoch': 1.51}
+{'loss': 0.8531, 'grad_norm': 2.2356724739074707, 'learning_rate': 7.705278592375366e-05, 'epoch': 1.51}
+{'loss': 0.524, 'grad_norm': 1.6373522281646729, 'learning_rate': 7.702834799608992e-05, 'epoch': 1.51}
+{'loss': 0.5397, 'grad_norm': 1.7506439685821533, 'learning_rate': 7.70039100684262e-05, 'epoch': 1.51}
+{'loss': 0.6136, 'grad_norm': 1.3206775188446045, 'learning_rate': 7.697947214076245e-05, 'epoch': 1.51}
+{'loss': 0.2406, 'grad_norm': 0.5027822256088257, 'learning_rate': 7.695503421309871e-05, 'epoch': 1.51}
+{'loss': 0.2308, 'grad_norm': 0.5172881484031677, 'learning_rate': 7.693059628543499e-05, 'epoch': 1.51}
+{'loss': 0.2369, 'grad_norm': 0.5118494033813477, 'learning_rate': 7.690615835777126e-05, 'epoch': 1.51}
+{'loss': 0.131, 'grad_norm': 0.42256832122802734, 'learning_rate': 7.688172043010751e-05, 'epoch': 1.51}
+{'loss': 0.1728, 'grad_norm': 0.41296517848968506, 'learning_rate': 7.685728250244379e-05, 'epoch': 1.51}
+{'loss': 0.1832, 'grad_norm': 1.4930837154388428, 'learning_rate': 7.683284457478005e-05, 'epoch': 1.51}
+{'loss': 0.2564, 'grad_norm': 0.46577295660972595, 'learning_rate': 7.680840664711632e-05, 'epoch': 1.51}
+{'loss': 0.3131, 'grad_norm': 0.6412208676338196, 'learning_rate': 7.678396871945258e-05, 'epoch': 1.51}
+{'loss': 0.4227, 'grad_norm': 1.1731911897659302, 'learning_rate': 7.675953079178885e-05, 'epoch': 1.51}
+{'loss': 0.3043, 'grad_norm': 0.7713680267333984, 'learning_rate': 7.673509286412511e-05, 'epoch': 1.51}
+{'loss': 0.3916, 'grad_norm': 0.8555247187614441, 'learning_rate': 7.671065493646139e-05, 'epoch': 1.51}
+{'loss': 0.2323, 'grad_norm': 1.737226128578186, 'learning_rate': 7.668621700879764e-05, 'epoch': 1.51}
+{'loss': 0.3248, 'grad_norm': 2.419440746307373, 'learning_rate': 7.66617790811339e-05, 'epoch': 1.51}
+{'loss': 0.37, 'grad_norm': 1.3268059492111206, 'learning_rate': 7.663734115347018e-05, 'epoch': 1.51}
+{'loss': 0.5048, 'grad_norm': 0.7393986582756042, 'learning_rate': 7.661290322580645e-05, 'epoch': 1.51}
+{'loss': 0.3139, 'grad_norm': 1.3429745435714722, 'learning_rate': 7.65884652981427e-05, 'epoch': 1.51}
+{'loss': 0.5404, 'grad_norm': 1.2537013292312622, 'learning_rate': 7.656402737047898e-05, 'epoch': 1.51}
+{'loss': 0.4524, 'grad_norm': 1.820609211921692, 'learning_rate': 7.653958944281524e-05, 'epoch': 1.51}
+{'loss': 0.2998, 'grad_norm': 1.4393458366394043, 'learning_rate': 7.651515151515151e-05, 'epoch': 1.51}
+{'loss': 0.2722, 'grad_norm': 0.7819269895553589, 'learning_rate': 7.649071358748777e-05, 'epoch': 1.51}
+{'loss': 0.3758, 'grad_norm': 1.3116579055786133, 'learning_rate': 7.646627565982404e-05, 'epoch': 1.51}
+{'loss': 0.4362, 'grad_norm': 1.4176701307296753, 'learning_rate': 7.64418377321603e-05, 'epoch': 1.51}
+{'loss': 0.4902, 'grad_norm': 1.270081877708435, 'learning_rate': 7.641739980449658e-05, 'epoch': 1.51}
+{'loss': 0.5373, 'grad_norm': 1.4358556270599365, 'learning_rate': 7.639296187683283e-05, 'epoch': 1.51}
+{'loss': 0.6467, 'grad_norm': 2.2279257774353027, 'learning_rate': 7.63685239491691e-05, 'epoch': 1.51}
+{'loss': 0.6748, 'grad_norm': 1.7583202123641968, 'learning_rate': 7.634408602150538e-05, 'epoch': 1.51}
+{'loss': 0.7467, 'grad_norm': 1.953381896018982, 'learning_rate': 7.631964809384164e-05, 'epoch': 1.51}
+{'loss': 0.5326, 'grad_norm': 2.358065605163574, 'learning_rate': 7.629521016617789e-05, 'epoch': 1.51}
+{'loss': 0.6585, 'grad_norm': 2.178169012069702, 'learning_rate': 7.627077223851417e-05, 'epoch': 1.51}
+{'loss': 0.7107, 'grad_norm': 2.588186502456665, 'learning_rate': 7.624633431085043e-05, 'epoch': 1.51}
+{'loss': 0.578, 'grad_norm': 3.952768087387085, 'learning_rate': 7.62218963831867e-05, 'epoch': 1.51}
+{'loss': 0.3684, 'grad_norm': 1.8726907968521118, 'learning_rate': 7.619745845552296e-05, 'epoch': 1.51}
+{'loss': 0.8064, 'grad_norm': 11.03661823272705, 'learning_rate': 7.617302052785923e-05, 'epoch': 1.51}
+{'loss': 0.6892, 'grad_norm': 1.7992969751358032, 'learning_rate': 7.61485826001955e-05, 'epoch': 1.51}
+{'loss': 0.4726, 'grad_norm': 1.6284939050674438, 'learning_rate': 7.612414467253177e-05, 'epoch': 1.51}
+{'loss': 1.0102, 'grad_norm': 4.366304874420166, 'learning_rate': 7.609970674486802e-05, 'epoch': 1.51}
+{'loss': 1.1905, 'grad_norm': 2.8104560375213623, 'learning_rate': 7.607526881720429e-05, 'epoch': 1.51}
+{'loss': 0.7504, 'grad_norm': 3.4192862510681152, 'learning_rate': 7.605083088954057e-05, 'epoch': 1.51}
+{'loss': 0.7833, 'grad_norm': 3.793952465057373, 'learning_rate': 7.602639296187683e-05, 'epoch': 1.51}
+{'loss': 0.9845, 'grad_norm': 3.155576229095459, 'learning_rate': 7.600195503421308e-05, 'epoch': 1.52}
+{'loss': 1.1077, 'grad_norm': 2.9574239253997803, 'learning_rate': 7.597751710654936e-05, 'epoch': 1.52}
+{'loss': 0.9307, 'grad_norm': 2.5134990215301514, 'learning_rate': 7.595307917888563e-05, 'epoch': 1.52}
+{'loss': 1.0685, 'grad_norm': 2.6495211124420166, 'learning_rate': 7.592864125122188e-05, 'epoch': 1.52}
+{'loss': 0.8099, 'grad_norm': 2.684701919555664, 'learning_rate': 7.590420332355816e-05, 'epoch': 1.52}
+{'loss': 1.5933, 'grad_norm': 2.9182863235473633, 'learning_rate': 7.587976539589442e-05, 'epoch': 1.52}
+{'loss': 0.9618, 'grad_norm': 3.108677387237549, 'learning_rate': 7.585532746823068e-05, 'epoch': 1.52}
+{'loss': 0.213, 'grad_norm': 1.0323936939239502, 'learning_rate': 7.583088954056696e-05, 'epoch': 1.52}
+{'loss': 0.8856, 'grad_norm': 2.6111085414886475, 'learning_rate': 7.580645161290321e-05, 'epoch': 1.52}
+{'loss': 0.4987, 'grad_norm': 0.9904088973999023, 'learning_rate': 7.578201368523948e-05, 'epoch': 1.52}
+{'loss': 0.7905, 'grad_norm': 1.8084644079208374, 'learning_rate': 7.575757575757576e-05, 'epoch': 1.52}
+{'loss': 0.2475, 'grad_norm': 0.8530622720718384, 'learning_rate': 7.573313782991202e-05, 'epoch': 1.52}
+{'loss': 0.2024, 'grad_norm': 0.5058773756027222, 'learning_rate': 7.570869990224827e-05, 'epoch': 1.52}
+{'loss': 0.185, 'grad_norm': 0.4346686005592346, 'learning_rate': 7.568426197458455e-05, 'epoch': 1.52}
+{'loss': 0.2676, 'grad_norm': 0.6962853074073792, 'learning_rate': 7.565982404692082e-05, 'epoch': 1.52}
+{'loss': 0.243, 'grad_norm': 0.5338491201400757, 'learning_rate': 7.563538611925707e-05, 'epoch': 1.52}
+{'loss': 0.2168, 'grad_norm': 0.5298436284065247, 'learning_rate': 7.561094819159335e-05, 'epoch': 1.52}
+{'loss': 0.2639, 'grad_norm': 0.6006895899772644, 'learning_rate': 7.558651026392961e-05, 'epoch': 1.52}
+{'loss': 0.2938, 'grad_norm': 1.1772099733352661, 'learning_rate': 7.556207233626588e-05, 'epoch': 1.52}
+{'loss': 0.2997, 'grad_norm': 2.0881564617156982, 'learning_rate': 7.553763440860215e-05, 'epoch': 1.52}
+{'loss': 0.3687, 'grad_norm': 0.7094396352767944, 'learning_rate': 7.55131964809384e-05, 'epoch': 1.52}
+{'loss': 0.2226, 'grad_norm': 0.6079855561256409, 'learning_rate': 7.548875855327467e-05, 'epoch': 1.52}
+{'loss': 0.2565, 'grad_norm': 0.6672441363334656, 'learning_rate': 7.546432062561095e-05, 'epoch': 1.52}
+{'loss': 0.3589, 'grad_norm': 0.7931745648384094, 'learning_rate': 7.543988269794721e-05, 'epoch': 1.52}
+ 76%|███████▌  | 9702/12776 [1:41:47<28:30,  1.80it/s] 76%|███████▌  | 9703/12776 [1:41:48<28:21,  1.81it/s]                                                       76%|███████▌  | 9703/12776 [1:41:48<28:21,  1.81it/s] 76%|███████▌  | 9704/12776 [1:41:48<26:39,  1.92it/s]                                                       76%|███████▌  | 9704/12776 [1:41:48<26:39,  1.92it/s] 76%|███████▌  | 9705/12776 [1:41:49<26:22,  1.94it/s]                                                       76%|███████▌  | 9705/12776 [1:41:49<26:22,  1.94it/s] 76%|███████▌  | 9706/12776 [1:41:49<24:37,  2.08it/s]                                                       76%|███████▌  | 9706/12776 [1:41:49<24:37,  2.08it/s] 76%|███████▌  | 9707/12776 [1:41:49<23:08,  2.21it/s]                                                       76%|███████▌  | 9707/12776 [1:41:49<23:08,  2.21it/s] 76%|███████▌  | 9708/12776 [1:41:50<22:23,  2.28it/s]                                                       76%|███████▌  | 9708/12776 [1:41:50<22:23,  2.28it/s] 76%|███████▌  | 9709/12776 [1:41:50<21:06,  2.42it/s]                                                       76%|███████▌  | 9709/12776 [1:41:50<21:06,  2.42it/s] 76%|███████▌  | 9710/12776 [1:41:50<20:04,  2.55it/s]                                                       76%|███████▌  | 9710/12776 [1:41:50<20:04,  2.55it/s] 76%|███████▌  | 9711/12776 [1:41:51<20:53,  2.45it/s]                                                       76%|███████▌  | 9711/12776 [1:41:51<20:53,  2.45it/s] 76%|███████▌  | 9712/12776 [1:41:51<19:52,  2.57it/s]                                                       76%|███████▌  | 9712/12776 [1:41:51<19:52,  2.57it/s] 76%|███████▌  | 9713/12776 [1:41:52<18:34,  2.75it/s]                                                       76%|███████▌  | 9713/12776 [1:41:52<18:34,  2.75it/s] 76%|███████▌  | 9714/12776 [1:41:52<17:32,  2.91it/s]                                                       76%|███████▌  | 9714/12776 [1:41:52<17:32,  2.91it/s] 76%|███████▌  | 9715/12776 [1:41:52<17:30,  2.92it/s]                                                       76%|███████▌  | 9715/12776 [1:41:52<17:30,  2.92it/s] 76%|███████▌  | 9716/12776 [1:41:52<16:31,  3.09it/s]                                                       76%|███████▌  | 9716/12776 [1:41:52<16:31,  3.09it/s] 76%|███████▌  | 9717/12776 [1:41:53<15:42,  3.25it/s]                                                       76%|███████▌  | 9717/12776 [1:41:53<15:42,  3.25it/s] 76%|███████▌  | 9718/12776 [1:41:53<15:02,  3.39it/s]                                                       76%|███████▌  | 9718/12776 [1:41:53<15:02,  3.39it/s] 76%|███████▌  | 9719/12776 [1:41:53<15:46,  3.23it/s]                                                       76%|███████▌  | 9719/12776 [1:41:53<15:46,  3.23it/s] 76%|███████▌  | 9720/12776 [1:41:54<14:53,  3.42it/s]                                                       76%|███████▌  | 9720/12776 [1:41:54<14:53,  3.42it/s] 76%|███████▌  | 9721/12776 [1:41:54<14:12,  3.58it/s]                                                       76%|███████▌  | 9721/12776 [1:41:54<14:12,  3.58it/s] 76%|███████▌  | 9722/12776 [1:41:54<13:40,  3.72it/s]                                                       76%|███████▌  | 9722/12776 [1:41:54<13:40,  3.72it/s] 76%|███████▌  | 9723/12776 [1:41:54<14:45,  3.45it/s]                                                       76%|███████▌  | 9723/12776 [1:41:54<14:45,  3.45it/s] 76%|███████▌  | 9724/12776 [1:41:55<13:51,  3.67it/s]                                                       76%|███████▌  | 9724/12776 [1:41:55<13:51,  3.67it/s] 76%|███████▌  | 9725/12776 [1:41:55<13:07,  3.87it/s]                                                       76%|███████▌  | 9725/12776 [1:41:55<13:07,  3.87it/s] 76%|███████▌  | 9726/12776 [1:41:55<12:28,  4.07it/s]                                                       76%|███████▌  | 9726/12776 [1:41:55<12:28,  4.07it/s] 76%|███████▌  | 9727/12776 [1:41:55<13:28,  3.77it/s]                                                       76%|███████▌  | 9727/12776 [1:41:55<13:28,  3.77it/s] 76%|███████▌  | 9728/12776 [1:41:56<12:40,  4.01it/s]                                                       76%|███████▌  | 9728/12776 [1:41:56<12:40,  4.01it/s] 76%|███████▌  | 9729/12776 [1:41:56<12:03,  4.21it/s]                                                       76%|███████▌  | 9729/12776 [1:41:56<12:03,  4.21it/s] 76%|███████▌  | 9730/12776 [1:41:56<11:35,  4.38it/s]                                                       76%|███████▌  | 9730/12776 [1:41:56<11:35,  4.38it/s] 76%|███████▌  | 9731/12776 [1:41:56<11:12,  4.53it/s]                                                       76%|███████▌  | 9731/12776 [1:41:56<11:12,  4.53it/s] 76%|███████▌  | 9732/12776 [1:41:57<12:33,  4.04it/s]                                                       76%|███████▌  | 9732/12776 [1:41:57<12:33,  4.04it/s] 76%|███████▌  | 9733/12776 [1:41:57<11:47,  4.30it/s]                                                       76%|███████▌  | 9733/12776 [1:41:57<11:47,  4.30it/s] 76%|███████▌  | 9734/12776 [1:41:57<11:13,  4.52it/s]                                                       76%|███████▌  | 9734/12776 [1:41:57<11:13,  4.52it/s] 76%|███████▌  | 9735/12776 [1:41:57<10:48,  4.69it/s]                                                       76%|███████▌  | 9735/12776 [1:41:57<10:48,  4.69it/s] 76%|███████▌  | 9736/12776 [1:41:57<10:25,  4.86it/s]                                                       76%|███████▌  | 9736/12776 [1:41:57<10:25,  4.86it/s] 76%|███████▌  | 9737/12776 [1:41:57<10:07,  5.00it/s]                                                       76%|███████▌  | 9737/12776 [1:41:57<10:07,  5.00it/s] 76%|███████▌  | 9738/12776 [1:41:58<17:44,  2.85it/s]                                                       76%|███████▌  | 9738/12776 [1:41:58<17:44,  2.85it/s] 76%|███████▌  | 9739/12776 [1:41:59<31:34,  1.60it/s]                                                       76%|███████▌  | 9739/12776 [1:41:59<31:34,  1.60it/s] 76%|███████▌  | 9740/12776 [1:42:00<36:14,  1.40it/s]                                                       76%|███████▌  | 9740/12776 [1:42:00<36:14,  1.40it/s] 76%|███████▌  | 9741/12776 [1:42:01<37:56,  1.33it/s]                                                       76%|███████▌  | 9741/12776 [1:42:01<37:56,  1.33it/s] 76%|███████▋  | 9742/12776 [1:42:02<37:59,  1.33it/s]                                                       76%|███████▋  | 9742/12776 [1:42:02<37:59,  1.33it/s] 76%|███████▋  | 9743/12776 [1:42:03<37:51,  1.34it/s]                                                       76%|███████▋  | 9743/12776 [1:42:03<37:51,  1.34it/s] 76%|███████▋  | 9744/12776 [1:42:03<37:51,  1.34it/s]                                                       76%|███████▋  | 9744/12776 [1:42:03<37:51,  1.34it/s] 76%|███████▋  | 9745/12776 [1:42:04<36:04,  1.40it/s]                                                       76%|███████▋  | 9745/12776 [1:42:04<36:04,  1.40it/s] 76%|███████▋  | 9746/12776 [1:42:05<34:19,  1.47it/s]                                                       76%|███████▋  | 9746/12776 [1:42:05<34:19,  1.47it/s] 76%|███████▋  | 9747/12776 [1:42:05<32:43,  1.54it/s]                                                       76%|███████▋  | 9747/12776 [1:42:05<32:43,  1.54it/s] 76%|███████▋  | 9748/12776 [1:42:06<31:35,  1.60it/s]                                                       76%|███████▋  | 9748/12776 [1:42:06<31:35,  1.60it/s] 76%|███████▋  | 9749/12776 [1:42:06<30:03,  1.68it/s]                                                       76%|███████▋  | 9749/12776 [1:42:06<30:03,  1.68it/s] 76%|███████▋  | 9750/12776 [1:42:07<28:54,  1.74it/s]                                                       76%|███████▋  | 9750/12776 [1:42:07<28:54,  1.74it/s] 76%|███████▋  | 9751/12776 [1:42:07<27:10,  1.86it/s]                                                       76%|███████▋  | 9751/12776 [1:42:07<27:10,  1.86it/s] 76%|███████▋  | 9752/12776 [1:42:08<25:47,  1.95it/s]                                                       76%|███████▋  | 9752/12776 [1:42:08<25:47,  1.95it/s] 76%|███████▋  | 9753/12776 [1:42:08<25:28,  1.98it/s]                                                       76%|███████▋  | 9753/12776 [1:42:08<25:28,  1.98it/s] 76%|███████▋  | 9754/12776 [1:42:09<23:59,  2.10it/s]                                                       76%|███████▋  | 9754/12776 [1:42:09<23:59,  2.10it/s] 76%|███████▋  | 9755/12776 [1:42:09<23:58,  2.10it/s]                                                       76%|███████▋  | 9755/12776 [1:42:09<23:58,  2.10it/s] 76%|███████▋  | 9756/12776 [1:42:10<22:37,  2.22it/s]                                                       76%|███████▋  | 9756/12776 [1:42:10<22:37,  2.22it/s] 76%|███████▋  | 9757/12776 [1:42:10<21:21,  2.36it/s]                                                       76%|███████▋  | 9757/12776 [1:42:10<21:21,  2.36it/s] 76%|███████▋  | 9758/12776 [1:42:10<21:14,  2.37it/s]                                                       76%|███████▋  | 9758/12776 [1:42:10<21:14,  2.37it/s] 76%|███████▋  | 9759/12776 [1:42:11<20:01,  2.51it/s]                                                       76%|███████▋  | 9759/12776 [1:42:11<20:01,  2.51it/s] 76%|███████▋  | 9760/12776 [1:42:11<19:03,  2.64it/s]                                                       76%|███████▋  | 9760/12776 [1:42:11<19:03,  2.64it/s] 76%|███████▋  | 9761/12776 [1:42:11<18:11,  2.76it/s]                                                       76%|███████▋  | 9761/12776 [1:42:11<18:11,  2.76it/s] 76%|███████▋  | 9762/12776 [1:42:12<17:30,  2.87it/s]                                                       76%|███████▋  | 9762/12776 [1:42:12<17:30,  2.87it/s] 76%|███████▋  | 9763/12776 [1:42:12<16:43,  3.00it/s]                                                       76%|███████▋  | 9763/12776 [1:42:12<16:43,  3.00it/s] 76%|███████▋  | 9764/12776 [1:42:12<16:02,  3.13it/s]                                                       76%|███████▋  | 9764/12776 [1:42:12<16:02,  3.13it/s] 76%|███████▋  | 9765/12776 [1:42:13<17:11,  2.92it/s]                                                       76%|███████▋  | 9765/12776 [1:42:13<17:11,  2.92it/s] 76%|███████▋  | 9766/12776 [1:42:13<16:07,  3.11it/s]                                                       76%|███████▋  | 9766/12776 [1:42:13<16:07,  3.11it/s] 76%|███████▋  | 9767/12776 [1:42:13<15:16,  3.28it/s]                                                       76%|███████▋  | 9767/12776 [1:42:13<15:16,  3.28it/s] 76%|███████▋  | 9768/12776 [1:42:13<14:30,  3.45it/s]                                                       76%|███████▋  | 9768/12776 [1:42:13<14:30,  3.45it/s] 76%|███████▋  | 9769/12776 [1:42:14<15:33,  3.22it/s]                                                       76%|███████▋  | 9769/12776 [1:42:14<15:33,  3.22it/s] 76%|███████▋  | 9770/12776 [1:42:14<14:36,  3.43it/s]                                                       76%|███████▋  | 9770/12776 [1:42:14<14:36,  3.43it/s] 76%|███████▋  | 9771/12776 [1:42:14<13:53,  3.61it/s]                                                       76%|███████▋  | 9771/12776 [1:42:14<13:53,  3.61it/s] 76%|███████▋  | 9772/12776 [1:42:15<13:20,  3.75it/s]                                                       76%|███████▋  | 9772/12776 [1:42:15<13:20,  3.75it/s] 76%|███████▋  | 9773/12776 [1:42:15<12:51,  3.89it/s]                                                       76%|███████▋  | 9773/12776 [1:42:15<12:51,  3.89it/s] 77%|███████▋  | 9774/12776 [1:42:15<13:32,  3.70it/s]                                                       77%|███████▋  | 9774/12776 [1:42:15<13:32,  3.70it/s] 77%|███████▋  | 9775/12776 [1:42:15<12:48,  3.90it/s]                                                       77%|███████▋  | 9775/12776 [1:42:15<12:48,  3.90it/s] 77%|███████▋  | 9776/12776 [1:42:15<12:12,  4.10it/s]                                                       77%|███████▋  | 9776/12776 [1:42:15<12:12,  4.10it/s] 77%|███████▋  | 9777/12776 [1:42:16<11:43,  4.26it/s]                                                       77%|███████▋  | 9777/12776 [1:42:16<11:43,  4.26it/s] 77%|███████▋  | 9778/12776 [1:42:16<11:22,  4.39it/s]                                                       77%|███████▋  | 9778/12776 [1:42:16<11:22,  4.39it/s] 77%|███████▋  | 9779/12776 [1:42:16<11:47,  4.23it/s]                                                       77%|███████▋  | 9779/12776 [1:42:16<11:47,  4.23it/s] 77%|███████▋  | 9780/12776 [1:42:16<11:28,  4.35it/s]                                                      {'loss': 0.1624, 'grad_norm': 0.5384275913238525, 'learning_rate': 7.541544477028346e-05, 'epoch': 1.52}
+{'loss': 0.2575, 'grad_norm': 0.7884646058082581, 'learning_rate': 7.539100684261974e-05, 'epoch': 1.52}
+{'loss': 0.3014, 'grad_norm': 0.8177517652511597, 'learning_rate': 7.536656891495601e-05, 'epoch': 1.52}
+{'loss': 0.1904, 'grad_norm': 1.0651932954788208, 'learning_rate': 7.534213098729226e-05, 'epoch': 1.52}
+{'loss': 0.2615, 'grad_norm': 1.055517554283142, 'learning_rate': 7.531769305962854e-05, 'epoch': 1.52}
+{'loss': 0.5792, 'grad_norm': 1.7985464334487915, 'learning_rate': 7.52932551319648e-05, 'epoch': 1.52}
+{'loss': 0.5109, 'grad_norm': 3.7123966217041016, 'learning_rate': 7.526881720430107e-05, 'epoch': 1.52}
+{'loss': 0.4718, 'grad_norm': 1.6406701803207397, 'learning_rate': 7.524437927663735e-05, 'epoch': 1.52}
+{'loss': 0.4324, 'grad_norm': 1.475159764289856, 'learning_rate': 7.52199413489736e-05, 'epoch': 1.52}
+{'loss': 0.663, 'grad_norm': 3.5248284339904785, 'learning_rate': 7.519550342130986e-05, 'epoch': 1.52}
+{'loss': 0.4683, 'grad_norm': 2.4344213008880615, 'learning_rate': 7.517106549364614e-05, 'epoch': 1.52}
+{'loss': 0.6168, 'grad_norm': 1.7357611656188965, 'learning_rate': 7.51466275659824e-05, 'epoch': 1.52}
+{'loss': 0.779, 'grad_norm': 8.591681480407715, 'learning_rate': 7.512218963831866e-05, 'epoch': 1.52}
+{'loss': 0.513, 'grad_norm': 3.0469913482666016, 'learning_rate': 7.509775171065493e-05, 'epoch': 1.52}
+{'loss': 0.5417, 'grad_norm': 4.075806617736816, 'learning_rate': 7.50733137829912e-05, 'epoch': 1.52}
+{'loss': 0.5785, 'grad_norm': 1.6552000045776367, 'learning_rate': 7.504887585532745e-05, 'epoch': 1.52}
+{'loss': 0.8276, 'grad_norm': 2.410032033920288, 'learning_rate': 7.502443792766373e-05, 'epoch': 1.52}
+{'loss': 0.654, 'grad_norm': 1.1762977838516235, 'learning_rate': 7.5e-05, 'epoch': 1.52}
+{'loss': 0.7655, 'grad_norm': 5.700626373291016, 'learning_rate': 7.497556207233626e-05, 'epoch': 1.52}
+{'loss': 1.2639, 'grad_norm': 3.678863763809204, 'learning_rate': 7.495112414467252e-05, 'epoch': 1.52}
+{'loss': 0.81, 'grad_norm': 2.683408737182617, 'learning_rate': 7.492668621700879e-05, 'epoch': 1.52}
+{'loss': 0.763, 'grad_norm': 2.2185420989990234, 'learning_rate': 7.490224828934507e-05, 'epoch': 1.52}
+{'loss': 0.7293, 'grad_norm': 2.0188112258911133, 'learning_rate': 7.487781036168132e-05, 'epoch': 1.52}
+{'loss': 1.0011, 'grad_norm': 2.3393964767456055, 'learning_rate': 7.485337243401758e-05, 'epoch': 1.52}
+{'loss': 0.8906, 'grad_norm': 2.7806484699249268, 'learning_rate': 7.482893450635386e-05, 'epoch': 1.52}
+{'loss': 0.8827, 'grad_norm': 2.0687015056610107, 'learning_rate': 7.480449657869011e-05, 'epoch': 1.52}
+{'loss': 1.36, 'grad_norm': 2.678903818130493, 'learning_rate': 7.478005865102639e-05, 'epoch': 1.52}
+{'loss': 0.89, 'grad_norm': 2.200491428375244, 'learning_rate': 7.475562072336265e-05, 'epoch': 1.52}
+{'loss': 1.4945, 'grad_norm': 2.768153429031372, 'learning_rate': 7.473118279569892e-05, 'epoch': 1.52}
+{'loss': 1.177, 'grad_norm': 6.975987434387207, 'learning_rate': 7.470674486803518e-05, 'epoch': 1.52}
+{'loss': 1.1726, 'grad_norm': 2.4341022968292236, 'learning_rate': 7.468230694037145e-05, 'epoch': 1.52}
+{'loss': 0.6998, 'grad_norm': 4.012300491333008, 'learning_rate': 7.465786901270771e-05, 'epoch': 1.52}
+{'loss': 0.8953, 'grad_norm': 2.198310136795044, 'learning_rate': 7.463343108504398e-05, 'epoch': 1.52}
+{'loss': 0.5788, 'grad_norm': 1.1857575178146362, 'learning_rate': 7.460899315738026e-05, 'epoch': 1.52}
+{'loss': 0.6252, 'grad_norm': 2.221917152404785, 'learning_rate': 7.458455522971651e-05, 'epoch': 1.52}
+{'loss': 0.6908, 'grad_norm': 4.886264801025391, 'learning_rate': 7.456011730205277e-05, 'epoch': 1.52}
+{'loss': 0.8521, 'grad_norm': 2.3509812355041504, 'learning_rate': 7.453567937438905e-05, 'epoch': 1.52}
+{'loss': 0.1878, 'grad_norm': 0.35476207733154297, 'learning_rate': 7.45112414467253e-05, 'epoch': 1.52}
+{'loss': 0.2251, 'grad_norm': 3.2828149795532227, 'learning_rate': 7.448680351906158e-05, 'epoch': 1.52}
+{'loss': 0.19, 'grad_norm': 0.5039660334587097, 'learning_rate': 7.446236559139785e-05, 'epoch': 1.52}
+{'loss': 0.2284, 'grad_norm': 0.5465099215507507, 'learning_rate': 7.443792766373411e-05, 'epoch': 1.53}
+{'loss': 0.326, 'grad_norm': 1.618552803993225, 'learning_rate': 7.441348973607038e-05, 'epoch': 1.53}
+{'loss': 0.3094, 'grad_norm': 0.585069477558136, 'learning_rate': 7.438905180840664e-05, 'epoch': 1.53}
+{'loss': 0.3384, 'grad_norm': 1.64920175075531, 'learning_rate': 7.43646138807429e-05, 'epoch': 1.53}
+{'loss': 0.538, 'grad_norm': 1.8453181982040405, 'learning_rate': 7.434017595307917e-05, 'epoch': 1.53}
+{'loss': 0.255, 'grad_norm': 0.8853111267089844, 'learning_rate': 7.431573802541543e-05, 'epoch': 1.53}
+{'loss': 0.1581, 'grad_norm': 0.9123733043670654, 'learning_rate': 7.42913000977517e-05, 'epoch': 1.53}
+{'loss': 0.2773, 'grad_norm': 0.7811870574951172, 'learning_rate': 7.426686217008796e-05, 'epoch': 1.53}
+{'loss': 0.4379, 'grad_norm': 0.824766993522644, 'learning_rate': 7.424242424242424e-05, 'epoch': 1.53}
+{'loss': 0.4431, 'grad_norm': 1.7148675918579102, 'learning_rate': 7.42179863147605e-05, 'epoch': 1.53}
+{'loss': 0.3849, 'grad_norm': 1.5956602096557617, 'learning_rate': 7.419354838709677e-05, 'epoch': 1.53}
+{'loss': 0.4567, 'grad_norm': 1.4418418407440186, 'learning_rate': 7.416911045943304e-05, 'epoch': 1.53}
+{'loss': 0.5558, 'grad_norm': 1.1421548128128052, 'learning_rate': 7.41446725317693e-05, 'epoch': 1.53}
+{'loss': 0.3335, 'grad_norm': 2.0118582248687744, 'learning_rate': 7.412023460410557e-05, 'epoch': 1.53}
+{'loss': 0.4836, 'grad_norm': 1.003547191619873, 'learning_rate': 7.409579667644183e-05, 'epoch': 1.53}
+{'loss': 0.4633, 'grad_norm': 1.8095251321792603, 'learning_rate': 7.40713587487781e-05, 'epoch': 1.53}
+{'loss': 0.2844, 'grad_norm': 0.8199507594108582, 'learning_rate': 7.404692082111436e-05, 'epoch': 1.53}
+{'loss': 0.5131, 'grad_norm': 1.0319750308990479, 'learning_rate': 7.402248289345063e-05, 'epoch': 1.53}
+{'loss': 0.5979, 'grad_norm': 1.8906883001327515, 'learning_rate': 7.399804496578689e-05, 'epoch': 1.53}
+{'loss': 0.288, 'grad_norm': 0.7006099820137024, 'learning_rate': 7.397360703812316e-05, 'epoch': 1.53}
+{'loss': 0.7689, 'grad_norm': 2.609555721282959, 'learning_rate': 7.394916911045943e-05, 'epoch': 1.53}
+{'loss': 0.6709, 'grad_norm': 1.5572997331619263, 'learning_rate': 7.392473118279569e-05, 'epoch': 1.53}
+{'loss': 0.4997, 'grad_norm': 1.1742448806762695, 'learning_rate': 7.390029325513196e-05, 'epoch': 1.53}
+{'loss': 0.636, 'grad_norm': 1.8424162864685059, 'learning_rate': 7.387585532746823e-05, 'epoch': 1.53}
+{'loss': 0.7927, 'grad_norm': 1.5619405508041382, 'learning_rate': 7.38514173998045e-05, 'epoch': 1.53}
+{'loss': 1.1046, 'grad_norm': 3.71563982963562, 'learning_rate': 7.382697947214076e-05, 'epoch': 1.53}
+{'loss': 0.3665, 'grad_norm': 1.4069294929504395, 'learning_rate': 7.380254154447702e-05, 'epoch': 1.53}
+{'loss': 0.6521, 'grad_norm': 2.09309983253479, 'learning_rate': 7.377810361681329e-05, 'epoch': 1.53}
+{'loss': 0.8215, 'grad_norm': 1.4793744087219238, 'learning_rate': 7.375366568914955e-05, 'epoch': 1.53}
+{'loss': 0.4851, 'grad_norm': 1.229730248451233, 'learning_rate': 7.372922776148582e-05, 'epoch': 1.53}
+{'loss': 0.5851, 'grad_norm': 1.6996830701828003, 'learning_rate': 7.370478983382208e-05, 'epoch': 1.53}
+{'loss': 0.881, 'grad_norm': 1.705766201019287, 'learning_rate': 7.368035190615835e-05, 'epoch': 1.53}
+{'loss': 0.3521, 'grad_norm': 1.5375521183013916, 'learning_rate': 7.365591397849463e-05, 'epoch': 1.53}
+{'loss': 0.9904, 'grad_norm': 2.4411001205444336, 'learning_rate': 7.363147605083088e-05, 'epoch': 1.53}
+{'loss': 1.0197, 'grad_norm': 2.59539794921875, 'learning_rate': 7.360703812316715e-05, 'epoch': 1.53}
+{'loss': 0.8143, 'grad_norm': 1.2721023559570312, 'learning_rate': 7.358260019550342e-05, 'epoch': 1.53}
+{'loss': 1.3697, 'grad_norm': 3.4059696197509766, 'learning_rate': 7.355816226783968e-05, 'epoch': 1.53}
+{'loss': 0.8997, 'grad_norm': 1.2390133142471313, 'learning_rate': 7.353372434017595e-05, 'epoch': 1.53}
+ 77%|███████▋  | 9780/12776 [1:42:16<11:28,  4.35it/s] 77%|███████▋  | 9781/12776 [1:42:17<11:02,  4.52it/s]                                                       77%|███████▋  | 9781/12776 [1:42:17<11:02,  4.52it/s] 77%|███████▋  | 9782/12776 [1:42:17<10:41,  4.67it/s]                                                       77%|███████▋  | 9782/12776 [1:42:17<10:41,  4.67it/s] 77%|███████▋  | 9783/12776 [1:42:17<10:27,  4.77it/s]                                                       77%|███████▋  | 9783/12776 [1:42:17<10:27,  4.77it/s] 77%|███████▋  | 9784/12776 [1:42:17<11:39,  4.28it/s]                                                       77%|███████▋  | 9784/12776 [1:42:17<11:39,  4.28it/s] 77%|███████▋  | 9785/12776 [1:42:17<11:04,  4.50it/s]                                                       77%|███████▋  | 9785/12776 [1:42:17<11:04,  4.50it/s] 77%|███████▋  | 9786/12776 [1:42:18<10:33,  4.72it/s]                                                       77%|███████▋  | 9786/12776 [1:42:18<10:33,  4.72it/s] 77%|███████▋  | 9787/12776 [1:42:18<10:09,  4.91it/s]                                                       77%|███████▋  | 9787/12776 [1:42:18<10:09,  4.91it/s] 77%|███████▋  | 9788/12776 [1:42:19<17:05,  2.91it/s]                                                       77%|███████▋  | 9788/12776 [1:42:19<17:05,  2.91it/s] 77%|███████▋  | 9789/12776 [1:42:20<34:04,  1.46it/s]                                                       77%|███████▋  | 9789/12776 [1:42:20<34:04,  1.46it/s] 77%|███████▋  | 9790/12776 [1:42:21<38:43,  1.29it/s]                                                       77%|███████▋  | 9790/12776 [1:42:21<38:43,  1.29it/s] 77%|███████▋  | 9791/12776 [1:42:22<40:15,  1.24it/s]                                                       77%|███████▋  | 9791/12776 [1:42:22<40:15,  1.24it/s] 77%|███████▋  | 9792/12776 [1:42:23<40:15,  1.24it/s]                                                       77%|███████▋  | 9792/12776 [1:42:23<40:15,  1.24it/s] 77%|███████▋  | 9793/12776 [1:42:23<39:05,  1.27it/s]                                                       77%|███████▋  | 9793/12776 [1:42:23<39:05,  1.27it/s] 77%|███████▋  | 9794/12776 [1:42:24<37:52,  1.31it/s]                                                       77%|███████▋  | 9794/12776 [1:42:24<37:52,  1.31it/s] 77%|███████▋  | 9795/12776 [1:42:25<35:55,  1.38it/s]                                                       77%|███████▋  | 9795/12776 [1:42:25<35:55,  1.38it/s] 77%|███████▋  | 9796/12776 [1:42:25<33:46,  1.47it/s]                                                       77%|███████▋  | 9796/12776 [1:42:25<33:46,  1.47it/s] 77%|███████▋  | 9797/12776 [1:42:26<32:01,  1.55it/s]                                                       77%|███████▋  | 9797/12776 [1:42:26<32:01,  1.55it/s] 77%|███████▋  | 9798/12776 [1:42:26<31:29,  1.58it/s]                                                       77%|███████▋  | 9798/12776 [1:42:26<31:29,  1.58it/s] 77%|███████▋  | 9799/12776 [1:42:27<29:34,  1.68it/s]                                                       77%|███████▋  | 9799/12776 [1:42:27<29:34,  1.68it/s] 77%|███████▋  | 9800/12776 [1:42:27<27:50,  1.78it/s]                                                       77%|███████▋  | 9800/12776 [1:42:27<27:50,  1.78it/s] 77%|███████▋  | 9801/12776 [1:42:28<26:33,  1.87it/s]                                                       77%|███████▋  | 9801/12776 [1:42:28<26:33,  1.87it/s] 77%|███████▋  | 9802/12776 [1:42:28<25:06,  1.97it/s]                                                       77%|███████▋  | 9802/12776 [1:42:28<25:06,  1.97it/s] 77%|███████▋  | 9803/12776 [1:42:29<24:06,  2.06it/s]                                                       77%|███████▋  | 9803/12776 [1:42:29<24:06,  2.06it/s] 77%|███████▋  | 9804/12776 [1:42:29<22:46,  2.17it/s]                                                       77%|███████▋  | 9804/12776 [1:42:29<22:46,  2.17it/s] 77%|███████▋  | 9805/12776 [1:42:30<21:41,  2.28it/s]                                                       77%|███████▋  | 9805/12776 [1:42:30<21:41,  2.28it/s] 77%|███████▋  | 9806/12776 [1:42:30<22:49,  2.17it/s]                                                       77%|███████▋  | 9806/12776 [1:42:30<22:49,  2.17it/s] 77%|███████▋  | 9807/12776 [1:42:30<21:16,  2.33it/s]                                                       77%|███████▋  | 9807/12776 [1:42:30<21:16,  2.33it/s] 77%|███████▋  | 9808/12776 [1:42:31<20:04,  2.46it/s]                                                       77%|███████▋  | 9808/12776 [1:42:31<20:04,  2.46it/s] 77%|███████▋  | 9809/12776 [1:42:31<20:22,  2.43it/s]                                                       77%|███████▋  | 9809/12776 [1:42:31<20:22,  2.43it/s] 77%|███████▋  | 9810/12776 [1:42:32<19:11,  2.58it/s]                                                       77%|███████▋  | 9810/12776 [1:42:32<19:11,  2.58it/s] 77%|███████▋  | 9811/12776 [1:42:32<18:09,  2.72it/s]                                                       77%|███████▋  | 9811/12776 [1:42:32<18:09,  2.72it/s] 77%|███████▋  | 9812/12776 [1:42:32<17:48,  2.78it/s]                                                       77%|███████▋  | 9812/12776 [1:42:32<17:48,  2.78it/s] 77%|███████▋  | 9813/12776 [1:42:33<16:47,  2.94it/s]                                                       77%|███████▋  | 9813/12776 [1:42:33<16:47,  2.94it/s] 77%|███████▋  | 9814/12776 [1:42:33<15:56,  3.10it/s]                                                       77%|███████▋  | 9814/12776 [1:42:33<15:56,  3.10it/s] 77%|███████▋  | 9815/12776 [1:42:33<15:14,  3.24it/s]                                                       77%|███████▋  | 9815/12776 [1:42:33<15:14,  3.24it/s] 77%|███████▋  | 9816/12776 [1:42:33<15:21,  3.21it/s]                                                       77%|███████▋  | 9816/12776 [1:42:33<15:21,  3.21it/s] 77%|███████▋  | 9817/12776 [1:42:34<14:31,  3.40it/s]                                                       77%|███████▋  | 9817/12776 [1:42:34<14:31,  3.40it/s] 77%|███████▋  | 9818/12776 [1:42:34<13:55,  3.54it/s]                                                       77%|███████▋  | 9818/12776 [1:42:34<13:55,  3.54it/s] 77%|███████▋  | 9819/12776 [1:42:34<13:28,  3.66it/s]                                                       77%|███████▋  | 9819/12776 [1:42:34<13:28,  3.66it/s] 77%|███████▋  | 9820/12776 [1:42:35<15:10,  3.25it/s]                                                       77%|███████▋  | 9820/12776 [1:42:35<15:10,  3.25it/s] 77%|███████▋  | 9821/12776 [1:42:35<14:12,  3.47it/s]                                                       77%|███████▋  | 9821/12776 [1:42:35<14:12,  3.47it/s] 77%|███████▋  | 9822/12776 [1:42:35<13:26,  3.66it/s]                                                       77%|███████▋  | 9822/12776 [1:42:35<13:26,  3.66it/s] 77%|███████▋  | 9823/12776 [1:42:35<12:49,  3.84it/s]                                                       77%|███████▋  | 9823/12776 [1:42:35<12:49,  3.84it/s] 77%|███████▋  | 9824/12776 [1:42:36<12:18,  4.00it/s]                                                       77%|███████▋  | 9824/12776 [1:42:36<12:18,  4.00it/s] 77%|███████▋  | 9825/12776 [1:42:36<12:51,  3.83it/s]                                                       77%|███████▋  | 9825/12776 [1:42:36<12:51,  3.83it/s] 77%|███████▋  | 9826/12776 [1:42:36<12:10,  4.04it/s]                                                       77%|███████▋  | 9826/12776 [1:42:36<12:10,  4.04it/s] 77%|███████▋  | 9827/12776 [1:42:36<11:39,  4.22it/s]                                                       77%|███████▋  | 9827/12776 [1:42:36<11:39,  4.22it/s] 77%|███████▋  | 9828/12776 [1:42:36<11:13,  4.38it/s]                                                       77%|███████▋  | 9828/12776 [1:42:36<11:13,  4.38it/s] 77%|███████▋  | 9829/12776 [1:42:37<10:54,  4.50it/s]                                                       77%|███████▋  | 9829/12776 [1:42:37<10:54,  4.50it/s] 77%|███████▋  | 9830/12776 [1:42:37<11:34,  4.24it/s]                                                       77%|███████▋  | 9830/12776 [1:42:37<11:34,  4.24it/s] 77%|███████▋  | 9831/12776 [1:42:37<11:01,  4.45it/s]                                                       77%|███████▋  | 9831/12776 [1:42:37<11:01,  4.45it/s] 77%|███████▋  | 9832/12776 [1:42:37<10:39,  4.61it/s]                                                       77%|███████▋  | 9832/12776 [1:42:37<10:39,  4.61it/s] 77%|███████▋  | 9833/12776 [1:42:38<10:23,  4.72it/s]                                                       77%|███████▋  | 9833/12776 [1:42:38<10:23,  4.72it/s] 77%|███████▋  | 9834/12776 [1:42:38<10:07,  4.85it/s]                                                       77%|███████▋  | 9834/12776 [1:42:38<10:07,  4.85it/s] 77%|███████▋  | 9835/12776 [1:42:38<11:31,  4.25it/s]                                                       77%|███████▋  | 9835/12776 [1:42:38<11:31,  4.25it/s] 77%|███████▋  | 9836/12776 [1:42:38<10:49,  4.53it/s]                                                       77%|███████▋  | 9836/12776 [1:42:38<10:49,  4.53it/s] 77%|███████▋  | 9837/12776 [1:42:38<10:14,  4.78it/s]                                                       77%|███████▋  | 9837/12776 [1:42:38<10:14,  4.78it/s] 77%|███████▋  | 9838/12776 [1:42:39<18:49,  2.60it/s]                                                       77%|███████▋  | 9838/12776 [1:42:39<18:49,  2.60it/s] 77%|███████▋  | 9839/12776 [1:42:41<38:31,  1.27it/s]                                                       77%|███████▋  | 9839/12776 [1:42:41<38:31,  1.27it/s] 77%|███████▋  | 9840/12776 [1:42:42<42:08,  1.16it/s]                                                       77%|███████▋  | 9840/12776 [1:42:42<42:08,  1.16it/s] 77%|███████▋  | 9841/12776 [1:42:43<44:11,  1.11it/s]                                                       77%|███████▋  | 9841/12776 [1:42:43<44:11,  1.11it/s] 77%|███████▋  | 9842/12776 [1:42:44<45:21,  1.08it/s]                                                       77%|███████▋  | 9842/12776 [1:42:44<45:21,  1.08it/s] 77%|███████▋  | 9843/12776 [1:42:45<43:47,  1.12it/s]                                                       77%|███████▋  | 9843/12776 [1:42:45<43:47,  1.12it/s] 77%|███████▋  | 9844/12776 [1:42:45<41:23,  1.18it/s]                                                       77%|███████▋  | 9844/12776 [1:42:45<41:23,  1.18it/s] 77%|███████▋  | 9845/12776 [1:42:46<39:32,  1.24it/s]                                                       77%|███████▋  | 9845/12776 [1:42:46<39:32,  1.24it/s] 77%|███████▋  | 9846/12776 [1:42:47<37:09,  1.31it/s]                                                       77%|███████▋  | 9846/12776 [1:42:47<37:09,  1.31it/s] 77%|███████▋  | 9847/12776 [1:42:47<35:01,  1.39it/s]                                                       77%|███████▋  | 9847/12776 [1:42:47<35:01,  1.39it/s] 77%|███████▋  | 9848/12776 [1:42:48<32:50,  1.49it/s]                                                       77%|███████▋  | 9848/12776 [1:42:48<32:50,  1.49it/s] 77%|███████▋  | 9849/12776 [1:42:49<30:57,  1.58it/s]                                                       77%|███████▋  | 9849/12776 [1:42:49<30:57,  1.58it/s] 77%|███████▋  | 9850/12776 [1:42:49<29:17,  1.67it/s]                                                       77%|███████▋  | 9850/12776 [1:42:49<29:17,  1.67it/s] 77%|███████▋  | 9851/12776 [1:42:50<29:27,  1.65it/s]                                                       77%|███████▋  | 9851/12776 [1:42:50<29:27,  1.65it/s] 77%|███████▋  | 9852/12776 [1:42:50<27:22,  1.78it/s]                                                       77%|███████▋  | 9852/12776 [1:42:50<27:22,  1.78it/s] 77%|███████▋  | 9853/12776 [1:42:51<27:28,  1.77it/s]                                                       77%|███████▋  | 9853/12776 [1:42:51<27:28,  1.77it/s] 77%|███████▋  | 9854/12776 [1:42:51<25:34,  1.90it/s]                                                       77%|███████▋  | 9854/12776 [1:42:51<25:34,  1.90it/s] 77%|███████▋  | 9855/12776 [1:42:52<25:16,  1.93it/s]                                                       77%|███████▋  | 9855/12776 [1:42:52<25:16,  1.93it/s] 77%|███████▋  | 9856/12776 [1:42:52<23:29,  2.07it/s]                                                       77%|███████▋  | 9856/12776 [1:42:52<23:29,  2.07it/s] 77%|███████▋  | 9857/12776 [1:42:52<22:03,  2.21it/s]                                                       77%|███████▋  | 9857/12776 [1:42:52<22:03,  2.21it/s] 77%|███████▋  | 9858/12776 [1:42:53<21:05,  2.31it/s]                                                      {'loss': 1.2236, 'grad_norm': 2.6192235946655273, 'learning_rate': 7.350928641251221e-05, 'epoch': 1.53}
+{'loss': 0.9419, 'grad_norm': 2.4125397205352783, 'learning_rate': 7.348484848484848e-05, 'epoch': 1.53}
+{'loss': 1.8505, 'grad_norm': 3.147789716720581, 'learning_rate': 7.346041055718474e-05, 'epoch': 1.53}
+{'loss': 0.5833, 'grad_norm': 2.7218902111053467, 'learning_rate': 7.343597262952101e-05, 'epoch': 1.53}
+{'loss': 0.3275, 'grad_norm': 1.0626882314682007, 'learning_rate': 7.341153470185727e-05, 'epoch': 1.53}
+{'loss': 0.8926, 'grad_norm': 2.7859811782836914, 'learning_rate': 7.338709677419354e-05, 'epoch': 1.53}
+{'loss': 0.5686, 'grad_norm': 2.831791877746582, 'learning_rate': 7.33626588465298e-05, 'epoch': 1.53}
+{'loss': 0.5434, 'grad_norm': 2.355695962905884, 'learning_rate': 7.333822091886607e-05, 'epoch': 1.53}
+{'loss': 0.5117, 'grad_norm': 1.321400761604309, 'learning_rate': 7.331378299120235e-05, 'epoch': 1.53}
+{'loss': 0.2592, 'grad_norm': 0.5803260207176208, 'learning_rate': 7.32893450635386e-05, 'epoch': 1.53}
+{'loss': 0.1622, 'grad_norm': 0.40602564811706543, 'learning_rate': 7.326490713587488e-05, 'epoch': 1.53}
+{'loss': 0.2955, 'grad_norm': 1.1273137331008911, 'learning_rate': 7.324046920821114e-05, 'epoch': 1.53}
+{'loss': 0.2276, 'grad_norm': 0.4123140573501587, 'learning_rate': 7.32160312805474e-05, 'epoch': 1.53}
+{'loss': 0.1589, 'grad_norm': 0.5335753560066223, 'learning_rate': 7.319159335288367e-05, 'epoch': 1.53}
+{'loss': 0.3083, 'grad_norm': 0.8474439382553101, 'learning_rate': 7.316715542521993e-05, 'epoch': 1.53}
+{'loss': 0.1776, 'grad_norm': 0.48397138714790344, 'learning_rate': 7.31427174975562e-05, 'epoch': 1.53}
+{'loss': 0.2416, 'grad_norm': 0.5532123446464539, 'learning_rate': 7.311827956989246e-05, 'epoch': 1.53}
+{'loss': 0.2122, 'grad_norm': 0.656062126159668, 'learning_rate': 7.309384164222873e-05, 'epoch': 1.53}
+{'loss': 0.2524, 'grad_norm': 1.0843610763549805, 'learning_rate': 7.3069403714565e-05, 'epoch': 1.53}
+{'loss': 0.437, 'grad_norm': 0.891477644443512, 'learning_rate': 7.304496578690126e-05, 'epoch': 1.53}
+{'loss': 0.2953, 'grad_norm': 1.0225049257278442, 'learning_rate': 7.302052785923754e-05, 'epoch': 1.53}
+{'loss': 0.492, 'grad_norm': 1.2247625589370728, 'learning_rate': 7.299608993157379e-05, 'epoch': 1.53}
+{'loss': 0.3037, 'grad_norm': 0.9728368520736694, 'learning_rate': 7.297165200391007e-05, 'epoch': 1.53}
+{'loss': 0.1679, 'grad_norm': 1.27720046043396, 'learning_rate': 7.294721407624633e-05, 'epoch': 1.53}
+{'loss': 0.1985, 'grad_norm': 0.6740818023681641, 'learning_rate': 7.29227761485826e-05, 'epoch': 1.53}
+{'loss': 0.4624, 'grad_norm': 1.0346983671188354, 'learning_rate': 7.289833822091886e-05, 'epoch': 1.53}
+{'loss': 0.1886, 'grad_norm': 1.6507031917572021, 'learning_rate': 7.287390029325513e-05, 'epoch': 1.54}
+{'loss': 0.3239, 'grad_norm': 1.1130856275558472, 'learning_rate': 7.284946236559139e-05, 'epoch': 1.54}
+{'loss': 0.5766, 'grad_norm': 1.4110175371170044, 'learning_rate': 7.282502443792766e-05, 'epoch': 1.54}
+{'loss': 0.4304, 'grad_norm': 15.8782377243042, 'learning_rate': 7.280058651026392e-05, 'epoch': 1.54}
+{'loss': 0.4212, 'grad_norm': 2.349193811416626, 'learning_rate': 7.277614858260019e-05, 'epoch': 1.54}
+{'loss': 0.7621, 'grad_norm': 2.3084702491760254, 'learning_rate': 7.275171065493645e-05, 'epoch': 1.54}
+{'loss': 0.6079, 'grad_norm': 2.6022043228149414, 'learning_rate': 7.272727272727273e-05, 'epoch': 1.54}
+{'loss': 0.505, 'grad_norm': 1.9148122072219849, 'learning_rate': 7.270283479960898e-05, 'epoch': 1.54}
+{'loss': 0.9097, 'grad_norm': 3.0425705909729004, 'learning_rate': 7.267839687194526e-05, 'epoch': 1.54}
+{'loss': 0.6503, 'grad_norm': 2.4766037464141846, 'learning_rate': 7.265395894428152e-05, 'epoch': 1.54}
+{'loss': 0.8135, 'grad_norm': 2.7663185596466064, 'learning_rate': 7.262952101661779e-05, 'epoch': 1.54}
+{'loss': 0.4896, 'grad_norm': 1.486636996269226, 'learning_rate': 7.260508308895405e-05, 'epoch': 1.54}
+{'loss': 0.3703, 'grad_norm': 1.2840673923492432, 'learning_rate': 7.258064516129032e-05, 'epoch': 1.54}
+{'loss': 0.7038, 'grad_norm': 1.9025120735168457, 'learning_rate': 7.255620723362658e-05, 'epoch': 1.54}
+{'loss': 0.5779, 'grad_norm': 3.2480320930480957, 'learning_rate': 7.253176930596285e-05, 'epoch': 1.54}
+{'loss': 1.0415, 'grad_norm': 2.224262237548828, 'learning_rate': 7.250733137829911e-05, 'epoch': 1.54}
+{'loss': 1.0646, 'grad_norm': 2.9639852046966553, 'learning_rate': 7.248289345063538e-05, 'epoch': 1.54}
+{'loss': 0.7018, 'grad_norm': 2.493807077407837, 'learning_rate': 7.245845552297164e-05, 'epoch': 1.54}
+{'loss': 0.4942, 'grad_norm': 2.37644624710083, 'learning_rate': 7.243401759530792e-05, 'epoch': 1.54}
+{'loss': 1.0169, 'grad_norm': 2.5154953002929688, 'learning_rate': 7.240957966764417e-05, 'epoch': 1.54}
+{'loss': 1.0743, 'grad_norm': 1.9995393753051758, 'learning_rate': 7.238514173998045e-05, 'epoch': 1.54}
+{'loss': 1.4467, 'grad_norm': 2.441619396209717, 'learning_rate': 7.236070381231671e-05, 'epoch': 1.54}
+{'loss': 0.5267, 'grad_norm': 2.3574626445770264, 'learning_rate': 7.233626588465298e-05, 'epoch': 1.54}
+{'loss': 0.769, 'grad_norm': 1.9852875471115112, 'learning_rate': 7.231182795698924e-05, 'epoch': 1.54}
+{'loss': 1.0293, 'grad_norm': 5.952073574066162, 'learning_rate': 7.228739002932551e-05, 'epoch': 1.54}
+{'loss': 1.2532, 'grad_norm': 3.0914673805236816, 'learning_rate': 7.226295210166177e-05, 'epoch': 1.54}
+{'loss': 0.9199, 'grad_norm': 2.5825436115264893, 'learning_rate': 7.223851417399804e-05, 'epoch': 1.54}
+{'loss': 1.0303, 'grad_norm': 2.3879377841949463, 'learning_rate': 7.22140762463343e-05, 'epoch': 1.54}
+{'loss': 0.1337, 'grad_norm': 0.8389413356781006, 'learning_rate': 7.218963831867057e-05, 'epoch': 1.54}
+{'loss': 0.8003, 'grad_norm': 1.7144027948379517, 'learning_rate': 7.216520039100683e-05, 'epoch': 1.54}
+{'loss': 1.1162, 'grad_norm': 3.8174052238464355, 'learning_rate': 7.214076246334311e-05, 'epoch': 1.54}
+{'loss': 1.6102, 'grad_norm': 3.3473262786865234, 'learning_rate': 7.211632453567936e-05, 'epoch': 1.54}
+{'loss': 0.6261, 'grad_norm': 1.6419070959091187, 'learning_rate': 7.209188660801564e-05, 'epoch': 1.54}
+{'loss': 0.198, 'grad_norm': 0.6053900718688965, 'learning_rate': 7.20674486803519e-05, 'epoch': 1.54}
+{'loss': 0.2027, 'grad_norm': 0.49573662877082825, 'learning_rate': 7.204301075268816e-05, 'epoch': 1.54}
+{'loss': 0.1637, 'grad_norm': 0.4826977849006653, 'learning_rate': 7.201857282502443e-05, 'epoch': 1.54}
+{'loss': 0.1888, 'grad_norm': 0.7587850689888, 'learning_rate': 7.19941348973607e-05, 'epoch': 1.54}
+{'loss': 0.1634, 'grad_norm': 0.565876841545105, 'learning_rate': 7.196969696969696e-05, 'epoch': 1.54}
+{'loss': 0.2195, 'grad_norm': 0.4626244604587555, 'learning_rate': 7.194525904203323e-05, 'epoch': 1.54}
+{'loss': 0.3624, 'grad_norm': 1.5018632411956787, 'learning_rate': 7.19208211143695e-05, 'epoch': 1.54}
+{'loss': 0.2698, 'grad_norm': 0.6011160016059875, 'learning_rate': 7.189638318670576e-05, 'epoch': 1.54}
+{'loss': 0.3386, 'grad_norm': 1.015252947807312, 'learning_rate': 7.187194525904202e-05, 'epoch': 1.54}
+{'loss': 0.1781, 'grad_norm': 1.5964446067810059, 'learning_rate': 7.18475073313783e-05, 'epoch': 1.54}
+{'loss': 0.237, 'grad_norm': 1.3437423706054688, 'learning_rate': 7.182306940371455e-05, 'epoch': 1.54}
+{'loss': 0.2588, 'grad_norm': 0.8205925822257996, 'learning_rate': 7.179863147605083e-05, 'epoch': 1.54}
+{'loss': 0.3411, 'grad_norm': 1.1015372276306152, 'learning_rate': 7.17741935483871e-05, 'epoch': 1.54}
+{'loss': 0.2773, 'grad_norm': 3.2911665439605713, 'learning_rate': 7.174975562072335e-05, 'epoch': 1.54}
+{'loss': 0.4853, 'grad_norm': 6.5218186378479, 'learning_rate': 7.172531769305963e-05, 'epoch': 1.54}
+{'loss': 0.233, 'grad_norm': 1.0145297050476074, 'learning_rate': 7.170087976539589e-05, 'epoch': 1.54}
+{'loss': 0.3493, 'grad_norm': 1.0070481300354004, 'learning_rate': 7.167644183773216e-05, 'epoch': 1.54}
+{'loss': 0.5796, 'grad_norm': 2.540808916091919, 'learning_rate': 7.165200391006842e-05, 'epoch': 1.54}
+{'loss': 0.4986, 'grad_norm': 2.2027969360351562, 'learning_rate': 7.162756598240468e-05, 'epoch': 1.54}
+ 77%|███████▋  | 9858/12776 [1:42:53<21:05,  2.31it/s] 77%|███████▋  | 9859/12776 [1:42:53<19:54,  2.44it/s]                                                       77%|███████▋  | 9859/12776 [1:42:53<19:54,  2.44it/s] 77%|███████▋  | 9860/12776 [1:42:54<19:02,  2.55it/s]                                                       77%|███████▋  | 9860/12776 [1:42:54<19:02,  2.55it/s] 77%|███████▋  | 9861/12776 [1:42:54<19:53,  2.44it/s]                                                       77%|███████▋  | 9861/12776 [1:42:54<19:53,  2.44it/s] 77%|███████▋  | 9862/12776 [1:42:54<18:46,  2.59it/s]                                                       77%|███████▋  | 9862/12776 [1:42:54<18:46,  2.59it/s] 77%|███████▋  | 9863/12776 [1:42:55<17:45,  2.73it/s]                                                       77%|███████▋  | 9863/12776 [1:42:55<17:45,  2.73it/s] 77%|███████▋  | 9864/12776 [1:42:55<16:50,  2.88it/s]                                                       77%|███████▋  | 9864/12776 [1:42:55<16:50,  2.88it/s] 77%|███████▋  | 9865/12776 [1:42:55<16:48,  2.89it/s]                                                       77%|███████▋  | 9865/12776 [1:42:55<16:48,  2.89it/s] 77%|███████▋  | 9866/12776 [1:42:56<15:56,  3.04it/s]                                                       77%|███████▋  | 9866/12776 [1:42:56<15:56,  3.04it/s] 77%|███████▋  | 9867/12776 [1:42:56<15:09,  3.20it/s]                                                       77%|███████▋  | 9867/12776 [1:42:56<15:09,  3.20it/s] 77%|███████▋  | 9868/12776 [1:42:56<14:33,  3.33it/s]                                                       77%|███████▋  | 9868/12776 [1:42:56<14:33,  3.33it/s] 77%|███████▋  | 9869/12776 [1:42:56<14:07,  3.43it/s]                                                       77%|███████▋  | 9869/12776 [1:42:56<14:07,  3.43it/s] 77%|███████▋  | 9870/12776 [1:42:57<13:31,  3.58it/s]                                                       77%|███████▋  | 9870/12776 [1:42:57<13:31,  3.58it/s] 77%|███████▋  | 9871/12776 [1:42:57<13:02,  3.71it/s]                                                       77%|███████▋  | 9871/12776 [1:42:57<13:02,  3.71it/s] 77%|███████▋  | 9872/12776 [1:42:57<12:37,  3.83it/s]                                                       77%|███████▋  | 9872/12776 [1:42:57<12:37,  3.83it/s] 77%|███████▋  | 9873/12776 [1:42:57<12:36,  3.84it/s]                                                       77%|███████▋  | 9873/12776 [1:42:57<12:36,  3.84it/s] 77%|███████▋  | 9874/12776 [1:42:58<12:10,  3.97it/s]                                                       77%|███████▋  | 9874/12776 [1:42:58<12:10,  3.97it/s] 77%|███████▋  | 9875/12776 [1:42:58<11:47,  4.10it/s]                                                       77%|███████▋  | 9875/12776 [1:42:58<11:47,  4.10it/s] 77%|███████▋  | 9876/12776 [1:42:58<11:21,  4.26it/s]                                                       77%|███████▋  | 9876/12776 [1:42:58<11:21,  4.26it/s] 77%|███████▋  | 9877/12776 [1:42:58<11:01,  4.38it/s]                                                       77%|███████▋  | 9877/12776 [1:42:58<11:01,  4.38it/s] 77%|███████▋  | 9878/12776 [1:42:59<11:21,  4.25it/s]                                                       77%|███████▋  | 9878/12776 [1:42:59<11:21,  4.25it/s] 77%|███████▋  | 9879/12776 [1:42:59<10:54,  4.43it/s]                                                       77%|███████▋  | 9879/12776 [1:42:59<10:54,  4.43it/s] 77%|███████▋  | 9880/12776 [1:42:59<10:33,  4.57it/s]                                                       77%|███████▋  | 9880/12776 [1:42:59<10:33,  4.57it/s] 77%|███████▋  | 9881/12776 [1:42:59<10:15,  4.70it/s]                                                       77%|███████▋  | 9881/12776 [1:42:59<10:15,  4.70it/s] 77%|███████▋  | 9882/12776 [1:42:59<10:01,  4.81it/s]                                                       77%|███████▋  | 9882/12776 [1:42:59<10:01,  4.81it/s] 77%|███████▋  | 9883/12776 [1:43:00<09:50,  4.90it/s]                                                       77%|███████▋  | 9883/12776 [1:43:00<09:50,  4.90it/s] 77%|███████▋  | 9884/12776 [1:43:00<10:20,  4.66it/s]                                                       77%|███████▋  | 9884/12776 [1:43:00<10:20,  4.66it/s] 77%|███████▋  | 9885/12776 [1:43:00<09:59,  4.82it/s]                                                       77%|███████▋  | 9885/12776 [1:43:00<09:59,  4.82it/s] 77%|███████▋  | 9886/12776 [1:43:00<09:40,  4.98it/s]                                                       77%|███████▋  | 9886/12776 [1:43:00<09:40,  4.98it/s] 77%|███████▋  | 9887/12776 [1:43:00<09:25,  5.11it/s]                                                       77%|███████▋  | 9887/12776 [1:43:00<09:25,  5.11it/s] 77%|███████▋  | 9888/12776 [1:43:01<17:03,  2.82it/s]                                                       77%|███████▋  | 9888/12776 [1:43:01<17:03,  2.82it/s] 77%|███████▋  | 9889/12776 [1:43:03<33:45,  1.43it/s]                                                       77%|███████▋  | 9889/12776 [1:43:03<33:45,  1.43it/s] 77%|███████▋  | 9890/12776 [1:43:04<38:42,  1.24it/s]                                                       77%|███████▋  | 9890/12776 [1:43:04<38:42,  1.24it/s] 77%|███████▋  | 9891/12776 [1:43:05<41:09,  1.17it/s]                                                       77%|███████▋  | 9891/12776 [1:43:05<41:09,  1.17it/s] 77%|███████▋  | 9892/12776 [1:43:05<40:36,  1.18it/s]                                                       77%|███████▋  | 9892/12776 [1:43:05<40:36,  1.18it/s] 77%|███████▋  | 9893/12776 [1:43:06<38:29,  1.25it/s]                                                       77%|███████▋  | 9893/12776 [1:43:06<38:29,  1.25it/s] 77%|███████▋  | 9894/12776 [1:43:07<37:06,  1.29it/s]                                                       77%|███████▋  | 9894/12776 [1:43:07<37:06,  1.29it/s] 77%|███████▋  | 9895/12776 [1:43:07<35:09,  1.37it/s]                                                       77%|███████▋  | 9895/12776 [1:43:07<35:09,  1.37it/s] 77%|███████▋  | 9896/12776 [1:43:08<33:19,  1.44it/s]                                                       77%|███████▋  | 9896/12776 [1:43:08<33:19,  1.44it/s] 77%|███████▋  | 9897/12776 [1:43:09<31:29,  1.52it/s]                                                       77%|███████▋  | 9897/12776 [1:43:09<31:29,  1.52it/s] 77%|███████▋  | 9898/12776 [1:43:09<29:45,  1.61it/s]                                                       77%|███████▋  | 9898/12776 [1:43:09<29:45,  1.61it/s] 77%|███████▋  | 9899/12776 [1:43:10<28:12,  1.70it/s]                                                       77%|███████▋  | 9899/12776 [1:43:10<28:12,  1.70it/s] 77%|███████▋  | 9900/12776 [1:43:10<27:47,  1.72it/s]                                                       77%|███████▋  | 9900/12776 [1:43:10<27:47,  1.72it/s] 77%|███████▋  | 9901/12776 [1:43:11<26:02,  1.84it/s]                                                       77%|███████▋  | 9901/12776 [1:43:11<26:02,  1.84it/s] 78%|███████▊  | 9902/12776 [1:43:11<27:18,  1.75it/s]                                                       78%|███████▊  | 9902/12776 [1:43:11<27:18,  1.75it/s] 78%|███████▊  | 9903/12776 [1:43:12<25:19,  1.89it/s]                                                       78%|███████▊  | 9903/12776 [1:43:12<25:19,  1.89it/s] 78%|███████▊  | 9904/12776 [1:43:12<24:57,  1.92it/s]                                                       78%|███████▊  | 9904/12776 [1:43:12<24:57,  1.92it/s] 78%|███████▊  | 9905/12776 [1:43:13<23:14,  2.06it/s]                                                       78%|███████▊  | 9905/12776 [1:43:13<23:14,  2.06it/s] 78%|███████▊  | 9906/12776 [1:43:13<21:53,  2.18it/s]                                                       78%|███████▊  | 9906/12776 [1:43:13<21:53,  2.18it/s] 78%|███████▊  | 9907/12776 [1:43:14<23:05,  2.07it/s]                                                       78%|███████▊  | 9907/12776 [1:43:14<23:05,  2.07it/s] 78%|███████▊  | 9908/12776 [1:43:14<21:27,  2.23it/s]                                                       78%|███████▊  | 9908/12776 [1:43:14<21:27,  2.23it/s] 78%|███████▊  | 9909/12776 [1:43:14<20:07,  2.37it/s]                                                       78%|███████▊  | 9909/12776 [1:43:14<20:07,  2.37it/s] 78%|███████▊  | 9910/12776 [1:43:15<19:40,  2.43it/s]                                                       78%|███████▊  | 9910/12776 [1:43:15<19:40,  2.43it/s] 78%|███████▊  | 9911/12776 [1:43:15<18:33,  2.57it/s]                                                       78%|███████▊  | 9911/12776 [1:43:15<18:33,  2.57it/s] 78%|███████▊  | 9912/12776 [1:43:15<17:39,  2.70it/s]                                                       78%|███████▊  | 9912/12776 [1:43:15<17:39,  2.70it/s] 78%|███████▊  | 9913/12776 [1:43:16<16:57,  2.81it/s]                                                       78%|███████▊  | 9913/12776 [1:43:16<16:57,  2.81it/s] 78%|███████▊  | 9914/12776 [1:43:16<16:09,  2.95it/s]                                                       78%|███████▊  | 9914/12776 [1:43:16<16:09,  2.95it/s] 78%|███████▊  | 9915/12776 [1:43:16<15:28,  3.08it/s]                                                       78%|███████▊  | 9915/12776 [1:43:16<15:28,  3.08it/s] 78%|███████▊  | 9916/12776 [1:43:17<14:52,  3.20it/s]                                                       78%|███████▊  | 9916/12776 [1:43:17<14:52,  3.20it/s] 78%|███████▊  | 9917/12776 [1:43:17<15:53,  3.00it/s]                                                       78%|███████▊  | 9917/12776 [1:43:17<15:53,  3.00it/s] 78%|███████▊  | 9918/12776 [1:43:17<14:56,  3.19it/s]                                                       78%|███████▊  | 9918/12776 [1:43:17<14:56,  3.19it/s] 78%|███████▊  | 9919/12776 [1:43:17<14:14,  3.34it/s]                                                       78%|███████▊  | 9919/12776 [1:43:17<14:14,  3.34it/s] 78%|███████▊  | 9920/12776 [1:43:18<13:37,  3.50it/s]                                                       78%|███████▊  | 9920/12776 [1:43:18<13:37,  3.50it/s] 78%|███████▊  | 9921/12776 [1:43:18<14:29,  3.28it/s]                                                       78%|███████▊  | 9921/12776 [1:43:18<14:29,  3.28it/s] 78%|███████▊  | 9922/12776 [1:43:18<13:37,  3.49it/s]                                                       78%|███████▊  | 9922/12776 [1:43:18<13:37,  3.49it/s] 78%|███████▊  | 9923/12776 [1:43:19<12:56,  3.67it/s]                                                       78%|███████▊  | 9923/12776 [1:43:19<12:56,  3.67it/s] 78%|███████▊  | 9924/12776 [1:43:19<12:23,  3.84it/s]                                                       78%|███████▊  | 9924/12776 [1:43:19<12:23,  3.84it/s] 78%|███████▊  | 9925/12776 [1:43:19<12:32,  3.79it/s]                                                       78%|███████▊  | 9925/12776 [1:43:19<12:32,  3.79it/s] 78%|███████▊  | 9926/12776 [1:43:19<11:56,  3.98it/s]                                                       78%|███████▊  | 9926/12776 [1:43:19<11:56,  3.98it/s] 78%|███████▊  | 9927/12776 [1:43:20<11:20,  4.19it/s]                                                       78%|███████▊  | 9927/12776 [1:43:20<11:20,  4.19it/s] 78%|███████▊  | 9928/12776 [1:43:20<10:54,  4.35it/s]                                                       78%|███████▊  | 9928/12776 [1:43:20<10:54,  4.35it/s] 78%|███████▊  | 9929/12776 [1:43:20<10:33,  4.49it/s]                                                       78%|███████▊  | 9929/12776 [1:43:20<10:33,  4.49it/s] 78%|███████▊  | 9930/12776 [1:43:20<11:12,  4.23it/s]                                                       78%|███████▊  | 9930/12776 [1:43:20<11:12,  4.23it/s] 78%|███████▊  | 9931/12776 [1:43:20<10:40,  4.44it/s]                                                       78%|███████▊  | 9931/12776 [1:43:20<10:40,  4.44it/s] 78%|███████▊  | 9932/12776 [1:43:21<10:14,  4.63it/s]                                                       78%|███████▊  | 9932/12776 [1:43:21<10:14,  4.63it/s] 78%|███████▊  | 9933/12776 [1:43:21<09:55,  4.77it/s]                                                       78%|███████▊  | 9933/12776 [1:43:21<09:55,  4.77it/s] 78%|███████▊  | 9934/12776 [1:43:21<09:39,  4.91it/s]                                                       78%|███████▊  | 9934/12776 [1:43:21<09:39,  4.91it/s] 78%|███████▊  | 9935/12776 [1:43:21<09:23,  5.04it/s]                                                       78%|███████▊  | 9935/12776 [1:43:21<09:23,  5.04it/s] 78%|███████▊  | 9936/12776 [1:43:21<10:47,  4.39it/s]                                                      {'loss': 0.427, 'grad_norm': 2.4842472076416016, 'learning_rate': 7.160312805474095e-05, 'epoch': 1.54}
+{'loss': 0.4044, 'grad_norm': 2.618431568145752, 'learning_rate': 7.157869012707721e-05, 'epoch': 1.54}
+{'loss': 0.5528, 'grad_norm': 2.222273349761963, 'learning_rate': 7.155425219941349e-05, 'epoch': 1.54}
+{'loss': 0.5638, 'grad_norm': 1.5825259685516357, 'learning_rate': 7.152981427174974e-05, 'epoch': 1.54}
+{'loss': 0.4018, 'grad_norm': 12.67103099822998, 'learning_rate': 7.150537634408601e-05, 'epoch': 1.54}
+{'loss': 0.5984, 'grad_norm': 1.7103662490844727, 'learning_rate': 7.148093841642229e-05, 'epoch': 1.54}
+{'loss': 0.6249, 'grad_norm': 1.5723588466644287, 'learning_rate': 7.145650048875854e-05, 'epoch': 1.54}
+{'loss': 0.6729, 'grad_norm': 1.4810919761657715, 'learning_rate': 7.143206256109482e-05, 'epoch': 1.54}
+{'loss': 0.373, 'grad_norm': 2.4991166591644287, 'learning_rate': 7.140762463343108e-05, 'epoch': 1.54}
+{'loss': 0.3133, 'grad_norm': 1.0830026865005493, 'learning_rate': 7.138318670576735e-05, 'epoch': 1.54}
+{'loss': 0.5774, 'grad_norm': 1.9214036464691162, 'learning_rate': 7.135874877810361e-05, 'epoch': 1.54}
+{'loss': 0.7693, 'grad_norm': 1.623348355293274, 'learning_rate': 7.133431085043988e-05, 'epoch': 1.54}
+{'loss': 0.7588, 'grad_norm': 2.111285924911499, 'learning_rate': 7.130987292277614e-05, 'epoch': 1.55}
+{'loss': 0.633, 'grad_norm': 4.0085248947143555, 'learning_rate': 7.12854349951124e-05, 'epoch': 1.55}
+{'loss': 0.5439, 'grad_norm': 1.885757327079773, 'learning_rate': 7.126099706744868e-05, 'epoch': 1.55}
+{'loss': 0.5721, 'grad_norm': 2.1260712146759033, 'learning_rate': 7.123655913978494e-05, 'epoch': 1.55}
+{'loss': 0.598, 'grad_norm': 1.1092246770858765, 'learning_rate': 7.12121212121212e-05, 'epoch': 1.55}
+{'loss': 1.0518, 'grad_norm': 2.1099350452423096, 'learning_rate': 7.118768328445748e-05, 'epoch': 1.55}
+{'loss': 0.974, 'grad_norm': 2.708494186401367, 'learning_rate': 7.116324535679373e-05, 'epoch': 1.55}
+{'loss': 1.064, 'grad_norm': 2.1030566692352295, 'learning_rate': 7.113880742913001e-05, 'epoch': 1.55}
+{'loss': 0.6343, 'grad_norm': 1.5087107419967651, 'learning_rate': 7.111436950146627e-05, 'epoch': 1.55}
+{'loss': 1.0731, 'grad_norm': 2.708134651184082, 'learning_rate': 7.108993157380254e-05, 'epoch': 1.55}
+{'loss': 1.0076, 'grad_norm': 2.4557900428771973, 'learning_rate': 7.10654936461388e-05, 'epoch': 1.55}
+{'loss': 0.6316, 'grad_norm': 4.119977951049805, 'learning_rate': 7.104105571847507e-05, 'epoch': 1.55}
+{'loss': 1.0986, 'grad_norm': 9.010337829589844, 'learning_rate': 7.101661779081133e-05, 'epoch': 1.55}
+{'loss': 1.4029, 'grad_norm': 3.3070311546325684, 'learning_rate': 7.09921798631476e-05, 'epoch': 1.55}
+{'loss': 0.467, 'grad_norm': 1.8002490997314453, 'learning_rate': 7.096774193548386e-05, 'epoch': 1.55}
+{'loss': 0.6966, 'grad_norm': 1.5535470247268677, 'learning_rate': 7.094330400782013e-05, 'epoch': 1.55}
+{'loss': 0.4247, 'grad_norm': 1.3492341041564941, 'learning_rate': 7.091886608015639e-05, 'epoch': 1.55}
+{'loss': 0.8019, 'grad_norm': 1.93979811668396, 'learning_rate': 7.089442815249267e-05, 'epoch': 1.55}
+{'loss': 0.8177, 'grad_norm': 1.897113561630249, 'learning_rate': 7.086999022482892e-05, 'epoch': 1.55}
+{'loss': 0.2577, 'grad_norm': 0.4913039207458496, 'learning_rate': 7.08455522971652e-05, 'epoch': 1.55}
+{'loss': 0.2678, 'grad_norm': 0.9754254221916199, 'learning_rate': 7.082111436950146e-05, 'epoch': 1.55}
+{'loss': 0.2229, 'grad_norm': 0.5141871571540833, 'learning_rate': 7.079667644183773e-05, 'epoch': 1.55}
+{'loss': 0.2285, 'grad_norm': 0.7318794131278992, 'learning_rate': 7.0772238514174e-05, 'epoch': 1.55}
+{'loss': 0.2582, 'grad_norm': 1.011592984199524, 'learning_rate': 7.074780058651026e-05, 'epoch': 1.55}
+{'loss': 0.3078, 'grad_norm': 0.5550967454910278, 'learning_rate': 7.072336265884652e-05, 'epoch': 1.55}
+{'loss': 0.1527, 'grad_norm': 0.6541218757629395, 'learning_rate': 7.069892473118279e-05, 'epoch': 1.55}
+{'loss': 0.309, 'grad_norm': 0.6758529543876648, 'learning_rate': 7.067448680351905e-05, 'epoch': 1.55}
+{'loss': 0.234, 'grad_norm': 0.5618512630462646, 'learning_rate': 7.065004887585532e-05, 'epoch': 1.55}
+{'loss': 0.3533, 'grad_norm': 0.8279595971107483, 'learning_rate': 7.062561094819158e-05, 'epoch': 1.55}
+{'loss': 0.1606, 'grad_norm': 0.7879864573478699, 'learning_rate': 7.060117302052786e-05, 'epoch': 1.55}
+{'loss': 0.33, 'grad_norm': 0.8414984941482544, 'learning_rate': 7.057673509286411e-05, 'epoch': 1.55}
+{'loss': 0.3478, 'grad_norm': 0.6734075546264648, 'learning_rate': 7.055229716520039e-05, 'epoch': 1.55}
+{'loss': 0.4642, 'grad_norm': 0.876607358455658, 'learning_rate': 7.052785923753666e-05, 'epoch': 1.55}
+{'loss': 0.4359, 'grad_norm': 1.32351553440094, 'learning_rate': 7.050342130987292e-05, 'epoch': 1.55}
+{'loss': 0.3068, 'grad_norm': 0.7632149457931519, 'learning_rate': 7.047898338220918e-05, 'epoch': 1.55}
+{'loss': 0.2885, 'grad_norm': 1.024189829826355, 'learning_rate': 7.045454545454545e-05, 'epoch': 1.55}
+{'loss': 0.3361, 'grad_norm': 1.50118887424469, 'learning_rate': 7.043010752688171e-05, 'epoch': 1.55}
+{'loss': 0.7564, 'grad_norm': 3.2080376148223877, 'learning_rate': 7.040566959921798e-05, 'epoch': 1.55}
+{'loss': 0.471, 'grad_norm': 1.7170158624649048, 'learning_rate': 7.038123167155424e-05, 'epoch': 1.55}
+{'loss': 0.4193, 'grad_norm': 2.007455825805664, 'learning_rate': 7.035679374389051e-05, 'epoch': 1.55}
+{'loss': 0.4327, 'grad_norm': 1.438209891319275, 'learning_rate': 7.033235581622677e-05, 'epoch': 1.55}
+{'loss': 0.5447, 'grad_norm': 1.6576300859451294, 'learning_rate': 7.030791788856304e-05, 'epoch': 1.55}
+{'loss': 0.9359, 'grad_norm': 1.656591534614563, 'learning_rate': 7.02834799608993e-05, 'epoch': 1.55}
+{'loss': 0.7214, 'grad_norm': 2.055999279022217, 'learning_rate': 7.025904203323558e-05, 'epoch': 1.55}
+{'loss': 0.8004, 'grad_norm': 2.960740089416504, 'learning_rate': 7.023460410557183e-05, 'epoch': 1.55}
+{'loss': 0.2811, 'grad_norm': 1.1211111545562744, 'learning_rate': 7.021016617790811e-05, 'epoch': 1.55}
+{'loss': 0.4153, 'grad_norm': 0.9293329119682312, 'learning_rate': 7.018572825024438e-05, 'epoch': 1.55}
+{'loss': 0.6008, 'grad_norm': 1.6393110752105713, 'learning_rate': 7.016129032258064e-05, 'epoch': 1.55}
+{'loss': 0.3159, 'grad_norm': 2.427534341812134, 'learning_rate': 7.01368523949169e-05, 'epoch': 1.55}
+{'loss': 0.677, 'grad_norm': 2.0107951164245605, 'learning_rate': 7.011241446725317e-05, 'epoch': 1.55}
+{'loss': 0.4769, 'grad_norm': 1.3930466175079346, 'learning_rate': 7.008797653958944e-05, 'epoch': 1.55}
+{'loss': 0.6934, 'grad_norm': 3.030660390853882, 'learning_rate': 7.00635386119257e-05, 'epoch': 1.55}
+{'loss': 0.3575, 'grad_norm': 1.1675511598587036, 'learning_rate': 7.003910068426196e-05, 'epoch': 1.55}
+{'loss': 0.7557, 'grad_norm': 2.445157527923584, 'learning_rate': 7.001466275659823e-05, 'epoch': 1.55}
+{'loss': 0.7685, 'grad_norm': 2.884352445602417, 'learning_rate': 6.99902248289345e-05, 'epoch': 1.55}
+{'loss': 0.7295, 'grad_norm': 1.9358322620391846, 'learning_rate': 6.996578690127077e-05, 'epoch': 1.55}
+{'loss': 0.6324, 'grad_norm': 2.95656681060791, 'learning_rate': 6.994134897360702e-05, 'epoch': 1.55}
+{'loss': 0.4526, 'grad_norm': 1.8009204864501953, 'learning_rate': 6.99169110459433e-05, 'epoch': 1.55}
+{'loss': 0.9872, 'grad_norm': 3.0422096252441406, 'learning_rate': 6.989247311827957e-05, 'epoch': 1.55}
+{'loss': 0.8607, 'grad_norm': 4.271030902862549, 'learning_rate': 6.986803519061583e-05, 'epoch': 1.55}
+{'loss': 0.8247, 'grad_norm': 2.318971633911133, 'learning_rate': 6.98435972629521e-05, 'epoch': 1.55}
+{'loss': 0.9417, 'grad_norm': 1.7466161251068115, 'learning_rate': 6.981915933528836e-05, 'epoch': 1.55}
+{'loss': 1.8184, 'grad_norm': 2.927077531814575, 'learning_rate': 6.979472140762463e-05, 'epoch': 1.55}
+{'loss': 0.8251, 'grad_norm': 2.963963747024536, 'learning_rate': 6.977028347996089e-05, 'epoch': 1.55}
+{'loss': 0.8453, 'grad_norm': 2.6687653064727783, 'learning_rate': 6.974584555229716e-05, 'epoch': 1.56}
+{'loss': 0.7546, 'grad_norm': 3.3215959072113037, 'learning_rate': 6.972140762463342e-05, 'epoch': 1.56}
+ 78%|███████▊  | 9936/12776 [1:43:21<10:47,  4.39it/s] 78%|███████▊  | 9937/12776 [1:43:22<10:09,  4.66it/s]                                                       78%|███████▊  | 9937/12776 [1:43:22<10:09,  4.66it/s] 78%|███████▊  | 9938/12776 [1:43:22<16:14,  2.91it/s]                                                       78%|███████▊  | 9938/12776 [1:43:22<16:14,  2.91it/s] 78%|███████▊  | 9939/12776 [1:43:24<30:12,  1.57it/s]                                                       78%|███████▊  | 9939/12776 [1:43:24<30:12,  1.57it/s] 78%|███████▊  | 9940/12776 [1:43:25<34:14,  1.38it/s]                                                       78%|███████▊  | 9940/12776 [1:43:25<34:14,  1.38it/s] 78%|███████▊  | 9941/12776 [1:43:25<35:16,  1.34it/s]                                                       78%|███████▊  | 9941/12776 [1:43:25<35:16,  1.34it/s] 78%|███████▊  | 9942/12776 [1:43:26<37:01,  1.28it/s]                                                       78%|███████▊  | 9942/12776 [1:43:26<37:01,  1.28it/s] 78%|███████▊  | 9943/12776 [1:43:27<35:37,  1.33it/s]                                                       78%|███████▊  | 9943/12776 [1:43:27<35:37,  1.33it/s] 78%|███████▊  | 9944/12776 [1:43:28<34:04,  1.39it/s]                                                       78%|███████▊  | 9944/12776 [1:43:28<34:04,  1.39it/s] 78%|███████▊  | 9945/12776 [1:43:28<33:45,  1.40it/s]                                                       78%|███████▊  | 9945/12776 [1:43:28<33:45,  1.40it/s] 78%|███████▊  | 9946/12776 [1:43:29<31:50,  1.48it/s]                                                       78%|███████▊  | 9946/12776 [1:43:29<31:50,  1.48it/s] 78%|███████▊  | 9947/12776 [1:43:29<30:25,  1.55it/s]                                                       78%|███████▊  | 9947/12776 [1:43:29<30:25,  1.55it/s] 78%|███████▊  | 9948/12776 [1:43:30<28:49,  1.64it/s]                                                       78%|███████▊  | 9948/12776 [1:43:30<28:49,  1.64it/s] 78%|███████▊  | 9949/12776 [1:43:31<29:32,  1.59it/s]                                                       78%|███████▊  | 9949/12776 [1:43:31<29:32,  1.59it/s] 78%|███████▊  | 9950/12776 [1:43:31<27:37,  1.71it/s]                                                       78%|███████▊  | 9950/12776 [1:43:31<27:37,  1.71it/s] 78%|███████▊  | 9951/12776 [1:43:32<25:46,  1.83it/s]                                                       78%|███████▊  | 9951/12776 [1:43:32<25:46,  1.83it/s] 78%|███████▊  | 9952/12776 [1:43:32<24:24,  1.93it/s]                                                       78%|███████▊  | 9952/12776 [1:43:32<24:24,  1.93it/s] 78%|███████▊  | 9953/12776 [1:43:32<23:08,  2.03it/s]                                                       78%|███████▊  | 9953/12776 [1:43:32<23:08,  2.03it/s] 78%|███████▊  | 9954/12776 [1:43:33<22:58,  2.05it/s]                                                       78%|███████▊  | 9954/12776 [1:43:33<22:58,  2.05it/s] 78%|███████▊  | 9955/12776 [1:43:33<21:34,  2.18it/s]                                                       78%|███████▊  | 9955/12776 [1:43:33<21:34,  2.18it/s] 78%|███████▊  | 9956/12776 [1:43:34<20:30,  2.29it/s]                                                       78%|███████▊  | 9956/12776 [1:43:34<20:30,  2.29it/s] 78%|███████▊  | 9957/12776 [1:43:34<19:57,  2.35it/s]                                                       78%|███████▊  | 9957/12776 [1:43:34<19:57,  2.35it/s] 78%|███████▊  | 9958/12776 [1:43:34<18:59,  2.47it/s]                                                       78%|███████▊  | 9958/12776 [1:43:34<18:59,  2.47it/s] 78%|███████▊  | 9959/12776 [1:43:35<18:11,  2.58it/s]                                                       78%|███████▊  | 9959/12776 [1:43:35<18:11,  2.58it/s] 78%|███████▊  | 9960/12776 [1:43:35<18:50,  2.49it/s]                                                       78%|███████▊  | 9960/12776 [1:43:35<18:50,  2.49it/s] 78%|███████▊  | 9961/12776 [1:43:36<17:52,  2.63it/s]                                                       78%|███████▊  | 9961/12776 [1:43:36<17:52,  2.63it/s] 78%|███████▊  | 9962/12776 [1:43:36<17:00,  2.76it/s]                                                       78%|███████▊  | 9962/12776 [1:43:36<17:00,  2.76it/s] 78%|███████▊  | 9963/12776 [1:43:36<16:13,  2.89it/s]                                                       78%|███████▊  | 9963/12776 [1:43:36<16:13,  2.89it/s] 78%|███████▊  | 9964/12776 [1:43:37<16:07,  2.91it/s]                                                       78%|███████▊  | 9964/12776 [1:43:37<16:07,  2.91it/s] 78%|███████▊  | 9965/12776 [1:43:37<15:23,  3.04it/s]                                                       78%|███████▊  | 9965/12776 [1:43:37<15:23,  3.04it/s] 78%|███████▊  | 9966/12776 [1:43:37<14:43,  3.18it/s]                                                       78%|███████▊  | 9966/12776 [1:43:37<14:43,  3.18it/s] 78%|███████▊  | 9967/12776 [1:43:37<14:08,  3.31it/s]                                                       78%|███████▊  | 9967/12776 [1:43:37<14:08,  3.31it/s] 78%|███████▊  | 9968/12776 [1:43:38<13:41,  3.42it/s]                                                       78%|███████▊  | 9968/12776 [1:43:38<13:41,  3.42it/s] 78%|███████▊  | 9969/12776 [1:43:38<13:11,  3.55it/s]                                                       78%|███████▊  | 9969/12776 [1:43:38<13:11,  3.55it/s] 78%|███████▊  | 9970/12776 [1:43:38<12:44,  3.67it/s]                                                       78%|███████▊  | 9970/12776 [1:43:38<12:44,  3.67it/s] 78%|███████▊  | 9971/12776 [1:43:38<12:20,  3.79it/s]                                                       78%|███████▊  | 9971/12776 [1:43:38<12:20,  3.79it/s] 78%|███████▊  | 9972/12776 [1:43:39<13:38,  3.43it/s]                                                       78%|███████▊  | 9972/12776 [1:43:39<13:38,  3.43it/s] 78%|███████▊  | 9973/12776 [1:43:39<12:48,  3.65it/s]                                                       78%|███████▊  | 9973/12776 [1:43:39<12:48,  3.65it/s] 78%|███████▊  | 9974/12776 [1:43:39<12:05,  3.86it/s]                                                       78%|███████▊  | 9974/12776 [1:43:39<12:05,  3.86it/s] 78%|███████▊  | 9975/12776 [1:43:39<11:29,  4.06it/s]                                                       78%|███████▊  | 9975/12776 [1:43:39<11:29,  4.06it/s] 78%|███████▊  | 9976/12776 [1:43:40<12:23,  3.77it/s]                                                       78%|███████▊  | 9976/12776 [1:43:40<12:23,  3.77it/s] 78%|███████▊  | 9977/12776 [1:43:40<11:43,  3.98it/s]                                                       78%|███████▊  | 9977/12776 [1:43:40<11:43,  3.98it/s] 78%|███████▊  | 9978/12776 [1:43:40<11:11,  4.17it/s]                                                       78%|███████▊  | 9978/12776 [1:43:40<11:11,  4.17it/s] 78%|███████▊  | 9979/12776 [1:43:40<10:43,  4.34it/s]                                                       78%|███████▊  | 9979/12776 [1:43:40<10:43,  4.34it/s] 78%|███████▊  | 9980/12776 [1:43:41<10:20,  4.51it/s]                                                       78%|███████▊  | 9980/12776 [1:43:41<10:20,  4.51it/s] 78%|███████▊  | 9981/12776 [1:43:41<11:24,  4.08it/s]                                                       78%|███████▊  | 9981/12776 [1:43:41<11:24,  4.08it/s] 78%|███████▊  | 9982/12776 [1:43:41<10:45,  4.33it/s]                                                       78%|███████▊  | 9982/12776 [1:43:41<10:45,  4.33it/s] 78%|███████▊  | 9983/12776 [1:43:41<10:16,  4.53it/s]                                                       78%|███████▊  | 9983/12776 [1:43:41<10:16,  4.53it/s] 78%|███████▊  | 9984/12776 [1:43:41<09:54,  4.70it/s]                                                       78%|███████▊  | 9984/12776 [1:43:41<09:54,  4.70it/s] 78%|███████▊  | 9985/12776 [1:43:42<09:36,  4.84it/s]                                                       78%|███████▊  | 9985/12776 [1:43:42<09:36,  4.84it/s] 78%|███████▊  | 9986/12776 [1:43:42<09:19,  4.99it/s]                                                       78%|███████▊  | 9986/12776 [1:43:42<09:19,  4.99it/s] 78%|███████▊  | 9987/12776 [1:43:42<10:35,  4.39it/s]                                                       78%|███████▊  | 9987/12776 [1:43:42<10:35,  4.39it/s] 78%|███████▊  | 9988/12776 [1:43:43<16:54,  2.75it/s]                                                       78%|███████▊  | 9988/12776 [1:43:43<16:54,  2.75it/s] 78%|███████▊  | 9989/12776 [1:43:44<29:03,  1.60it/s]                                                       78%|███████▊  | 9989/12776 [1:43:44<29:03,  1.60it/s] 78%|███████▊  | 9990/12776 [1:43:45<33:57,  1.37it/s]                                                       78%|███████▊  | 9990/12776 [1:43:45<33:57,  1.37it/s] 78%|███████▊  | 9991/12776 [1:43:46<36:14,  1.28it/s]                                                       78%|███████▊  | 9991/12776 [1:43:46<36:14,  1.28it/s] 78%|███████▊  | 9992/12776 [1:43:47<35:59,  1.29it/s]                                                       78%|███████▊  | 9992/12776 [1:43:47<35:59,  1.29it/s] 78%|███████▊  | 9993/12776 [1:43:47<35:29,  1.31it/s]                                                       78%|███████▊  | 9993/12776 [1:43:47<35:29,  1.31it/s] 78%|███████▊  | 9994/12776 [1:43:48<34:35,  1.34it/s]                                                       78%|███████▊  | 9994/12776 [1:43:48<34:35,  1.34it/s] 78%|███████▊  | 9995/12776 [1:43:49<35:02,  1.32it/s]                                                       78%|███████▊  | 9995/12776 [1:43:49<35:02,  1.32it/s] 78%|███████▊  | 9996/12776 [1:43:50<33:10,  1.40it/s]                                                       78%|███████▊  | 9996/12776 [1:43:50<33:10,  1.40it/s] 78%|███████▊  | 9997/12776 [1:43:50<31:07,  1.49it/s]                                                       78%|███████▊  | 9997/12776 [1:43:50<31:07,  1.49it/s] 78%|███████▊  | 9998/12776 [1:43:51<29:31,  1.57it/s]                                                       78%|███████▊  | 9998/12776 [1:43:51<29:31,  1.57it/s] 78%|███████▊  | 9999/12776 [1:43:51<28:39,  1.61it/s]                                                       78%|███████▊  | 9999/12776 [1:43:51<28:39,  1.61it/s] 78%|███████▊  | 10000/12776 [1:43:52<27:07,  1.71it/s]                                                        78%|███████▊  | 10000/12776 [1:43:52<27:07,  1.71it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 1.5532, 'grad_norm': 2.8623807430267334, 'learning_rate': 6.969696969696969e-05, 'epoch': 1.56}
+{'loss': 0.8139, 'grad_norm': 2.0836265087127686, 'learning_rate': 6.967253176930596e-05, 'epoch': 1.56}
+{'loss': 0.8304, 'grad_norm': 1.388414978981018, 'learning_rate': 6.964809384164222e-05, 'epoch': 1.56}
+{'loss': 0.1613, 'grad_norm': 0.9456003904342651, 'learning_rate': 6.96236559139785e-05, 'epoch': 1.56}
+{'loss': 0.121, 'grad_norm': 0.5043039917945862, 'learning_rate': 6.959921798631476e-05, 'epoch': 1.56}
+{'loss': 0.2142, 'grad_norm': 0.7678887248039246, 'learning_rate': 6.957478005865102e-05, 'epoch': 1.56}
+{'loss': 0.2479, 'grad_norm': 0.7166454792022705, 'learning_rate': 6.955034213098729e-05, 'epoch': 1.56}
+{'loss': 0.2025, 'grad_norm': 0.4818023145198822, 'learning_rate': 6.952590420332355e-05, 'epoch': 1.56}
+{'loss': 0.1756, 'grad_norm': 0.3814961314201355, 'learning_rate': 6.950146627565982e-05, 'epoch': 1.56}
+{'loss': 0.2258, 'grad_norm': 0.6317948698997498, 'learning_rate': 6.947702834799608e-05, 'epoch': 1.56}
+{'loss': 0.2127, 'grad_norm': 0.6837334036827087, 'learning_rate': 6.945259042033235e-05, 'epoch': 1.56}
+{'loss': 0.2307, 'grad_norm': 0.6503742337226868, 'learning_rate': 6.942815249266861e-05, 'epoch': 1.56}
+{'loss': 0.3621, 'grad_norm': 0.9390753507614136, 'learning_rate': 6.940371456500488e-05, 'epoch': 1.56}
+{'loss': 0.2944, 'grad_norm': 1.152475118637085, 'learning_rate': 6.937927663734116e-05, 'epoch': 1.56}
+{'loss': 0.1876, 'grad_norm': 0.7496026754379272, 'learning_rate': 6.93548387096774e-05, 'epoch': 1.56}
+{'loss': 0.2536, 'grad_norm': 0.6168410778045654, 'learning_rate': 6.933040078201368e-05, 'epoch': 1.56}
+{'loss': 0.3867, 'grad_norm': 1.393839716911316, 'learning_rate': 6.930596285434995e-05, 'epoch': 1.56}
+{'loss': 0.1936, 'grad_norm': 0.946195125579834, 'learning_rate': 6.928152492668621e-05, 'epoch': 1.56}
+{'loss': 0.3014, 'grad_norm': 1.4753316640853882, 'learning_rate': 6.925708699902248e-05, 'epoch': 1.56}
+{'loss': 0.4375, 'grad_norm': 1.1015839576721191, 'learning_rate': 6.923264907135874e-05, 'epoch': 1.56}
+{'loss': 0.701, 'grad_norm': 1.402594804763794, 'learning_rate': 6.920821114369501e-05, 'epoch': 1.56}
+{'loss': 0.3449, 'grad_norm': 2.3376095294952393, 'learning_rate': 6.918377321603127e-05, 'epoch': 1.56}
+{'loss': 0.2373, 'grad_norm': 0.5967291593551636, 'learning_rate': 6.915933528836754e-05, 'epoch': 1.56}
+{'loss': 0.4976, 'grad_norm': 2.35986065864563, 'learning_rate': 6.91348973607038e-05, 'epoch': 1.56}
+{'loss': 0.4235, 'grad_norm': 1.496276617050171, 'learning_rate': 6.911045943304007e-05, 'epoch': 1.56}
+{'loss': 0.5994, 'grad_norm': 3.5515880584716797, 'learning_rate': 6.908602150537635e-05, 'epoch': 1.56}
+{'loss': 0.3619, 'grad_norm': 1.0382581949234009, 'learning_rate': 6.90615835777126e-05, 'epoch': 1.56}
+{'loss': 0.7072, 'grad_norm': 1.326045274734497, 'learning_rate': 6.903714565004888e-05, 'epoch': 1.56}
+{'loss': 0.9478, 'grad_norm': 3.173861265182495, 'learning_rate': 6.901270772238514e-05, 'epoch': 1.56}
+{'loss': 0.5995, 'grad_norm': 1.280656099319458, 'learning_rate': 6.898826979472139e-05, 'epoch': 1.56}
+{'loss': 0.415, 'grad_norm': 1.5048918724060059, 'learning_rate': 6.896383186705767e-05, 'epoch': 1.56}
+{'loss': 0.7435, 'grad_norm': 1.7540141344070435, 'learning_rate': 6.893939393939393e-05, 'epoch': 1.56}
+{'loss': 0.6369, 'grad_norm': 1.4080053567886353, 'learning_rate': 6.89149560117302e-05, 'epoch': 1.56}
+{'loss': 0.4578, 'grad_norm': 1.5890631675720215, 'learning_rate': 6.889051808406646e-05, 'epoch': 1.56}
+{'loss': 0.6443, 'grad_norm': 1.6475311517715454, 'learning_rate': 6.886608015640273e-05, 'epoch': 1.56}
+{'loss': 0.5241, 'grad_norm': 1.769564151763916, 'learning_rate': 6.8841642228739e-05, 'epoch': 1.56}
+{'loss': 0.7585, 'grad_norm': 2.2985165119171143, 'learning_rate': 6.881720430107526e-05, 'epoch': 1.56}
+{'loss': 0.8987, 'grad_norm': 1.865506649017334, 'learning_rate': 6.879276637341154e-05, 'epoch': 1.56}
+{'loss': 1.1079, 'grad_norm': 2.5502965450286865, 'learning_rate': 6.876832844574779e-05, 'epoch': 1.56}
+{'loss': 1.0968, 'grad_norm': 3.633234977722168, 'learning_rate': 6.874389051808407e-05, 'epoch': 1.56}
+{'loss': 0.5893, 'grad_norm': 2.496279239654541, 'learning_rate': 6.871945259042033e-05, 'epoch': 1.56}
+{'loss': 0.8225, 'grad_norm': 2.3446414470672607, 'learning_rate': 6.869501466275658e-05, 'epoch': 1.56}
+{'loss': 0.819, 'grad_norm': 1.9929825067520142, 'learning_rate': 6.867057673509286e-05, 'epoch': 1.56}
+{'loss': 1.3806, 'grad_norm': 5.4597086906433105, 'learning_rate': 6.864613880742913e-05, 'epoch': 1.56}
+{'loss': 0.9163, 'grad_norm': 1.8970288038253784, 'learning_rate': 6.862170087976539e-05, 'epoch': 1.56}
+{'loss': 0.9537, 'grad_norm': 2.849325656890869, 'learning_rate': 6.859726295210166e-05, 'epoch': 1.56}
+{'loss': 0.5998, 'grad_norm': 1.5763049125671387, 'learning_rate': 6.857282502443792e-05, 'epoch': 1.56}
+{'loss': 1.1581, 'grad_norm': 2.1057538986206055, 'learning_rate': 6.854838709677419e-05, 'epoch': 1.56}
+{'loss': 1.0438, 'grad_norm': 3.0501420497894287, 'learning_rate': 6.852394916911045e-05, 'epoch': 1.56}
+{'loss': 0.4397, 'grad_norm': 1.4863300323486328, 'learning_rate': 6.849951124144673e-05, 'epoch': 1.56}
+{'loss': 0.5282, 'grad_norm': 1.986098289489746, 'learning_rate': 6.847507331378298e-05, 'epoch': 1.56}
+{'loss': 0.4203, 'grad_norm': 3.5367484092712402, 'learning_rate': 6.845063538611926e-05, 'epoch': 1.56}
+{'loss': 0.8021, 'grad_norm': 3.17246413230896, 'learning_rate': 6.842619745845552e-05, 'epoch': 1.56}
+{'loss': 0.1862, 'grad_norm': 0.6066386103630066, 'learning_rate': 6.840175953079177e-05, 'epoch': 1.56}
+{'loss': 0.1316, 'grad_norm': 0.8987689018249512, 'learning_rate': 6.837732160312805e-05, 'epoch': 1.56}
+{'loss': 0.2883, 'grad_norm': 0.6625165343284607, 'learning_rate': 6.835288367546432e-05, 'epoch': 1.56}
+{'loss': 0.2005, 'grad_norm': 0.5179951786994934, 'learning_rate': 6.832844574780058e-05, 'epoch': 1.56}
+{'loss': 0.3602, 'grad_norm': 0.7913885116577148, 'learning_rate': 6.830400782013685e-05, 'epoch': 1.56}
+{'loss': 0.3084, 'grad_norm': 0.5309231281280518, 'learning_rate': 6.827956989247311e-05, 'epoch': 1.56}
+{'loss': 0.4261, 'grad_norm': 1.3907285928726196, 'learning_rate': 6.825513196480938e-05, 'epoch': 1.56}
+{'loss': 0.2173, 'grad_norm': 0.567585289478302, 'learning_rate': 6.823069403714564e-05, 'epoch': 1.56}
+{'loss': 0.2504, 'grad_norm': 0.8944092392921448, 'learning_rate': 6.820625610948192e-05, 'epoch': 1.56}
+{'loss': 0.3839, 'grad_norm': 0.9722375273704529, 'learning_rate': 6.818181818181817e-05, 'epoch': 1.57}
+{'loss': 0.1622, 'grad_norm': 0.5953250527381897, 'learning_rate': 6.815738025415444e-05, 'epoch': 1.57}
+{'loss': 0.1617, 'grad_norm': 0.7407553195953369, 'learning_rate': 6.813294232649071e-05, 'epoch': 1.57}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:40,  4.80it/s][A
+  0%|          | 3/774 [00:00<03:09,  4.06it/s][A
+  1%|          | 4/774 [00:01<03:29,  3.68it/s][A
+  1%|          | 5/774 [00:01<03:24,  3.76it/s][A
+  1%|          | 6/774 [00:01<03:34,  3.57it/s][A
+  1%|          | 7/774 [00:01<03:31,  3.63it/s][A
+  1%|          | 8/774 [00:02<03:32,  3.61it/s][A
+  1%|          | 9/774 [00:02<03:19,  3.83it/s][A
+  1%|▏         | 10/774 [00:02<03:18,  3.84it/s][A
+  1%|▏         | 11/774 [00:02<03:34,  3.56it/s][A
+  2%|▏         | 12/774 [00:03<03:19,  3.82it/s][A
+  2%|▏         | 13/774 [00:03<03:11,  3.97it/s][A
+  2%|▏         | 14/774 [00:03<03:21,  3.78it/s][A
+  2%|▏         | 15/774 [00:04<03:39,  3.46it/s][A
+  2%|▏         | 16/774 [00:04<03:38,  3.47it/s][A
+  2%|▏         | 17/774 [00:04<03:15,  3.87it/s][A
+  2%|▏         | 18/774 [00:04<03:09,  3.99it/s][A
+  2%|▏         | 19/774 [00:05<03:19,  3.79it/s][A
+  3%|▎         | 20/774 [00:05<03:15,  3.86it/s][A
+  3%|▎         | 21/774 [00:05<03:19,  3.77it/s][A
+  3%|▎         | 22/774 [00:05<03:24,  3.67it/s][A
+  3%|▎         | 23/774 [00:06<03:35,  3.48it/s][A
+  3%|▎         | 24/774 [00:06<03:35,  3.49it/s][A
+  3%|▎         | 25/774 [00:06<03:41,  3.38it/s][A
+  3%|▎         | 26/774 [00:07<03:38,  3.42it/s][A
+  3%|▎         | 27/774 [00:07<03:37,  3.44it/s][A
+  4%|▎         | 28/774 [00:07<03:42,  3.35it/s][A
+  4%|▎         | 29/774 [00:07<03:46,  3.29it/s][A
+  4%|▍         | 30/774 [00:08<03:33,  3.49it/s][A
+  4%|▍         | 31/774 [00:08<03:33,  3.48it/s][A
+  4%|▍         | 32/774 [00:08<04:07,  3.00it/s][A
+  4%|▍         | 33/774 [00:09<03:54,  3.16it/s][A
+  4%|▍         | 34/774 [00:09<03:39,  3.37it/s][A
+  5%|▍         | 35/774 [00:09<03:46,  3.26it/s][A
+  5%|▍         | 36/774 [00:10<03:47,  3.25it/s][A
+  5%|▍         | 37/774 [00:10<03:47,  3.24it/s][A
+  5%|▍         | 38/774 [00:10<03:39,  3.35it/s][A
+  5%|▌         | 39/774 [00:10<03:24,  3.60it/s][A
+  5%|▌         | 40/774 [00:11<03:28,  3.52it/s][A
+  5%|▌         | 41/774 [00:11<03:25,  3.57it/s][A
+  5%|▌         | 42/774 [00:11<03:13,  3.79it/s][A
+  6%|▌         | 43/774 [00:12<03:25,  3.55it/s][A
+  6%|▌         | 44/774 [00:12<03:29,  3.48it/s][A
+  6%|▌         | 45/774 [00:12<03:18,  3.68it/s][A
+  6%|▌         | 46/774 [00:12<03:03,  3.98it/s][A
+  6%|▌         | 47/774 [00:12<02:49,  4.29it/s][A
+  6%|▌         | 48/774 [00:13<02:52,  4.22it/s][A
+  6%|▋         | 49/774 [00:13<02:53,  4.17it/s][A
+  6%|▋         | 50/774 [00:13<02:57,  4.08it/s][A
+  7%|▋         | 51/774 [00:14<02:59,  4.04it/s][A
+  7%|▋         | 52/774 [00:14<02:55,  4.11it/s][A
+  7%|▋         | 53/774 [00:14<03:05,  3.88it/s][A
+  7%|▋         | 54/774 [00:14<03:10,  3.78it/s][A
+  7%|▋         | 55/774 [00:15<03:20,  3.59it/s][A
+  7%|▋         | 56/774 [00:15<03:18,  3.62it/s][A
+  7%|▋         | 57/774 [00:15<03:25,  3.49it/s][A
+  7%|▋         | 58/774 [00:15<03:24,  3.50it/s][A
+  8%|▊         | 59/774 [00:16<03:08,  3.79it/s][A
+  8%|▊         | 60/774 [00:16<02:54,  4.09it/s][A
+  8%|▊         | 61/774 [00:16<02:31,  4.70it/s][A
+  8%|▊         | 62/774 [00:16<02:31,  4.71it/s][A
+  8%|▊         | 63/774 [00:17<02:56,  4.03it/s][A
+  8%|▊         | 64/774 [00:17<02:47,  4.24it/s][A
+  8%|▊         | 65/774 [00:17<02:49,  4.19it/s][A
+  9%|▊         | 66/774 [00:17<02:45,  4.28it/s][A
+  9%|▊         | 67/774 [00:17<02:40,  4.41it/s][A
+  9%|▉         | 68/774 [00:18<02:36,  4.51it/s][A
+  9%|▉         | 69/774 [00:18<02:27,  4.76it/s][A
+  9%|▉         | 70/774 [00:18<02:35,  4.52it/s][A
+  9%|▉         | 71/774 [00:18<02:31,  4.65it/s][A
+  9%|▉         | 72/774 [00:19<02:40,  4.37it/s][A
+  9%|▉         | 73/774 [00:19<02:50,  4.11it/s][A
+ 10%|▉         | 74/774 [00:19<02:57,  3.95it/s][A
+ 10%|▉         | 75/774 [00:19<03:03,  3.80it/s][A
+ 10%|▉         | 76/774 [00:20<02:59,  3.89it/s][A
+ 10%|▉         | 77/774 [00:20<03:13,  3.61it/s][A
+ 10%|█         | 78/774 [00:20<02:54,  4.00it/s][A
+ 10%|█         | 79/774 [00:20<02:42,  4.29it/s][A
+ 10%|█         | 80/774 [00:21<02:39,  4.36it/s][A
+ 10%|█         | 81/774 [00:21<02:17,  5.03it/s][A
+ 11%|█         | 82/774 [00:21<02:16,  5.07it/s][A
+ 11%|█         | 83/774 [00:21<02:20,  4.91it/s][A
+ 11%|█         | 84/774 [00:21<02:27,  4.69it/s][A
+ 11%|█         | 85/774 [00:22<02:36,  4.41it/s][A
+ 11%|█         | 86/774 [00:22<02:43,  4.22it/s][A
+ 11%|█         | 87/774 [00:22<02:42,  4.22it/s][A
+ 11%|█▏        | 88/774 [00:22<02:31,  4.51it/s][A
+ 11%|█▏        | 89/774 [00:22<02:26,  4.68it/s][A
+ 12%|█▏        | 90/774 [00:23<02:34,  4.41it/s][A
+ 12%|█▏        | 91/774 [00:23<02:49,  4.04it/s][A
+ 12%|█▏        | 92/774 [00:23<03:01,  3.76it/s][A
+ 12%|█▏        | 93/774 [00:24<02:58,  3.82it/s][A
+ 12%|█▏        | 94/774 [00:24<03:05,  3.67it/s][A
+ 12%|█▏        | 95/774 [00:24<03:03,  3.71it/s][A
+ 12%|█▏        | 96/774 [00:24<02:58,  3.81it/s][A
+ 13%|█▎        | 97/774 [00:25<02:42,  4.17it/s][A
+ 13%|█▎        | 98/774 [00:25<02:35,  4.35it/s][A
+ 13%|█▎        | 99/774 [00:25<02:47,  4.02it/s][A
+ 13%|█▎        | 100/774 [00:25<02:59,  3.76it/s][A
+ 13%|█▎        | 101/774 [00:26<03:02,  3.69it/s][A
+ 13%|█▎        | 102/774 [00:26<03:14,  3.45it/s][A
+ 13%|█▎        | 103/774 [00:26<03:17,  3.39it/s][A
+ 13%|█▎        | 104/774 [00:27<03:16,  3.42it/s][A
+ 14%|█▎        | 105/774 [00:27<03:14,  3.45it/s][A
+ 14%|█▎        | 106/774 [00:27<03:34,  3.11it/s][A
+ 14%|█▍        | 107/774 [00:28<03:46,  2.94it/s][A
+ 14%|█▍        | 108/774 [00:28<03:37,  3.07it/s][A
+ 14%|█▍        | 109/774 [00:28<03:35,  3.09it/s][A
+ 14%|█▍        | 110/774 [00:29<03:24,  3.24it/s][A
+ 14%|█▍        | 111/774 [00:29<03:23,  3.26it/s][A
+ 14%|█▍        | 112/774 [00:29<03:13,  3.43it/s][A
+ 15%|█▍        | 113/774 [00:29<03:18,  3.34it/s][A
+ 15%|█▍        | 114/774 [00:30<03:21,  3.27it/s][A
+ 15%|█▍        | 115/774 [00:30<03:17,  3.34it/s][A
+ 15%|█▍        | 116/774 [00:30<03:01,  3.62it/s][A
+ 15%|█▌        | 117/774 [00:31<03:08,  3.49it/s][A
+ 15%|█▌        | 118/774 [00:31<03:08,  3.48it/s][A
+ 15%|█▌        | 119/774 [00:31<03:00,  3.63it/s][A
+ 16%|█▌        | 120/774 [00:31<03:10,  3.44it/s][A
+ 16%|█▌        | 121/774 [00:32<03:05,  3.52it/s][A
+ 16%|█▌        | 122/774 [00:32<03:08,  3.46it/s][A
+ 16%|█▌        | 123/774 [00:32<02:59,  3.63it/s][A
+ 16%|█▌        | 124/774 [00:33<03:00,  3.59it/s][A
+ 16%|█▌        | 125/774 [00:33<03:01,  3.57it/s][A
+ 16%|█▋        | 126/774 [00:33<03:14,  3.32it/s][A
+ 16%|█▋        | 127/774 [00:34<03:23,  3.19it/s][A
+ 17%|█▋        | 128/774 [00:34<03:12,  3.36it/s][A
+ 17%|█▋        | 129/774 [00:34<03:21,  3.20it/s][A
+ 17%|█▋        | 130/774 [00:34<03:25,  3.13it/s][A
+ 17%|█▋        | 131/774 [00:35<03:15,  3.30it/s][A
+ 17%|█▋        | 132/774 [00:35<03:13,  3.32it/s][A
+ 17%|█▋        | 133/774 [00:35<03:08,  3.40it/s][A
+ 17%|█▋        | 134/774 [00:36<03:06,  3.43it/s][A
+ 17%|█▋        | 135/774 [00:36<03:22,  3.16it/s][A
+ 18%|█▊        | 136/774 [00:36<03:27,  3.08it/s][A
+ 18%|█▊        | 137/774 [00:37<03:27,  3.07it/s][A
+ 18%|█▊        | 138/774 [00:37<03:24,  3.12it/s][A
+ 18%|█▊        | 139/774 [00:37<03:24,  3.10it/s][A
+ 18%|█▊        | 140/774 [00:38<03:19,  3.18it/s][A
+ 18%|█▊        | 141/774 [00:38<03:12,  3.28it/s][A
+ 18%|█▊        | 142/774 [00:38<03:22,  3.13it/s][A
+ 18%|█▊        | 143/774 [00:38<03:19,  3.16it/s][A
+ 19%|█▊        | 144/774 [00:39<03:09,  3.33it/s][A
+ 19%|█▊        | 145/774 [00:39<03:01,  3.46it/s][A
+ 19%|█▉        | 146/774 [00:39<02:50,  3.68it/s][A
+ 19%|█▉        | 147/774 [00:39<02:41,  3.88it/s][A
+ 19%|█▉        | 148/774 [00:40<02:51,  3.64it/s][A
+ 19%|█▉        | 149/774 [00:40<03:04,  3.39it/s][A
+ 19%|█▉        | 150/774 [00:40<03:06,  3.34it/s][A
+ 20%|█▉        | 151/774 [00:41<02:58,  3.49it/s][A
+ 20%|█▉        | 152/774 [00:41<02:48,  3.70it/s][A
+ 20%|█▉        | 153/774 [00:41<02:55,  3.54it/s][A
+ 20%|█▉        | 154/774 [00:42<02:53,  3.57it/s][A
+ 20%|██        | 155/774 [00:42<02:49,  3.65it/s][A
+ 20%|██        | 156/774 [00:42<02:44,  3.76it/s][A
+ 20%|██        | 157/774 [00:42<02:37,  3.91it/s][A
+ 20%|██        | 158/774 [00:43<02:41,  3.82it/s][A
+ 21%|██        | 159/774 [00:43<02:42,  3.77it/s][A
+ 21%|██        | 160/774 [00:43<02:34,  3.98it/s][A
+ 21%|██        | 161/774 [00:43<02:42,  3.78it/s][A
+ 21%|██        | 162/774 [00:44<02:49,  3.61it/s][A
+ 21%|██        | 163/774 [00:44<02:47,  3.64it/s][A
+ 21%|██        | 164/774 [00:44<02:41,  3.77it/s][A
+ 21%|██▏       | 165/774 [00:44<02:38,  3.83it/s][A
+ 21%|██▏       | 166/774 [00:45<02:43,  3.71it/s][A
+ 22%|██▏       | 167/774 [00:45<02:46,  3.65it/s][A
+ 22%|██▏       | 168/774 [00:45<02:37,  3.86it/s][A
+ 22%|██▏       | 169/774 [00:45<02:29,  4.05it/s][A
+ 22%|██▏       | 170/774 [00:46<02:37,  3.84it/s][A
+ 22%|██▏       | 171/774 [00:46<02:47,  3.60it/s][A
+ 22%|██▏       | 172/774 [00:46<02:55,  3.44it/s][A
+ 22%|██▏       | 173/774 [00:47<02:51,  3.51it/s][A
+ 22%|██▏       | 174/774 [00:47<02:43,  3.67it/s][A
+ 23%|██▎       | 175/774 [00:47<02:44,  3.65it/s][A
+ 23%|██▎       | 176/774 [00:47<02:37,  3.79it/s][A
+ 23%|██▎       | 177/774 [00:48<02:50,  3.49it/s][A
+ 23%|██▎       | 178/774 [00:48<02:34,  3.86it/s][A
+ 23%|██▎       | 179/774 [00:48<02:21,  4.21it/s][A
+ 23%|██▎       | 180/774 [00:48<02:16,  4.35it/s][A
+ 23%|██▎       | 181/774 [00:49<02:19,  4.24it/s][A
+ 24%|██▎       | 182/774 [00:49<02:23,  4.14it/s][A
+ 24%|██▎       | 183/774 [00:49<02:23,  4.12it/s][A
+ 24%|██▍       | 184/774 [00:49<02:34,  3.82it/s][A
+ 24%|██▍       | 185/774 [00:50<02:43,  3.60it/s][A
+ 24%|██▍       | 186/774 [00:50<02:41,  3.64it/s][A
+ 24%|██▍       | 187/774 [00:50<02:35,  3.78it/s][A
+ 24%|██▍       | 188/774 [00:50<02:34,  3.80it/s][A
+ 24%|██▍       | 189/774 [00:51<02:32,  3.85it/s][A
+ 25%|██▍       | 190/774 [00:51<02:27,  3.97it/s][A
+ 25%|██▍       | 191/774 [00:51<02:32,  3.82it/s][A
+ 25%|██▍       | 192/774 [00:51<02:35,  3.74it/s][A
+ 25%|██▍       | 193/774 [00:52<02:39,  3.64it/s][A
+ 25%|██▌       | 194/774 [00:52<02:48,  3.44it/s][A
+ 25%|██▌       | 195/774 [00:52<02:56,  3.28it/s][A
+ 25%|██▌       | 196/774 [00:53<02:56,  3.27it/s][A
+ 25%|██▌       | 197/774 [00:53<02:54,  3.31it/s][A
+ 26%|██▌       | 198/774 [00:53<02:45,  3.49it/s][A
+ 26%|██▌       | 199/774 [00:54<02:45,  3.47it/s][A
+ 26%|██▌       | 200/774 [00:54<02:38,  3.63it/s][A
+ 26%|██▌       | 201/774 [00:54<02:35,  3.68it/s][A
+ 26%|██▌       | 202/774 [00:54<02:33,  3.72it/s][A
+ 26%|██▌       | 203/774 [00:55<02:26,  3.89it/s][A
+ 26%|██▋       | 204/774 [00:55<02:30,  3.79it/s][A
+ 26%|██▋       | 205/774 [00:55<02:40,  3.55it/s][A
+ 27%|██▋       | 206/774 [00:55<02:35,  3.65it/s][A
+ 27%|██▋       | 207/774 [00:56<02:33,  3.70it/s][A
+ 27%|██▋       | 208/774 [00:56<02:34,  3.66it/s][A
+ 27%|██▋       | 209/774 [00:56<02:33,  3.69it/s][A
+ 27%|██▋       | 210/774 [00:57<02:32,  3.70it/s][A
+ 27%|██▋       | 211/774 [00:57<02:28,  3.78it/s][A
+ 27%|██▋       | 212/774 [00:57<02:17,  4.08it/s][A
+ 28%|██▊       | 213/774 [00:57<02:02,  4.59it/s][A
+ 28%|██▊       | 214/774 [00:57<02:04,  4.51it/s][A
+ 28%|██▊       | 215/774 [00:58<02:03,  4.52it/s][A
+ 28%|██▊       | 216/774 [00:58<02:02,  4.54it/s][A
+ 28%|██▊       | 217/774 [00:58<02:06,  4.42it/s][A
+ 28%|██▊       | 218/774 [00:58<02:12,  4.21it/s][A
+ 28%|██▊       | 219/774 [00:59<02:20,  3.95it/s][A
+ 28%|██▊       | 220/774 [00:59<02:19,  3.97it/s][A
+ 29%|██▊       | 221/774 [00:59<02:24,  3.82it/s][A
+ 29%|██▊       | 222/774 [00:59<02:33,  3.59it/s][A
+ 29%|██▉       | 223/774 [01:00<02:50,  3.23it/s][A
+ 29%|██▉       | 224/774 [01:00<03:00,  3.05it/s][A
+ 29%|██▉       | 225/774 [01:01<03:11,  2.87it/s][A
+ 29%|██▉       | 226/774 [01:01<03:14,  2.81it/s][A
+ 29%|██▉       | 227/774 [01:01<03:11,  2.86it/s][A
+ 29%|██▉       | 228/774 [01:02<03:04,  2.97it/s][A
+ 30%|██▉       | 229/774 [01:02<03:20,  2.72it/s][A
+ 30%|██▉       | 230/774 [01:02<03:06,  2.92it/s][A
+ 30%|██▉       | 231/774 [01:03<03:03,  2.96it/s][A
+ 30%|██▉       | 232/774 [01:03<02:55,  3.10it/s][A
+ 30%|███       | 233/774 [01:03<03:09,  2.85it/s][A
+ 30%|███       | 234/774 [01:04<03:13,  2.80it/s][A
+ 30%|███       | 235/774 [01:04<03:11,  2.81it/s][A
+ 30%|███       | 236/774 [01:04<03:13,  2.78it/s][A
+ 31%|███       | 237/774 [01:05<03:11,  2.81it/s][A
+ 31%|███       | 238/774 [01:05<03:01,  2.95it/s][A
+ 31%|███       | 239/774 [01:05<03:00,  2.96it/s][A
+ 31%|███       | 240/774 [01:06<02:59,  2.97it/s][A
+ 31%|███       | 241/774 [01:06<03:02,  2.92it/s][A
+ 31%|███▏      | 242/774 [01:07<03:13,  2.75it/s][A
+ 31%|███▏      | 243/774 [01:07<03:23,  2.61it/s][A
+ 32%|███▏      | 244/774 [01:07<03:18,  2.68it/s][A
+ 32%|███▏      | 245/774 [01:08<03:09,  2.79it/s][A
+ 32%|███▏      | 246/774 [01:08<03:08,  2.81it/s][A
+ 32%|███▏      | 247/774 [01:09<03:34,  2.45it/s][A
+ 32%|███▏      | 248/774 [01:09<03:43,  2.35it/s][A
+ 32%|███▏      | 249/774 [01:09<03:22,  2.59it/s][A
+ 32%|███▏      | 250/774 [01:10<03:17,  2.65it/s][A
+ 32%|███▏      | 251/774 [01:10<03:16,  2.66it/s][A
+ 33%|███▎      | 252/774 [01:10<03:13,  2.69it/s][A
+ 33%|███▎      | 253/774 [01:11<03:12,  2.70it/s][A
+ 33%|███▎      | 254/774 [01:11<03:08,  2.76it/s][A
+ 33%|███▎      | 255/774 [01:11<03:02,  2.84it/s][A
+ 33%|███▎      | 256/774 [01:12<02:58,  2.90it/s][A
+ 33%|███▎      | 257/774 [01:12<02:56,  2.93it/s][A
+ 33%|███▎      | 258/774 [01:12<02:41,  3.19it/s][A
+ 33%|███▎      | 259/774 [01:13<02:23,  3.59it/s][A
+ 34%|███▎      | 260/774 [01:13<02:23,  3.59it/s][A
+ 34%|███▎      | 261/774 [01:13<02:28,  3.46it/s][A
+ 34%|███▍      | 262/774 [01:13<02:13,  3.84it/s][A
+ 34%|███▍      | 263/774 [01:14<02:06,  4.05it/s][A
+ 34%|███▍      | 264/774 [01:14<02:14,  3.78it/s][A
+ 34%|███▍      | 265/774 [01:14<02:10,  3.91it/s][A
+ 34%|███▍      | 266/774 [01:14<02:04,  4.09it/s][A
+ 34%|███▍      | 267/774 [01:15<02:02,  4.13it/s][A
+ 35%|███▍      | 268/774 [01:15<02:09,  3.92it/s][A
+ 35%|███▍      | 269/774 [01:15<02:14,  3.75it/s][A
+ 35%|███▍      | 270/774 [01:15<02:20,  3.59it/s][A
+ 35%|███▌      | 271/774 [01:16<02:17,  3.66it/s][A
+ 35%|███▌      | 272/774 [01:16<02:06,  3.98it/s][A
+ 35%|███▌      | 273/774 [01:16<02:01,  4.13it/s][A
+ 35%|███▌      | 274/774 [01:16<02:06,  3.95it/s][A
+ 36%|███▌      | 275/774 [01:17<01:59,  4.16it/s][A
+ 36%|███▌      | 276/774 [01:17<01:53,  4.37it/s][A
+ 36%|███▌      | 277/774 [01:17<01:57,  4.22it/s][A
+ 36%|███▌      | 278/774 [01:17<01:59,  4.16it/s][A
+ 36%|███▌      | 279/774 [01:17<01:53,  4.35it/s][A
+ 36%|███▌      | 280/774 [01:18<01:55,  4.29it/s][A
+ 36%|███▋      | 281/774 [01:18<02:05,  3.92it/s][A
+ 36%|███▋      | 282/774 [01:18<02:16,  3.61it/s][A
+ 37%|███▋      | 283/774 [01:19<02:11,  3.73it/s][A
+ 37%|███▋      | 284/774 [01:19<02:12,  3.69it/s][A
+ 37%|███▋      | 285/774 [01:19<02:05,  3.90it/s][A
+ 37%|███▋      | 286/774 [01:19<02:00,  4.05it/s][A
+ 37%|███▋      | 287/774 [01:20<02:11,  3.71it/s][A
+ 37%|███▋      | 288/774 [01:20<02:15,  3.58it/s][A
+ 37%|███▋      | 289/774 [01:20<02:13,  3.63it/s][A
+ 37%|███▋      | 290/774 [01:20<02:09,  3.74it/s][A
+ 38%|███▊      | 291/774 [01:21<02:07,  3.78it/s][A
+ 38%|███▊      | 292/774 [01:21<02:04,  3.86it/s][A
+ 38%|███▊      | 293/774 [01:21<01:54,  4.20it/s][A
+ 38%|███▊      | 294/774 [01:21<01:51,  4.32it/s][A
+ 38%|███▊      | 295/774 [01:22<01:49,  4.37it/s][A
+ 38%|███▊      | 296/774 [01:22<01:44,  4.58it/s][A
+ 38%|███▊      | 297/774 [01:22<01:38,  4.83it/s][A
+ 39%|███▊      | 298/774 [01:22<01:43,  4.58it/s][A
+ 39%|███▊      | 299/774 [01:22<01:47,  4.40it/s][A
+ 39%|███▉      | 300/774 [01:23<01:54,  4.13it/s][A
+ 39%|███▉      | 301/774 [01:23<01:48,  4.34it/s][A
+ 39%|███▉      | 302/774 [01:23<01:41,  4.66it/s][A
+ 39%|███▉      | 303/774 [01:23<01:38,  4.79it/s][A
+ 39%|███▉      | 304/774 [01:23<01:27,  5.40it/s][A
+ 39%|███▉      | 305/774 [01:24<01:26,  5.43it/s][A
+ 40%|███▉      | 306/774 [01:24<01:38,  4.75it/s][A
+ 40%|███▉      | 307/774 [01:24<01:43,  4.51it/s][A
+ 40%|███▉      | 308/774 [01:24<01:37,  4.76it/s][A
+ 40%|███▉      | 309/774 [01:25<01:38,  4.70it/s][A
+ 40%|████      | 310/774 [01:25<01:44,  4.44it/s][A
+ 40%|████      | 311/774 [01:25<01:42,  4.50it/s][A
+ 40%|████      | 312/774 [01:25<01:39,  4.62it/s][A
+ 40%|████      | 313/774 [01:25<01:39,  4.65it/s][A
+ 41%|████      | 314/774 [01:26<01:40,  4.58it/s][A
+ 41%|████      | 315/774 [01:26<01:49,  4.20it/s][A
+ 41%|████      | 316/774 [01:26<01:40,  4.56it/s][A
+ 41%|████      | 317/774 [01:26<01:33,  4.89it/s][A
+ 41%|████      | 318/774 [01:27<01:37,  4.70it/s][A
+ 41%|████      | 319/774 [01:27<01:38,  4.61it/s][A
+ 41%|████▏     | 320/774 [01:27<01:39,  4.56it/s][A
+ 41%|████▏     | 321/774 [01:27<01:31,  4.95it/s][A
+ 42%|████▏     | 322/774 [01:27<01:25,  5.27it/s][A
+ 42%|████▏     | 323/774 [01:27<01:17,  5.84it/s][A
+ 42%|████▏     | 324/774 [01:28<01:23,  5.36it/s][A
+ 42%|████▏     | 325/774 [01:28<01:27,  5.14it/s][A
+ 42%|████▏     | 326/774 [01:28<01:24,  5.30it/s][A
+ 42%|████▏     | 327/774 [01:28<01:27,  5.08it/s][A
+ 42%|████▏     | 328/774 [01:28<01:25,  5.20it/s][A
+ 43%|████▎     | 329/774 [01:29<01:34,  4.72it/s][A
+ 43%|████▎     | 330/774 [01:29<01:30,  4.92it/s][A
+ 43%|████▎     | 331/774 [01:29<01:20,  5.50it/s][A
+ 43%|████▎     | 332/774 [01:29<01:18,  5.61it/s][A
+ 43%|████▎     | 333/774 [01:29<01:22,  5.38it/s][A
+ 43%|████▎     | 334/774 [01:30<01:26,  5.11it/s][A
+ 43%|████▎     | 335/774 [01:30<01:26,  5.07it/s][A
+ 43%|████▎     | 336/774 [01:30<01:25,  5.09it/s][A
+ 44%|████▎     | 337/774 [01:30<01:19,  5.52it/s][A
+ 44%|████▎     | 338/774 [01:30<01:14,  5.86it/s][A
+ 44%|████▍     | 339/774 [01:30<01:10,  6.20it/s][A
+ 44%|████▍     | 340/774 [01:31<01:10,  6.19it/s][A
+ 44%|████▍     | 341/774 [01:31<01:27,  4.95it/s][A
+ 44%|████▍     | 342/774 [01:31<01:36,  4.47it/s][A
+ 44%|████▍     | 343/774 [01:31<01:36,  4.46it/s][A
+ 44%|████▍     | 344/774 [01:32<01:40,  4.26it/s][A
+ 45%|████▍     | 345/774 [01:32<01:44,  4.11it/s][A
+ 45%|████▍     | 346/774 [01:32<01:46,  4.03it/s][A
+ 45%|████▍     | 347/774 [01:32<01:43,  4.13it/s][A
+ 45%|████▍     | 348/774 [01:33<01:37,  4.36it/s][A
+ 45%|████▌     | 349/774 [01:33<01:34,  4.52it/s][A
+ 45%|████▌     | 350/774 [01:33<01:37,  4.36it/s][A
+ 45%|████▌     | 351/774 [01:33<01:37,  4.32it/s][A
+ 45%|████▌     | 352/774 [01:34<01:34,  4.46it/s][A
+ 46%|████▌     | 353/774 [01:34<01:33,  4.48it/s][A
+ 46%|████▌     | 354/774 [01:34<01:33,  4.49it/s][A
+ 46%|████▌     | 355/774 [01:34<01:38,  4.25it/s][A
+ 46%|████▌     | 356/774 [01:35<01:48,  3.86it/s][A
+ 46%|████▌     | 357/774 [01:35<02:03,  3.36it/s][A
+ 46%|████▋     | 358/774 [01:35<02:08,  3.24it/s][A
+ 46%|████▋     | 359/774 [01:36<02:07,  3.25it/s][A
+ 47%|████▋     | 360/774 [01:36<02:08,  3.23it/s][A
+ 47%|████▋     | 361/774 [01:36<02:01,  3.40it/s][A
+ 47%|████▋     | 362/774 [01:36<02:08,  3.22it/s][A
+ 47%|████▋     | 363/774 [01:37<02:07,  3.23it/s][A
+ 47%|████▋     | 364/774 [01:37<02:08,  3.19it/s][A
+ 47%|████▋     | 365/774 [01:37<02:04,  3.28it/s][A
+ 47%|████▋     | 366/774 [01:38<01:55,  3.53it/s][A
+ 47%|████▋     | 367/774 [01:38<01:49,  3.71it/s][A
+ 48%|████▊     | 368/774 [01:38<01:47,  3.78it/s][A
+ 48%|████▊     | 369/774 [01:38<01:53,  3.55it/s][A
+ 48%|████▊     | 370/774 [01:39<02:08,  3.15it/s][A
+ 48%|████▊     | 371/774 [01:39<01:59,  3.36it/s][A
+ 48%|████▊     | 372/774 [01:39<02:00,  3.34it/s][A
+ 48%|████▊     | 373/774 [01:40<01:57,  3.40it/s][A
+ 48%|████▊     | 374/774 [01:40<01:55,  3.47it/s][A
+ 48%|████▊     | 375/774 [01:40<01:55,  3.46it/s][A
+ 49%|████▊     | 376/774 [01:41<01:59,  3.33it/s][A
+ 49%|████▊     | 377/774 [01:41<02:11,  3.02it/s][A
+ 49%|████▉     | 378/774 [01:41<02:12,  2.98it/s][A
+ 49%|████▉     | 379/774 [01:42<02:03,  3.20it/s][A
+ 49%|████▉     | 380/774 [01:42<01:53,  3.47it/s][A
+ 49%|████▉     | 381/774 [01:42<01:44,  3.76it/s][A
+ 49%|████▉     | 382/774 [01:42<01:41,  3.87it/s][A
+ 49%|████▉     | 383/774 [01:43<01:39,  3.92it/s][A
+ 50%|████▉     | 384/774 [01:43<01:47,  3.62it/s][A
+ 50%|████▉     | 385/774 [01:43<01:55,  3.36it/s][A
+ 50%|████▉     | 386/774 [01:43<01:48,  3.57it/s][A
+ 50%|█████     | 387/774 [01:44<01:41,  3.80it/s][A
+ 50%|█████     | 388/774 [01:44<01:47,  3.60it/s][A
+ 50%|█████     | 389/774 [01:44<01:43,  3.72it/s][A
+ 50%|█████     | 390/774 [01:45<01:57,  3.28it/s][A
+ 51%|█████     | 391/774 [01:45<01:58,  3.23it/s][A
+ 51%|█████     | 392/774 [01:45<01:48,  3.50it/s][A
+ 51%|█████     | 393/774 [01:45<01:40,  3.79it/s][A
+ 51%|█████     | 394/774 [01:46<01:41,  3.76it/s][A
+ 51%|█████     | 395/774 [01:46<01:48,  3.50it/s][A
+ 51%|█████     | 396/774 [01:46<01:45,  3.57it/s][A
+ 51%|█████▏    | 397/774 [01:47<01:48,  3.46it/s][A
+ 51%|█████▏    | 398/774 [01:47<01:43,  3.63it/s][A
+ 52%|█████▏    | 399/774 [01:47<01:42,  3.65it/s][A
+ 52%|█████▏    | 400/774 [01:47<01:35,  3.92it/s][A
+ 52%|█████▏    | 401/774 [01:47<01:31,  4.07it/s][A
+ 52%|█████▏    | 402/774 [01:48<01:31,  4.08it/s][A
+ 52%|█████▏    | 403/774 [01:48<01:35,  3.89it/s][A
+ 52%|█████▏    | 404/774 [01:48<01:41,  3.65it/s][A
+ 52%|█████▏    | 405/774 [01:49<01:37,  3.78it/s][A
+ 52%|█████▏    | 406/774 [01:49<01:40,  3.67it/s][A
+ 53%|█████▎    | 407/774 [01:49<01:45,  3.47it/s][A
+ 53%|█████▎    | 408/774 [01:49<01:41,  3.59it/s][A
+ 53%|█████▎    | 409/774 [01:50<01:38,  3.70it/s][A
+ 53%|█████▎    | 410/774 [01:50<01:39,  3.65it/s][A
+ 53%|█████▎    | 411/774 [01:50<01:38,  3.67it/s][A
+ 53%|█████▎    | 412/774 [01:51<01:40,  3.60it/s][A
+ 53%|█████▎    | 413/774 [01:51<01:38,  3.66it/s][A
+ 53%|█████▎    | 414/774 [01:51<01:35,  3.75it/s][A
+ 54%|█████▎    | 415/774 [01:51<01:24,  4.23it/s][A
+ 54%|█████▎    | 416/774 [01:51<01:24,  4.21it/s][A
+ 54%|█████▍    | 417/774 [01:52<01:23,  4.28it/s][A
+ 54%|█████▍    | 418/774 [01:52<01:17,  4.61it/s][A
+ 54%|█████▍    | 419/774 [01:52<01:31,  3.86it/s][A
+ 54%|█████▍    | 420/774 [01:53<01:36,  3.67it/s][A
+ 54%|█████▍    | 421/774 [01:53<01:36,  3.67it/s][A
+ 55%|█████▍    | 422/774 [01:53<01:36,  3.66it/s][A
+ 55%|█████▍    | 423/774 [01:53<01:37,  3.61it/s][A
+ 55%|█████▍    | 424/774 [01:54<01:35,  3.67it/s][A
+ 55%|█████▍    | 425/774 [01:54<01:24,  4.15it/s][A
+ 55%|█████▌    | 426/774 [01:54<01:17,  4.46it/s][A
+ 55%|█████▌    | 427/774 [01:54<01:14,  4.65it/s][A
+ 55%|█████▌    | 428/774 [01:54<01:16,  4.53it/s][A
+ 55%|█████▌    | 429/774 [01:55<01:18,  4.37it/s][A
+ 56%|█████▌    | 430/774 [01:55<01:22,  4.17it/s][A
+ 56%|█████▌    | 431/774 [01:55<01:35,  3.59it/s][A
+ 56%|█████▌    | 432/774 [01:56<01:34,  3.62it/s][A
+ 56%|█████▌    | 433/774 [01:56<01:27,  3.89it/s][A
+ 56%|█████▌    | 434/774 [01:56<01:22,  4.11it/s][A
+ 56%|█████▌    | 435/774 [01:56<01:22,  4.13it/s][A
+ 56%|█████▋    | 436/774 [01:56<01:23,  4.04it/s][A
+ 56%|█████▋    | 437/774 [01:57<01:20,  4.18it/s][A
+ 57%|█████▋    | 438/774 [01:57<01:16,  4.39it/s][A
+ 57%|█████▋    | 439/774 [01:57<01:19,  4.20it/s][A
+ 57%|█████▋    | 440/774 [01:57<01:23,  4.00it/s][A
+ 57%|█████▋    | 441/774 [01:58<01:27,  3.80it/s][A
+ 57%|█████▋    | 442/774 [01:58<01:29,  3.71it/s][A
+ 57%|█████▋    | 443/774 [01:58<01:27,  3.79it/s][A
+ 57%|█████▋    | 444/774 [01:58<01:25,  3.88it/s][A
+ 57%|█████▋    | 445/774 [01:59<01:25,  3.86it/s][A
+ 58%|█████▊    | 446/774 [01:59<01:22,  3.96it/s][A
+ 58%|█████▊    | 447/774 [01:59<01:21,  4.02it/s][A
+ 58%|█████▊    | 448/774 [01:59<01:14,  4.39it/s][A
+ 58%|█████▊    | 449/774 [02:00<01:14,  4.35it/s][A
+ 58%|█████▊    | 450/774 [02:00<01:17,  4.17it/s][A
+ 58%|█████▊    | 451/774 [02:00<01:15,  4.27it/s][A
+ 58%|█████▊    | 452/774 [02:00<01:11,  4.48it/s][A
+ 59%|█████▊    | 453/774 [02:01<01:10,  4.55it/s][A
+ 59%|█████▊    | 454/774 [02:01<01:15,  4.22it/s][A
+ 59%|█████▉    | 455/774 [02:01<01:20,  3.96it/s][A
+ 59%|█████▉    | 456/774 [02:01<01:24,  3.75it/s][A
+ 59%|█████▉    | 457/774 [02:02<01:18,  4.02it/s][A
+ 59%|█████▉    | 458/774 [02:02<01:18,  4.03it/s][A
+ 59%|█████▉    | 459/774 [02:02<01:16,  4.10it/s][A
+ 59%|█████▉    | 460/774 [02:02<01:22,  3.82it/s][A
+ 60%|█████▉    | 461/774 [02:03<01:29,  3.51it/s][A
+ 60%|█████▉    | 462/774 [02:03<01:26,  3.61it/s][A
+ 60%|█████▉    | 463/774 [02:03<01:24,  3.69it/s][A
+ 60%|█████▉    | 464/774 [02:04<01:23,  3.69it/s][A
+ 60%|██████    | 465/774 [02:04<01:15,  4.09it/s][A
+ 60%|██████    | 466/774 [02:04<01:12,  4.22it/s][A
+ 60%|██████    | 467/774 [02:04<01:08,  4.47it/s][A
+ 60%|██████    | 468/774 [02:04<01:08,  4.44it/s][A
+ 61%|██████    | 469/774 [02:05<01:02,  4.85it/s][A
+ 61%|██████    | 470/774 [02:05<01:00,  5.02it/s][A
+ 61%|██████    | 471/774 [02:05<01:02,  4.83it/s][A
+ 61%|██████    | 472/774 [02:05<01:07,  4.46it/s][A
+ 61%|██████    | 473/774 [02:05<01:09,  4.33it/s][A
+ 61%|██████    | 474/774 [02:06<01:08,  4.39it/s][A
+ 61%|██████▏   | 475/774 [02:06<01:09,  4.29it/s][A
+ 61%|██████▏   | 476/774 [02:06<01:17,  3.83it/s][A
+ 62%|██████▏   | 477/774 [02:07<01:31,  3.24it/s][A
+ 62%|██████▏   | 478/774 [02:07<01:32,  3.19it/s][A
+ 62%|██████▏   | 479/774 [02:07<01:30,  3.25it/s][A
+ 62%|██████▏   | 480/774 [02:08<01:27,  3.35it/s][A
+ 62%|██████▏   | 481/774 [02:08<01:28,  3.32it/s][A
+ 62%|██████▏   | 482/774 [02:08<01:26,  3.38it/s][A
+ 62%|██████▏   | 483/774 [02:08<01:24,  3.46it/s][A
+ 63%|██████▎   | 484/774 [02:09<01:25,  3.39it/s][A
+ 63%|██████▎   | 485/774 [02:09<01:26,  3.33it/s][A
+ 63%|██████▎   | 486/774 [02:09<01:23,  3.44it/s][A
+ 63%|██████▎   | 487/774 [02:10<01:25,  3.37it/s][A
+ 63%|██████▎   | 488/774 [02:10<01:22,  3.45it/s][A
+ 63%|██████▎   | 489/774 [02:10<01:17,  3.67it/s][A
+ 63%|██████▎   | 490/774 [02:10<01:17,  3.65it/s][A
+ 63%|██████▎   | 491/774 [02:11<01:16,  3.69it/s][A
+ 64%|██████▎   | 492/774 [02:11<01:18,  3.60it/s][A
+ 64%|██████▎   | 493/774 [02:11<01:18,  3.56it/s][A
+ 64%|██████▍   | 494/774 [02:12<01:17,  3.60it/s][A
+ 64%|██████▍   | 495/774 [02:12<01:17,  3.58it/s][A
+ 64%|██████▍   | 496/774 [02:12<01:23,  3.34it/s][A
+ 64%|██████▍   | 497/774 [02:12<01:23,  3.32it/s][A
+ 64%|██████▍   | 498/774 [02:13<01:22,  3.35it/s][A
+ 64%|██████▍   | 499/774 [02:13<01:20,  3.43it/s][A
+ 65%|██████▍   | 500/774 [02:13<01:17,  3.52it/s][A
+ 65%|██████▍   | 501/774 [02:14<01:14,  3.64it/s][A
+ 65%|██████▍   | 502/774 [02:14<01:14,  3.66it/s][A
+ 65%|██████▍   | 503/774 [02:14<01:20,  3.37it/s][A
+ 65%|██████▌   | 504/774 [02:14<01:22,  3.27it/s][A
+ 65%|██████▌   | 505/774 [02:15<01:19,  3.40it/s][A
+ 65%|██████▌   | 506/774 [02:15<01:19,  3.38it/s][A
+ 66%|██████▌   | 507/774 [02:15<01:24,  3.18it/s][A
+ 66%|██████▌   | 508/774 [02:16<01:21,  3.25it/s][A
+ 66%|██████▌   | 509/774 [02:16<01:19,  3.33it/s][A
+ 66%|██████▌   | 510/774 [02:16<01:17,  3.40it/s][A
+ 66%|██████▌   | 511/774 [02:16<01:13,  3.58it/s][A
+ 66%|██████▌   | 512/774 [02:17<01:11,  3.66it/s][A
+ 66%|██████▋   | 513/774 [02:17<01:14,  3.51it/s][A
+ 66%|██████▋   | 514/774 [02:17<01:16,  3.39it/s][A
+ 67%|██████▋   | 515/774 [02:18<01:22,  3.13it/s][A
+ 67%|██████▋   | 516/774 [02:18<01:17,  3.35it/s][A
+ 67%|██████▋   | 517/774 [02:18<01:10,  3.65it/s][A
+ 67%|██████▋   | 518/774 [02:18<01:07,  3.77it/s][A
+ 67%|██████▋   | 519/774 [02:19<01:10,  3.60it/s][A
+ 67%|██████▋   | 520/774 [02:19<01:10,  3.63it/s][A
+ 67%|██████▋   | 521/774 [02:19<01:07,  3.73it/s][A
+ 67%|██████▋   | 522/774 [02:20<01:03,  3.96it/s][A
+ 68%|██████▊   | 523/774 [02:20<01:02,  4.04it/s][A
+ 68%|██████▊   | 524/774 [02:20<01:06,  3.77it/s][A
+ 68%|██████▊   | 525/774 [02:20<01:07,  3.70it/s][A
+ 68%|██████▊   | 526/774 [02:21<01:09,  3.56it/s][A
+ 68%|██████▊   | 527/774 [02:21<01:10,  3.48it/s][A
+ 68%|██████▊   | 528/774 [02:21<01:10,  3.50it/s][A
+ 68%|██████▊   | 529/774 [02:21<01:06,  3.68it/s][A
+ 68%|██████▊   | 530/774 [02:22<01:05,  3.73it/s][A
+ 69%|██████▊   | 531/774 [02:22<01:04,  3.75it/s][A
+ 69%|██████▊   | 532/774 [02:22<01:02,  3.85it/s][A
+ 69%|██████▉   | 533/774 [02:22<00:59,  4.05it/s][A
+ 69%|██████▉   | 534/774 [02:23<00:56,  4.24it/s][A
+ 69%|██████▉   | 535/774 [02:23<00:58,  4.08it/s][A
+ 69%|██████▉   | 536/774 [02:23<01:01,  3.89it/s][A
+ 69%|██████▉   | 537/774 [02:23<01:01,  3.82it/s][A
+ 70%|██████▉   | 538/774 [02:24<01:05,  3.58it/s][A
+ 70%|██████▉   | 539/774 [02:24<01:04,  3.62it/s][A
+ 70%|██████▉   | 540/774 [02:24<01:04,  3.62it/s][A
+ 70%|██████▉   | 541/774 [02:25<01:02,  3.71it/s][A
+ 70%|███████   | 542/774 [02:25<01:02,  3.70it/s][A
+ 70%|███████   | 543/774 [02:25<01:03,  3.62it/s][A
+ 70%|███████   | 544/774 [02:25<01:03,  3.61it/s][A
+ 70%|███████   | 545/774 [02:26<01:01,  3.72it/s][A
+ 71%|███████   | 546/774 [02:26<00:58,  3.92it/s][A
+ 71%|███████   | 547/774 [02:26<00:55,  4.07it/s][A
+ 71%|███████   | 548/774 [02:26<00:54,  4.15it/s][A
+ 71%|███████   | 549/774 [02:27<00:55,  4.08it/s][A
+ 71%|███████   | 550/774 [02:27<00:58,  3.83it/s][A
+ 71%|███████   | 551/774 [02:27<01:01,  3.65it/s][A
+ 71%|███████▏  | 552/774 [02:28<01:03,  3.48it/s][A
+ 71%|███████▏  | 553/774 [02:28<01:08,  3.25it/s][A
+ 72%|███████▏  | 554/774 [02:28<01:07,  3.28it/s][A
+ 72%|███████▏  | 555/774 [02:29<01:06,  3.28it/s][A
+ 72%|███████▏  | 556/774 [02:29<01:02,  3.47it/s][A
+ 72%|███████▏  | 557/774 [02:29<01:06,  3.26it/s][A
+ 72%|███████▏  | 558/774 [02:29<01:00,  3.55it/s][A
+ 72%|███████▏  | 559/774 [02:30<00:55,  3.84it/s][A
+ 72%|███████▏  | 560/774 [02:30<01:00,  3.55it/s][A
+ 72%|███████▏  | 561/774 [02:30<00:56,  3.74it/s][A
+ 73%|███████▎  | 562/774 [02:30<00:52,  4.04it/s][A
+ 73%|███████▎  | 563/774 [02:31<00:50,  4.19it/s][A
+ 73%|███████▎  | 564/774 [02:31<00:52,  4.03it/s][A
+ 73%|███████▎  | 565/774 [02:31<00:53,  3.89it/s][A
+ 73%|███████▎  | 566/774 [02:31<00:49,  4.17it/s][A
+ 73%|███████▎  | 567/774 [02:31<00:45,  4.50it/s][A
+ 73%|███████▎  | 568/774 [02:32<00:47,  4.32it/s][A
+ 74%|███████▎  | 569/774 [02:32<00:48,  4.23it/s][A
+ 74%|███████▎  | 570/774 [02:32<00:48,  4.23it/s][A
+ 74%|███████▍  | 571/774 [02:33<00:52,  3.88it/s][A
+ 74%|███████▍  | 572/774 [02:33<00:54,  3.72it/s][A
+ 74%|███████▍  | 573/774 [02:33<00:53,  3.73it/s][A
+ 74%|███████▍  | 574/774 [02:33<00:52,  3.81it/s][A
+ 74%|███████▍  | 575/774 [02:34<00:52,  3.82it/s][A
+ 74%|███████▍  | 576/774 [02:34<00:57,  3.44it/s][A
+ 75%|███████▍  | 577/774 [02:34<00:55,  3.54it/s][A
+ 75%|███████▍  | 578/774 [02:34<00:54,  3.60it/s][A
+ 75%|███████▍  | 579/774 [02:35<00:56,  3.43it/s][A
+ 75%|███████▍  | 580/774 [02:35<00:56,  3.46it/s][A
+ 75%|███████▌  | 581/774 [02:35<00:55,  3.50it/s][A
+ 75%|███████▌  | 582/774 [02:36<00:53,  3.61it/s][A
+ 75%|███████▌  | 583/774 [02:36<00:50,  3.77it/s][A
+ 75%|███████▌  | 584/774 [02:36<00:50,  3.79it/s][A
+ 76%|███████▌  | 585/774 [02:36<00:52,  3.61it/s][A
+ 76%|███████▌  | 586/774 [02:37<00:52,  3.56it/s][A
+ 76%|███████▌  | 587/774 [02:37<00:51,  3.66it/s][A
+ 76%|███████▌  | 588/774 [02:37<00:50,  3.72it/s][A
+ 76%|███████▌  | 589/774 [02:37<00:48,  3.78it/s][A
+ 76%|███████▌  | 590/774 [02:38<00:45,  4.02it/s][A
+ 76%|███████▋  | 591/774 [02:38<00:47,  3.89it/s][A
+ 76%|███████▋  | 592/774 [02:38<00:50,  3.63it/s][A
+ 77%|███████▋  | 593/774 [02:39<00:50,  3.58it/s][A
+ 77%|███████▋  | 594/774 [02:39<00:50,  3.57it/s][A
+ 77%|███████▋  | 595/774 [02:39<00:53,  3.32it/s][A
+ 77%|███████▋  | 596/774 [02:40<00:56,  3.15it/s][A
+ 77%|███████▋  | 597/774 [02:40<00:56,  3.13it/s][A
+ 77%|███████▋  | 598/774 [02:40<00:57,  3.05it/s][A
+ 77%|███████▋  | 599/774 [02:41<00:57,  3.02it/s][A
+ 78%|███████▊  | 600/774 [02:41<00:57,  3.02it/s][A
+ 78%|███████▊  | 601/774 [02:41<00:57,  2.99it/s][A
+ 78%|███████▊  | 602/774 [02:42<00:57,  2.97it/s][A
+ 78%|███████▊  | 603/774 [02:42<00:56,  3.01it/s][A
+ 78%|███████▊  | 604/774 [02:42<00:57,  2.95it/s][A
+ 78%|███████▊  | 605/774 [02:43<00:56,  3.01it/s][A
+ 78%|███████▊  | 606/774 [02:43<00:57,  2.92it/s][A
+ 78%|███████▊  | 607/774 [02:43<00:56,  2.94it/s][A
+ 79%|███████▊  | 608/774 [02:44<00:56,  2.93it/s][A
+ 79%|███████▊  | 609/774 [02:44<00:54,  3.05it/s][A
+ 79%|███████▉  | 610/774 [02:44<00:55,  2.97it/s][A
+ 79%|███████▉  | 611/774 [02:45<00:59,  2.74it/s][A
+ 79%|███████▉  | 612/774 [02:45<01:01,  2.63it/s][A
+ 79%|███████▉  | 613/774 [02:45<00:56,  2.83it/s][A
+ 79%|███████▉  | 614/774 [02:46<00:55,  2.88it/s][A
+ 79%|███████▉  | 615/774 [02:46<00:52,  3.02it/s][A
+ 80%|███████▉  | 616/774 [02:46<00:51,  3.08it/s][A
+ 80%|███████▉  | 617/774 [02:47<00:50,  3.09it/s][A
+ 80%|███████▉  | 618/774 [02:47<00:48,  3.23it/s][A
+ 80%|███████▉  | 619/774 [02:47<00:45,  3.40it/s][A
+ 80%|████████  | 620/774 [02:47<00:44,  3.46it/s][A
+ 80%|████████  | 621/774 [02:48<00:41,  3.72it/s][A
+ 80%|████████  | 622/774 [02:48<00:38,  3.97it/s][A
+ 80%|████████  | 623/774 [02:48<00:38,  3.94it/s][A
+ 81%|████████  | 624/774 [02:48<00:41,  3.61it/s][A
+ 81%|████████  | 625/774 [02:49<00:41,  3.56it/s][A
+ 81%|████████  | 626/774 [02:49<00:44,  3.31it/s][A
+ 81%|████████  | 627/774 [02:49<00:45,  3.22it/s][A
+ 81%|████████  | 628/774 [02:50<00:45,  3.21it/s][A
+ 81%|████████▏ | 629/774 [02:50<00:43,  3.31it/s][A
+ 81%|████████▏ | 630/774 [02:50<00:40,  3.54it/s][A
+ 82%|████████▏ | 631/774 [02:51<00:38,  3.73it/s][A
+ 82%|████████▏ | 632/774 [02:51<00:37,  3.74it/s][A
+ 82%|████████▏ | 633/774 [02:51<00:39,  3.58it/s][A
+ 82%|████████▏ | 634/774 [02:51<00:40,  3.48it/s][A
+ 82%|████████▏ | 635/774 [02:52<00:39,  3.54it/s][A
+ 82%|████████▏ | 636/774 [02:52<00:39,  3.47it/s][A
+ 82%|████████▏ | 637/774 [02:52<00:38,  3.53it/s][A
+ 82%|████████▏ | 638/774 [02:53<00:38,  3.49it/s][A
+ 83%|████████▎ | 639/774 [02:53<00:43,  3.10it/s][A
+ 83%|████████▎ | 640/774 [02:53<00:49,  2.70it/s][A
+ 83%|████████▎ | 641/774 [02:54<00:49,  2.71it/s][A
+ 83%|████████▎ | 642/774 [02:54<00:45,  2.87it/s][A
+ 83%|████████▎ | 643/774 [02:54<00:45,  2.89it/s][A
+ 83%|████████▎ | 644/774 [02:55<00:41,  3.10it/s][A
+ 83%|████████▎ | 645/774 [02:55<00:38,  3.39it/s][A
+ 83%|████████▎ | 646/774 [02:55<00:35,  3.62it/s][A
+ 84%|████████▎ | 647/774 [02:55<00:32,  3.87it/s][A
+ 84%|████████▎ | 648/774 [02:56<00:32,  3.92it/s][A
+ 84%|████████▍ | 649/774 [02:56<00:31,  3.99it/s][A
+ 84%|████████▍ | 650/774 [02:56<00:29,  4.22it/s][A
+ 84%|████████▍ | 651/774 [02:56<00:29,  4.19it/s][A
+ 84%|████████▍ | 652/774 [02:57<00:29,  4.08it/s][A
+ 84%|████████▍ | 653/774 [02:57<00:31,  3.84it/s][A
+ 84%|██████��█▍ | 654/774 [02:57<00:29,  4.07it/s][A
+ 85%|████████▍ | 655/774 [02:57<00:27,  4.35it/s][A
+ 85%|████████▍ | 656/774 [02:58<00:28,  4.17it/s][A
+ 85%|████████▍ | 657/774 [02:58<00:26,  4.37it/s][A
+ 85%|████████▌ | 658/774 [02:58<00:27,  4.19it/s][A
+ 85%|████████▌ | 659/774 [02:58<00:29,  3.86it/s][A
+ 85%|████████▌ | 660/774 [02:59<00:30,  3.74it/s][A
+ 85%|████████▌ | 661/774 [02:59<00:30,  3.68it/s][A
+ 86%|████████▌ | 662/774 [02:59<00:29,  3.86it/s][A
+ 86%|████████▌ | 663/774 [02:59<00:30,  3.65it/s][A
+ 86%|████████▌ | 664/774 [03:00<00:30,  3.60it/s][A
+ 86%|████████▌ | 665/774 [03:00<00:28,  3.89it/s][A
+ 86%|████████▌ | 666/774 [03:00<00:25,  4.30it/s][A
+ 86%|████████▌ | 667/774 [03:00<00:23,  4.57it/s][A
+ 86%|████████▋ | 668/774 [03:01<00:23,  4.46it/s][A
+ 86%|████████▋ | 669/774 [03:01<00:25,  4.18it/s][A
+ 87%|████████▋ | 670/774 [03:01<00:24,  4.32it/s][A
+ 87%|████████▋ | 671/774 [03:01<00:26,  3.93it/s][A
+ 87%|████████▋ | 672/774 [03:02<00:25,  4.00it/s][A
+ 87%|████████▋ | 673/774 [03:02<00:24,  4.09it/s][A
+ 87%|████████▋ | 674/774 [03:02<00:24,  4.02it/s][A
+ 87%|████████▋ | 675/774 [03:02<00:23,  4.24it/s][A
+ 87%|████████▋ | 676/774 [03:02<00:22,  4.42it/s][A
+ 87%|████████▋ | 677/774 [03:03<00:22,  4.40it/s][A
+ 88%|████████▊ | 678/774 [03:03<00:21,  4.51it/s][A
+ 88%|████████▊ | 679/774 [03:03<00:22,  4.21it/s][A
+ 88%|████████▊ | 680/774 [03:03<00:22,  4.17it/s][A
+ 88%|████████▊ | 681/774 [03:04<00:20,  4.44it/s][A
+ 88%|████████▊ | 682/774 [03:04<00:20,  4.47it/s][A
+ 88%|████████▊ | 683/774 [03:04<00:21,  4.15it/s][A
+ 88%|████████▊ | 684/774 [03:04<00:23,  3.87it/s][A
+ 89%|████████▊ | 685/774 [03:05<00:24,  3.68it/s][A
+ 89%|████████▊ | 686/774 [03:05<00:23,  3.79it/s][A
+ 89%|████████▉ | 687/774 [03:05<00:21,  4.02it/s][A
+ 89%|████████▉ | 688/774 [03:05<00:21,  4.03it/s][A
+ 89%|████████▉ | 689/774 [03:06<00:20,  4.18it/s][A
+ 89%|████████▉ | 690/774 [03:06<00:19,  4.29it/s][A
+ 89%|████████▉ | 691/774 [03:06<00:18,  4.40it/s][A
+ 89%|████████▉ | 692/774 [03:06<00:18,  4.47it/s][A
+ 90%|████████▉ | 693/774 [03:07<00:18,  4.47it/s][A
+ 90%|████████▉ | 694/774 [03:07<00:19,  4.19it/s][A
+ 90%|████████▉ | 695/774 [03:07<00:20,  3.85it/s][A
+ 90%|████████▉ | 696/774 [03:07<00:19,  3.92it/s][A
+ 90%|█████████ | 697/774 [03:08<00:19,  3.85it/s][A
+ 90%|█████████ | 698/774 [03:08<00:17,  4.26it/s][A
+ 90%|█████████ | 699/774 [03:08<00:16,  4.63it/s][A
+ 90%|█████████ | 700/774 [03:08<00:17,  4.28it/s][A
+ 91%|█████████ | 701/774 [03:08<00:16,  4.36it/s][A
+ 91%|█████████ | 702/774 [03:09<00:16,  4.39it/s][A
+ 91%|█████████ | 703/774 [03:09<00:16,  4.38it/s][A
+ 91%|█████████ | 704/774 [03:09<00:16,  4.25it/s][A
+ 91%|█████████ | 705/774 [03:09<00:14,  4.60it/s][A
+ 91%|█████████ | 706/774 [03:10<00:14,  4.77it/s][A
+ 91%|█████████▏| 707/774 [03:10<00:14,  4.67it/s][A
+ 91%|█████████▏| 708/774 [03:10<00:13,  5.00it/s][A
+ 92%|█████████▏| 709/774 [03:10<00:13,  4.81it/s][A
+ 92%|█████████▏| 710/774 [03:10<00:13,  4.69it/s][A
+ 92%|█████████▏| 711/774 [03:11<00:12,  4.86it/s][A
+ 92%|█████████▏| 712/774 [03:11<00:12,  5.10it/s][A
+ 92%|█████████▏| 713/774 [03:11<00:12,  4.93it/s][A
+ 92%|█████████▏| 714/774 [03:11<00:12,  4.64it/s][A
+ 92%|█████████▏| 715/774 [03:11<00:12,  4.74it/s][A
+ 93%|█████████▎| 716/774 [03:12<00:11,  5.23it/s][A
+ 93%|█████████▎| 717/774 [03:12<00:10,  5.30it/s][A
+ 93%|█████████▎| 718/774 [03:12<00:11,  4.73it/s][A
+ 93%|█████████▎| 719/774 [03:12<00:11,  4.62it/s][A
+ 93%|█████████▎| 720/774 [03:12<00:10,  4.94it/s][A
+ 93%|█████████▎| 721/774 [03:13<00:10,  5.20it/s][A
+ 93%|█████████▎| 722/774 [03:13<00:09,  5.60it/s][A
+ 93%|█████████▎| 723/774 [03:13<00:09,  5.41it/s][A
+ 94%|█████████▎| 724/774 [03:13<00:09,  5.36it/s][A
+ 94%|█████████▎| 725/774 [03:13<00:08,  5.51it/s][A
+ 94%|█████████▍| 726/774 [03:13<00:08,  5.65it/s][A
+ 94%|█████████▍| 727/774 [03:14<00:08,  5.39it/s][A
+ 94%|█████████▍| 728/774 [03:14<00:09,  4.81it/s][A
+ 94%|█████████▍| 729/774 [03:14<00:08,  5.08it/s][A
+ 94%|█████████▍| 730/774 [03:14<00:08,  5.35it/s][A
+ 94%|█████████▍| 731/774 [03:14<00:08,  5.36it/s][A
+ 95%|█████████▍| 732/774 [03:15<00:07,  5.56it/s][A
+ 95%|█████████▍| 733/774 [03:15<00:07,  5.54it/s][A
+ 95%|█████████▍| 734/774 [03:15<00:07,  5.59it/s][A
+ 95%|█████████▍| 735/774 [03:15<00:06,  5.70it/s][A
+ 95%|█████████▌| 736/774 [03:15<00:06,  5.76it/s][A
+ 95%|█████████▌| 737/774 [03:15<00:06,  5.68it/s][A
+ 95%|█████████▌| 738/774 [03:16<00:06,  5.51it/s][A
+ 95%|█████████▌| 739/774 [03:16<00:06,  5.57it/s][A
+ 96%|█████████▌| 740/774 [03:16<00:06,  5.45it/s][A
+ 96%|█████████▌| 741/774 [03:16<00:06,  5.11it/s][A
+ 96%|█████████▌| 742/774 [03:16<00:06,  5.29it/s][A
+ 96%|█████████▌| 743/774 [03:17<00:05,  5.60it/s][A
+ 96%|█████████▌| 744/774 [03:17<00:05,  5.38it/s][A
+ 96%|█████████▋| 745/774 [03:17<00:06,  4.46it/s][A
+ 96%|█████████▋| 746/774 [03:17<00:07,  3.86it/s][A
+ 97%|█████████▋| 747/774 [03:18<00:06,  4.07it/s][A
+ 97%|█████████▋| 748/774 [03:18<00:06,  4.28it/s][A
+ 97%|█████████▋| 749/774 [03:18<00:05,  4.58it/s][A
+ 97%|█████████▋| 750/774 [03:18<00:05,  4.27it/s][A
+ 97%|█████████▋| 751/774 [03:18<00:05,  4.46it/s][A
+ 97%|█████████▋| 752/774 [03:19<00:05,  4.40it/s][A
+ 97%|█████████▋| 753/774 [03:19<00:04,  4.69it/s][A
+ 97%|█████████▋| 754/774 [03:19<00:03,  5.31it/s][A
+ 98%|█████████▊| 755/774 [03:19<00:03,  5.63it/s][A
+ 98%|█████████▊| 756/774 [03:19<00:03,  5.46it/s][A
+ 98%|█████████▊| 757/774 [03:20<00:03,  5.29it/s][A
+ 98%|█████████▊| 758/774 [03:20<00:03,  5.12it/s][A
+ 98%|█████████▊| 759/774 [03:20<00:02,  5.38it/s][A
+ 98%|█████████▊| 760/774 [03:20<00:02,  5.38it/s][A
+ 98%|█████████▊| 761/774 [03:20<00:02,  5.85it/s][A
+ 98%|█████████▊| 762/774 [03:20<00:01,  6.01it/s][A
+ 99%|█████████▊| 763/774 [03:21<00:01,  6.19it/s][A
+ 99%|█████████▊| 764/774 [03:21<00:01,  6.30it/s][A
+ 99%|█████████▉| 765/774 [03:21<00:01,  6.22it/s][A
+ 99%|█████████▉| 766/774 [03:21<00:01,  5.33it/s][A
+ 99%|█████████▉| 767/774 [03:21<00:01,  5.49it/s][A
+ 99%|█████████▉| 768/774 [03:22<00:01,  5.52it/s][A
+ 99%|█████████▉| 769/774 [03:22<00:00,  5.22it/s][A
+ 99%|█████████▉| 770/774 [03:22<00:00,  5.08it/s][A
+100%|█████████▉| 771/774 [03:22<00:00,  5.35it/s][A
+100%|█████████▉| 772/774 [03:22<00:00,  5.07it/s][A
+100%|█████████▉| 773/774 [03:23<00:00,  4.90it/s][A                                                       
+                                                 [A 78%|███████▊  | 10000/12776 [1:47:18<27:07,  1.71it/s]
+100%|██████████| 774/774 [03:25<00:00,  4.90it/s][A
+                                                 [ASaving model checkpoint to ./checkpoint-10000
+Configuration saved in ./checkpoint-10000/config.json
+Model weights saved in ./checkpoint-10000/model.safetensors
+Feature extractor saved in ./checkpoint-10000/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-10000/tokenizer_config.json
+Special tokens file saved in ./checkpoint-10000/special_tokens_map.json
+added tokens file saved in ./checkpoint-10000/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-8800] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 78%|███████▊  | 10001/12776 [1:47:33<51:27:55, 66.77s/it]                                                           78%|███████▊  | 10001/12776 [1:47:33<51:27:55, 66.77s/it] 78%|███████▊  | 10002/12776 [1:47:33<36:08:29, 46.90s/it]                                                           78%|███████▊  | 10002/12776 [1:47:33<36:08:29, 46.90s/it] 78%|███████▊  | 10003/12776 [1:47:34<25:23:12, 32.96s/it]                                                           78%|███████▊  | 10003/12776 [1:47:34<25:23:12, 32.96s/it] 78%|███████▊  | 10004/12776 [1:47:34<17:51:23, 23.19s/it]                                                           78%|███████▊  | 10004/12776 [1:47:34<17:51:23, 23.19s/it] 78%|███████▊  | 10005/12776 [1:47:35<12:36:29, 16.38s/it]                                                           78%|███████▊  | 10005/12776 [1:47:35<12:36:29, 16.38s/it] 78%|███████▊  | 10006/12776 [1:47:35<8:54:24, 11.58s/it]                                                           78%|███████▊  | 10006/12776 [1:47:35<8:54:24, 11.58s/it] 78%|███████▊  | 10007/12776 [1:47:36<6:18:50,  8.21s/it]                                                          78%|███████▊  | 10007/12776 [1:47:36<6:18:50,  8.21s/it] 78%|███████▊  | 10008/12776 [1:47:36<4:31:13,  5.88s/it]                                                          78%|███████▊  | 10008/12776 [1:47:36<4:31:13,  5.88s/it] 78%|███████▊  | 10009/12776 [1:47:36<3:14:15,  4.21s/it]                                                          78%|███████▊  | 10009/12776 [1:47:36<3:14:15,  4.21s/it] 78%|███████▊  | 10010/12776 [1:47:37<2:20:17,  3.04s/it]                                                          78%|███████▊  | 10010/12776 [1:47:37<2:20:17,  3.04s/it] 78%|███████▊  | 10011/12776 [1:47:37<1:44:18,  2.26s/it]                                                          78%|███████▊  | 10011/12776 [1:47:37<1:44:18,  2.26s/it] 78%|███████▊  | 10012/12776 [1:47:37<1:17:10,  1.68s/it]                                                          78%|███████▊  | 10012/12776 [1:47:37<1:17:10,  1.68s/it] 78%|███████▊  | 10013/12776 [1:47:38<58:02,  1.26s/it]                                                          78%|███████▊  | 10013/12776 [1:47:38<58:02,  1.26s/it] 78%|███████▊  | 10014/12776 [1:47:38<45:46,  1.01it/s]                                                        78%|███████▊  | 10014/12776 [1:47:38<45:46,  1.01it/s] 78%|███████▊  | 10015/12776 [1:47:38<35:44,  1.29it/s]                                                        78%|███████▊  | 10015/12776 [1:47:38<35:44,  1.29it/s] 78%|███████▊  | 10016/12776 [1:47:39<28:34,  1.61it/s]                                                        78%|███████▊  | 10016/12776 [1:47:39<28:34,  1.61it/s] 78%|███████▊  | 10017/12776 [1:47:39<23:30,  1.96it/s]                                                        78%|███████▊  | 10017/12776 [1:47:39<23:30,  1.96it/s] 78%|███████▊  | 10018/12776 [1:47:39<20:56,  2.19it/s]                                                        78%|███████▊  | 10018/12776 [1:47:39<20:56,  2.19it/s] 78%|███████▊  | 10019/12776 [1:47:39<17:51,  2.57it/s]                                                        78%|███████▊  | 10019/12776 [1:47:39<17:51,  2.57it/s] 78%|███████▊  | 10020/12776 [1:47:40<15:41,  2.93it/s]                                                        78%|███████▊  | 10020/12776 [1:47:40<15:41,  2.93it/s] 78%|███████▊  | 10021/12776 [1:47:40<14:05,  3.26it/s]                                                        78%|███████▊  | 10021/12776 [1:47:40<14:05,  3.26it/s] 78%|███████▊  | 10022/12776 [1:47:40<12:54,  3.55it/s]                                                        78%|███████▊  | 10022/12776 [1:47:40<12:54,  3.55it/s] 78%|███████▊  | 10023/12776 [1:47:40<13:43,  3.34it/s]                                                        78%|███████▊  | 10023/12776 [1:47:40<13:43,  3.34it/s] 78%|███████▊  | 10024/12776 [1:47:41<12:28,  3.68it/s]                                                        78%|███████▊  | 10024/12776 [1:47:41<12:28,  3.68it/s] 78%|███████▊  | 10025/12776 [1:47:41<11:33,  3.97it/s]                                                        78%|███████▊  | 10025/12776 [1:47:41<11:33,  3.97it/s] 78%|███████▊  | 10026/12776 [1:47:41<10:49,  4.23it/s]                                                        78%|███████▊  | 10026/12776 [1:47:41<10:49,  4.23it/s] 78%|███████▊  | 10027/12776 [1:47:41<10:15,  4.46it/s]                                                        78%|███████▊  | 10027/12776 [1:47:41<10:15,  4.46it/s] 78%|███████▊  | 10028/12776 [1:47:41<11:09,  4.11it/s]                                                        78%|███████▊  | 10028/12776 [1:47:41<11:09,  4.11it/s] 78%|███████▊  | 10029/12776 [1:47:42<10:25,  4.39it/s]                                                        78%|███████▊  | 10029/12776 [1:47:42<10:25,  4.39it/s] 79%|███████▊  | 10030/12776 [1:47:42<09:50,  4.65it/s]                                                        79%|███████▊  | 10030/12776 [1:47:42<09:50,  4.65it/s] 79%|███████▊  | 10031/12776 [1:47:42<09:24,  4.86it/s]                                                        79%|███████▊  | 10031/12776 [1:47:42<09:24,  4.86it/s] 79%|███████▊  | 10032/12776 [1:47:42<09:02,  5.06it/s]                                                        79%|███████▊  | 10032/12776 [1:47:42<09:02,  5.06it/s] 79%|███████▊  | 10033/12776 [1:47:42<10:06,  4.52it/s]                                                        79%|███████▊  | 10033/12776 [1:47:42<10:06,  4.52it/s] 79%|███████▊  | 10034/12776 [1:47:43<09:29,  4.81it/s]                                                        79%|███████▊  | 10034/12776 [1:47:43<09:29,  4.81it/s] 79%|███████▊  | 10035/12776 [1:47:43<09:04,  5.04it/s]                                                        79%|███████▊  | 10035/12776 [1:47:43<09:04,  5.04it/s] 79%|███████▊  | 10036/12776 [1:47:43<08:40,  5.26it/s]                                                        79%|███████▊  | 10036/12776 [1:47:43<08:40,  5.26it/s] 79%|███████▊  | 10037/12776 [1:47:43<08:18,  5.49it/s]                                                        79%|███████▊  | 10037/12776 [1:47:43<08:18,  5.49it/s] 79%|███████▊  | 10038/12776 [1:47:44<15:32,  2.94it/s]                                                        79%|███████▊  | 10038/12776 [1:47:44<15:32,  2.94it/s] 79%|███████▊  | 10039/12776 [1:47:45<31:22,  1.45it/s]                                                        79%|███████▊  | 10039/12776 [1:47:45<31:22,  1.45it/s] 79%|███████▊  | 10040/12776 [1:47:46<36:44,  1.24it/s]                                                        79%|███████▊  | 10040/12776 [1:47:46<36:44,  1.24it/s] 79%|███████▊  | 10041/12776 [1:47:47<38:16,  1.19it/s]                                                        79%|███████▊  | 10041/12776 [1:47:47<38:16,  1.19it/s] 79%|███████▊  | 10042/12776 [1:47:48<38:07,  1.20it/s]                                                        79%|███████▊  | 10042/12776 [1:47:48<38:07,  1.20it/s] 79%|███████▊  | 10043/12776 [1:47:49<36:15,  1.26it/s]                                                        79%|███████▊  | 10043/12776 [1:47:49<36:15,  1.26it/s] 79%|███████▊  | 10044/12776 [1:47:50<34:52,  1.31it/s]                                                        79%|███████▊  | 10044/12776 [1:47:50<34:52,  1.31it/s] 79%|███████▊  | 10045/12776 [1:47:50<32:51,  1.39it/s]                                                        79%|███████▊  | 10045/12776 [1:47:50<32:51,  1.39it/s] 79%|███████▊  | 10046/12776 [1:47:51<31:09,  1.46it/s]                                                        79%|███████▊  | 10046/12776 [1:47:51<31:09,  1.46it/s] 79%|███████▊  | 10047/12776 [1:47:51<29:28,  1.54it/s]                                                        79%|███████▊  | 10047/12776 [1:47:51<29:28,  1.54it/s] 79%|███████▊  | 10048/12776 [1:47:52<27:43,  1.64it/s]                                                        79%|███████▊  | 10048/12776 [1:47:52<27:43,  1.64it/s] 79%|███████▊  | 10049/12776 [1:47:52<26:09,  1.74it/s]                                                        79%|███████▊  | 10049/12776 [1:47:52<26:09,  1.74it/s] 79%|███████▊  | 10050/12776 [1:47:53<25:29,  1.78it/s]                                                        79%|███████▊  | 10050/12776 [1:47:53<25:29,  1.78it/s] 79%|███████▊  | 10051/12776 [1:47:53<24:05,  1.89it/s]                                                        79%|███████▊  | 10051/12776 [1:47:53<24:05,  1.89it/s] 79%|███████▊  | 10052/12776 [1:47:54<24:01,  1.89it/s]                                                        79%|███████▊  | 10052/12776 [1:47:54<24:01,  1.89it/s] 79%|███████▊  | 10053/12776 [1:47:54<22:22,  2.03it/s]                                                        79%|███████▊  | 10053/12776 [1:47:54<22:22,  2.03it/s] 79%|███████▊  | 10054/12776 [1:47:55<20:59,  2.16it/s]                                                        79%|███████▊  | 10054/12776 [1:47:55<20:59,  2.16it/s] 79%|███████▊  | 10055/12776 [1:47:55<22:05,  2.05it/s]                                                        79%|███████▊  | 10055/12776 [1:47:55<22:05,  2.05it/s] 79%|███████▊  | 10056/12776 [1:47:56<20:21,  2.23it/s]                                                        79%|███████▊  | 10056/12776 [1:47:56<20:21,  2.23it/s] 79%|███████▊  | 10057/12776 [1:47:56<19:02,  2.38it/s]                                                        79%|███████▊  | 10057/12776 [1:47:56<19:02,  2.38it/s] 79%|███████▊  | 10058/12776 [1:47:56<18:49,  2.41it/s]                                                        79%|███████▊  | 10058/12776 [1:47:56<18:49,  2.41it/s] 79%|███████▊  | 10059/12776 [1:47:57<17:36,  2.57it/s]                                                        79%|███████▊  | 10059/12776 [1:47:57<17:36,  2.57it/s] 79%|███████▊  | 10060/12776 [1:47:57<16:33,  2.73it/s]                                                        79%|███████▊  | 10060/12776 [1:47:57<16:33,  2.73it/s] 79%|███████▊  | 10061/12776 [1:47:57<16:36,  2.72it/s]                                                        79%|███████▊  | 10061/12776 [1:47:57<16:36,  2.72it/s] 79%|███████▉  | 10062/12776 [1:47:58<15:41,  2.88it/s]                                                        79%|███████▉  | 10062/12776 [1:47:58<15:41,  2.88it/s] 79%|███████▉  | 10063/12776 [1:47:58<14:45,  3.06it/s]                                                        79%|███████▉  | 10063/12776 [1:47:58<14:45,  3.06it/s] 79%|███████▉  | 10064/12776 [1:47:58<14:00,  3.23it/s]                                                        79%|███████▉  | 10064/12776 [1:47:58<14:00,  3.23it/s] 79%|███████▉  | 10065/12776 [1:47:59<13:47,  3.28it/s]                                                        79%|███████▉  | 10065/12776 [1:47:59<13:47,  3.28it/s] 79%|███████▉  | 10066/12776 [1:47:59<13:06,  3.45it/s]                                                        79%|███████▉  | 10066/12776 [1:47:59<13:06,  3.45it/s] 79%|███████▉  | 10067/12776 [1:47:59<12:29,  3.62it/s]                                                        79%|███████▉  | 10067/12776 [1:47:59<12:29,  3.62it/s] 79%|███████▉  | 10068/12776 [1:47:59<11:55,  3.79it/s]                                                        79%|███████▉  | 10068/12776 [1:47:59<11:55,  3.79it/s] 79%|███████▉  | 10069/12776 [1:47:59<11:29,  3.93it/s]                                                        79%|███████▉  | 10069/12776 [1:47:59<11:29,  3.93it/s] 79%|███████▉  | 10070/12776 [1:48:00<11:43,  3.85it/s]                                                        79%|███████▉  | 10070/12776 [1:48:00<11:43,  3.85it/s] 79%|███████▉  | 10071/12776 [1:48:00<11:12,  4.02it/s]                                                        79%|███████▉  | 10071/12776 [1:48:00<11:12,  4.02it/s] 79%|███████▉  | 10072/12776 [1:48:00<10:46,  4.19it/s]                                                        79%|███████▉  | 10072/12776 [1:48:00<10:46,  4.19it/s] 79%|███████▉  | 10073/12776 [1:48:00<10:21,  4.35it/s]                                                        79%|███████▉  | 10073/12776 [1:48:00<10:21,  4.35it/s] 79%|███████▉  | 10074/12776 [1:48:01<09:57,  4.52it/s]                                                        79%|███████▉  | 10074/12776 [1:48:01<09:57,  4.52it/s] 79%|███████▉  | 10075/12776 [1:48:01<10:25,  4.32it/s]                                                        79%|███████▉  | 10075/12776 [1:48:01<10:25,  4.32it/s] 79%|███████▉  | 10076/12776 [1:48:01<09:53,  4.55it/s]                                                        79%|███████▉  | 10076/12776 [1:48:01<09:53,  4.55it/s] 79%|███████▉  | 10077/12776 [1:48:01<09:27,  4.75it/s]                                                       {'eval_loss': 0.5069952011108398, 'eval_wer': 0.3151178358161379, 'eval_runtime': 205.9415, 'eval_samples_per_second': 60.129, 'eval_steps_per_second': 3.758, 'epoch': 1.57}
+{'loss': 0.3372, 'grad_norm': 0.744092583656311, 'learning_rate': 6.810850439882697e-05, 'epoch': 1.57}
+{'loss': 0.3241, 'grad_norm': 2.9647679328918457, 'learning_rate': 6.808406647116324e-05, 'epoch': 1.57}
+{'loss': 0.2338, 'grad_norm': 1.3153102397918701, 'learning_rate': 6.805962854349951e-05, 'epoch': 1.57}
+{'loss': 0.2711, 'grad_norm': 1.2516891956329346, 'learning_rate': 6.803519061583577e-05, 'epoch': 1.57}
+{'loss': 0.2158, 'grad_norm': 0.8349717259407043, 'learning_rate': 6.801075268817204e-05, 'epoch': 1.57}
+{'loss': 0.5239, 'grad_norm': 1.2376327514648438, 'learning_rate': 6.79863147605083e-05, 'epoch': 1.57}
+{'loss': 0.3544, 'grad_norm': 1.2116237878799438, 'learning_rate': 6.796187683284457e-05, 'epoch': 1.57}
+{'loss': 0.478, 'grad_norm': 1.2293322086334229, 'learning_rate': 6.793743890518083e-05, 'epoch': 1.57}
+{'loss': 0.4453, 'grad_norm': 1.6404024362564087, 'learning_rate': 6.791300097751711e-05, 'epoch': 1.57}
+{'loss': 0.4273, 'grad_norm': 1.6513690948486328, 'learning_rate': 6.788856304985336e-05, 'epoch': 1.57}
+{'loss': 0.3235, 'grad_norm': 1.3088352680206299, 'learning_rate': 6.786412512218963e-05, 'epoch': 1.57}
+{'loss': 0.2902, 'grad_norm': 1.2101068496704102, 'learning_rate': 6.78396871945259e-05, 'epoch': 1.57}
+{'loss': 0.6311, 'grad_norm': 2.1694164276123047, 'learning_rate': 6.781524926686216e-05, 'epoch': 1.57}
+{'loss': 0.4426, 'grad_norm': 1.8107420206069946, 'learning_rate': 6.779081133919843e-05, 'epoch': 1.57}
+{'loss': 0.3915, 'grad_norm': 1.5845452547073364, 'learning_rate': 6.77663734115347e-05, 'epoch': 1.57}
+{'loss': 0.7383, 'grad_norm': 4.756518840789795, 'learning_rate': 6.774193548387096e-05, 'epoch': 1.57}
+{'loss': 0.7217, 'grad_norm': 2.210839033126831, 'learning_rate': 6.771749755620723e-05, 'epoch': 1.57}
+{'loss': 0.7003, 'grad_norm': 2.380188226699829, 'learning_rate': 6.76930596285435e-05, 'epoch': 1.57}
+{'loss': 1.0092, 'grad_norm': 3.6436707973480225, 'learning_rate': 6.766862170087976e-05, 'epoch': 1.57}
+{'loss': 0.6157, 'grad_norm': 1.1202057600021362, 'learning_rate': 6.764418377321602e-05, 'epoch': 1.57}
+{'loss': 1.0375, 'grad_norm': 3.5373764038085938, 'learning_rate': 6.761974584555229e-05, 'epoch': 1.57}
+{'loss': 0.6472, 'grad_norm': 2.415952205657959, 'learning_rate': 6.759530791788855e-05, 'epoch': 1.57}
+{'loss': 0.8034, 'grad_norm': 2.597670316696167, 'learning_rate': 6.757086999022482e-05, 'epoch': 1.57}
+{'loss': 0.4365, 'grad_norm': 1.0793561935424805, 'learning_rate': 6.75464320625611e-05, 'epoch': 1.57}
+{'loss': 0.825, 'grad_norm': 4.046724796295166, 'learning_rate': 6.752199413489735e-05, 'epoch': 1.57}
+{'loss': 0.833, 'grad_norm': 2.050435781478882, 'learning_rate': 6.749755620723363e-05, 'epoch': 1.57}
+{'loss': 0.5743, 'grad_norm': 2.141474723815918, 'learning_rate': 6.747311827956989e-05, 'epoch': 1.57}
+{'loss': 0.8483, 'grad_norm': 2.143913507461548, 'learning_rate': 6.744868035190616e-05, 'epoch': 1.57}
+{'loss': 1.5895, 'grad_norm': 3.70343017578125, 'learning_rate': 6.742424242424242e-05, 'epoch': 1.57}
+{'loss': 0.7103, 'grad_norm': 1.8668042421340942, 'learning_rate': 6.739980449657869e-05, 'epoch': 1.57}
+{'loss': 1.2124, 'grad_norm': 3.584521770477295, 'learning_rate': 6.737536656891495e-05, 'epoch': 1.57}
+{'loss': 0.7392, 'grad_norm': 1.4927868843078613, 'learning_rate': 6.735092864125121e-05, 'epoch': 1.57}
+{'loss': 1.2404, 'grad_norm': 2.586317300796509, 'learning_rate': 6.732649071358748e-05, 'epoch': 1.57}
+{'loss': 1.218, 'grad_norm': 1.917419672012329, 'learning_rate': 6.730205278592374e-05, 'epoch': 1.57}
+{'loss': 1.2963, 'grad_norm': 2.0818874835968018, 'learning_rate': 6.727761485826001e-05, 'epoch': 1.57}
+{'loss': 0.6938, 'grad_norm': 2.2003262042999268, 'learning_rate': 6.725317693059629e-05, 'epoch': 1.57}
+{'loss': 0.6664, 'grad_norm': 2.3637146949768066, 'learning_rate': 6.722873900293254e-05, 'epoch': 1.57}
+{'loss': 1.2521, 'grad_norm': 2.3716087341308594, 'learning_rate': 6.720430107526882e-05, 'epoch': 1.57}
+{'loss': 0.1909, 'grad_norm': 0.47229862213134766, 'learning_rate': 6.717986314760507e-05, 'epoch': 1.57}
+{'loss': 0.2017, 'grad_norm': 0.536920428276062, 'learning_rate': 6.715542521994135e-05, 'epoch': 1.57}
+{'loss': 0.1635, 'grad_norm': 0.5747900009155273, 'learning_rate': 6.713098729227761e-05, 'epoch': 1.57}
+{'loss': 0.1387, 'grad_norm': 0.4652441740036011, 'learning_rate': 6.710654936461388e-05, 'epoch': 1.57}
+{'loss': 0.1682, 'grad_norm': 0.32238516211509705, 'learning_rate': 6.708211143695014e-05, 'epoch': 1.57}
+{'loss': 0.2486, 'grad_norm': 0.5064279437065125, 'learning_rate': 6.70576735092864e-05, 'epoch': 1.57}
+{'loss': 0.2012, 'grad_norm': 0.5283740758895874, 'learning_rate': 6.703323558162267e-05, 'epoch': 1.57}
+{'loss': 0.2433, 'grad_norm': 0.55204176902771, 'learning_rate': 6.700879765395894e-05, 'epoch': 1.57}
+{'loss': 0.2243, 'grad_norm': 0.5996967554092407, 'learning_rate': 6.69843597262952e-05, 'epoch': 1.57}
+{'loss': 0.1715, 'grad_norm': 0.5109484791755676, 'learning_rate': 6.695992179863147e-05, 'epoch': 1.57}
+{'loss': 0.2348, 'grad_norm': 0.9667737483978271, 'learning_rate': 6.693548387096773e-05, 'epoch': 1.57}
+{'loss': 0.2966, 'grad_norm': 1.0587562322616577, 'learning_rate': 6.691104594330401e-05, 'epoch': 1.57}
+{'loss': 0.1986, 'grad_norm': 1.2635822296142578, 'learning_rate': 6.688660801564026e-05, 'epoch': 1.57}
+{'loss': 0.2691, 'grad_norm': 0.7891488075256348, 'learning_rate': 6.686217008797654e-05, 'epoch': 1.57}
+{'loss': 0.3324, 'grad_norm': 0.8126229643821716, 'learning_rate': 6.68377321603128e-05, 'epoch': 1.57}
+{'loss': 0.3169, 'grad_norm': 1.0948699712753296, 'learning_rate': 6.681329423264907e-05, 'epoch': 1.57}
+{'loss': 0.5378, 'grad_norm': 1.4431370496749878, 'learning_rate': 6.678885630498533e-05, 'epoch': 1.57}
+{'loss': 0.4985, 'grad_norm': 2.1338889598846436, 'learning_rate': 6.67644183773216e-05, 'epoch': 1.57}
+{'loss': 0.4749, 'grad_norm': 1.3593388795852661, 'learning_rate': 6.673998044965786e-05, 'epoch': 1.57}
+{'loss': 0.431, 'grad_norm': 1.260514259338379, 'learning_rate': 6.671554252199413e-05, 'epoch': 1.57}
+{'loss': 0.4252, 'grad_norm': 1.087146282196045, 'learning_rate': 6.669110459433039e-05, 'epoch': 1.57}
+{'loss': 0.3251, 'grad_norm': 0.7750207185745239, 'learning_rate': 6.666666666666666e-05, 'epoch': 1.57}
+{'loss': 0.3326, 'grad_norm': 1.3115235567092896, 'learning_rate': 6.664222873900292e-05, 'epoch': 1.57}
+{'loss': 0.5962, 'grad_norm': 1.4149150848388672, 'learning_rate': 6.66177908113392e-05, 'epoch': 1.58}
+{'loss': 0.4921, 'grad_norm': 3.154900550842285, 'learning_rate': 6.659335288367545e-05, 'epoch': 1.58}
+{'loss': 0.8742, 'grad_norm': 1.6714290380477905, 'learning_rate': 6.656891495601173e-05, 'epoch': 1.58}
+{'loss': 0.767, 'grad_norm': 2.3452184200286865, 'learning_rate': 6.6544477028348e-05, 'epoch': 1.58}
+{'loss': 0.3151, 'grad_norm': 1.208871603012085, 'learning_rate': 6.652003910068426e-05, 'epoch': 1.58}
+{'loss': 0.8238, 'grad_norm': 3.192185878753662, 'learning_rate': 6.649560117302052e-05, 'epoch': 1.58}
+{'loss': 0.8813, 'grad_norm': 3.4253957271575928, 'learning_rate': 6.647116324535679e-05, 'epoch': 1.58}
+{'loss': 0.6148, 'grad_norm': 1.4279910326004028, 'learning_rate': 6.644672531769305e-05, 'epoch': 1.58}
+{'loss': 0.3901, 'grad_norm': 3.070469856262207, 'learning_rate': 6.642228739002932e-05, 'epoch': 1.58}
+{'loss': 0.4485, 'grad_norm': 2.3599345684051514, 'learning_rate': 6.639784946236558e-05, 'epoch': 1.58}
+{'loss': 0.7455, 'grad_norm': 1.7810369729995728, 'learning_rate': 6.637341153470185e-05, 'epoch': 1.58}
+{'loss': 0.5618, 'grad_norm': 2.3326375484466553, 'learning_rate': 6.634897360703811e-05, 'epoch': 1.58}
+{'loss': 0.9591, 'grad_norm': 2.5533432960510254, 'learning_rate': 6.632453567937439e-05, 'epoch': 1.58}
+{'loss': 0.8351, 'grad_norm': 3.2034530639648438, 'learning_rate': 6.630009775171064e-05, 'epoch': 1.58}
+{'loss': 1.478, 'grad_norm': 2.8962016105651855, 'learning_rate': 6.627565982404692e-05, 'epoch': 1.58}
+ 79%|███████▉  | 10077/12776 [1:48:01<09:27,  4.75it/s] 79%|███████▉  | 10078/12776 [1:48:01<09:13,  4.87it/s]                                                        79%|███████▉  | 10078/12776 [1:48:01<09:13,  4.87it/s] 79%|███████▉  | 10079/12776 [1:48:02<09:11,  4.89it/s]                                                        79%|███████▉  | 10079/12776 [1:48:02<09:11,  4.89it/s] 79%|███████▉  | 10080/12776 [1:48:02<09:51,  4.56it/s]                                                        79%|███████▉  | 10080/12776 [1:48:02<09:51,  4.56it/s] 79%|███████▉  | 10081/12776 [1:48:02<09:34,  4.69it/s]                                                        79%|███████▉  | 10081/12776 [1:48:02<09:34,  4.69it/s] 79%|███████▉  | 10082/12776 [1:48:02<09:21,  4.80it/s]                                                        79%|███████▉  | 10082/12776 [1:48:02<09:21,  4.80it/s] 79%|███████▉  | 10083/12776 [1:48:02<09:10,  4.89it/s]                                                        79%|███████▉  | 10083/12776 [1:48:02<09:10,  4.89it/s] 79%|███████▉  | 10084/12776 [1:48:03<09:00,  4.98it/s]                                                        79%|███████▉  | 10084/12776 [1:48:03<09:00,  4.98it/s] 79%|███████▉  | 10085/12776 [1:48:03<08:48,  5.09it/s]                                                        79%|███████▉  | 10085/12776 [1:48:03<08:48,  5.09it/s] 79%|███████▉  | 10086/12776 [1:48:03<09:49,  4.56it/s]                                                        79%|███████▉  | 10086/12776 [1:48:03<09:49,  4.56it/s] 79%|███████▉  | 10087/12776 [1:48:03<09:19,  4.80it/s]                                                        79%|███████▉  | 10087/12776 [1:48:03<09:19,  4.80it/s] 79%|███████▉  | 10088/12776 [1:48:04<17:51,  2.51it/s]                                                        79%|███████▉  | 10088/12776 [1:48:04<17:51,  2.51it/s] 79%|███████▉  | 10089/12776 [1:48:06<33:45,  1.33it/s]                                                        79%|███████▉  | 10089/12776 [1:48:06<33:45,  1.33it/s] 79%|███████▉  | 10090/12776 [1:48:07<39:02,  1.15it/s]                                                        79%|███████▉  | 10090/12776 [1:48:07<39:02,  1.15it/s] 79%|███████▉  | 10091/12776 [1:48:08<38:41,  1.16it/s]                                                        79%|███████▉  | 10091/12776 [1:48:08<38:41,  1.16it/s] 79%|███████▉  | 10092/12776 [1:48:09<37:45,  1.18it/s]                                                        79%|███████▉  | 10092/12776 [1:48:09<37:45,  1.18it/s] 79%|███████▉  | 10093/12776 [1:48:09<36:14,  1.23it/s]                                                        79%|███████▉  | 10093/12776 [1:48:09<36:14,  1.23it/s] 79%|███████▉  | 10094/12776 [1:48:10<34:41,  1.29it/s]                                                        79%|███████▉  | 10094/12776 [1:48:10<34:41,  1.29it/s] 79%|███████▉  | 10095/12776 [1:48:11<32:59,  1.35it/s]                                                        79%|███████▉  | 10095/12776 [1:48:11<32:59,  1.35it/s] 79%|███████▉  | 10096/12776 [1:48:11<32:54,  1.36it/s]                                                        79%|███████▉  | 10096/12776 [1:48:11<32:54,  1.36it/s] 79%|███████▉  | 10097/12776 [1:48:12<31:00,  1.44it/s]                                                        79%|███████▉  | 10097/12776 [1:48:12<31:00,  1.44it/s] 79%|███████▉  | 10098/12776 [1:48:13<29:52,  1.49it/s]                                                        79%|███████▉  | 10098/12776 [1:48:13<29:52,  1.49it/s] 79%|███████▉  | 10099/12776 [1:48:13<27:51,  1.60it/s]                                                        79%|███████▉  | 10099/12776 [1:48:13<27:51,  1.60it/s] 79%|███████▉  | 10100/12776 [1:48:14<27:08,  1.64it/s]                                                        79%|███████▉  | 10100/12776 [1:48:14<27:08,  1.64it/s] 79%|███████▉  | 10101/12776 [1:48:14<25:07,  1.77it/s]                                                        79%|███████▉  | 10101/12776 [1:48:14<25:07,  1.77it/s] 79%|███████▉  | 10102/12776 [1:48:15<25:01,  1.78it/s]                                                        79%|███████▉  | 10102/12776 [1:48:15<25:01,  1.78it/s] 79%|███████▉  | 10103/12776 [1:48:15<23:25,  1.90it/s]                                                        79%|███████▉  | 10103/12776 [1:48:15<23:25,  1.90it/s] 79%|███████▉  | 10104/12776 [1:48:16<22:57,  1.94it/s]                                                        79%|███████▉  | 10104/12776 [1:48:16<22:57,  1.94it/s] 79%|███████▉  | 10105/12776 [1:48:16<21:24,  2.08it/s]                                                        79%|███████▉  | 10105/12776 [1:48:16<21:24,  2.08it/s] 79%|███████▉  | 10106/12776 [1:48:16<20:13,  2.20it/s]                                                        79%|███████▉  | 10106/12776 [1:48:16<20:13,  2.20it/s] 79%|███████▉  | 10107/12776 [1:48:17<20:52,  2.13it/s]                                                        79%|███████▉  | 10107/12776 [1:48:17<20:52,  2.13it/s] 79%|███████▉  | 10108/12776 [1:48:17<19:30,  2.28it/s]                                                        79%|███████▉  | 10108/12776 [1:48:17<19:30,  2.28it/s] 79%|███████▉  | 10109/12776 [1:48:18<18:20,  2.42it/s]                                                        79%|███████▉  | 10109/12776 [1:48:18<18:20,  2.42it/s] 79%|███████▉  | 10110/12776 [1:48:18<18:21,  2.42it/s]                                                        79%|███████▉  | 10110/12776 [1:48:18<18:21,  2.42it/s] 79%|███████▉  | 10111/12776 [1:48:18<17:15,  2.57it/s]                                                        79%|███████▉  | 10111/12776 [1:48:18<17:15,  2.57it/s] 79%|███████▉  | 10112/12776 [1:48:19<16:23,  2.71it/s]                                                        79%|███████▉  | 10112/12776 [1:48:19<16:23,  2.71it/s] 79%|███████▉  | 10113/12776 [1:48:19<15:49,  2.80it/s]                                                        79%|███████▉  | 10113/12776 [1:48:19<15:49,  2.80it/s] 79%|███████▉  | 10114/12776 [1:48:19<15:02,  2.95it/s]                                                        79%|███████▉  | 10114/12776 [1:48:19<15:02,  2.95it/s] 79%|███████▉  | 10115/12776 [1:48:20<14:25,  3.08it/s]                                                        79%|███████▉  | 10115/12776 [1:48:20<14:25,  3.08it/s] 79%|███████▉  | 10116/12776 [1:48:20<13:51,  3.20it/s]                                                        79%|███████▉  | 10116/12776 [1:48:20<13:51,  3.20it/s] 79%|███████▉  | 10117/12776 [1:48:20<14:53,  2.98it/s]                                                        79%|███████▉  | 10117/12776 [1:48:20<14:53,  2.98it/s] 79%|███████▉  | 10118/12776 [1:48:21<13:57,  3.17it/s]                                                        79%|███████▉  | 10118/12776 [1:48:21<13:57,  3.17it/s] 79%|███████▉  | 10119/12776 [1:48:21<13:09,  3.36it/s]                                                        79%|███████▉  | 10119/12776 [1:48:21<13:09,  3.36it/s] 79%|███████▉  | 10120/12776 [1:48:21<12:34,  3.52it/s]                                                        79%|███████▉  | 10120/12776 [1:48:21<12:34,  3.52it/s] 79%|███████▉  | 10121/12776 [1:48:21<13:18,  3.32it/s]                                                        79%|███████▉  | 10121/12776 [1:48:21<13:18,  3.32it/s] 79%|███████▉  | 10122/12776 [1:48:22<12:31,  3.53it/s]                                                        79%|███████▉  | 10122/12776 [1:48:22<12:31,  3.53it/s] 79%|███████▉  | 10123/12776 [1:48:22<11:54,  3.71it/s]                                                        79%|███████▉  | 10123/12776 [1:48:22<11:54,  3.71it/s] 79%|███████▉  | 10124/12776 [1:48:22<11:22,  3.88it/s]                                                        79%|███████▉  | 10124/12776 [1:48:22<11:22,  3.88it/s] 79%|███████▉  | 10125/12776 [1:48:22<11:45,  3.76it/s]                                                        79%|███████▉  | 10125/12776 [1:48:22<11:45,  3.76it/s] 79%|███████▉  | 10126/12776 [1:48:23<11:04,  3.99it/s]                                                        79%|███████▉  | 10126/12776 [1:48:23<11:04,  3.99it/s] 79%|███████▉  | 10127/12776 [1:48:23<10:30,  4.20it/s]                                                        79%|███████▉  | 10127/12776 [1:48:23<10:30,  4.20it/s] 79%|███████▉  | 10128/12776 [1:48:23<10:08,  4.35it/s]                                                        79%|███████▉  | 10128/12776 [1:48:23<10:08,  4.35it/s] 79%|███████▉  | 10129/12776 [1:48:23<09:51,  4.48it/s]                                                        79%|███████▉  | 10129/12776 [1:48:23<09:51,  4.48it/s] 79%|███████▉  | 10130/12776 [1:48:24<10:54,  4.04it/s]                                                        79%|███████▉  | 10130/12776 [1:48:24<10:54,  4.04it/s] 79%|███████▉  | 10131/12776 [1:48:24<10:17,  4.28it/s]                                                        79%|███████▉  | 10131/12776 [1:48:24<10:17,  4.28it/s] 79%|███████▉  | 10132/12776 [1:48:24<09:48,  4.49it/s]                                                        79%|███████▉  | 10132/12776 [1:48:24<09:48,  4.49it/s] 79%|███████▉  | 10133/12776 [1:48:24<09:27,  4.65it/s]                                                        79%|███████▉  | 10133/12776 [1:48:24<09:27,  4.65it/s] 79%|███████▉  | 10134/12776 [1:48:24<09:10,  4.80it/s]                                                        79%|███████▉  | 10134/12776 [1:48:24<09:10,  4.80it/s] 79%|███████▉  | 10135/12776 [1:48:25<08:55,  4.93it/s]                                                        79%|███████▉  | 10135/12776 [1:48:25<08:55,  4.93it/s] 79%|███████▉  | 10136/12776 [1:48:25<09:57,  4.42it/s]                                                        79%|███████▉  | 10136/12776 [1:48:25<09:57,  4.42it/s] 79%|███████▉  | 10137/12776 [1:48:25<09:24,  4.67it/s]                                                        79%|███████▉  | 10137/12776 [1:48:25<09:24,  4.67it/s] 79%|███████▉  | 10138/12776 [1:48:26<17:35,  2.50it/s]                                                        79%|███████▉  | 10138/12776 [1:48:26<17:35,  2.50it/s] 79%|███████▉  | 10139/12776 [1:48:27<33:10,  1.32it/s]                                                        79%|███████▉  | 10139/12776 [1:48:27<33:10,  1.32it/s] 79%|███████▉  | 10140/12776 [1:48:28<35:36,  1.23it/s]                                                        79%|███████▉  | 10140/12776 [1:48:28<35:36,  1.23it/s] 79%|███████▉  | 10141/12776 [1:48:29<37:13,  1.18it/s]                                                        79%|███████▉  | 10141/12776 [1:48:29<37:13,  1.18it/s] 79%|███████▉  | 10142/12776 [1:48:30<36:04,  1.22it/s]                                                        79%|███████▉  | 10142/12776 [1:48:30<36:04,  1.22it/s] 79%|███████▉  | 10143/12776 [1:48:31<35:45,  1.23it/s]                                                        79%|███████▉  | 10143/12776 [1:48:31<35:45,  1.23it/s] 79%|███████▉  | 10144/12776 [1:48:32<34:54,  1.26it/s]                                                        79%|███████▉  | 10144/12776 [1:48:32<34:54,  1.26it/s] 79%|███████▉  | 10145/12776 [1:48:32<32:52,  1.33it/s]                                                        79%|███████▉  | 10145/12776 [1:48:32<32:52,  1.33it/s] 79%|███████▉  | 10146/12776 [1:48:33<32:57,  1.33it/s]                                                        79%|███████▉  | 10146/12776 [1:48:33<32:57,  1.33it/s] 79%|███████▉  | 10147/12776 [1:48:34<30:32,  1.43it/s]                                                        79%|███████▉  | 10147/12776 [1:48:34<30:32,  1.43it/s] 79%|███████▉  | 10148/12776 [1:48:34<30:07,  1.45it/s]                                                        79%|███████▉  | 10148/12776 [1:48:34<30:07,  1.45it/s] 79%|███████▉  | 10149/12776 [1:48:35<27:57,  1.57it/s]                                                        79%|███████▉  | 10149/12776 [1:48:35<27:57,  1.57it/s] 79%|███████▉  | 10150/12776 [1:48:35<26:28,  1.65it/s]                                                        79%|███████▉  | 10150/12776 [1:48:35<26:28,  1.65it/s] 79%|███████▉  | 10151/12776 [1:48:36<24:29,  1.79it/s]                                                        79%|███████▉  | 10151/12776 [1:48:36<24:29,  1.79it/s] 79%|███████▉  | 10152/12776 [1:48:36<23:29,  1.86it/s]                                                        79%|███████▉  | 10152/12776 [1:48:36<23:29,  1.86it/s] 79%|███████▉  | 10153/12776 [1:48:37<22:03,  1.98it/s]                                                        79%|███████▉  | 10153/12776 [1:48:37<22:03,  1.98it/s] 79%|███████▉  | 10154/12776 [1:48:37<20:51,  2.10it/s]                                                        79%|███████▉  | 10154/12776 [1:48:37<20:51,  2.10it/s] 79%|███████▉  | 10155/12776 [1:48:38<20:57,  2.08it/s]                                                       {'loss': 1.1017, 'grad_norm': 2.253652572631836, 'learning_rate': 6.625122189638319e-05, 'epoch': 1.58}
+{'loss': 1.4581, 'grad_norm': 4.773320198059082, 'learning_rate': 6.622678396871945e-05, 'epoch': 1.58}
+{'loss': 0.7407, 'grad_norm': 4.403872013092041, 'learning_rate': 6.620234604105571e-05, 'epoch': 1.58}
+{'loss': 0.9164, 'grad_norm': 2.4393060207366943, 'learning_rate': 6.617790811339198e-05, 'epoch': 1.58}
+{'loss': 1.3664, 'grad_norm': 3.306149959564209, 'learning_rate': 6.615347018572824e-05, 'epoch': 1.58}
+{'loss': 0.9943, 'grad_norm': 2.284524917602539, 'learning_rate': 6.612903225806451e-05, 'epoch': 1.58}
+{'loss': 0.3365, 'grad_norm': 6.304023265838623, 'learning_rate': 6.610459433040077e-05, 'epoch': 1.58}
+{'loss': 0.3204, 'grad_norm': 1.4609630107879639, 'learning_rate': 6.608015640273704e-05, 'epoch': 1.58}
+{'loss': 0.5147, 'grad_norm': 1.2958881855010986, 'learning_rate': 6.60557184750733e-05, 'epoch': 1.58}
+{'loss': 0.5687, 'grad_norm': 2.1635255813598633, 'learning_rate': 6.603128054740958e-05, 'epoch': 1.58}
+{'loss': 1.3545, 'grad_norm': 3.585665225982666, 'learning_rate': 6.600684261974583e-05, 'epoch': 1.58}
+{'loss': 1.403, 'grad_norm': 4.3810834884643555, 'learning_rate': 6.598240469208211e-05, 'epoch': 1.58}
+{'loss': 0.195, 'grad_norm': 1.2125346660614014, 'learning_rate': 6.595796676441838e-05, 'epoch': 1.58}
+{'loss': 0.303, 'grad_norm': 1.367098331451416, 'learning_rate': 6.593352883675464e-05, 'epoch': 1.58}
+{'loss': 0.3611, 'grad_norm': 1.04794442653656, 'learning_rate': 6.59090909090909e-05, 'epoch': 1.58}
+{'loss': 0.1909, 'grad_norm': 0.518174946308136, 'learning_rate': 6.588465298142717e-05, 'epoch': 1.58}
+{'loss': 0.2269, 'grad_norm': 0.5261414051055908, 'learning_rate': 6.586021505376344e-05, 'epoch': 1.58}
+{'loss': 0.1931, 'grad_norm': 0.4940567910671234, 'learning_rate': 6.58357771260997e-05, 'epoch': 1.58}
+{'loss': 0.3157, 'grad_norm': 0.8084196448326111, 'learning_rate': 6.581133919843596e-05, 'epoch': 1.58}
+{'loss': 0.2145, 'grad_norm': 0.5676218867301941, 'learning_rate': 6.578690127077223e-05, 'epoch': 1.58}
+{'loss': 0.1708, 'grad_norm': 0.6355946660041809, 'learning_rate': 6.57624633431085e-05, 'epoch': 1.58}
+{'loss': 0.3097, 'grad_norm': 0.8469789028167725, 'learning_rate': 6.573802541544477e-05, 'epoch': 1.58}
+{'loss': 0.2384, 'grad_norm': 2.256359338760376, 'learning_rate': 6.571358748778102e-05, 'epoch': 1.58}
+{'loss': 0.2843, 'grad_norm': 0.8068323731422424, 'learning_rate': 6.56891495601173e-05, 'epoch': 1.58}
+{'loss': 0.1872, 'grad_norm': 0.44950971007347107, 'learning_rate': 6.566471163245357e-05, 'epoch': 1.58}
+{'loss': 0.3186, 'grad_norm': 1.0343793630599976, 'learning_rate': 6.564027370478982e-05, 'epoch': 1.58}
+{'loss': 0.5353, 'grad_norm': 2.219566822052002, 'learning_rate': 6.56158357771261e-05, 'epoch': 1.58}
+{'loss': 0.4237, 'grad_norm': 1.2421504259109497, 'learning_rate': 6.559139784946236e-05, 'epoch': 1.58}
+{'loss': 0.4712, 'grad_norm': 0.9300140738487244, 'learning_rate': 6.556695992179863e-05, 'epoch': 1.58}
+{'loss': 0.4134, 'grad_norm': 1.3351165056228638, 'learning_rate': 6.554252199413489e-05, 'epoch': 1.58}
+{'loss': 0.337, 'grad_norm': 1.0536439418792725, 'learning_rate': 6.551808406647116e-05, 'epoch': 1.58}
+{'loss': 0.4153, 'grad_norm': 0.8616892695426941, 'learning_rate': 6.549364613880742e-05, 'epoch': 1.58}
+{'loss': 0.3483, 'grad_norm': 1.845602035522461, 'learning_rate': 6.546920821114369e-05, 'epoch': 1.58}
+{'loss': 0.9881, 'grad_norm': 4.453863143920898, 'learning_rate': 6.544477028347996e-05, 'epoch': 1.58}
+{'loss': 0.5901, 'grad_norm': 1.3871952295303345, 'learning_rate': 6.542033235581622e-05, 'epoch': 1.58}
+{'loss': 0.5883, 'grad_norm': 1.2875733375549316, 'learning_rate': 6.53958944281525e-05, 'epoch': 1.58}
+{'loss': 0.3296, 'grad_norm': 1.413844108581543, 'learning_rate': 6.537145650048876e-05, 'epoch': 1.58}
+{'loss': 0.4407, 'grad_norm': 1.2688345909118652, 'learning_rate': 6.534701857282501e-05, 'epoch': 1.58}
+{'loss': 0.2713, 'grad_norm': 1.8131461143493652, 'learning_rate': 6.532258064516129e-05, 'epoch': 1.58}
+{'loss': 0.5562, 'grad_norm': 1.4916858673095703, 'learning_rate': 6.529814271749755e-05, 'epoch': 1.58}
+{'loss': 0.5763, 'grad_norm': 2.313715696334839, 'learning_rate': 6.527370478983382e-05, 'epoch': 1.58}
+{'loss': 0.562, 'grad_norm': 1.6103403568267822, 'learning_rate': 6.524926686217008e-05, 'epoch': 1.58}
+{'loss': 0.475, 'grad_norm': 0.9578543901443481, 'learning_rate': 6.522482893450635e-05, 'epoch': 1.58}
+{'loss': 0.8251, 'grad_norm': 1.8726333379745483, 'learning_rate': 6.520039100684261e-05, 'epoch': 1.58}
+{'loss': 1.235, 'grad_norm': 1.8902461528778076, 'learning_rate': 6.517595307917888e-05, 'epoch': 1.58}
+{'loss': 0.5179, 'grad_norm': 3.1188371181488037, 'learning_rate': 6.515151515151516e-05, 'epoch': 1.58}
+{'loss': 0.6082, 'grad_norm': 4.811651706695557, 'learning_rate': 6.51270772238514e-05, 'epoch': 1.58}
+{'loss': 0.6315, 'grad_norm': 2.434835910797119, 'learning_rate': 6.510263929618767e-05, 'epoch': 1.58}
+{'loss': 0.2202, 'grad_norm': 1.2945551872253418, 'learning_rate': 6.507820136852395e-05, 'epoch': 1.59}
+{'loss': 0.7926, 'grad_norm': 1.9833664894104004, 'learning_rate': 6.50537634408602e-05, 'epoch': 1.59}
+{'loss': 1.2268, 'grad_norm': 3.4194037914276123, 'learning_rate': 6.502932551319648e-05, 'epoch': 1.59}
+{'loss': 1.0213, 'grad_norm': 2.057821035385132, 'learning_rate': 6.500488758553274e-05, 'epoch': 1.59}
+{'loss': 1.0572, 'grad_norm': 7.126612186431885, 'learning_rate': 6.498044965786901e-05, 'epoch': 1.59}
+{'loss': 1.4866, 'grad_norm': 3.2313385009765625, 'learning_rate': 6.495601173020527e-05, 'epoch': 1.59}
+{'loss': 0.851, 'grad_norm': 2.4898273944854736, 'learning_rate': 6.493157380254154e-05, 'epoch': 1.59}
+{'loss': 1.1641, 'grad_norm': 2.793358087539673, 'learning_rate': 6.49071358748778e-05, 'epoch': 1.59}
+{'loss': 1.1646, 'grad_norm': 2.09865665435791, 'learning_rate': 6.488269794721407e-05, 'epoch': 1.59}
+{'loss': 0.6815, 'grad_norm': 1.7419707775115967, 'learning_rate': 6.485826001955035e-05, 'epoch': 1.59}
+{'loss': 0.5228, 'grad_norm': 1.2431154251098633, 'learning_rate': 6.48338220918866e-05, 'epoch': 1.59}
+{'loss': 0.1294, 'grad_norm': 0.5367014408111572, 'learning_rate': 6.480938416422286e-05, 'epoch': 1.59}
+{'loss': 0.6302, 'grad_norm': 1.9725390672683716, 'learning_rate': 6.478494623655914e-05, 'epoch': 1.59}
+{'loss': 0.6514, 'grad_norm': 2.6568782329559326, 'learning_rate': 6.476050830889539e-05, 'epoch': 1.59}
+{'loss': 0.2569, 'grad_norm': 0.5387908220291138, 'learning_rate': 6.473607038123167e-05, 'epoch': 1.59}
+{'loss': 0.2113, 'grad_norm': 0.6417441964149475, 'learning_rate': 6.471163245356794e-05, 'epoch': 1.59}
+{'loss': 0.2751, 'grad_norm': 0.7953237295150757, 'learning_rate': 6.46871945259042e-05, 'epoch': 1.59}
+{'loss': 0.274, 'grad_norm': 0.5870561003684998, 'learning_rate': 6.466275659824046e-05, 'epoch': 1.59}
+{'loss': 0.2782, 'grad_norm': 0.416892409324646, 'learning_rate': 6.463831867057673e-05, 'epoch': 1.59}
+{'loss': 0.2186, 'grad_norm': 1.2958848476409912, 'learning_rate': 6.4613880742913e-05, 'epoch': 1.59}
+{'loss': 0.3302, 'grad_norm': 0.7092198133468628, 'learning_rate': 6.458944281524926e-05, 'epoch': 1.59}
+{'loss': 0.4434, 'grad_norm': 1.2143514156341553, 'learning_rate': 6.456500488758554e-05, 'epoch': 1.59}
+{'loss': 0.3993, 'grad_norm': 0.6722570657730103, 'learning_rate': 6.454056695992179e-05, 'epoch': 1.59}
+{'loss': 0.4719, 'grad_norm': 0.7443752884864807, 'learning_rate': 6.451612903225805e-05, 'epoch': 1.59}
+{'loss': 0.2445, 'grad_norm': 1.1258270740509033, 'learning_rate': 6.449169110459433e-05, 'epoch': 1.59}
+{'loss': 0.3133, 'grad_norm': 0.5991902351379395, 'learning_rate': 6.446725317693058e-05, 'epoch': 1.59}
+{'loss': 0.2713, 'grad_norm': 1.0533815622329712, 'learning_rate': 6.444281524926686e-05, 'epoch': 1.59}
+{'loss': 0.2822, 'grad_norm': 1.2451895475387573, 'learning_rate': 6.441837732160313e-05, 'epoch': 1.59}
+{'loss': 0.4126, 'grad_norm': 1.2398431301116943, 'learning_rate': 6.439393939393939e-05, 'epoch': 1.59}
+{'loss': 0.4618, 'grad_norm': 7.360801696777344, 'learning_rate': 6.436950146627566e-05, 'epoch': 1.59}
+ 79%|███████▉  | 10155/12776 [1:48:38<20:57,  2.08it/s] 79%|███████▉  | 10156/12776 [1:48:38<19:46,  2.21it/s]                                                        79%|███████▉  | 10156/12776 [1:48:38<19:46,  2.21it/s] 80%|███████▉  | 10157/12776 [1:48:38<18:46,  2.32it/s]                                                        80%|███████▉  | 10157/12776 [1:48:38<18:46,  2.32it/s] 80%|███████▉  | 10158/12776 [1:48:39<18:08,  2.40it/s]                                                        80%|███████▉  | 10158/12776 [1:48:39<18:08,  2.40it/s] 80%|███████▉  | 10159/12776 [1:48:39<17:14,  2.53it/s]                                                        80%|███████▉  | 10159/12776 [1:48:39<17:14,  2.53it/s] 80%|███████▉  | 10160/12776 [1:48:39<16:29,  2.64it/s]                                                        80%|███████▉  | 10160/12776 [1:48:39<16:29,  2.64it/s] 80%|███████▉  | 10161/12776 [1:48:40<16:46,  2.60it/s]                                                        80%|███████▉  | 10161/12776 [1:48:40<16:46,  2.60it/s] 80%|███████▉  | 10162/12776 [1:48:40<15:56,  2.73it/s]                                                        80%|███████▉  | 10162/12776 [1:48:40<15:56,  2.73it/s] 80%|███████▉  | 10163/12776 [1:48:40<15:10,  2.87it/s]                                                        80%|███████▉  | 10163/12776 [1:48:40<15:10,  2.87it/s] 80%|███████▉  | 10164/12776 [1:48:41<16:24,  2.65it/s]                                                        80%|███████▉  | 10164/12776 [1:48:41<16:24,  2.65it/s] 80%|███████▉  | 10165/12776 [1:48:41<15:20,  2.84it/s]                                                        80%|███████▉  | 10165/12776 [1:48:41<15:20,  2.84it/s] 80%|███████▉  | 10166/12776 [1:48:41<14:27,  3.01it/s]                                                        80%|███████▉  | 10166/12776 [1:48:41<14:27,  3.01it/s] 80%|███████▉  | 10167/12776 [1:48:42<15:04,  2.89it/s]                                                        80%|███████▉  | 10167/12776 [1:48:42<15:04,  2.89it/s] 80%|███████▉  | 10168/12776 [1:48:42<14:05,  3.09it/s]                                                        80%|███████▉  | 10168/12776 [1:48:42<14:05,  3.09it/s] 80%|███████▉  | 10169/12776 [1:48:42<13:17,  3.27it/s]                                                        80%|███████▉  | 10169/12776 [1:48:42<13:17,  3.27it/s] 80%|███████▉  | 10170/12776 [1:48:43<12:36,  3.44it/s]                                                        80%|███████▉  | 10170/12776 [1:48:43<12:36,  3.44it/s] 80%|███████▉  | 10171/12776 [1:48:43<13:21,  3.25it/s]                                                        80%|███████▉  | 10171/12776 [1:48:43<13:21,  3.25it/s] 80%|███████▉  | 10172/12776 [1:48:43<12:31,  3.46it/s]                                                        80%|███████▉  | 10172/12776 [1:48:43<12:31,  3.46it/s] 80%|███████▉  | 10173/12776 [1:48:43<11:52,  3.65it/s]                                                        80%|███████▉  | 10173/12776 [1:48:43<11:52,  3.65it/s] 80%|███████▉  | 10174/12776 [1:48:44<11:19,  3.83it/s]                                                        80%|███████▉  | 10174/12776 [1:48:44<11:19,  3.83it/s] 80%|███████▉  | 10175/12776 [1:48:44<10:52,  3.99it/s]                                                        80%|███████▉  | 10175/12776 [1:48:44<10:52,  3.99it/s] 80%|███████▉  | 10176/12776 [1:48:44<11:40,  3.71it/s]                                                        80%|███████▉  | 10176/12776 [1:48:44<11:40,  3.71it/s] 80%|███████▉  | 10177/12776 [1:48:44<10:56,  3.96it/s]                                                        80%|███████▉  | 10177/12776 [1:48:44<10:56,  3.96it/s] 80%|███████▉  | 10178/12776 [1:48:45<10:22,  4.17it/s]                                                        80%|███████▉  | 10178/12776 [1:48:45<10:22,  4.17it/s] 80%|███████▉  | 10179/12776 [1:48:45<10:46,  4.02it/s]                                                        80%|███████▉  | 10179/12776 [1:48:45<10:46,  4.02it/s] 80%|███████▉  | 10180/12776 [1:48:45<11:32,  3.75it/s]                                                        80%|███████▉  | 10180/12776 [1:48:45<11:32,  3.75it/s] 80%|███████▉  | 10181/12776 [1:48:45<10:41,  4.05it/s]                                                        80%|███████▉  | 10181/12776 [1:48:45<10:41,  4.05it/s] 80%|███████▉  | 10182/12776 [1:48:46<10:03,  4.30it/s]                                                        80%|███████▉  | 10182/12776 [1:48:46<10:03,  4.30it/s] 80%|███████▉  | 10183/12776 [1:48:46<09:35,  4.50it/s]                                                        80%|███████▉  | 10183/12776 [1:48:46<09:35,  4.50it/s] 80%|███████▉  | 10184/12776 [1:48:46<09:13,  4.68it/s]                                                        80%|███████▉  | 10184/12776 [1:48:46<09:13,  4.68it/s] 80%|███████▉  | 10185/12776 [1:48:46<08:55,  4.83it/s]                                                        80%|███████▉  | 10185/12776 [1:48:46<08:55,  4.83it/s] 80%|███████▉  | 10186/12776 [1:48:46<09:55,  4.35it/s]                                                        80%|███████▉  | 10186/12776 [1:48:46<09:55,  4.35it/s] 80%|███████▉  | 10187/12776 [1:48:47<09:21,  4.61it/s]                                                        80%|███████▉  | 10187/12776 [1:48:47<09:21,  4.61it/s] 80%|███████▉  | 10188/12776 [1:48:48<17:45,  2.43it/s]                                                        80%|███████▉  | 10188/12776 [1:48:48<17:45,  2.43it/s] 80%|███████▉  | 10189/12776 [1:48:49<32:29,  1.33it/s]                                                        80%|███████▉  | 10189/12776 [1:48:49<32:29,  1.33it/s] 80%|███████▉  | 10190/12776 [1:48:50<35:07,  1.23it/s]                                                        80%|███████▉  | 10190/12776 [1:48:50<35:07,  1.23it/s] 80%|███████▉  | 10191/12776 [1:48:51<37:02,  1.16it/s]                                                        80%|███████▉  | 10191/12776 [1:48:51<37:02,  1.16it/s] 80%|███████▉  | 10192/12776 [1:48:52<36:24,  1.18it/s]                                                        80%|███████▉  | 10192/12776 [1:48:52<36:24,  1.18it/s] 80%|███████▉  | 10193/12776 [1:48:53<35:41,  1.21it/s]                                                        80%|███████▉  | 10193/12776 [1:48:53<35:41,  1.21it/s] 80%|███████▉  | 10194/12776 [1:48:53<34:29,  1.25it/s]                                                        80%|███████▉  | 10194/12776 [1:48:53<34:29,  1.25it/s] 80%|███████▉  | 10195/12776 [1:48:54<32:53,  1.31it/s]                                                        80%|███████▉  | 10195/12776 [1:48:54<32:53,  1.31it/s] 80%|███████▉  | 10196/12776 [1:48:55<32:59,  1.30it/s]                                                        80%|███████▉  | 10196/12776 [1:48:55<32:59,  1.30it/s] 80%|███████▉  | 10197/12776 [1:48:55<30:51,  1.39it/s]                                                        80%|███████▉  | 10197/12776 [1:48:55<30:51,  1.39it/s] 80%|███████▉  | 10198/12776 [1:48:56<29:13,  1.47it/s]                                                        80%|███████▉  | 10198/12776 [1:48:56<29:13,  1.47it/s] 80%|███████▉  | 10199/12776 [1:48:57<27:22,  1.57it/s]                                                        80%|███████▉  | 10199/12776 [1:48:57<27:22,  1.57it/s] 80%|███████▉  | 10200/12776 [1:48:57<26:46,  1.60it/s]                                                        80%|███████▉  | 10200/12776 [1:48:57<26:46,  1.60it/s] 80%|███████▉  | 10201/12776 [1:48:58<25:05,  1.71it/s]                                                        80%|███████▉  | 10201/12776 [1:48:58<25:05,  1.71it/s] 80%|███████▉  | 10202/12776 [1:48:58<24:35,  1.74it/s]                                                        80%|███████▉  | 10202/12776 [1:48:58<24:35,  1.74it/s] 80%|███████▉  | 10203/12776 [1:48:59<22:59,  1.87it/s]                                                        80%|███████▉  | 10203/12776 [1:48:59<22:59,  1.87it/s] 80%|███████▉  | 10204/12776 [1:48:59<22:22,  1.92it/s]                                                        80%|███████▉  | 10204/12776 [1:48:59<22:22,  1.92it/s] 80%|███████▉  | 10205/12776 [1:48:59<21:05,  2.03it/s]                                                        80%|███████▉  | 10205/12776 [1:48:59<21:05,  2.03it/s] 80%|███████▉  | 10206/12776 [1:49:00<19:53,  2.15it/s]                                                        80%|███████▉  | 10206/12776 [1:49:00<19:53,  2.15it/s] 80%|███████▉  | 10207/12776 [1:49:00<20:21,  2.10it/s]                                                        80%|███████▉  | 10207/12776 [1:49:00<20:21,  2.10it/s] 80%|███████▉  | 10208/12776 [1:49:01<18:59,  2.25it/s]                                                        80%|███████▉  | 10208/12776 [1:49:01<18:59,  2.25it/s] 80%|███████▉  | 10209/12776 [1:49:01<17:50,  2.40it/s]                                                        80%|███████▉  | 10209/12776 [1:49:01<17:50,  2.40it/s] 80%|███████▉  | 10210/12776 [1:49:02<17:40,  2.42it/s]                                                        80%|███████▉  | 10210/12776 [1:49:02<17:40,  2.42it/s] 80%|███████▉  | 10211/12776 [1:49:02<16:33,  2.58it/s]                                                        80%|███████▉  | 10211/12776 [1:49:02<16:33,  2.58it/s] 80%|███████▉  | 10212/12776 [1:49:02<15:38,  2.73it/s]                                                        80%|███████▉  | 10212/12776 [1:49:02<15:38,  2.73it/s] 80%|███████▉  | 10213/12776 [1:49:03<15:22,  2.78it/s]                                                        80%|███████▉  | 10213/12776 [1:49:03<15:22,  2.78it/s] 80%|███████▉  | 10214/12776 [1:49:03<14:29,  2.95it/s]                                                        80%|███████▉  | 10214/12776 [1:49:03<14:29,  2.95it/s] 80%|███████▉  | 10215/12776 [1:49:03<13:44,  3.11it/s]                                                        80%|███████▉  | 10215/12776 [1:49:03<13:44,  3.11it/s] 80%|███████▉  | 10216/12776 [1:49:03<13:05,  3.26it/s]                                                        80%|███████▉  | 10216/12776 [1:49:03<13:05,  3.26it/s] 80%|███████▉  | 10217/12776 [1:49:04<13:04,  3.26it/s]                                                        80%|███████▉  | 10217/12776 [1:49:04<13:04,  3.26it/s] 80%|███████▉  | 10218/12776 [1:49:04<12:19,  3.46it/s]                                                        80%|███████▉  | 10218/12776 [1:49:04<12:19,  3.46it/s] 80%|███████▉  | 10219/12776 [1:49:04<11:49,  3.60it/s]                                                        80%|███████▉  | 10219/12776 [1:49:04<11:49,  3.60it/s] 80%|███████▉  | 10220/12776 [1:49:04<11:28,  3.71it/s]                                                        80%|███████▉  | 10220/12776 [1:49:04<11:28,  3.71it/s] 80%|████████  | 10221/12776 [1:49:05<11:07,  3.83it/s]                                                        80%|████████  | 10221/12776 [1:49:05<11:07,  3.83it/s] 80%|████████  | 10222/12776 [1:49:05<11:30,  3.70it/s]                                                        80%|████████  | 10222/12776 [1:49:05<11:30,  3.70it/s] 80%|████████  | 10223/12776 [1:49:05<11:04,  3.84it/s]                                                        80%|████████  | 10223/12776 [1:49:05<11:04,  3.84it/s] 80%|████████  | 10224/12776 [1:49:05<10:38,  3.99it/s]                                                        80%|████████  | 10224/12776 [1:49:05<10:38,  3.99it/s] 80%|████████  | 10225/12776 [1:49:06<10:18,  4.12it/s]                                                        80%|████████  | 10225/12776 [1:49:06<10:18,  4.12it/s] 80%|████████  | 10226/12776 [1:49:06<11:09,  3.81it/s]                                                        80%|████████  | 10226/12776 [1:49:06<11:09,  3.81it/s] 80%|████████  | 10227/12776 [1:49:06<10:29,  4.05it/s]                                                        80%|████████  | 10227/12776 [1:49:06<10:29,  4.05it/s] 80%|████████  | 10228/12776 [1:49:06<10:01,  4.24it/s]                                                        80%|████████  | 10228/12776 [1:49:06<10:01,  4.24it/s] 80%|████████  | 10229/12776 [1:49:07<09:37,  4.41it/s]                                                        80%|████████  | 10229/12776 [1:49:07<09:37,  4.41it/s] 80%|████████  | 10230/12776 [1:49:07<09:19,  4.55it/s]                                                        80%|████████  | 10230/12776 [1:49:07<09:19,  4.55it/s] 80%|████████  | 10231/12776 [1:49:07<10:41,  3.97it/s]                                                        80%|████████  | 10231/12776 [1:49:07<10:41,  3.97it/s] 80%|████████  | 10232/12776 [1:49:07<10:01,  4.23it/s]                                                        80%|████████  | 10232/12776 [1:49:07<10:01,  4.23it/s] 80%|████████  | 10233/12776 [1:49:08<09:32,  4.44it/s]                                                       {'loss': 0.5885, 'grad_norm': 1.0283269882202148, 'learning_rate': 6.434506353861192e-05, 'epoch': 1.59}
+{'loss': 0.4484, 'grad_norm': 2.743352174758911, 'learning_rate': 6.432062561094819e-05, 'epoch': 1.59}
+{'loss': 0.3717, 'grad_norm': 0.7910766005516052, 'learning_rate': 6.429618768328445e-05, 'epoch': 1.59}
+{'loss': 0.372, 'grad_norm': 0.8904502987861633, 'learning_rate': 6.427174975562072e-05, 'epoch': 1.59}
+{'loss': 0.3387, 'grad_norm': 1.1738675832748413, 'learning_rate': 6.424731182795698e-05, 'epoch': 1.59}
+{'loss': 0.4907, 'grad_norm': 1.5055691003799438, 'learning_rate': 6.422287390029324e-05, 'epoch': 1.59}
+{'loss': 0.3105, 'grad_norm': 1.081289291381836, 'learning_rate': 6.419843597262952e-05, 'epoch': 1.59}
+{'loss': 0.3565, 'grad_norm': 1.3567743301391602, 'learning_rate': 6.417399804496577e-05, 'epoch': 1.59}
+{'loss': 0.7406, 'grad_norm': 1.7186039686203003, 'learning_rate': 6.414956011730205e-05, 'epoch': 1.59}
+{'loss': 0.3946, 'grad_norm': 1.8988852500915527, 'learning_rate': 6.412512218963832e-05, 'epoch': 1.59}
+{'loss': 0.5579, 'grad_norm': 2.2571020126342773, 'learning_rate': 6.410068426197458e-05, 'epoch': 1.59}
+{'loss': 0.7901, 'grad_norm': 1.9459624290466309, 'learning_rate': 6.407624633431085e-05, 'epoch': 1.59}
+{'loss': 1.0994, 'grad_norm': 2.4512932300567627, 'learning_rate': 6.405180840664711e-05, 'epoch': 1.59}
+{'loss': 0.4887, 'grad_norm': 1.6560828685760498, 'learning_rate': 6.402737047898338e-05, 'epoch': 1.59}
+{'loss': 0.649, 'grad_norm': 1.7962381839752197, 'learning_rate': 6.400293255131964e-05, 'epoch': 1.59}
+{'loss': 0.7007, 'grad_norm': 2.8789329528808594, 'learning_rate': 6.39784946236559e-05, 'epoch': 1.59}
+{'loss': 1.0201, 'grad_norm': 2.5336639881134033, 'learning_rate': 6.395405669599217e-05, 'epoch': 1.59}
+{'loss': 0.7936, 'grad_norm': 2.5551042556762695, 'learning_rate': 6.392961876832844e-05, 'epoch': 1.59}
+{'loss': 0.6436, 'grad_norm': 1.407386064529419, 'learning_rate': 6.39051808406647e-05, 'epoch': 1.59}
+{'loss': 0.7574, 'grad_norm': 2.690272331237793, 'learning_rate': 6.388074291300097e-05, 'epoch': 1.59}
+{'loss': 0.7914, 'grad_norm': 1.8954501152038574, 'learning_rate': 6.385630498533724e-05, 'epoch': 1.59}
+{'loss': 0.6738, 'grad_norm': 2.0154988765716553, 'learning_rate': 6.38318670576735e-05, 'epoch': 1.59}
+{'loss': 0.5325, 'grad_norm': 2.260967969894409, 'learning_rate': 6.380742913000977e-05, 'epoch': 1.59}
+{'loss': 0.6965, 'grad_norm': 3.078644037246704, 'learning_rate': 6.378299120234604e-05, 'epoch': 1.59}
+{'loss': 0.9808, 'grad_norm': 2.3561620712280273, 'learning_rate': 6.37585532746823e-05, 'epoch': 1.59}
+{'loss': 0.9701, 'grad_norm': 3.07482647895813, 'learning_rate': 6.373411534701857e-05, 'epoch': 1.59}
+{'loss': 0.8996, 'grad_norm': 5.131535530090332, 'learning_rate': 6.370967741935483e-05, 'epoch': 1.59}
+{'loss': 1.0249, 'grad_norm': 2.536735773086548, 'learning_rate': 6.36852394916911e-05, 'epoch': 1.59}
+{'loss': 0.9431, 'grad_norm': 1.7344963550567627, 'learning_rate': 6.366080156402736e-05, 'epoch': 1.59}
+{'loss': 0.3811, 'grad_norm': 1.4525513648986816, 'learning_rate': 6.363636363636363e-05, 'epoch': 1.59}
+{'loss': 0.7413, 'grad_norm': 11.553523063659668, 'learning_rate': 6.361192570869989e-05, 'epoch': 1.59}
+{'loss': 0.6394, 'grad_norm': 2.428168535232544, 'learning_rate': 6.358748778103616e-05, 'epoch': 1.59}
+{'loss': 0.5436, 'grad_norm': 5.859124183654785, 'learning_rate': 6.356304985337244e-05, 'epoch': 1.59}
+{'loss': 1.0681, 'grad_norm': 2.319378614425659, 'learning_rate': 6.353861192570869e-05, 'epoch': 1.59}
+{'loss': 0.2068, 'grad_norm': 0.3972637355327606, 'learning_rate': 6.351417399804496e-05, 'epoch': 1.6}
+{'loss': 0.1627, 'grad_norm': 0.3770846128463745, 'learning_rate': 6.348973607038123e-05, 'epoch': 1.6}
+{'loss': 0.254, 'grad_norm': 1.2007943391799927, 'learning_rate': 6.34652981427175e-05, 'epoch': 1.6}
+{'loss': 0.3272, 'grad_norm': 2.273597002029419, 'learning_rate': 6.344086021505376e-05, 'epoch': 1.6}
+{'loss': 0.2878, 'grad_norm': 0.8018514513969421, 'learning_rate': 6.341642228739002e-05, 'epoch': 1.6}
+{'loss': 0.2637, 'grad_norm': 0.5968719124794006, 'learning_rate': 6.339198435972629e-05, 'epoch': 1.6}
+{'loss': 0.2629, 'grad_norm': 0.6231215000152588, 'learning_rate': 6.336754643206255e-05, 'epoch': 1.6}
+{'loss': 0.207, 'grad_norm': 0.4936104416847229, 'learning_rate': 6.334310850439882e-05, 'epoch': 1.6}
+{'loss': 0.4249, 'grad_norm': 0.6326175928115845, 'learning_rate': 6.331867057673508e-05, 'epoch': 1.6}
+{'loss': 0.2245, 'grad_norm': 0.5844799876213074, 'learning_rate': 6.329423264907135e-05, 'epoch': 1.6}
+{'loss': 0.4301, 'grad_norm': 2.547029495239258, 'learning_rate': 6.326979472140763e-05, 'epoch': 1.6}
+{'loss': 0.4086, 'grad_norm': 1.4274916648864746, 'learning_rate': 6.324535679374388e-05, 'epoch': 1.6}
+{'loss': 0.2683, 'grad_norm': 1.0012903213500977, 'learning_rate': 6.322091886608016e-05, 'epoch': 1.6}
+{'loss': 0.5077, 'grad_norm': 2.6661782264709473, 'learning_rate': 6.319648093841642e-05, 'epoch': 1.6}
+{'loss': 0.3245, 'grad_norm': 0.7351189851760864, 'learning_rate': 6.317204301075269e-05, 'epoch': 1.6}
+{'loss': 0.3436, 'grad_norm': 0.7789361476898193, 'learning_rate': 6.314760508308895e-05, 'epoch': 1.6}
+{'loss': 0.381, 'grad_norm': 1.3661255836486816, 'learning_rate': 6.312316715542522e-05, 'epoch': 1.6}
+{'loss': 0.3987, 'grad_norm': 0.8813636898994446, 'learning_rate': 6.309872922776148e-05, 'epoch': 1.6}
+{'loss': 0.1548, 'grad_norm': 0.6111112236976624, 'learning_rate': 6.307429130009774e-05, 'epoch': 1.6}
+{'loss': 0.5964, 'grad_norm': 1.8574801683425903, 'learning_rate': 6.304985337243401e-05, 'epoch': 1.6}
+{'loss': 0.4009, 'grad_norm': 3.280104160308838, 'learning_rate': 6.302541544477027e-05, 'epoch': 1.6}
+{'loss': 0.4519, 'grad_norm': 4.668222427368164, 'learning_rate': 6.300097751710654e-05, 'epoch': 1.6}
+{'loss': 0.5817, 'grad_norm': 2.0994975566864014, 'learning_rate': 6.297653958944282e-05, 'epoch': 1.6}
+{'loss': 0.5803, 'grad_norm': 2.334909200668335, 'learning_rate': 6.295210166177907e-05, 'epoch': 1.6}
+{'loss': 0.3882, 'grad_norm': 1.9847476482391357, 'learning_rate': 6.292766373411535e-05, 'epoch': 1.6}
+{'loss': 0.477, 'grad_norm': 1.3854622840881348, 'learning_rate': 6.290322580645161e-05, 'epoch': 1.6}
+{'loss': 0.3697, 'grad_norm': 1.641670823097229, 'learning_rate': 6.287878787878788e-05, 'epoch': 1.6}
+{'loss': 0.438, 'grad_norm': 2.7601287364959717, 'learning_rate': 6.285434995112414e-05, 'epoch': 1.6}
+{'loss': 0.8074, 'grad_norm': 2.510101318359375, 'learning_rate': 6.28299120234604e-05, 'epoch': 1.6}
+{'loss': 0.5739, 'grad_norm': 2.204482316970825, 'learning_rate': 6.280547409579667e-05, 'epoch': 1.6}
+{'loss': 0.9905, 'grad_norm': 2.250614643096924, 'learning_rate': 6.278103616813294e-05, 'epoch': 1.6}
+{'loss': 0.7706, 'grad_norm': 2.100642204284668, 'learning_rate': 6.27565982404692e-05, 'epoch': 1.6}
+{'loss': 0.9495, 'grad_norm': 2.480836868286133, 'learning_rate': 6.273216031280547e-05, 'epoch': 1.6}
+{'loss': 0.5965, 'grad_norm': 2.637479305267334, 'learning_rate': 6.270772238514173e-05, 'epoch': 1.6}
+{'loss': 1.0819, 'grad_norm': 5.914449691772461, 'learning_rate': 6.268328445747801e-05, 'epoch': 1.6}
+{'loss': 0.9483, 'grad_norm': 3.239971876144409, 'learning_rate': 6.265884652981426e-05, 'epoch': 1.6}
+{'loss': 0.8619, 'grad_norm': 2.788769483566284, 'learning_rate': 6.263440860215054e-05, 'epoch': 1.6}
+{'loss': 0.8397, 'grad_norm': 3.059751033782959, 'learning_rate': 6.26099706744868e-05, 'epoch': 1.6}
+{'loss': 0.8119, 'grad_norm': 3.003505229949951, 'learning_rate': 6.258553274682307e-05, 'epoch': 1.6}
+{'loss': 0.9308, 'grad_norm': 2.3189423084259033, 'learning_rate': 6.256109481915933e-05, 'epoch': 1.6}
+{'loss': 0.7128, 'grad_norm': 5.253555774688721, 'learning_rate': 6.25366568914956e-05, 'epoch': 1.6}
+{'loss': 1.3895, 'grad_norm': 4.00673770904541, 'learning_rate': 6.251221896383186e-05, 'epoch': 1.6}
+{'loss': 0.7943, 'grad_norm': 4.583085060119629, 'learning_rate': 6.248778103616813e-05, 'epoch': 1.6}
+{'loss': 0.3133, 'grad_norm': 0.6882913112640381, 'learning_rate': 6.246334310850439e-05, 'epoch': 1.6}
+ 80%|████████  | 10233/12776 [1:49:08<09:32,  4.44it/s] 80%|████████  | 10234/12776 [1:49:08<09:10,  4.62it/s]                                                        80%|████████  | 10234/12776 [1:49:08<09:10,  4.62it/s] 80%|████████  | 10235/12776 [1:49:08<08:52,  4.77it/s]                                                        80%|████████  | 10235/12776 [1:49:08<08:52,  4.77it/s] 80%|████████  | 10236/12776 [1:49:08<08:35,  4.92it/s]                                                        80%|████████  | 10236/12776 [1:49:08<08:35,  4.92it/s] 80%|████████  | 10237/12776 [1:49:08<09:36,  4.40it/s]                                                        80%|████████  | 10237/12776 [1:49:08<09:36,  4.40it/s] 80%|████████  | 10238/12776 [1:49:09<15:16,  2.77it/s]                                                        80%|████████  | 10238/12776 [1:49:09<15:16,  2.77it/s] 80%|████████  | 10239/12776 [1:49:10<27:04,  1.56it/s]                                                        80%|████████  | 10239/12776 [1:49:10<27:04,  1.56it/s] 80%|████████  | 10240/12776 [1:49:11<30:38,  1.38it/s]                                                        80%|████████  | 10240/12776 [1:49:11<30:38,  1.38it/s] 80%|████████  | 10241/12776 [1:49:12<32:10,  1.31it/s]                                                        80%|████████  | 10241/12776 [1:49:12<32:10,  1.31it/s] 80%|████████  | 10242/12776 [1:49:13<31:45,  1.33it/s]                                                        80%|████████  | 10242/12776 [1:49:13<31:45,  1.33it/s] 80%|████████  | 10243/12776 [1:49:14<31:02,  1.36it/s]                                                        80%|████████  | 10243/12776 [1:49:14<31:02,  1.36it/s] 80%|████████  | 10244/12776 [1:49:14<30:39,  1.38it/s]                                                        80%|████████  | 10244/12776 [1:49:14<30:39,  1.38it/s] 80%|████████  | 10245/12776 [1:49:15<29:26,  1.43it/s]                                                        80%|████████  | 10245/12776 [1:49:15<29:26,  1.43it/s] 80%|████████  | 10246/12776 [1:49:15<28:04,  1.50it/s]                                                        80%|████████  | 10246/12776 [1:49:15<28:04,  1.50it/s] 80%|████████  | 10247/12776 [1:49:16<26:50,  1.57it/s]                                                        80%|████████  | 10247/12776 [1:49:16<26:50,  1.57it/s] 80%|████████  | 10248/12776 [1:49:17<25:41,  1.64it/s]                                                        80%|████████  | 10248/12776 [1:49:17<25:41,  1.64it/s] 80%|████████  | 10249/12776 [1:49:17<24:36,  1.71it/s]                                                        80%|████████  | 10249/12776 [1:49:17<24:36,  1.71it/s] 80%|████████  | 10250/12776 [1:49:18<24:40,  1.71it/s]                                                        80%|████████  | 10250/12776 [1:49:18<24:40,  1.71it/s] 80%|████████  | 10251/12776 [1:49:18<23:01,  1.83it/s]                                                        80%|████████  | 10251/12776 [1:49:18<23:01,  1.83it/s] 80%|████████  | 10252/12776 [1:49:19<23:13,  1.81it/s]                                                        80%|████████  | 10252/12776 [1:49:19<23:13,  1.81it/s] 80%|████████  | 10253/12776 [1:49:19<21:29,  1.96it/s]                                                        80%|████████  | 10253/12776 [1:49:19<21:29,  1.96it/s] 80%|████████  | 10254/12776 [1:49:20<22:00,  1.91it/s]                                                        80%|████████  | 10254/12776 [1:49:20<22:00,  1.91it/s] 80%|████████  | 10255/12776 [1:49:20<20:19,  2.07it/s]                                                        80%|████████  | 10255/12776 [1:49:20<20:19,  2.07it/s] 80%|████████  | 10256/12776 [1:49:20<19:03,  2.20it/s]                                                        80%|████████  | 10256/12776 [1:49:20<19:03,  2.20it/s] 80%|████████  | 10257/12776 [1:49:21<18:16,  2.30it/s]                                                        80%|████████  | 10257/12776 [1:49:21<18:16,  2.30it/s] 80%|████████  | 10258/12776 [1:49:21<17:11,  2.44it/s]                                                        80%|████████  | 10258/12776 [1:49:21<17:11,  2.44it/s] 80%|████████  | 10259/12776 [1:49:22<16:22,  2.56it/s]                                                        80%|████████  | 10259/12776 [1:49:22<16:22,  2.56it/s] 80%|████████  | 10260/12776 [1:49:22<16:50,  2.49it/s]                                                        80%|████████  | 10260/12776 [1:49:22<16:50,  2.49it/s] 80%|████████  | 10261/12776 [1:49:22<15:56,  2.63it/s]                                                        80%|████████  | 10261/12776 [1:49:22<15:56,  2.63it/s] 80%|████████  | 10262/12776 [1:49:23<14:59,  2.79it/s]                                                        80%|████████  | 10262/12776 [1:49:23<14:59,  2.79it/s] 80%|████████  | 10263/12776 [1:49:23<14:14,  2.94it/s]                                                        80%|████████  | 10263/12776 [1:49:23<14:14,  2.94it/s] 80%|████████  | 10264/12776 [1:49:23<14:14,  2.94it/s]                                                        80%|████████  | 10264/12776 [1:49:23<14:14,  2.94it/s] 80%|████████  | 10265/12776 [1:49:24<13:31,  3.10it/s]                                                        80%|████████  | 10265/12776 [1:49:24<13:31,  3.10it/s] 80%|████████  | 10266/12776 [1:49:24<12:57,  3.23it/s]                                                        80%|████████  | 10266/12776 [1:49:24<12:57,  3.23it/s] 80%|████████  | 10267/12776 [1:49:24<12:27,  3.36it/s]                                                        80%|████████  | 10267/12776 [1:49:24<12:27,  3.36it/s] 80%|████████  | 10268/12776 [1:49:24<12:30,  3.34it/s]                                                        80%|████████  | 10268/12776 [1:49:24<12:30,  3.34it/s] 80%|████████  | 10269/12776 [1:49:25<11:56,  3.50it/s]                                                        80%|████████  | 10269/12776 [1:49:25<11:56,  3.50it/s] 80%|████████  | 10270/12776 [1:49:25<11:27,  3.64it/s]                                                        80%|████████  | 10270/12776 [1:49:25<11:27,  3.64it/s] 80%|████████  | 10271/12776 [1:49:25<11:02,  3.78it/s]                                                        80%|████████  | 10271/12776 [1:49:25<11:02,  3.78it/s] 80%|████████  | 10272/12776 [1:49:25<12:09,  3.43it/s]                                                        80%|████████  | 10272/12776 [1:49:25<12:09,  3.43it/s] 80%|████████  | 10273/12776 [1:49:26<11:23,  3.66it/s]                                                        80%|████████  | 10273/12776 [1:49:26<11:23,  3.66it/s] 80%|████████  | 10274/12776 [1:49:26<10:43,  3.89it/s]                                                        80%|████████  | 10274/12776 [1:49:26<10:43,  3.89it/s] 80%|████████  | 10275/12776 [1:49:26<10:10,  4.10it/s]                                                        80%|████████  | 10275/12776 [1:49:26<10:10,  4.10it/s] 80%|████████  | 10276/12776 [1:49:26<09:43,  4.28it/s]                                                        80%|████████  | 10276/12776 [1:49:26<09:43,  4.28it/s] 80%|████████  | 10277/12776 [1:49:27<10:09,  4.10it/s]                                                        80%|████████  | 10277/12776 [1:49:27<10:09,  4.10it/s] 80%|████████  | 10278/12776 [1:49:27<09:26,  4.41it/s]                                                        80%|████████  | 10278/12776 [1:49:27<09:26,  4.41it/s] 80%|████████  | 10279/12776 [1:49:27<09:10,  4.53it/s]                                                        80%|████████  | 10279/12776 [1:49:27<09:10,  4.53it/s] 80%|████████  | 10280/12776 [1:49:27<09:00,  4.62it/s]                                                        80%|████████  | 10280/12776 [1:49:27<09:00,  4.62it/s] 80%|████████  | 10281/12776 [1:49:27<08:53,  4.68it/s]                                                        80%|████████  | 10281/12776 [1:49:27<08:53,  4.68it/s] 80%|████████  | 10282/12776 [1:49:28<10:30,  3.96it/s]                                                        80%|████████  | 10282/12776 [1:49:28<10:30,  3.96it/s] 80%|████████  | 10283/12776 [1:49:28<09:49,  4.23it/s]                                                        80%|████████  | 10283/12776 [1:49:28<09:49,  4.23it/s] 80%|████████  | 10284/12776 [1:49:28<09:19,  4.45it/s]                                                        80%|████████  | 10284/12776 [1:49:28<09:19,  4.45it/s] 81%|██████��█  | 10285/12776 [1:49:28<08:56,  4.64it/s]                                                        81%|████████  | 10285/12776 [1:49:28<08:56,  4.64it/s] 81%|████████  | 10286/12776 [1:49:29<08:35,  4.83it/s]                                                        81%|████████  | 10286/12776 [1:49:29<08:35,  4.83it/s] 81%|████████  | 10287/12776 [1:49:29<09:02,  4.59it/s]                                                        81%|████████  | 10287/12776 [1:49:29<09:02,  4.59it/s] 81%|████████  | 10288/12776 [1:49:29<15:15,  2.72it/s]                                                        81%|████████  | 10288/12776 [1:49:29<15:15,  2.72it/s] 81%|████████  | 10289/12776 [1:49:31<29:47,  1.39it/s]                                                        81%|████████  | 10289/12776 [1:49:31<29:47,  1.39it/s] 81%|████████  | 10290/12776 [1:49:32<32:25,  1.28it/s]                                                        81%|████████  | 10290/12776 [1:49:32<32:25,  1.28it/s] 81%|████████  | 10291/12776 [1:49:33<32:45,  1.26it/s]                                                        81%|████████  | 10291/12776 [1:49:33<32:45,  1.26it/s] 81%|████████  | 10292/12776 [1:49:34<33:37,  1.23it/s]                                                        81%|████████  | 10292/12776 [1:49:34<33:37,  1.23it/s] 81%|████████  | 10293/12776 [1:49:34<33:42,  1.23it/s]                                                        81%|████████  | 10293/12776 [1:49:34<33:42,  1.23it/s] 81%|████████  | 10294/12776 [1:49:35<32:04,  1.29it/s]                                                        81%|████████  | 10294/12776 [1:49:35<32:04,  1.29it/s] 81%|████████  | 10295/12776 [1:49:36<31:32,  1.31it/s]                                                        81%|████████  | 10295/12776 [1:49:36<31:32,  1.31it/s] 81%|████████  | 10296/12776 [1:49:36<29:46,  1.39it/s]                                                        81%|████████  | 10296/12776 [1:49:37<29:46,  1.39it/s] 81%|████████  | 10297/12776 [1:49:37<28:22,  1.46it/s]                                                        81%|████████  | 10297/12776 [1:49:37<28:22,  1.46it/s] 81%|████████  | 10298/12776 [1:49:38<26:41,  1.55it/s]                                                        81%|████████  | 10298/12776 [1:49:38<26:41,  1.55it/s] 81%|████████  | 10299/12776 [1:49:38<25:44,  1.60it/s]                                                        81%|████████  | 10299/12776 [1:49:38<25:44,  1.60it/s] 81%|████████  | 10300/12776 [1:49:39<24:18,  1.70it/s]                                                        81%|████████  | 10300/12776 [1:49:39<24:18,  1.70it/s] 81%|████████  | 10301/12776 [1:49:39<23:35,  1.75it/s]                                                        81%|████████  | 10301/12776 [1:49:39<23:35,  1.75it/s] 81%|████████  | 10302/12776 [1:49:40<22:08,  1.86it/s]                                                        81%|████████  | 10302/12776 [1:49:40<22:08,  1.86it/s] 81%|████████  | 10303/12776 [1:49:40<21:17,  1.94it/s]                                                        81%|████████  | 10303/12776 [1:49:40<21:17,  1.94it/s] 81%|████████  | 10304/12776 [1:49:41<20:10,  2.04it/s]                                                        81%|████████  | 10304/12776 [1:49:41<20:10,  2.04it/s] 81%|████████  | 10305/12776 [1:49:41<19:05,  2.16it/s]                                                        81%|████████  | 10305/12776 [1:49:41<19:05,  2.16it/s] 81%|████████  | 10306/12776 [1:49:41<19:05,  2.16it/s]                                                        81%|████████  | 10306/12776 [1:49:41<19:05,  2.16it/s] 81%|████████  | 10307/12776 [1:49:42<17:54,  2.30it/s]                                                        81%|████████  | 10307/12776 [1:49:42<17:54,  2.30it/s] 81%|████████  | 10308/12776 [1:49:42<16:50,  2.44it/s]                                                        81%|████████  | 10308/12776 [1:49:42<16:50,  2.44it/s] 81%|████████  | 10309/12776 [1:49:43<16:46,  2.45it/s]                                                        81%|████████  | 10309/12776 [1:49:43<16:46,  2.45it/s] 81%|████████  | 10310/12776 [1:49:43<15:52,  2.59it/s]                                                        81%|████████  | 10310/12776 [1:49:43<15:52,  2.59it/s] 81%|████████  | 10311/12776 [1:49:43<15:09,  2.71it/s]                                                       {'loss': 0.4667, 'grad_norm': 1.9263126850128174, 'learning_rate': 6.243890518084066e-05, 'epoch': 1.6}
+{'loss': 0.7877, 'grad_norm': 13.986974716186523, 'learning_rate': 6.241446725317692e-05, 'epoch': 1.6}
+{'loss': 0.2021, 'grad_norm': 0.6357542872428894, 'learning_rate': 6.23900293255132e-05, 'epoch': 1.6}
+{'loss': 0.728, 'grad_norm': 6.546449661254883, 'learning_rate': 6.236559139784945e-05, 'epoch': 1.6}
+{'loss': 0.843, 'grad_norm': 1.615708351135254, 'learning_rate': 6.234115347018573e-05, 'epoch': 1.6}
+{'loss': 1.1529, 'grad_norm': 2.3978271484375, 'learning_rate': 6.2316715542522e-05, 'epoch': 1.6}
+{'loss': 0.2389, 'grad_norm': 0.8120617866516113, 'learning_rate': 6.229227761485825e-05, 'epoch': 1.6}
+{'loss': 0.2102, 'grad_norm': 0.506904125213623, 'learning_rate': 6.226783968719452e-05, 'epoch': 1.6}
+{'loss': 0.1446, 'grad_norm': 0.43618541955947876, 'learning_rate': 6.224340175953079e-05, 'epoch': 1.6}
+{'loss': 0.3323, 'grad_norm': 0.7896573543548584, 'learning_rate': 6.221896383186705e-05, 'epoch': 1.6}
+{'loss': 0.2047, 'grad_norm': 0.5736374855041504, 'learning_rate': 6.219452590420332e-05, 'epoch': 1.6}
+{'loss': 0.3968, 'grad_norm': 0.6807544231414795, 'learning_rate': 6.217008797653958e-05, 'epoch': 1.6}
+{'loss': 0.3243, 'grad_norm': 0.7900474071502686, 'learning_rate': 6.214565004887585e-05, 'epoch': 1.6}
+{'loss': 0.1766, 'grad_norm': 0.6970692873001099, 'learning_rate': 6.212121212121211e-05, 'epoch': 1.6}
+{'loss': 0.2549, 'grad_norm': 2.053403854370117, 'learning_rate': 6.209677419354839e-05, 'epoch': 1.6}
+{'loss': 0.3394, 'grad_norm': 2.547222375869751, 'learning_rate': 6.207233626588464e-05, 'epoch': 1.6}
+{'loss': 0.2039, 'grad_norm': 0.9715545773506165, 'learning_rate': 6.204789833822092e-05, 'epoch': 1.6}
+{'loss': 0.3098, 'grad_norm': 0.9468798637390137, 'learning_rate': 6.202346041055719e-05, 'epoch': 1.6}
+{'loss': 0.1379, 'grad_norm': 0.8663145303726196, 'learning_rate': 6.199902248289344e-05, 'epoch': 1.6}
+{'loss': 0.3302, 'grad_norm': 0.8937069177627563, 'learning_rate': 6.197458455522971e-05, 'epoch': 1.6}
+{'loss': 0.3953, 'grad_norm': 1.2698081731796265, 'learning_rate': 6.195014662756598e-05, 'epoch': 1.61}
+{'loss': 0.2489, 'grad_norm': 0.827766478061676, 'learning_rate': 6.192570869990224e-05, 'epoch': 1.61}
+{'loss': 0.6099, 'grad_norm': 3.880368947982788, 'learning_rate': 6.190127077223851e-05, 'epoch': 1.61}
+{'loss': 0.5125, 'grad_norm': 2.3780124187469482, 'learning_rate': 6.187683284457477e-05, 'epoch': 1.61}
+{'loss': 0.2951, 'grad_norm': 2.895963191986084, 'learning_rate': 6.185239491691104e-05, 'epoch': 1.61}
+{'loss': 0.397, 'grad_norm': 0.9345550537109375, 'learning_rate': 6.18279569892473e-05, 'epoch': 1.61}
+{'loss': 0.51, 'grad_norm': 1.1102604866027832, 'learning_rate': 6.180351906158358e-05, 'epoch': 1.61}
+{'loss': 0.2253, 'grad_norm': 0.9499253630638123, 'learning_rate': 6.177908113391983e-05, 'epoch': 1.61}
+{'loss': 0.2358, 'grad_norm': 0.8864089250564575, 'learning_rate': 6.17546432062561e-05, 'epoch': 1.61}
+{'loss': 0.5664, 'grad_norm': 1.9460018873214722, 'learning_rate': 6.173020527859238e-05, 'epoch': 1.61}
+{'loss': 0.4159, 'grad_norm': 1.625748634338379, 'learning_rate': 6.170576735092863e-05, 'epoch': 1.61}
+{'loss': 0.7013, 'grad_norm': 2.510746479034424, 'learning_rate': 6.16813294232649e-05, 'epoch': 1.61}
+{'loss': 0.326, 'grad_norm': 3.4259462356567383, 'learning_rate': 6.165689149560117e-05, 'epoch': 1.61}
+{'loss': 0.3353, 'grad_norm': 1.4860423803329468, 'learning_rate': 6.163245356793744e-05, 'epoch': 1.61}
+{'loss': 0.6318, 'grad_norm': 1.9358034133911133, 'learning_rate': 6.16080156402737e-05, 'epoch': 1.61}
+{'loss': 0.5148, 'grad_norm': 2.1862995624542236, 'learning_rate': 6.158357771260997e-05, 'epoch': 1.61}
+{'loss': 0.8787, 'grad_norm': 1.741811990737915, 'learning_rate': 6.155913978494623e-05, 'epoch': 1.61}
+{'loss': 0.4273, 'grad_norm': 1.7040098905563354, 'learning_rate': 6.15347018572825e-05, 'epoch': 1.61}
+{'loss': 0.5667, 'grad_norm': 1.6641523838043213, 'learning_rate': 6.151026392961877e-05, 'epoch': 1.61}
+{'loss': 0.69, 'grad_norm': 1.784555435180664, 'learning_rate': 6.148582600195502e-05, 'epoch': 1.61}
+{'loss': 0.6946, 'grad_norm': 4.743298053741455, 'learning_rate': 6.146138807429129e-05, 'epoch': 1.61}
+{'loss': 1.1488, 'grad_norm': 5.469465732574463, 'learning_rate': 6.143695014662757e-05, 'epoch': 1.61}
+{'loss': 1.2612, 'grad_norm': 4.396662712097168, 'learning_rate': 6.141251221896382e-05, 'epoch': 1.61}
+{'loss': 0.768, 'grad_norm': 2.4530532360076904, 'learning_rate': 6.13880742913001e-05, 'epoch': 1.61}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 6.13880742913001e-05, 'epoch': 1.61}
+{'loss': 0.9859, 'grad_norm': 3.0336153507232666, 'learning_rate': 6.136363636363636e-05, 'epoch': 1.61}
+{'loss': 0.8229, 'grad_norm': 2.3858444690704346, 'learning_rate': 6.133919843597263e-05, 'epoch': 1.61}
+{'loss': 1.1625, 'grad_norm': 3.8104684352874756, 'learning_rate': 6.131476050830889e-05, 'epoch': 1.61}
+{'loss': 0.8723, 'grad_norm': 1.7261332273483276, 'learning_rate': 6.129032258064516e-05, 'epoch': 1.61}
+{'loss': 0.995, 'grad_norm': 1.7524932622909546, 'learning_rate': 6.126588465298142e-05, 'epoch': 1.61}
+{'loss': 1.0345, 'grad_norm': 2.208996057510376, 'learning_rate': 6.124144672531769e-05, 'epoch': 1.61}
+{'loss': 0.5331, 'grad_norm': 9.19688892364502, 'learning_rate': 6.121700879765395e-05, 'epoch': 1.61}
+{'loss': 0.705, 'grad_norm': 4.053257465362549, 'learning_rate': 6.119257086999022e-05, 'epoch': 1.61}
+{'loss': 0.8349, 'grad_norm': 2.6395530700683594, 'learning_rate': 6.116813294232648e-05, 'epoch': 1.61}
+{'loss': 0.9976, 'grad_norm': 5.501692771911621, 'learning_rate': 6.114369501466276e-05, 'epoch': 1.61}
+{'loss': 1.3165, 'grad_norm': 2.47517466545105, 'learning_rate': 6.111925708699901e-05, 'epoch': 1.61}
+{'loss': 0.1867, 'grad_norm': 0.6524291634559631, 'learning_rate': 6.109481915933529e-05, 'epoch': 1.61}
+{'loss': 0.2824, 'grad_norm': 0.988003134727478, 'learning_rate': 6.107038123167155e-05, 'epoch': 1.61}
+{'loss': 0.1858, 'grad_norm': 0.47198671102523804, 'learning_rate': 6.104594330400782e-05, 'epoch': 1.61}
+{'loss': 0.1501, 'grad_norm': 0.9640408754348755, 'learning_rate': 6.102150537634408e-05, 'epoch': 1.61}
+{'loss': 0.2762, 'grad_norm': 1.5908207893371582, 'learning_rate': 6.099706744868035e-05, 'epoch': 1.61}
+{'loss': 0.1985, 'grad_norm': 0.730593740940094, 'learning_rate': 6.097262952101661e-05, 'epoch': 1.61}
+{'loss': 0.2736, 'grad_norm': 0.45398107171058655, 'learning_rate': 6.094819159335288e-05, 'epoch': 1.61}
+{'loss': 0.3241, 'grad_norm': 1.1660237312316895, 'learning_rate': 6.092375366568914e-05, 'epoch': 1.61}
+{'loss': 0.2403, 'grad_norm': 0.7459734678268433, 'learning_rate': 6.089931573802541e-05, 'epoch': 1.61}
+{'loss': 0.3583, 'grad_norm': 1.4114768505096436, 'learning_rate': 6.087487781036168e-05, 'epoch': 1.61}
+{'loss': 0.2025, 'grad_norm': 0.7342216968536377, 'learning_rate': 6.0850439882697936e-05, 'epoch': 1.61}
+{'loss': 0.2146, 'grad_norm': 0.8763503432273865, 'learning_rate': 6.082600195503421e-05, 'epoch': 1.61}
+{'loss': 0.2597, 'grad_norm': 1.2437353134155273, 'learning_rate': 6.080156402737047e-05, 'epoch': 1.61}
+{'loss': 0.1298, 'grad_norm': 0.5917118191719055, 'learning_rate': 6.077712609970674e-05, 'epoch': 1.61}
+{'loss': 0.2692, 'grad_norm': 1.689225196838379, 'learning_rate': 6.0752688172043e-05, 'epoch': 1.61}
+{'loss': 0.3584, 'grad_norm': 1.53922438621521, 'learning_rate': 6.0728250244379274e-05, 'epoch': 1.61}
+{'loss': 0.4676, 'grad_norm': 1.4097189903259277, 'learning_rate': 6.070381231671553e-05, 'epoch': 1.61}
+{'loss': 0.4225, 'grad_norm': 2.491358757019043, 'learning_rate': 6.0679374389051803e-05, 'epoch': 1.61}
+{'loss': 0.5883, 'grad_norm': 2.754809617996216, 'learning_rate': 6.065493646138807e-05, 'epoch': 1.61}
+{'loss': 0.6693, 'grad_norm': 1.1778233051300049, 'learning_rate': 6.063049853372433e-05, 'epoch': 1.61}
+{'loss': 0.2029, 'grad_norm': 0.8539136052131653, 'learning_rate': 6.06060606060606e-05, 'epoch': 1.61}
+{'loss': 0.3794, 'grad_norm': 2.6600699424743652, 'learning_rate': 6.058162267839687e-05, 'epoch': 1.61}
+ 81%|████████  | 10311/12776 [1:49:43<15:09,  2.71it/s] 81%|████████  | 10312/12776 [1:49:44<15:59,  2.57it/s]                                                        81%|████████  | 10312/12776 [1:49:44<15:59,  2.57it/s] 81%|████████  | 10313/12776 [1:49:44<14:56,  2.75it/s]                                                        81%|████████  | 10313/12776 [1:49:44<14:56,  2.75it/s] 81%|████████  | 10314/12776 [1:49:44<14:04,  2.91it/s]                                                        81%|████████  | 10314/12776 [1:49:44<14:04,  2.91it/s] 81%|████████  | 10315/12776 [1:49:45<14:33,  2.82it/s]                                                        81%|████████  | 10315/12776 [1:49:45<14:33,  2.82it/s] 81%|████████  | 10316/12776 [1:49:45<13:40,  3.00it/s]                                                        81%|████████  | 10316/12776 [1:49:45<13:40,  3.00it/s] 81%|████████  | 10317/12776 [1:49:45<12:55,  3.17it/s]                                                        81%|████████  | 10317/12776 [1:49:45<12:55,  3.17it/s] 81%|████████  | 10318/12776 [1:49:46<12:18,  3.33it/s]                                                        81%|████████  | 10318/12776 [1:49:46<12:18,  3.33it/s] 81%|████████  | 10319/12776 [1:49:46<12:34,  3.26it/s]                                                        81%|████████  | 10319/12776 [1:49:46<12:34,  3.26it/s] 81%|████████  | 10320/12776 [1:49:46<11:54,  3.44it/s]                                                        81%|████████  | 10320/12776 [1:49:46<11:54,  3.44it/s] 81%|████████  | 10321/12776 [1:49:46<11:21,  3.60it/s]                                                        81%|████████  | 10321/12776 [1:49:46<11:21,  3.60it/s] 81%|████████  | 10322/12776 [1:49:47<10:55,  3.74it/s]                                                        81%|████████  | 10322/12776 [1:49:47<10:55,  3.74it/s] 81%|████████  | 10323/12776 [1:49:47<10:33,  3.87it/s]                                                        81%|████████  | 10323/12776 [1:49:47<10:33,  3.87it/s] 81%|████████  | 10324/12776 [1:49:47<10:55,  3.74it/s]                                                        81%|████████  | 10324/12776 [1:49:47<10:55,  3.74it/s] 81%|████████  | 10325/12776 [1:49:47<10:24,  3.93it/s]                                                        81%|████████  | 10325/12776 [1:49:47<10:24,  3.93it/s] 81%|████████  | 10326/12776 [1:49:48<09:58,  4.10it/s]                                                        81%|████████  | 10326/12776 [1:49:48<09:58,  4.10it/s] 81%|████████  | 10327/12776 [1:49:48<09:36,  4.25it/s]                                                        81%|████████  | 10327/12776 [1:49:48<09:36,  4.25it/s] 81%|████████  | 10328/12776 [1:49:48<09:17,  4.39it/s]                                                        81%|████████  | 10328/12776 [1:49:48<09:17,  4.39it/s] 81%|████████  | 10329/12776 [1:49:48<10:21,  3.93it/s]                                                        81%|████████  | 10329/12776 [1:49:48<10:21,  3.93it/s] 81%|████████  | 10330/12776 [1:49:48<09:46,  4.17it/s]                                                        81%|████████  | 10330/12776 [1:49:48<09:46,  4.17it/s] 81%|████████  | 10331/12776 [1:49:49<09:22,  4.35it/s]                                                        81%|████████  | 10331/12776 [1:49:49<09:22,  4.35it/s] 81%|████████  | 10332/12776 [1:49:49<08:59,  4.53it/s]                                                        81%|████████  | 10332/12776 [1:49:49<08:59,  4.53it/s] 81%|████████  | 10333/12776 [1:49:49<08:42,  4.67it/s]                                                        81%|████████  | 10333/12776 [1:49:49<08:42,  4.67it/s] 81%|████████  | 10334/12776 [1:49:49<09:29,  4.28it/s]                                                        81%|████████  | 10334/12776 [1:49:49<09:29,  4.28it/s] 81%|████████  | 10335/12776 [1:49:50<08:58,  4.53it/s]                                                        81%|████████  | 10335/12776 [1:49:50<08:58,  4.53it/s] 81%|████████  | 10336/12776 [1:49:50<08:35,  4.73it/s]                                                        81%|████████  | 10336/12776 [1:49:50<08:35,  4.73it/s] 81%|████████  | 10337/12776 [1:49:50<08:17,  4.90it/s]                                                        81%|████████  | 10337/12776 [1:49:50<08:17,  4.90it/s] 81%|████████  | 10338/12776 [1:49:51<13:54,  2.92it/s]                                                        81%|████████  | 10338/12776 [1:49:51<13:54,  2.92it/s] 81%|████████  | 10339/12776 [1:49:52<26:30,  1.53it/s]                                                        81%|████████  | 10339/12776 [1:49:52<26:30,  1.53it/s] 81%|████████  | 10340/12776 [1:49:53<30:02,  1.35it/s]                                                        81%|████████  | 10340/12776 [1:49:53<30:02,  1.35it/s] 81%|████████  | 10341/12776 [1:49:54<32:43,  1.24it/s]                                                        81%|████████  | 10341/12776 [1:49:54<32:43,  1.24it/s] 81%|████████  | 10342/12776 [1:49:55<32:40,  1.24it/s]                                                        81%|████████  | 10342/12776 [1:49:55<32:40,  1.24it/s] 81%|████████  | 10343/12776 [1:49:55<32:12,  1.26it/s]                                                        81%|████████  | 10343/12776 [1:49:55<32:12,  1.26it/s] 81%|████████  | 10344/12776 [1:49:56<31:22,  1.29it/s]                                                        81%|████████  | 10344/12776 [1:49:56<31:22,  1.29it/s] 81%|████████  | 10345/12776 [1:49:57<29:59,  1.35it/s]                                                        81%|████████  | 10345/12776 [1:49:57<29:59,  1.35it/s] 81%|████████  | 10346/12776 [1:49:58<30:16,  1.34it/s]                                                        81%|████████  | 10346/12776 [1:49:58<30:16,  1.34it/s] 81%|████████  | 10347/12776 [1:49:58<28:17,  1.43it/s]                                                        81%|████████  | 10347/12776 [1:49:58<28:17,  1.43it/s] 81%|████████  | 10348/12776 [1:49:59<27:18,  1.48it/s]                                                        81%|████████  | 10348/12776 [1:49:59<27:18,  1.48it/s] 81%|████████  | 10349/12776 [1:49:59<25:37,  1.58it/s]                                                        81%|████████  | 10349/12776 [1:49:59<25:37,  1.58it/s] 81%|████████  | 10350/12776 [1:50:00<24:57,  1.62it/s]                                                        81%|████████  | 10350/12776 [1:50:00<24:57,  1.62it/s] 81%|████████  | 10351/12776 [1:50:00<23:27,  1.72it/s]                                                        81%|████████  | 10351/12776 [1:50:00<23:27,  1.72it/s] 81%|████████  | 10352/12776 [1:50:01<22:45,  1.78it/s]                                                        81%|████████  | 10352/12776 [1:50:01<22:45,  1.78it/s] 81%|████████  | 10353/12776 [1:50:01<21:18,  1.90it/s]                                                        81%|████████  | 10353/12776 [1:50:01<21:18,  1.90it/s] 81%|████████  | 10354/12776 [1:50:02<20:43,  1.95it/s]                                                        81%|████████  | 10354/12776 [1:50:02<20:43,  1.95it/s] 81%|████████  | 10355/12776 [1:50:02<19:22,  2.08it/s]                                                        81%|████████  | 10355/12776 [1:50:02<19:22,  2.08it/s] 81%|████████  | 10356/12776 [1:50:03<18:12,  2.22it/s]                                                        81%|████████  | 10356/12776 [1:50:03<18:12,  2.22it/s] 81%|████████  | 10357/12776 [1:50:03<17:18,  2.33it/s]                                                        81%|████████  | 10357/12776 [1:50:03<17:18,  2.33it/s] 81%|████████  | 10358/12776 [1:50:03<16:17,  2.47it/s]                                                        81%|████████  | 10358/12776 [1:50:03<16:17,  2.47it/s] 81%|████████  | 10359/12776 [1:50:04<15:28,  2.60it/s]                                                        81%|████████  | 10359/12776 [1:50:04<15:28,  2.60it/s] 81%|████████  | 10360/12776 [1:50:04<16:10,  2.49it/s]                                                        81%|████████  | 10360/12776 [1:50:04<16:10,  2.49it/s] 81%|████████  | 10361/12776 [1:50:04<15:14,  2.64it/s]                                                        81%|████████  | 10361/12776 [1:50:05<15:14,  2.64it/s] 81%|████████  | 10362/12776 [1:50:05<14:23,  2.80it/s]                                                        81%|████████  | 10362/12776 [1:50:05<14:23,  2.80it/s] 81%|████████  | 10363/12776 [1:50:05<13:42,  2.93it/s]                                                        81%|████████  | 10363/12776 [1:50:05<13:42,  2.93it/s] 81%|████████  | 10364/12776 [1:50:05<13:43,  2.93it/s]                                                        81%|████████  | 10364/12776 [1:50:05<13:43,  2.93it/s] 81%|████████  | 10365/12776 [1:50:06<13:06,  3.07it/s]                                                        81%|████████  | 10365/12776 [1:50:06<13:06,  3.07it/s] 81%|████████  | 10366/12776 [1:50:06<12:33,  3.20it/s]                                                        81%|████████  | 10366/12776 [1:50:06<12:33,  3.20it/s] 81%|████████  | 10367/12776 [1:50:06<12:07,  3.31it/s]                                                        81%|████████  | 10367/12776 [1:50:06<12:07,  3.31it/s] 81%|████████  | 10368/12776 [1:50:07<11:58,  3.35it/s]                                                        81%|████████  | 10368/12776 [1:50:07<11:58,  3.35it/s] 81%|████████  | 10369/12776 [1:50:07<11:33,  3.47it/s]                                                        81%|████████  | 10369/12776 [1:50:07<11:33,  3.47it/s] 81%|████████  | 10370/12776 [1:50:07<11:03,  3.63it/s]                                                        81%|████████  | 10370/12776 [1:50:07<11:03,  3.63it/s] 81%|████████  | 10371/12776 [1:50:07<10:44,  3.73it/s]                                                        81%|████████  | 10371/12776 [1:50:07<10:44,  3.73it/s] 81%|████████  | 10372/12776 [1:50:08<12:09,  3.29it/s]                                                        81%|████████  | 10372/12776 [1:50:08<12:09,  3.29it/s] 81%|████████  | 10373/12776 [1:50:08<11:24,  3.51it/s]                                                        81%|████████  | 10373/12776 [1:50:08<11:24,  3.51it/s] 81%|████████  | 10374/12776 [1:50:08<10:46,  3.71it/s]                                                        81%|████████  | 10374/12776 [1:50:08<10:46,  3.71it/s] 81%|████████  | 10375/12776 [1:50:08<10:13,  3.92it/s]                                                        81%|████████  | 10375/12776 [1:50:08<10:13,  3.92it/s] 81%|████████  | 10376/12776 [1:50:09<11:01,  3.63it/s]                                                        81%|████████  | 10376/12776 [1:50:09<11:01,  3.63it/s] 81%|████████  | 10377/12776 [1:50:09<10:16,  3.89it/s]                                                        81%|████████  | 10377/12776 [1:50:09<10:16,  3.89it/s] 81%|████████  | 10378/12776 [1:50:09<09:43,  4.11it/s]                                                        81%|████████  | 10378/12776 [1:50:09<09:43,  4.11it/s] 81%|████████  | 10379/12776 [1:50:09<09:18,  4.30it/s]                                                        81%|████████  | 10379/12776 [1:50:09<09:18,  4.30it/s] 81%|████████  | 10380/12776 [1:50:10<08:57,  4.45it/s]                                                        81%|████████  | 10380/12776 [1:50:10<08:57,  4.45it/s] 81%|████████▏ | 10381/12776 [1:50:10<09:44,  4.10it/s]                                                        81%|████████▏ | 10381/12776 [1:50:10<09:44,  4.10it/s] 81%|████████▏ | 10382/12776 [1:50:10<09:14,  4.32it/s]                                                        81%|████████▏ | 10382/12776 [1:50:10<09:14,  4.32it/s] 81%|████████▏ | 10383/12776 [1:50:10<08:50,  4.51it/s]                                                        81%|████████▏ | 10383/12776 [1:50:10<08:50,  4.51it/s] 81%|████████▏ | 10384/12776 [1:50:10<08:33,  4.66it/s]                                                        81%|████████▏ | 10384/12776 [1:50:10<08:33,  4.66it/s] 81%|████████▏ | 10385/12776 [1:50:11<08:18,  4.80it/s]                                                        81%|████████▏ | 10385/12776 [1:50:11<08:18,  4.80it/s] 81%|████████▏ | 10386/12776 [1:50:11<08:03,  4.94it/s]                                                        81%|████████▏ | 10386/12776 [1:50:11<08:03,  4.94it/s] 81%|████████▏ | 10387/12776 [1:50:11<08:37,  4.61it/s]                                                        81%|████████▏ | 10387/12776 [1:50:11<08:37,  4.61it/s] 81%|████████▏ | 10388/12776 [1:50:12<14:12,  2.80it/s]                                                        81%|████████▏ | 10388/12776 [1:50:12<14:12,  2.80it/s] 81%|████████▏ | 10389/12776 [1:50:13<27:34,  1.44it/s]                                                       {'loss': 0.3923, 'grad_norm': 1.350770354270935, 'learning_rate': 6.055718475073313e-05, 'epoch': 1.61}
+{'loss': 0.4274, 'grad_norm': 2.732351303100586, 'learning_rate': 6.05327468230694e-05, 'epoch': 1.61}
+{'loss': 0.5387, 'grad_norm': 1.7970898151397705, 'learning_rate': 6.0508308895405664e-05, 'epoch': 1.61}
+{'loss': 0.4457, 'grad_norm': 2.37106990814209, 'learning_rate': 6.048387096774193e-05, 'epoch': 1.61}
+{'loss': 0.6682, 'grad_norm': 1.6102174520492554, 'learning_rate': 6.0459433040078193e-05, 'epoch': 1.61}
+{'loss': 0.7914, 'grad_norm': 1.9735941886901855, 'learning_rate': 6.0434995112414465e-05, 'epoch': 1.61}
+{'loss': 0.6588, 'grad_norm': 1.6379317045211792, 'learning_rate': 6.041055718475072e-05, 'epoch': 1.62}
+{'loss': 0.6006, 'grad_norm': 1.541721224784851, 'learning_rate': 6.0386119257086995e-05, 'epoch': 1.62}
+{'loss': 0.6219, 'grad_norm': 5.82126522064209, 'learning_rate': 6.036168132942326e-05, 'epoch': 1.62}
+{'loss': 0.4162, 'grad_norm': 3.0988450050354004, 'learning_rate': 6.0337243401759524e-05, 'epoch': 1.62}
+{'loss': 1.2557, 'grad_norm': 3.262086868286133, 'learning_rate': 6.031280547409579e-05, 'epoch': 1.62}
+{'loss': 0.8396, 'grad_norm': 1.9713597297668457, 'learning_rate': 6.028836754643206e-05, 'epoch': 1.62}
+{'loss': 0.5553, 'grad_norm': 2.0286524295806885, 'learning_rate': 6.026392961876832e-05, 'epoch': 1.62}
+{'loss': 0.5284, 'grad_norm': 2.3841817378997803, 'learning_rate': 6.023949169110459e-05, 'epoch': 1.62}
+{'loss': 0.7772, 'grad_norm': 2.634639024734497, 'learning_rate': 6.0215053763440855e-05, 'epoch': 1.62}
+{'loss': 0.5422, 'grad_norm': 3.448430299758911, 'learning_rate': 6.019061583577712e-05, 'epoch': 1.62}
+{'loss': 1.0043, 'grad_norm': 1.8632174730300903, 'learning_rate': 6.0166177908113385e-05, 'epoch': 1.62}
+{'loss': 0.906, 'grad_norm': 3.0576517581939697, 'learning_rate': 6.0141739980449656e-05, 'epoch': 1.62}
+{'loss': 1.7748, 'grad_norm': 6.012516975402832, 'learning_rate': 6.0117302052785914e-05, 'epoch': 1.62}
+{'loss': 0.441, 'grad_norm': 1.4931806325912476, 'learning_rate': 6.0092864125122186e-05, 'epoch': 1.62}
+{'loss': 1.212, 'grad_norm': 2.6197221279144287, 'learning_rate': 6.006842619745845e-05, 'epoch': 1.62}
+{'loss': 0.9955, 'grad_norm': 2.8146133422851562, 'learning_rate': 6.0043988269794715e-05, 'epoch': 1.62}
+{'loss': 1.4292, 'grad_norm': 2.633820056915283, 'learning_rate': 6.001955034213098e-05, 'epoch': 1.62}
+{'loss': 1.3371, 'grad_norm': 5.142744541168213, 'learning_rate': 5.999511241446725e-05, 'epoch': 1.62}
+{'loss': 0.5765, 'grad_norm': 1.9300106763839722, 'learning_rate': 5.997067448680351e-05, 'epoch': 1.62}
+{'loss': 0.6505, 'grad_norm': 3.921834945678711, 'learning_rate': 5.994623655913978e-05, 'epoch': 1.62}
+{'loss': 0.8817, 'grad_norm': 2.213167905807495, 'learning_rate': 5.9921798631476046e-05, 'epoch': 1.62}
+{'loss': 0.6968, 'grad_norm': 2.0147557258605957, 'learning_rate': 5.989736070381231e-05, 'epoch': 1.62}
+{'loss': 0.1943, 'grad_norm': 0.6969068646430969, 'learning_rate': 5.9872922776148576e-05, 'epoch': 1.62}
+{'loss': 0.18, 'grad_norm': 0.5584667921066284, 'learning_rate': 5.984848484848485e-05, 'epoch': 1.62}
+{'loss': 0.2321, 'grad_norm': 0.6967960596084595, 'learning_rate': 5.9824046920821105e-05, 'epoch': 1.62}
+{'loss': 0.135, 'grad_norm': 0.3680100440979004, 'learning_rate': 5.979960899315738e-05, 'epoch': 1.62}
+{'loss': 0.2955, 'grad_norm': 0.755924642086029, 'learning_rate': 5.977517106549364e-05, 'epoch': 1.62}
+{'loss': 0.2681, 'grad_norm': 2.2289376258850098, 'learning_rate': 5.9750733137829907e-05, 'epoch': 1.62}
+{'loss': 0.1985, 'grad_norm': 1.0998948812484741, 'learning_rate': 5.972629521016617e-05, 'epoch': 1.62}
+{'loss': 0.266, 'grad_norm': 0.6350347399711609, 'learning_rate': 5.970185728250244e-05, 'epoch': 1.62}
+{'loss': 0.1386, 'grad_norm': 0.7343830466270447, 'learning_rate': 5.96774193548387e-05, 'epoch': 1.62}
+{'loss': 0.4067, 'grad_norm': 1.0701186656951904, 'learning_rate': 5.965298142717497e-05, 'epoch': 1.62}
+{'loss': 0.5609, 'grad_norm': 1.645283818244934, 'learning_rate': 5.962854349951124e-05, 'epoch': 1.62}
+{'loss': 0.2413, 'grad_norm': 1.122719168663025, 'learning_rate': 5.96041055718475e-05, 'epoch': 1.62}
+{'loss': 0.3426, 'grad_norm': 0.7150799036026001, 'learning_rate': 5.957966764418377e-05, 'epoch': 1.62}
+{'loss': 0.1765, 'grad_norm': 0.598060131072998, 'learning_rate': 5.955522971652004e-05, 'epoch': 1.62}
+{'loss': 0.29, 'grad_norm': 0.8176664710044861, 'learning_rate': 5.9530791788856297e-05, 'epoch': 1.62}
+{'loss': 0.3729, 'grad_norm': 1.5242834091186523, 'learning_rate': 5.950635386119257e-05, 'epoch': 1.62}
+{'loss': 0.7854, 'grad_norm': 1.9225590229034424, 'learning_rate': 5.948191593352883e-05, 'epoch': 1.62}
+{'loss': 0.5056, 'grad_norm': 1.2514513731002808, 'learning_rate': 5.94574780058651e-05, 'epoch': 1.62}
+{'loss': 0.4021, 'grad_norm': 1.7647795677185059, 'learning_rate': 5.943304007820136e-05, 'epoch': 1.62}
+{'loss': 0.3619, 'grad_norm': 0.9567508697509766, 'learning_rate': 5.9408602150537634e-05, 'epoch': 1.62}
+{'loss': 0.4833, 'grad_norm': 1.5597784519195557, 'learning_rate': 5.938416422287389e-05, 'epoch': 1.62}
+{'loss': 0.4786, 'grad_norm': 2.9853146076202393, 'learning_rate': 5.9359726295210164e-05, 'epoch': 1.62}
+{'loss': 0.4453, 'grad_norm': 1.8324689865112305, 'learning_rate': 5.933528836754643e-05, 'epoch': 1.62}
+{'loss': 0.5207, 'grad_norm': 2.3607141971588135, 'learning_rate': 5.931085043988269e-05, 'epoch': 1.62}
+{'loss': 0.5193, 'grad_norm': 1.914950966835022, 'learning_rate': 5.928641251221896e-05, 'epoch': 1.62}
+{'loss': 0.6317, 'grad_norm': 2.1781463623046875, 'learning_rate': 5.926197458455523e-05, 'epoch': 1.62}
+{'loss': 0.7715, 'grad_norm': 4.128535270690918, 'learning_rate': 5.923753665689149e-05, 'epoch': 1.62}
+{'loss': 0.7828, 'grad_norm': 3.1875927448272705, 'learning_rate': 5.921309872922776e-05, 'epoch': 1.62}
+{'loss': 0.6662, 'grad_norm': 1.828016996383667, 'learning_rate': 5.9188660801564024e-05, 'epoch': 1.62}
+{'loss': 0.647, 'grad_norm': 2.3392691612243652, 'learning_rate': 5.916422287390029e-05, 'epoch': 1.62}
+{'loss': 0.8108, 'grad_norm': 2.4134435653686523, 'learning_rate': 5.9139784946236554e-05, 'epoch': 1.62}
+{'loss': 0.326, 'grad_norm': 0.9342465996742249, 'learning_rate': 5.9115347018572825e-05, 'epoch': 1.62}
+{'loss': 0.6187, 'grad_norm': 2.0880491733551025, 'learning_rate': 5.909090909090908e-05, 'epoch': 1.62}
+{'loss': 0.5137, 'grad_norm': 1.288293719291687, 'learning_rate': 5.9066471163245355e-05, 'epoch': 1.62}
+{'loss': 0.7306, 'grad_norm': 2.43259596824646, 'learning_rate': 5.904203323558162e-05, 'epoch': 1.62}
+{'loss': 0.7173, 'grad_norm': 1.6936664581298828, 'learning_rate': 5.901759530791788e-05, 'epoch': 1.62}
+{'loss': 0.3325, 'grad_norm': 1.1513222455978394, 'learning_rate': 5.899315738025415e-05, 'epoch': 1.62}
+{'loss': 0.8307, 'grad_norm': 2.4946534633636475, 'learning_rate': 5.896871945259042e-05, 'epoch': 1.62}
+{'loss': 1.3256, 'grad_norm': 3.35575270652771, 'learning_rate': 5.894428152492668e-05, 'epoch': 1.62}
+{'loss': 0.6945, 'grad_norm': 8.07103157043457, 'learning_rate': 5.891984359726295e-05, 'epoch': 1.62}
+{'loss': 0.9647, 'grad_norm': 2.7447383403778076, 'learning_rate': 5.8895405669599215e-05, 'epoch': 1.62}
+{'loss': 1.3698, 'grad_norm': 2.4406001567840576, 'learning_rate': 5.887096774193547e-05, 'epoch': 1.62}
+{'loss': 0.7408, 'grad_norm': 6.9520978927612305, 'learning_rate': 5.8846529814271745e-05, 'epoch': 1.63}
+{'loss': 1.0021, 'grad_norm': 2.9394330978393555, 'learning_rate': 5.8822091886608016e-05, 'epoch': 1.63}
+{'loss': 1.1361, 'grad_norm': 3.4917502403259277, 'learning_rate': 5.8797653958944274e-05, 'epoch': 1.63}
+{'loss': 0.2525, 'grad_norm': 1.3461923599243164, 'learning_rate': 5.8773216031280546e-05, 'epoch': 1.63}
+{'loss': 0.7553, 'grad_norm': 1.8824841976165771, 'learning_rate': 5.874877810361681e-05, 'epoch': 1.63}
+{'loss': 0.5558, 'grad_norm': 1.4790881872177124, 'learning_rate': 5.872434017595307e-05, 'epoch': 1.63}
+{'loss': 0.4602, 'grad_norm': 1.9646244049072266, 'learning_rate': 5.869990224828934e-05, 'epoch': 1.63}
+{'loss': 0.5251, 'grad_norm': 1.7830438613891602, 'learning_rate': 5.867546432062561e-05, 'epoch': 1.63}
+ 81%|████████▏ | 10389/12776 [1:50:13<27:34,  1.44it/s] 81%|████████▏ | 10390/12776 [1:50:14<30:38,  1.30it/s]                                                        81%|████████▏ | 10390/12776 [1:50:14<30:38,  1.30it/s] 81%|████████▏ | 10391/12776 [1:50:15<31:40,  1.26it/s]                                                        81%|████████▏ | 10391/12776 [1:50:15<31:40,  1.26it/s] 81%|████████▏ | 10392/12776 [1:50:16<32:30,  1.22it/s]                                                        81%|████████▏ | 10392/12776 [1:50:16<32:30,  1.22it/s] 81%|████████▏ | 10393/12776 [1:50:17<32:20,  1.23it/s]                                                        81%|████████▏ | 10393/12776 [1:50:17<32:20,  1.23it/s] 81%|████████▏ | 10394/12776 [1:50:17<31:12,  1.27it/s]                                                        81%|████████▏ | 10394/12776 [1:50:17<31:12,  1.27it/s] 81%|████████▏ | 10395/12776 [1:50:18<30:33,  1.30it/s]                                                        81%|████████▏ | 10395/12776 [1:50:18<30:33,  1.30it/s] 81%|████████▏ | 10396/12776 [1:50:19<29:13,  1.36it/s]                                                        81%|████████▏ | 10396/12776 [1:50:19<29:13,  1.36it/s] 81%|████████▏ | 10397/12776 [1:50:19<27:52,  1.42it/s]                                                        81%|████████▏ | 10397/12776 [1:50:19<27:52,  1.42it/s] 81%|████████▏ | 10398/12776 [1:50:20<26:28,  1.50it/s]                                                        81%|████████▏ | 10398/12776 [1:50:20<26:28,  1.50it/s] 81%|████████▏ | 10399/12776 [1:50:21<25:12,  1.57it/s]                                                        81%|████████▏ | 10399/12776 [1:50:21<25:12,  1.57it/s] 81%|████████▏ | 10400/12776 [1:50:21<24:07,  1.64it/s]                                                        81%|████████▏ | 10400/12776 [1:50:21<24:07,  1.64it/s]Saving model checkpoint to ./checkpoint-10400
+Configuration saved in ./checkpoint-10400/config.json
+Model weights saved in ./checkpoint-10400/model.safetensors
+Feature extractor saved in ./checkpoint-10400/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-10400/tokenizer_config.json
+Special tokens file saved in ./checkpoint-10400/special_tokens_map.json
+added tokens file saved in ./checkpoint-10400/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-9200] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 81%|████████▏ | 10401/12776 [1:50:27<1:26:56,  2.20s/it]                                                          81%|████████▏ | 10401/12776 [1:50:27<1:26:56,  2.20s/it] 81%|████████▏ | 10402/12776 [1:50:28<1:06:20,  1.68s/it]                                                          81%|████████▏ | 10402/12776 [1:50:28<1:06:20,  1.68s/it] 81%|████████▏ | 10403/12776 [1:50:28<51:28,  1.30s/it]                                                          81%|████████▏ | 10403/12776 [1:50:28<51:28,  1.30s/it] 81%|████████▏ | 10404/12776 [1:50:29<42:18,  1.07s/it]                                                        81%|████████▏ | 10404/12776 [1:50:29<42:18,  1.07s/it] 81%|████████▏ | 10405/12776 [1:50:29<34:15,  1.15it/s]                                                        81%|████████▏ | 10405/12776 [1:50:29<34:15,  1.15it/s] 81%|████████▏ | 10406/12776 [1:50:29<30:35,  1.29it/s]                                                        81%|████████▏ | 10406/12776 [1:50:29<30:35,  1.29it/s] 81%|████████▏ | 10407/12776 [1:50:30<25:40,  1.54it/s]                                                        81%|████████▏ | 10407/12776 [1:50:30<25:40,  1.54it/s] 81%|████████▏ | 10408/12776 [1:50:30<22:07,  1.78it/s]                                                        81%|████████▏ | 10408/12776 [1:50:30<22:07,  1.78it/s] 81%|████████▏ | 10409/12776 [1:50:31<20:12,  1.95it/s]                                                        81%|████████▏ | 10409/12776 [1:50:31<20:12,  1.95it/s] 81%|████████▏ | 10410/12776 [1:50:31<17:55,  2.20it/s]                                                        81%|████████▏ | 10410/12776 [1:50:31<17:55,  2.20it/s] 81%|████████▏ | 10411/12776 [1:50:31<16:12,  2.43it/s]                                                        81%|████████▏ | 10411/12776 [1:50:31<16:12,  2.43it/s] 81%|████████▏ | 10412/12776 [1:50:32<14:53,  2.65it/s]                                                        81%|████████▏ | 10412/12776 [1:50:32<14:53,  2.65it/s] 82%|████████▏ | 10413/12776 [1:50:32<14:12,  2.77it/s]                                                        82%|████████▏ | 10413/12776 [1:50:32<14:12,  2.77it/s] 82%|████████▏ | 10414/12776 [1:50:32<13:12,  2.98it/s]                                                        82%|████████▏ | 10414/12776 [1:50:32<13:12,  2.98it/s] 82%|████████▏ | 10415/12776 [1:50:32<12:23,  3.18it/s]                                                        82%|████████▏ | 10415/12776 [1:50:32<12:23,  3.18it/s] 82%|████████▏ | 10416/12776 [1:50:33<11:42,  3.36it/s]                                                        82%|████████▏ | 10416/12776 [1:50:33<11:42,  3.36it/s] 82%|████████▏ | 10417/12776 [1:50:33<11:49,  3.32it/s]                                                        82%|████████▏ | 10417/12776 [1:50:33<11:49,  3.32it/s] 82%|████████▏ | 10418/12776 [1:50:33<11:09,  3.52it/s]                                                        82%|████████▏ | 10418/12776 [1:50:33<11:09,  3.52it/s] 82%|████████▏ | 10419/12776 [1:50:33<10:37,  3.70it/s]                                                        82%|████████▏ | 10419/12776 [1:50:33<10:37,  3.70it/s] 82%|████████▏ | 10420/12776 [1:50:34<10:08,  3.87it/s]                                                        82%|████████▏ | 10420/12776 [1:50:34<10:08,  3.87it/s] 82%|████████▏ | 10421/12776 [1:50:34<10:38,  3.69it/s]                                                        82%|████████▏ | 10421/12776 [1:50:34<10:38,  3.69it/s] 82%|████████▏ | 10422/12776 [1:50:34<10:02,  3.91it/s]                                                        82%|████████▏ | 10422/12776 [1:50:34<10:02,  3.91it/s] 82%|████████▏ | 10423/12776 [1:50:34<09:34,  4.10it/s]                                                        82%|████████▏ | 10423/12776 [1:50:34<09:34,  4.10it/s] 82%|████████▏ | 10424/12776 [1:50:35<09:09,  4.28it/s]                                                        82%|████████▏ | 10424/12776 [1:50:35<09:09,  4.28it/s] 82%|████████▏ | 10425/12776 [1:50:35<08:48,  4.45it/s]                                                        82%|████████▏ | 10425/12776 [1:50:35<08:48,  4.45it/s] 82%|████████▏ | 10426/12776 [1:50:35<09:22,  4.17it/s]                                                        82%|████████▏ | 10426/12776 [1:50:35<09:22,  4.17it/s] 82%|████████▏ | 10427/12776 [1:50:35<08:56,  4.38it/s]                                                        82%|████████▏ | 10427/12776 [1:50:35<08:56,  4.38it/s] 82%|████████▏ | 10428/12776 [1:50:35<08:30,  4.60it/s]                                                        82%|████████▏ | 10428/12776 [1:50:35<08:30,  4.60it/s] 82%|████████▏ | 10429/12776 [1:50:36<08:09,  4.79it/s]                                                        82%|████████▏ | 10429/12776 [1:50:36<08:09,  4.79it/s] 82%|████████▏ | 10430/12776 [1:50:36<07:51,  4.97it/s]                                                        82%|████████▏ | 10430/12776 [1:50:36<07:51,  4.97it/s] 82%|████████▏ | 10431/12776 [1:50:36<07:37,  5.12it/s]                                                        82%|████████▏ | 10431/12776 [1:50:36<07:37,  5.12it/s] 82%|████████▏ | 10432/12776 [1:50:36<08:54,  4.39it/s]                                                        82%|████████▏ | 10432/12776 [1:50:36<08:54,  4.39it/s] 82%|████████▏ | 10433/12776 [1:50:37<08:17,  4.71it/s]                                                        82%|████████▏ | 10433/12776 [1:50:37<08:17,  4.71it/s] 82%|████████▏ | 10434/12776 [1:50:37<07:49,  4.99it/s]                                                        82%|████████▏ | 10434/12776 [1:50:37<07:49,  4.99it/s] 82%|████████▏ | 10435/12776 [1:50:37<07:28,  5.22it/s]                                                        82%|████████▏ | 10435/12776 [1:50:37<07:28,  5.22it/s] 82%|████████▏ | 10436/12776 [1:50:37<07:08,  5.46it/s]                                                        82%|████████▏ | 10436/12776 [1:50:37<07:08,  5.46it/s] 82%|████████▏ | 10437/12776 [1:50:37<06:56,  5.62it/s]                                                        82%|████████▏ | 10437/12776 [1:50:37<06:56,  5.62it/s] 82%|████████▏ | 10438/12776 [1:50:38<12:37,  3.09it/s]                                                        82%|████████▏ | 10438/12776 [1:50:38<12:37,  3.09it/s] 82%|████████▏ | 10439/12776 [1:50:39<23:26,  1.66it/s]                                                        82%|████████▏ | 10439/12776 [1:50:39<23:26,  1.66it/s] 82%|████████▏ | 10440/12776 [1:50:40<27:54,  1.39it/s]                                                        82%|████████▏ | 10440/12776 [1:50:40<27:54,  1.39it/s] 82%|████████▏ | 10441/12776 [1:50:41<29:41,  1.31it/s]                                                        82%|████████▏ | 10441/12776 [1:50:41<29:41,  1.31it/s] 82%|████████▏ | 10442/12776 [1:50:42<29:23,  1.32it/s]                                                        82%|████████▏ | 10442/12776 [1:50:42<29:23,  1.32it/s] 82%|████████▏ | 10443/12776 [1:50:42<29:14,  1.33it/s]                                                        82%|████████▏ | 10443/12776 [1:50:42<29:14,  1.33it/s] 82%|████████▏ | 10444/12776 [1:50:43<29:09,  1.33it/s]                                                        82%|████████▏ | 10444/12776 [1:50:43<29:09,  1.33it/s] 82%|████████▏ | 10445/12776 [1:50:44<27:29,  1.41it/s]                                                        82%|████████▏ | 10445/12776 [1:50:44<27:29,  1.41it/s] 82%|████████▏ | 10446/12776 [1:50:44<26:02,  1.49it/s]                                                        82%|████████▏ | 10446/12776 [1:50:44<26:02,  1.49it/s] 82%|████████▏ | 10447/12776 [1:50:45<24:28,  1.59it/s]                                                        82%|████████▏ | 10447/12776 [1:50:45<24:28,  1.59it/s] 82%|████████▏ | 10448/12776 [1:50:46<24:36,  1.58it/s]                                                        82%|████████▏ | 10448/12776 [1:50:46<24:36,  1.58it/s] 82%|████████▏ | 10449/12776 [1:50:46<23:03,  1.68it/s]                                                        82%|████████▏ | 10449/12776 [1:50:46<23:03,  1.68it/s] 82%|████████▏ | 10450/12776 [1:50:47<21:39,  1.79it/s]                                                        82%|██���█████▏ | 10450/12776 [1:50:47<21:39,  1.79it/s] 82%|████████▏ | 10451/12776 [1:50:47<20:29,  1.89it/s]                                                        82%|████████▏ | 10451/12776 [1:50:47<20:29,  1.89it/s] 82%|████████▏ | 10452/12776 [1:50:47<19:13,  2.02it/s]                                                        82%|████████▏ | 10452/12776 [1:50:47<19:13,  2.02it/s] 82%|████████▏ | 10453/12776 [1:50:48<19:02,  2.03it/s]                                                        82%|████████▏ | 10453/12776 [1:50:48<19:02,  2.03it/s] 82%|████████▏ | 10454/12776 [1:50:48<17:57,  2.16it/s]                                                        82%|████████▏ | 10454/12776 [1:50:48<17:57,  2.16it/s] 82%|████████▏ | 10455/12776 [1:50:49<16:55,  2.29it/s]                                                        82%|████████▏ | 10455/12776 [1:50:49<16:55,  2.29it/s] 82%|████████▏ | 10456/12776 [1:50:49<16:24,  2.36it/s]                                                        82%|████████▏ | 10456/12776 [1:50:49<16:24,  2.36it/s] 82%|████████▏ | 10457/12776 [1:50:49<15:24,  2.51it/s]                                                        82%|████████▏ | 10457/12776 [1:50:49<15:24,  2.51it/s] 82%|████████▏ | 10458/12776 [1:50:50<14:32,  2.66it/s]                                                        82%|████████▏ | 10458/12776 [1:50:50<14:32,  2.66it/s] 82%|████████▏ | 10459/12776 [1:50:50<13:53,  2.78it/s]                                                        82%|████████▏ | 10459/12776 [1:50:50<13:53,  2.78it/s] 82%|████████▏ | 10460/12776 [1:50:50<13:50,  2.79it/s]                                                        82%|████████▏ | 10460/12776 [1:50:50<13:50,  2.79it/s] 82%|████████▏ | 10461/12776 [1:50:51<13:12,  2.92it/s]                                                        82%|████████▏ | 10461/12776 [1:50:51<13:12,  2.92it/s] 82%|████████▏ | 10462/12776 [1:50:51<12:38,  3.05it/s]                                                        82%|████████▏ | 10462/12776 [1:50:51<12:38,  3.05it/s] 82%|████████▏ | 10463/12776 [1:50:51<12:38,  3.05it/s]                                                        82%|████████▏ | 10463/12776 [1:50:51<12:38,  3.05it/s] 82%|████████▏ | 10464/12776 [1:50:52<12:00,  3.21it/s]                                                        82%|████████▏ | 10464/12776 [1:50:52<12:00,  3.21it/s] 82%|████████▏ | 10465/12776 [1:50:52<11:28,  3.35it/s]                                                        82%|████████▏ | 10465/12776 [1:50:52<11:28,  3.35it/s] 82%|████████▏ | 10466/12776 [1:50:52<11:00,  3.49it/s]                                                        82%|████████▏ | 10466/12776 [1:50:52<11:00,  3.49it/s] 82%|████████▏ | 10467/12776 [1:50:52<11:24,  3.37it/s]                                                       {'loss': 0.2012, 'grad_norm': 0.5768091678619385, 'learning_rate': 5.865102639296187e-05, 'epoch': 1.63}
+{'loss': 0.1778, 'grad_norm': 1.113883137702942, 'learning_rate': 5.862658846529814e-05, 'epoch': 1.63}
+{'loss': 0.1855, 'grad_norm': 0.5060340762138367, 'learning_rate': 5.8602150537634406e-05, 'epoch': 1.63}
+{'loss': 0.1701, 'grad_norm': 0.514045000076294, 'learning_rate': 5.8577712609970664e-05, 'epoch': 1.63}
+{'loss': 0.2594, 'grad_norm': 0.860876202583313, 'learning_rate': 5.8553274682306936e-05, 'epoch': 1.63}
+{'loss': 0.1743, 'grad_norm': 1.2746233940124512, 'learning_rate': 5.852883675464321e-05, 'epoch': 1.63}
+{'loss': 0.203, 'grad_norm': 0.68976891040802, 'learning_rate': 5.8504398826979466e-05, 'epoch': 1.63}
+{'loss': 0.226, 'grad_norm': 0.48356834053993225, 'learning_rate': 5.847996089931573e-05, 'epoch': 1.63}
+{'loss': 0.1366, 'grad_norm': 1.8807826042175293, 'learning_rate': 5.8455522971652e-05, 'epoch': 1.63}
+{'loss': 0.3315, 'grad_norm': 0.7366307377815247, 'learning_rate': 5.843108504398826e-05, 'epoch': 1.63}
+{'loss': 0.2295, 'grad_norm': 0.8928070068359375, 'learning_rate': 5.840664711632453e-05, 'epoch': 1.63}
+{'loss': 0.2468, 'grad_norm': 0.8314988613128662, 'learning_rate': 5.83822091886608e-05, 'epoch': 1.63}
+{'loss': 0.2625, 'grad_norm': 1.0625923871994019, 'learning_rate': 5.835777126099706e-05, 'epoch': 1.63}
+{'loss': 0.4308, 'grad_norm': 3.7548062801361084, 'learning_rate': 5.8333333333333326e-05, 'epoch': 1.63}
+{'loss': 0.3888, 'grad_norm': 2.1772689819335938, 'learning_rate': 5.83088954056696e-05, 'epoch': 1.63}
+{'loss': 0.2004, 'grad_norm': 1.233253002166748, 'learning_rate': 5.8284457478005856e-05, 'epoch': 1.63}
+{'loss': 0.3962, 'grad_norm': 1.6462030410766602, 'learning_rate': 5.826001955034213e-05, 'epoch': 1.63}
+{'loss': 0.3691, 'grad_norm': 7.115745544433594, 'learning_rate': 5.82355816226784e-05, 'epoch': 1.63}
+{'loss': 0.2721, 'grad_norm': 1.0875389575958252, 'learning_rate': 5.821114369501466e-05, 'epoch': 1.63}
+{'loss': 0.4565, 'grad_norm': 1.3013725280761719, 'learning_rate': 5.818670576735092e-05, 'epoch': 1.63}
+{'loss': 0.5847, 'grad_norm': 2.092606782913208, 'learning_rate': 5.816226783968719e-05, 'epoch': 1.63}
+{'loss': 0.32, 'grad_norm': 1.729488492012024, 'learning_rate': 5.813782991202345e-05, 'epoch': 1.63}
+{'loss': 0.2449, 'grad_norm': 1.737029790878296, 'learning_rate': 5.811339198435972e-05, 'epoch': 1.63}
+{'loss': 0.7863, 'grad_norm': 2.1841983795166016, 'learning_rate': 5.8088954056695994e-05, 'epoch': 1.63}
+{'loss': 0.2027, 'grad_norm': 0.9990864992141724, 'learning_rate': 5.806451612903225e-05, 'epoch': 1.63}
+{'loss': 0.5477, 'grad_norm': 2.165163040161133, 'learning_rate': 5.804007820136852e-05, 'epoch': 1.63}
+{'loss': 0.5501, 'grad_norm': 5.074351787567139, 'learning_rate': 5.801564027370479e-05, 'epoch': 1.63}
+{'loss': 0.4008, 'grad_norm': 1.8900859355926514, 'learning_rate': 5.799120234604105e-05, 'epoch': 1.63}
+{'loss': 0.4468, 'grad_norm': 1.9944469928741455, 'learning_rate': 5.796676441837732e-05, 'epoch': 1.63}
+{'loss': 0.7135, 'grad_norm': 2.274146795272827, 'learning_rate': 5.794232649071358e-05, 'epoch': 1.63}
+{'loss': 0.4055, 'grad_norm': 3.8034324645996094, 'learning_rate': 5.791788856304985e-05, 'epoch': 1.63}
+{'loss': 0.8925, 'grad_norm': 2.4638800621032715, 'learning_rate': 5.789345063538611e-05, 'epoch': 1.63}
+{'loss': 0.8966, 'grad_norm': 5.541257858276367, 'learning_rate': 5.7869012707722384e-05, 'epoch': 1.63}
+{'loss': 0.6739, 'grad_norm': 6.215266704559326, 'learning_rate': 5.784457478005864e-05, 'epoch': 1.63}
+{'loss': 0.5802, 'grad_norm': 1.26706862449646, 'learning_rate': 5.7820136852394914e-05, 'epoch': 1.63}
+{'loss': 0.6051, 'grad_norm': 2.1300575733184814, 'learning_rate': 5.779569892473117e-05, 'epoch': 1.63}
+{'loss': 1.0206, 'grad_norm': 4.257119178771973, 'learning_rate': 5.7771260997067443e-05, 'epoch': 1.63}
+{'loss': 0.7068, 'grad_norm': 2.297468662261963, 'learning_rate': 5.774682306940371e-05, 'epoch': 1.63}
+{'loss': 0.7299, 'grad_norm': 3.372777223587036, 'learning_rate': 5.772238514173997e-05, 'epoch': 1.63}
+{'loss': 1.2506, 'grad_norm': 2.6699585914611816, 'learning_rate': 5.769794721407624e-05, 'epoch': 1.63}
+{'loss': 0.8118, 'grad_norm': 2.306000232696533, 'learning_rate': 5.767350928641251e-05, 'epoch': 1.63}
+{'loss': 0.8326, 'grad_norm': 3.001568078994751, 'learning_rate': 5.764907135874877e-05, 'epoch': 1.63}
+{'loss': 1.0572, 'grad_norm': 2.3206942081451416, 'learning_rate': 5.762463343108504e-05, 'epoch': 1.63}
+{'loss': 1.1929, 'grad_norm': 9.964920043945312, 'learning_rate': 5.7600195503421304e-05, 'epoch': 1.63}
+{'loss': 0.7854, 'grad_norm': 2.1716456413269043, 'learning_rate': 5.757575757575757e-05, 'epoch': 1.63}
+{'loss': 0.4832, 'grad_norm': 3.3193044662475586, 'learning_rate': 5.7551319648093833e-05, 'epoch': 1.63}
+{'loss': 0.343, 'grad_norm': 3.8049163818359375, 'learning_rate': 5.7526881720430105e-05, 'epoch': 1.63}
+{'loss': 0.7935, 'grad_norm': 3.3288490772247314, 'learning_rate': 5.750244379276636e-05, 'epoch': 1.63}
+{'loss': 1.2361, 'grad_norm': 1.9719550609588623, 'learning_rate': 5.7478005865102635e-05, 'epoch': 1.63}
+{'loss': 1.1997, 'grad_norm': 2.7374227046966553, 'learning_rate': 5.74535679374389e-05, 'epoch': 1.63}
+{'loss': 0.1796, 'grad_norm': 0.5598339438438416, 'learning_rate': 5.7429130009775164e-05, 'epoch': 1.63}
+{'loss': 0.2172, 'grad_norm': 0.9065269231796265, 'learning_rate': 5.740469208211143e-05, 'epoch': 1.63}
+{'loss': 0.1974, 'grad_norm': 1.6582931280136108, 'learning_rate': 5.73802541544477e-05, 'epoch': 1.63}
+{'loss': 0.2214, 'grad_norm': 0.5348918437957764, 'learning_rate': 5.735581622678396e-05, 'epoch': 1.63}
+{'loss': 0.1988, 'grad_norm': 2.297356367111206, 'learning_rate': 5.733137829912023e-05, 'epoch': 1.63}
+{'loss': 0.1346, 'grad_norm': 0.5752388834953308, 'learning_rate': 5.7306940371456495e-05, 'epoch': 1.63}
+{'loss': 0.2363, 'grad_norm': 0.8799678683280945, 'learning_rate': 5.728250244379276e-05, 'epoch': 1.64}
+{'loss': 0.2131, 'grad_norm': 0.6415383219718933, 'learning_rate': 5.7258064516129025e-05, 'epoch': 1.64}
+{'loss': 0.344, 'grad_norm': 17.69931411743164, 'learning_rate': 5.7233626588465296e-05, 'epoch': 1.64}
+{'loss': 0.2848, 'grad_norm': 1.341117262840271, 'learning_rate': 5.7209188660801554e-05, 'epoch': 1.64}
+{'loss': 0.238, 'grad_norm': 1.4106292724609375, 'learning_rate': 5.7184750733137826e-05, 'epoch': 1.64}
+{'loss': 0.3487, 'grad_norm': 1.0851185321807861, 'learning_rate': 5.716031280547409e-05, 'epoch': 1.64}
+{'loss': 0.4459, 'grad_norm': 1.2558451890945435, 'learning_rate': 5.7135874877810355e-05, 'epoch': 1.64}
+{'loss': 0.3275, 'grad_norm': 2.4315550327301025, 'learning_rate': 5.711143695014662e-05, 'epoch': 1.64}
+{'loss': 0.5321, 'grad_norm': 1.1903713941574097, 'learning_rate': 5.708699902248289e-05, 'epoch': 1.64}
+{'loss': 0.5209, 'grad_norm': 2.3874330520629883, 'learning_rate': 5.706256109481915e-05, 'epoch': 1.64}
+{'loss': 0.4226, 'grad_norm': 2.2533371448516846, 'learning_rate': 5.703812316715542e-05, 'epoch': 1.64}
+{'loss': 0.205, 'grad_norm': 1.0902347564697266, 'learning_rate': 5.7013685239491686e-05, 'epoch': 1.64}
+{'loss': 0.1742, 'grad_norm': 2.033475875854492, 'learning_rate': 5.698924731182795e-05, 'epoch': 1.64}
+{'loss': 0.2724, 'grad_norm': 1.488540530204773, 'learning_rate': 5.6964809384164216e-05, 'epoch': 1.64}
+{'loss': 0.6245, 'grad_norm': 3.625598192214966, 'learning_rate': 5.694037145650049e-05, 'epoch': 1.64}
+{'loss': 0.3794, 'grad_norm': 3.725924015045166, 'learning_rate': 5.6915933528836745e-05, 'epoch': 1.64}
+{'loss': 0.2737, 'grad_norm': 1.230906367301941, 'learning_rate': 5.689149560117302e-05, 'epoch': 1.64}
+{'loss': 0.5026, 'grad_norm': 3.53603196144104, 'learning_rate': 5.686705767350928e-05, 'epoch': 1.64}
+{'loss': 0.2446, 'grad_norm': 1.3804330825805664, 'learning_rate': 5.6842619745845547e-05, 'epoch': 1.64}
+{'loss': 0.4187, 'grad_norm': 2.4402577877044678, 'learning_rate': 5.681818181818181e-05, 'epoch': 1.64}
+{'loss': 0.5199, 'grad_norm': 1.448628306388855, 'learning_rate': 5.679374389051808e-05, 'epoch': 1.64}
+{'loss': 0.7834, 'grad_norm': 3.0513761043548584, 'learning_rate': 5.676930596285434e-05, 'epoch': 1.64}
+ 82%|████████▏ | 10467/12776 [1:50:52<11:24,  3.37it/s] 82%|████████▏ | 10468/12776 [1:50:53<10:50,  3.55it/s]                                                        82%|████████▏ | 10468/12776 [1:50:53<10:50,  3.55it/s] 82%|████████▏ | 10469/12776 [1:50:53<10:16,  3.74it/s]                                                        82%|████████▏ | 10469/12776 [1:50:53<10:16,  3.74it/s] 82%|████████▏ | 10470/12776 [1:50:53<09:49,  3.91it/s]                                                        82%|████████▏ | 10470/12776 [1:50:53<09:49,  3.91it/s] 82%|████████▏ | 10471/12776 [1:50:53<09:28,  4.06it/s]                                                        82%|████████▏ | 10471/12776 [1:50:53<09:28,  4.06it/s] 82%|████████▏ | 10472/12776 [1:50:54<10:20,  3.71it/s]                                                        82%|████████▏ | 10472/12776 [1:50:54<10:20,  3.71it/s] 82%|████████▏ | 10473/12776 [1:50:54<09:44,  3.94it/s]                                                        82%|████████▏ | 10473/12776 [1:50:54<09:44,  3.94it/s] 82%|████████▏ | 10474/12776 [1:50:54<09:14,  4.15it/s]                                                        82%|████████▏ | 10474/12776 [1:50:54<09:14,  4.15it/s] 82%|████████▏ | 10475/12776 [1:50:54<09:01,  4.25it/s]                                                        82%|████████▏ | 10475/12776 [1:50:54<09:01,  4.25it/s] 82%|████████▏ | 10476/12776 [1:50:55<08:49,  4.35it/s]                                                        82%|████████▏ | 10476/12776 [1:50:55<08:49,  4.35it/s] 82%|████████▏ | 10477/12776 [1:50:55<09:19,  4.11it/s]                                                        82%|████████▏ | 10477/12776 [1:50:55<09:19,  4.11it/s] 82%|████████▏ | 10478/12776 [1:50:55<08:54,  4.30it/s]                                                        82%|████████▏ | 10478/12776 [1:50:55<08:54,  4.30it/s] 82%|████████▏ | 10479/12776 [1:50:55<08:37,  4.44it/s]                                                        82%|████████▏ | 10479/12776 [1:50:55<08:37,  4.44it/s] 82%|████████▏ | 10480/12776 [1:50:55<08:23,  4.56it/s]                                                        82%|████████▏ | 10480/12776 [1:50:55<08:23,  4.56it/s] 82%|████████▏ | 10481/12776 [1:50:56<08:12,  4.66it/s]                                                        82%|████████▏ | 10481/12776 [1:50:56<08:12,  4.66it/s] 82%|████████▏ | 10482/12776 [1:50:56<09:21,  4.09it/s]                                                        82%|████████▏ | 10482/12776 [1:50:56<09:21,  4.09it/s] 82%|████████▏ | 10483/12776 [1:50:56<08:47,  4.35it/s]                                                        82%|████████▏ | 10483/12776 [1:50:56<08:47,  4.35it/s] 82%|████████▏ | 10484/12776 [1:50:56<08:22,  4.56it/s]                                                        82%|████████▏ | 10484/12776 [1:50:56<08:22,  4.56it/s] 82%|████████▏ | 10485/12776 [1:50:57<08:04,  4.72it/s]                                                        82%|████████▏ | 10485/12776 [1:50:57<08:04,  4.72it/s] 82%|████████▏ | 10486/12776 [1:50:57<07:37,  5.00it/s]                                                        82%|████████▏ | 10486/12776 [1:50:57<07:37,  5.00it/s] 82%|████████▏ | 10487/12776 [1:50:57<08:15,  4.62it/s]                                                        82%|████████▏ | 10487/12776 [1:50:57<08:15,  4.62it/s] 82%|████████▏ | 10488/12776 [1:50:58<13:36,  2.80it/s]                                                        82%|████████▏ | 10488/12776 [1:50:58<13:36,  2.80it/s] 82%|████████▏ | 10489/12776 [1:50:59<24:19,  1.57it/s]                                                        82%|████████▏ | 10489/12776 [1:50:59<24:19,  1.57it/s] 82%|████████▏ | 10490/12776 [1:51:00<27:30,  1.38it/s]                                                        82%|████████▏ | 10490/12776 [1:51:00<27:30,  1.38it/s] 82%|████████▏ | 10491/12776 [1:51:01<29:03,  1.31it/s]                                                        82%|████████▏ | 10491/12776 [1:51:01<29:03,  1.31it/s] 82%|████████▏ | 10492/12776 [1:51:02<29:54,  1.27it/s]                                                        82%|████████▏ | 10492/12776 [1:51:02<29:54,  1.27it/s] 82%|████████▏ | 10493/12776 [1:51:02<29:10,  1.30it/s]                                                        82%|████████▏ | 10493/12776 [1:51:02<29:10,  1.30it/s] 82%|████████▏ | 10494/12776 [1:51:03<28:41,  1.33it/s]                                                        82%|████████▏ | 10494/12776 [1:51:03<28:41,  1.33it/s] 82%|████████▏ | 10495/12776 [1:51:04<27:15,  1.39it/s]                                                        82%|████████▏ | 10495/12776 [1:51:04<27:15,  1.39it/s] 82%|████████▏ | 10496/12776 [1:51:04<26:06,  1.46it/s]                                                        82%|████████▏ | 10496/12776 [1:51:04<26:06,  1.46it/s] 82%|████████▏ | 10497/12776 [1:51:05<24:55,  1.52it/s]                                                        82%|████████▏ | 10497/12776 [1:51:05<24:55,  1.52it/s] 82%|████████▏ | 10498/12776 [1:51:05<23:50,  1.59it/s]                                                        82%|████████▏ | 10498/12776 [1:51:05<23:50,  1.59it/s] 82%|████████▏ | 10499/12776 [1:51:06<22:49,  1.66it/s]                                                        82%|████████▏ | 10499/12776 [1:51:06<22:49,  1.66it/s] 82%|████████▏ | 10500/12776 [1:51:06<21:46,  1.74it/s]                                                        82%|████████▏ | 10500/12776 [1:51:06<21:46,  1.74it/s] 82%|████████▏ | 10501/12776 [1:51:07<20:46,  1.82it/s]                                                        82%|████████▏ | 10501/12776 [1:51:07<20:46,  1.82it/s] 82%|████████▏ | 10502/12776 [1:51:08<20:44,  1.83it/s]                                                        82%|████████▏ | 10502/12776 [1:51:08<20:44,  1.83it/s] 82%|████████▏ | 10503/12776 [1:51:08<19:31,  1.94it/s]                                                        82%|████████▏ | 10503/12776 [1:51:08<19:31,  1.94it/s] 82%|████████▏ | 10504/12776 [1:51:08<19:25,  1.95it/s]                                                        82%|████████▏ | 10504/12776 [1:51:08<19:25,  1.95it/s] 82%|████████▏ | 10505/12776 [1:51:09<18:07,  2.09it/s]                                                        82%|████████▏ | 10505/12776 [1:51:09<18:07,  2.09it/s] 82%|████████▏ | 10506/12776 [1:51:09<17:04,  2.22it/s]                                                        82%|████████▏ | 10506/12776 [1:51:09<17:04,  2.22it/s] 82%|████████▏ | 10507/12776 [1:51:10<16:18,  2.32it/s]                                                        82%|████████▏ | 10507/12776 [1:51:10<16:18,  2.32it/s] 82%|████████▏ | 10508/12776 [1:51:10<15:21,  2.46it/s]                                                        82%|████████▏ | 10508/12776 [1:51:10<15:21,  2.46it/s] 82%|████████▏ | 10509/12776 [1:51:10<14:36,  2.59it/s]                                                        82%|████████▏ | 10509/12776 [1:51:10<14:36,  2.59it/s] 82%|████████▏ | 10510/12776 [1:51:11<15:13,  2.48it/s]                                                        82%|████████▏ | 10510/12776 [1:51:11<15:13,  2.48it/s] 82%|████████▏ | 10511/12776 [1:51:11<14:18,  2.64it/s]                                                        82%|████████▏ | 10511/12776 [1:51:11<14:18,  2.64it/s] 82%|████████▏ | 10512/12776 [1:51:11<13:29,  2.80it/s]                                                        82%|████████▏ | 10512/12776 [1:51:11<13:29,  2.80it/s] 82%|████████▏ | 10513/12776 [1:51:12<12:51,  2.93it/s]                                                        82%|████████▏ | 10513/12776 [1:51:12<12:51,  2.93it/s] 82%|████████▏ | 10514/12776 [1:51:12<13:01,  2.89it/s]                                                        82%|████████▏ | 10514/12776 [1:51:12<13:01,  2.89it/s] 82%|████████▏ | 10515/12776 [1:51:12<12:22,  3.04it/s]                                                        82%|████████▏ | 10515/12776 [1:51:12<12:22,  3.04it/s] 82%|████████▏ | 10516/12776 [1:51:13<11:47,  3.19it/s]                                                        82%|████████▏ | 10516/12776 [1:51:13<11:47,  3.19it/s] 82%|████████▏ | 10517/12776 [1:51:13<11:18,  3.33it/s]                                                        82%|████████▏ | 10517/12776 [1:51:13<11:18,  3.33it/s] 82%|████████▏ | 10518/12776 [1:51:13<11:17,  3.33it/s]                                                        82%|████████▏ | 10518/12776 [1:51:13<11:17,  3.33it/s] 82%|████████▏ | 10519/12776 [1:51:13<10:47,  3.49it/s]                                                        82%|████████▏ | 10519/12776 [1:51:13<10:47,  3.49it/s] 82%|████████▏ | 10520/12776 [1:51:14<10:24,  3.61it/s]                                                        82%|████████▏ | 10520/12776 [1:51:14<10:24,  3.61it/s] 82%|████████▏ | 10521/12776 [1:51:14<10:04,  3.73it/s]                                                        82%|████████▏ | 10521/12776 [1:51:14<10:04,  3.73it/s] 82%|████████▏ | 10522/12776 [1:51:14<11:04,  3.39it/s]                                                        82%|████████▏ | 10522/12776 [1:51:14<11:04,  3.39it/s] 82%|████████▏ | 10523/12776 [1:51:15<10:25,  3.60it/s]                                                        82%|████████▏ | 10523/12776 [1:51:15<10:25,  3.60it/s] 82%|████████▏ | 10524/12776 [1:51:15<09:54,  3.79it/s]                                                        82%|████████▏ | 10524/12776 [1:51:15<09:54,  3.79it/s] 82%|████████▏ | 10525/12776 [1:51:15<09:28,  3.96it/s]                                                        82%|████████▏ | 10525/12776 [1:51:15<09:28,  3.96it/s] 82%|████████▏ | 10526/12776 [1:51:15<10:04,  3.72it/s]                                                        82%|████████▏ | 10526/12776 [1:51:15<10:04,  3.72it/s] 82%|████████▏ | 10527/12776 [1:51:16<09:26,  3.97it/s]                                                        82%|████████▏ | 10527/12776 [1:51:16<09:26,  3.97it/s] 82%|████████▏ | 10528/12776 [1:51:16<08:57,  4.18it/s]                                                        82%|████████▏ | 10528/12776 [1:51:16<08:57,  4.18it/s] 82%|████████▏ | 10529/12776 [1:51:16<08:39,  4.32it/s]                                                        82%|████████▏ | 10529/12776 [1:51:16<08:39,  4.32it/s] 82%|████████▏ | 10530/12776 [1:51:16<08:23,  4.46it/s]                                                        82%|████████▏ | 10530/12776 [1:51:16<08:23,  4.46it/s] 82%|████████▏ | 10531/12776 [1:51:16<09:10,  4.08it/s]                                                        82%|████████▏ | 10531/12776 [1:51:16<09:10,  4.08it/s] 82%|████████▏ | 10532/12776 [1:51:17<08:40,  4.31it/s]                                                        82%|████████▏ | 10532/12776 [1:51:17<08:40,  4.31it/s] 82%|████████▏ | 10533/12776 [1:51:17<08:18,  4.50it/s]                                                        82%|████████▏ | 10533/12776 [1:51:17<08:18,  4.50it/s] 82%|████████▏ | 10534/12776 [1:51:17<08:10,  4.57it/s]                                                        82%|████████▏ | 10534/12776 [1:51:17<08:10,  4.57it/s] 82%|████████▏ | 10535/12776 [1:51:17<07:52,  4.74it/s]                                                        82%|████████▏ | 10535/12776 [1:51:17<07:52,  4.74it/s] 82%|████████▏ | 10536/12776 [1:51:17<07:36,  4.90it/s]                                                        82%|████████▏ | 10536/12776 [1:51:17<07:36,  4.90it/s] 82%|████████▏ | 10537/12776 [1:51:18<08:03,  4.63it/s]                                                        82%|████████▏ | 10537/12776 [1:51:18<08:03,  4.63it/s] 82%|████████▏ | 10538/12776 [1:51:18<13:38,  2.73it/s]                                                        82%|████████▏ | 10538/12776 [1:51:18<13:38,  2.73it/s] 82%|████████▏ | 10539/12776 [1:51:20<25:32,  1.46it/s]                                                        82%|████████▏ | 10539/12776 [1:51:20<25:32,  1.46it/s] 82%|████████▏ | 10540/12776 [1:51:21<28:58,  1.29it/s]                                                        82%|████████▏ | 10540/12776 [1:51:21<28:58,  1.29it/s] 83%|████████▎ | 10541/12776 [1:51:22<29:54,  1.25it/s]                                                        83%|████████▎ | 10541/12776 [1:51:22<29:54,  1.25it/s] 83%|████████▎ | 10542/12776 [1:51:23<30:22,  1.23it/s]                                                        83%|████████▎ | 10542/12776 [1:51:23<30:22,  1.23it/s] 83%|████████▎ | 10543/12776 [1:51:23<30:14,  1.23it/s]                                                        83%|████████▎ | 10543/12776 [1:51:23<30:14,  1.23it/s] 83%|████████▎ | 10544/12776 [1:51:24<28:55,  1.29it/s]                                                        83%|████████▎ | 10544/12776 [1:51:24<28:55,  1.29it/s] 83%|████████▎ | 10545/12776 [1:51:25<28:14,  1.32it/s]                                                       {'loss': 0.8243, 'grad_norm': 2.2523458003997803, 'learning_rate': 5.674486803519061e-05, 'epoch': 1.64}
+{'loss': 0.3212, 'grad_norm': 2.0097694396972656, 'learning_rate': 5.672043010752688e-05, 'epoch': 1.64}
+{'loss': 0.6163, 'grad_norm': 4.3316874504089355, 'learning_rate': 5.669599217986314e-05, 'epoch': 1.64}
+{'loss': 0.5379, 'grad_norm': 1.4377896785736084, 'learning_rate': 5.667155425219941e-05, 'epoch': 1.64}
+{'loss': 0.8601, 'grad_norm': 3.63924241065979, 'learning_rate': 5.664711632453568e-05, 'epoch': 1.64}
+{'loss': 0.6937, 'grad_norm': 2.5112884044647217, 'learning_rate': 5.6622678396871937e-05, 'epoch': 1.64}
+{'loss': 1.0227, 'grad_norm': 3.881232500076294, 'learning_rate': 5.659824046920821e-05, 'epoch': 1.64}
+{'loss': 0.9926, 'grad_norm': 1.790979027748108, 'learning_rate': 5.657380254154447e-05, 'epoch': 1.64}
+{'loss': 0.9959, 'grad_norm': 4.465099811553955, 'learning_rate': 5.654936461388074e-05, 'epoch': 1.64}
+{'loss': 0.4801, 'grad_norm': 1.5608214139938354, 'learning_rate': 5.6524926686217e-05, 'epoch': 1.64}
+{'loss': 0.6256, 'grad_norm': 3.728848457336426, 'learning_rate': 5.6500488758553274e-05, 'epoch': 1.64}
+{'loss': 1.2323, 'grad_norm': 1.7102699279785156, 'learning_rate': 5.647605083088953e-05, 'epoch': 1.64}
+{'loss': 1.4178, 'grad_norm': 3.236166477203369, 'learning_rate': 5.6451612903225804e-05, 'epoch': 1.64}
+{'loss': 1.1856, 'grad_norm': 6.140420436859131, 'learning_rate': 5.642717497556207e-05, 'epoch': 1.64}
+{'loss': 0.6144, 'grad_norm': 2.2286741733551025, 'learning_rate': 5.640273704789833e-05, 'epoch': 1.64}
+{'loss': 1.2648, 'grad_norm': 2.5856056213378906, 'learning_rate': 5.63782991202346e-05, 'epoch': 1.64}
+{'loss': 0.9104, 'grad_norm': 3.3307247161865234, 'learning_rate': 5.635386119257087e-05, 'epoch': 1.64}
+{'loss': 0.9191, 'grad_norm': 2.219688892364502, 'learning_rate': 5.632942326490713e-05, 'epoch': 1.64}
+{'loss': 0.5855, 'grad_norm': 3.4340312480926514, 'learning_rate': 5.63049853372434e-05, 'epoch': 1.64}
+{'loss': 0.3031, 'grad_norm': 1.018791913986206, 'learning_rate': 5.6280547409579664e-05, 'epoch': 1.64}
+{'loss': 1.0541, 'grad_norm': 3.0899922847747803, 'learning_rate': 5.625610948191593e-05, 'epoch': 1.64}
+{'loss': 1.1517, 'grad_norm': 2.5007576942443848, 'learning_rate': 5.6231671554252194e-05, 'epoch': 1.64}
+{'loss': 0.1584, 'grad_norm': 0.45454585552215576, 'learning_rate': 5.6207233626588465e-05, 'epoch': 1.64}
+{'loss': 0.2598, 'grad_norm': 0.7792069911956787, 'learning_rate': 5.618279569892472e-05, 'epoch': 1.64}
+{'loss': 0.2881, 'grad_norm': 1.4265599250793457, 'learning_rate': 5.6158357771260995e-05, 'epoch': 1.64}
+{'loss': 0.2933, 'grad_norm': 1.610418438911438, 'learning_rate': 5.613391984359726e-05, 'epoch': 1.64}
+{'loss': 0.2019, 'grad_norm': 0.5651922225952148, 'learning_rate': 5.6109481915933524e-05, 'epoch': 1.64}
+{'loss': 0.1858, 'grad_norm': 0.757182240486145, 'learning_rate': 5.608504398826979e-05, 'epoch': 1.64}
+{'loss': 0.1283, 'grad_norm': 0.4835294783115387, 'learning_rate': 5.606060606060606e-05, 'epoch': 1.64}
+{'loss': 0.224, 'grad_norm': 0.6997655034065247, 'learning_rate': 5.603616813294232e-05, 'epoch': 1.64}
+{'loss': 0.4684, 'grad_norm': 1.8372862339019775, 'learning_rate': 5.601173020527859e-05, 'epoch': 1.64}
+{'loss': 0.2866, 'grad_norm': 1.1879760026931763, 'learning_rate': 5.5987292277614855e-05, 'epoch': 1.64}
+{'loss': 0.2133, 'grad_norm': 0.4589191973209381, 'learning_rate': 5.596285434995112e-05, 'epoch': 1.64}
+{'loss': 0.4608, 'grad_norm': 0.8576350212097168, 'learning_rate': 5.5938416422287385e-05, 'epoch': 1.64}
+{'loss': 0.2109, 'grad_norm': 0.8671181201934814, 'learning_rate': 5.5913978494623656e-05, 'epoch': 1.64}
+{'loss': 0.3122, 'grad_norm': 1.038217306137085, 'learning_rate': 5.5889540566959914e-05, 'epoch': 1.64}
+{'loss': 0.5782, 'grad_norm': 2.1841461658477783, 'learning_rate': 5.5865102639296186e-05, 'epoch': 1.64}
+{'loss': 0.4222, 'grad_norm': 1.437780737876892, 'learning_rate': 5.584066471163245e-05, 'epoch': 1.64}
+{'loss': 0.3952, 'grad_norm': 1.98343825340271, 'learning_rate': 5.581622678396871e-05, 'epoch': 1.64}
+{'loss': 0.42, 'grad_norm': 1.1477174758911133, 'learning_rate': 5.579178885630498e-05, 'epoch': 1.64}
+{'loss': 0.4185, 'grad_norm': 1.4632834196090698, 'learning_rate': 5.576735092864125e-05, 'epoch': 1.64}
+{'loss': 0.3385, 'grad_norm': 1.2096794843673706, 'learning_rate': 5.574291300097751e-05, 'epoch': 1.64}
+{'loss': 0.3774, 'grad_norm': 1.2012349367141724, 'learning_rate': 5.571847507331378e-05, 'epoch': 1.65}
+{'loss': 0.7032, 'grad_norm': 1.8027079105377197, 'learning_rate': 5.5694037145650046e-05, 'epoch': 1.65}
+{'loss': 0.7242, 'grad_norm': 2.5102810859680176, 'learning_rate': 5.5669599217986304e-05, 'epoch': 1.65}
+{'loss': 0.5148, 'grad_norm': 2.101835250854492, 'learning_rate': 5.5645161290322576e-05, 'epoch': 1.65}
+{'loss': 0.3692, 'grad_norm': 1.309965968132019, 'learning_rate': 5.562072336265885e-05, 'epoch': 1.65}
+{'loss': 0.6005, 'grad_norm': 2.4885432720184326, 'learning_rate': 5.5596285434995106e-05, 'epoch': 1.65}
+{'loss': 0.6397, 'grad_norm': 2.1940715312957764, 'learning_rate': 5.557184750733138e-05, 'epoch': 1.65}
+{'loss': 0.8497, 'grad_norm': 4.189990520477295, 'learning_rate': 5.554740957966764e-05, 'epoch': 1.65}
+{'loss': 0.3263, 'grad_norm': 1.0179821252822876, 'learning_rate': 5.55229716520039e-05, 'epoch': 1.65}
+{'loss': 0.2546, 'grad_norm': 2.343964099884033, 'learning_rate': 5.549853372434017e-05, 'epoch': 1.65}
+{'loss': 0.4887, 'grad_norm': 2.5637285709381104, 'learning_rate': 5.547409579667644e-05, 'epoch': 1.65}
+{'loss': 0.623, 'grad_norm': 1.819643497467041, 'learning_rate': 5.54496578690127e-05, 'epoch': 1.65}
+{'loss': 0.3924, 'grad_norm': 2.0592472553253174, 'learning_rate': 5.542521994134897e-05, 'epoch': 1.65}
+{'loss': 0.4991, 'grad_norm': 1.9924129247665405, 'learning_rate': 5.540078201368524e-05, 'epoch': 1.65}
+{'loss': 0.857, 'grad_norm': 1.941512942314148, 'learning_rate': 5.5376344086021496e-05, 'epoch': 1.65}
+{'loss': 1.019, 'grad_norm': 1.9502965211868286, 'learning_rate': 5.535190615835777e-05, 'epoch': 1.65}
+{'loss': 0.5463, 'grad_norm': 2.0973851680755615, 'learning_rate': 5.532746823069404e-05, 'epoch': 1.65}
+{'loss': 0.5572, 'grad_norm': 2.2591187953948975, 'learning_rate': 5.53030303030303e-05, 'epoch': 1.65}
+{'loss': 1.1269, 'grad_norm': 2.8180580139160156, 'learning_rate': 5.527859237536656e-05, 'epoch': 1.65}
+{'loss': 1.1379, 'grad_norm': 3.1859171390533447, 'learning_rate': 5.525415444770283e-05, 'epoch': 1.65}
+{'loss': 0.3368, 'grad_norm': 2.108909845352173, 'learning_rate': 5.522971652003909e-05, 'epoch': 1.65}
+{'loss': 1.0132, 'grad_norm': 9.457693099975586, 'learning_rate': 5.520527859237536e-05, 'epoch': 1.65}
+{'loss': 1.6879, 'grad_norm': 4.089739799499512, 'learning_rate': 5.5180840664711634e-05, 'epoch': 1.65}
+{'loss': 1.105, 'grad_norm': 1.7401306629180908, 'learning_rate': 5.515640273704789e-05, 'epoch': 1.65}
+{'loss': 1.4396, 'grad_norm': 2.562828540802002, 'learning_rate': 5.513196480938416e-05, 'epoch': 1.65}
+{'loss': 0.8768, 'grad_norm': 3.0045950412750244, 'learning_rate': 5.510752688172043e-05, 'epoch': 1.65}
+{'loss': 0.6006, 'grad_norm': 5.166981220245361, 'learning_rate': 5.508308895405669e-05, 'epoch': 1.65}
+{'loss': 0.7751, 'grad_norm': 4.5421929359436035, 'learning_rate': 5.505865102639296e-05, 'epoch': 1.65}
+{'loss': 0.5581, 'grad_norm': 1.448534369468689, 'learning_rate': 5.503421309872923e-05, 'epoch': 1.65}
+{'loss': 0.8802, 'grad_norm': 1.7535744905471802, 'learning_rate': 5.500977517106549e-05, 'epoch': 1.65}
+{'loss': 0.1992, 'grad_norm': 0.6873712539672852, 'learning_rate': 5.498533724340175e-05, 'epoch': 1.65}
+{'loss': 0.195, 'grad_norm': 0.7089383006095886, 'learning_rate': 5.4960899315738024e-05, 'epoch': 1.65}
+{'loss': 0.1743, 'grad_norm': 0.3983263671398163, 'learning_rate': 5.493646138807428e-05, 'epoch': 1.65}
+{'loss': 0.1656, 'grad_norm': 0.44565579295158386, 'learning_rate': 5.4912023460410554e-05, 'epoch': 1.65}
+{'loss': 0.245, 'grad_norm': 0.6502916216850281, 'learning_rate': 5.4887585532746825e-05, 'epoch': 1.65}
+{'loss': 0.275, 'grad_norm': 0.8741419315338135, 'learning_rate': 5.4863147605083083e-05, 'epoch': 1.65}
+ 83%|████████▎ | 10545/12776 [1:51:25<28:14,  1.32it/s] 83%|████████▎ | 10546/12776 [1:51:25<26:47,  1.39it/s]                                                        83%|████████▎ | 10546/12776 [1:51:25<26:47,  1.39it/s] 83%|████████▎ | 10547/12776 [1:51:26<25:52,  1.44it/s]                                                        83%|████████▎ | 10547/12776 [1:51:26<25:52,  1.44it/s] 83%|████████▎ | 10548/12776 [1:51:27<24:23,  1.52it/s]                                                        83%|████████▎ | 10548/12776 [1:51:27<24:23,  1.52it/s] 83%|████████▎ | 10549/12776 [1:51:27<23:36,  1.57it/s]                                                        83%|████████▎ | 10549/12776 [1:51:27<23:36,  1.57it/s] 83%|████████▎ | 10550/12776 [1:51:28<22:18,  1.66it/s]                                                        83%|████████▎ | 10550/12776 [1:51:28<22:18,  1.66it/s] 83%|████████▎ | 10551/12776 [1:51:28<21:53,  1.69it/s]                                                        83%|████████▎ | 10551/12776 [1:51:28<21:53,  1.69it/s] 83%|████████▎ | 10552/12776 [1:51:29<20:28,  1.81it/s]                                                        83%|████████▎ | 10552/12776 [1:51:29<20:28,  1.81it/s] 83%|████████▎ | 10553/12776 [1:51:29<20:36,  1.80it/s]                                                        83%|████████▎ | 10553/12776 [1:51:29<20:36,  1.80it/s] 83%|████████▎ | 10554/12776 [1:51:30<19:22,  1.91it/s]                                                        83%|████████▎ | 10554/12776 [1:51:30<19:22,  1.91it/s] 83%|████████▎ | 10555/12776 [1:51:30<19:02,  1.94it/s]                                                        83%|████████▎ | 10555/12776 [1:51:30<19:02,  1.94it/s] 83%|████████▎ | 10556/12776 [1:51:31<17:57,  2.06it/s]                                                        83%|████████▎ | 10556/12776 [1:51:31<17:57,  2.06it/s] 83%|████████▎ | 10557/12776 [1:51:31<17:01,  2.17it/s]                                                        83%|████████▎ | 10557/12776 [1:51:31<17:01,  2.17it/s] 83%|████████▎ | 10558/12776 [1:51:32<17:15,  2.14it/s]                                                        83%|████████▎ | 10558/12776 [1:51:32<17:15,  2.14it/s] 83%|████████▎ | 10559/12776 [1:51:32<16:16,  2.27it/s]                                                        83%|████████▎ | 10559/12776 [1:51:32<16:16,  2.27it/s] 83%|████████▎ | 10560/12776 [1:51:32<15:18,  2.41it/s]                                                        83%|████████▎ | 10560/12776 [1:51:32<15:18,  2.41it/s] 83%|████████▎ | 10561/12776 [1:51:33<15:37,  2.36it/s]                                                        83%|████████▎ | 10561/12776 [1:51:33<15:37,  2.36it/s] 83%|████████▎ | 10562/12776 [1:51:33<14:37,  2.52it/s]                                                        83%|████████▎ | 10562/12776 [1:51:33<14:37,  2.52it/s] 83%|████████▎ | 10563/12776 [1:51:33<13:51,  2.66it/s]                                                        83%|████████▎ | 10563/12776 [1:51:33<13:51,  2.66it/s] 83%|████████▎ | 10564/12776 [1:51:34<13:54,  2.65it/s]                                                        83%|████████▎ | 10564/12776 [1:51:34<13:54,  2.65it/s] 83%|████████▎ | 10565/12776 [1:51:34<12:59,  2.84it/s]                                                        83%|████████▎ | 10565/12776 [1:51:34<12:59,  2.84it/s] 83%|████████▎ | 10566/12776 [1:51:34<12:15,  3.01it/s]                                                        83%|████████▎ | 10566/12776 [1:51:34<12:15,  3.01it/s] 83%|████████▎ | 10567/12776 [1:51:35<13:00,  2.83it/s]                                                        83%|████████▎ | 10567/12776 [1:51:35<13:00,  2.83it/s] 83%|████████▎ | 10568/12776 [1:51:35<12:02,  3.06it/s]                                                        83%|████████▎ | 10568/12776 [1:51:35<12:02,  3.06it/s] 83%|████████▎ | 10569/12776 [1:51:35<11:19,  3.25it/s]                                                        83%|████████▎ | 10569/12776 [1:51:35<11:19,  3.25it/s] 83%|█��██████▎ | 10570/12776 [1:51:35<10:40,  3.44it/s]                                                        83%|████████▎ | 10570/12776 [1:51:35<10:40,  3.44it/s] 83%|████████▎ | 10571/12776 [1:51:36<11:28,  3.20it/s]                                                        83%|████████▎ | 10571/12776 [1:51:36<11:28,  3.20it/s] 83%|████████▎ | 10572/12776 [1:51:36<10:39,  3.44it/s]                                                        83%|████████▎ | 10572/12776 [1:51:36<10:39,  3.44it/s] 83%|████████▎ | 10573/12776 [1:51:36<10:01,  3.66it/s]                                                        83%|████████▎ | 10573/12776 [1:51:36<10:01,  3.66it/s] 83%|████████▎ | 10574/12776 [1:51:37<09:33,  3.84it/s]                                                        83%|████████▎ | 10574/12776 [1:51:37<09:33,  3.84it/s] 83%|████████▎ | 10575/12776 [1:51:37<09:08,  4.01it/s]                                                        83%|████████▎ | 10575/12776 [1:51:37<09:08,  4.01it/s] 83%|████████▎ | 10576/12776 [1:51:37<10:02,  3.65it/s]                                                        83%|████████▎ | 10576/12776 [1:51:37<10:02,  3.65it/s] 83%|████████▎ | 10577/12776 [1:51:37<09:22,  3.91it/s]                                                        83%|████████▎ | 10577/12776 [1:51:37<09:22,  3.91it/s] 83%|████████▎ | 10578/12776 [1:51:38<08:53,  4.12it/s]                                                        83%|████████▎ | 10578/12776 [1:51:38<08:53,  4.12it/s] 83%|████████▎ | 10579/12776 [1:51:38<08:30,  4.30it/s]                                                        83%|████████▎ | 10579/12776 [1:51:38<08:30,  4.30it/s] 83%|████████▎ | 10580/12776 [1:51:38<08:12,  4.46it/s]                                                        83%|████████▎ | 10580/12776 [1:51:38<08:12,  4.46it/s] 83%|████████▎ | 10581/12776 [1:51:38<08:38,  4.23it/s]                                                        83%|████████▎ | 10581/12776 [1:51:38<08:38,  4.23it/s] 83%|████████▎ | 10582/12776 [1:51:38<08:15,  4.43it/s]                                                        83%|████████▎ | 10582/12776 [1:51:38<08:15,  4.43it/s] 83%|████████▎ | 10583/12776 [1:51:39<07:57,  4.60it/s]                                                        83%|████████▎ | 10583/12776 [1:51:39<07:57,  4.60it/s] 83%|████████▎ | 10584/12776 [1:51:39<07:14,  5.04it/s]                                                        83%|████████▎ | 10584/12776 [1:51:39<07:14,  5.04it/s] 83%|████████▎ | 10585/12776 [1:51:39<06:57,  5.24it/s]                                                        83%|████████▎ | 10585/12776 [1:51:39<06:57,  5.24it/s] 83%|████████▎ | 10586/12776 [1:51:39<07:43,  4.72it/s]                                                        83%|████████▎ | 10586/12776 [1:51:39<07:43,  4.72it/s] 83%|████████▎ | 10587/12776 [1:51:39<07:25,  4.91it/s]                                                        83%|████████▎ | 10587/12776 [1:51:39<07:25,  4.91it/s] 83%|████████▎ | 10588/12776 [1:51:40<12:40,  2.88it/s]                                                        83%|████████▎ | 10588/12776 [1:51:40<12:40,  2.88it/s] 83%|████████▎ | 10589/12776 [1:51:42<26:01,  1.40it/s]                                                        83%|████████▎ | 10589/12776 [1:51:42<26:01,  1.40it/s] 83%|████████▎ | 10590/12776 [1:51:43<28:41,  1.27it/s]                                                        83%|████████▎ | 10590/12776 [1:51:43<28:41,  1.27it/s] 83%|████████▎ | 10591/12776 [1:51:43<28:50,  1.26it/s]                                                        83%|████████▎ | 10591/12776 [1:51:43<28:50,  1.26it/s] 83%|████████▎ | 10592/12776 [1:51:44<28:33,  1.27it/s]                                                        83%|████████▎ | 10592/12776 [1:51:44<28:33,  1.27it/s] 83%|████████▎ | 10593/12776 [1:51:45<27:54,  1.30it/s]                                                        83%|████████▎ | 10593/12776 [1:51:45<27:54,  1.30it/s] 83%|████████▎ | 10594/12776 [1:51:46<26:38,  1.37it/s]                                                        83%|████████▎ | 10594/12776 [1:51:46<26:38,  1.37it/s] 83%|████████▎ | 10595/12776 [1:51:46<26:46,  1.36it/s]                                                        83%|████████▎ | 10595/12776 [1:51:46<26:46,  1.36it/s] 83%|████████▎ | 10596/12776 [1:51:47<25:25,  1.43it/s]                                                        83%|████████▎ | 10596/12776 [1:51:47<25:25,  1.43it/s] 83%|████████▎ | 10597/12776 [1:51:48<24:24,  1.49it/s]                                                        83%|████████▎ | 10597/12776 [1:51:48<24:24,  1.49it/s] 83%|████████▎ | 10598/12776 [1:51:48<23:02,  1.58it/s]                                                        83%|████████▎ | 10598/12776 [1:51:48<23:02,  1.58it/s] 83%|████████▎ | 10599/12776 [1:51:49<22:17,  1.63it/s]                                                        83%|████████▎ | 10599/12776 [1:51:49<22:17,  1.63it/s] 83%|████████▎ | 10600/12776 [1:51:49<20:56,  1.73it/s]                                                        83%|████████▎ | 10600/12776 [1:51:49<20:56,  1.73it/s] 83%|████████▎ | 10601/12776 [1:51:50<20:08,  1.80it/s]                                                        83%|████████▎ | 10601/12776 [1:51:50<20:08,  1.80it/s] 83%|████████▎ | 10602/12776 [1:51:50<18:57,  1.91it/s]                                                        83%|████████▎ | 10602/12776 [1:51:50<18:57,  1.91it/s] 83%|████████▎ | 10603/12776 [1:51:51<18:34,  1.95it/s]                                                        83%|████████▎ | 10603/12776 [1:51:51<18:34,  1.95it/s] 83%|████████▎ | 10604/12776 [1:51:51<17:35,  2.06it/s]                                                        83%|████████▎ | 10604/12776 [1:51:51<17:35,  2.06it/s] 83%|████████▎ | 10605/12776 [1:51:51<16:37,  2.18it/s]                                                        83%|████████▎ | 10605/12776 [1:51:51<16:37,  2.18it/s] 83%|████████▎ | 10606/12776 [1:51:52<17:22,  2.08it/s]                                                        83%|████████▎ | 10606/12776 [1:51:52<17:22,  2.08it/s] 83%|████████▎ | 10607/12776 [1:51:52<16:13,  2.23it/s]                                                        83%|████████▎ | 10607/12776 [1:51:52<16:13,  2.23it/s] 83%|████████▎ | 10608/12776 [1:51:53<15:14,  2.37it/s]                                                        83%|████████▎ | 10608/12776 [1:51:53<15:14,  2.37it/s] 83%|████████▎ | 10609/12776 [1:51:53<15:35,  2.32it/s]                                                        83%|████████▎ | 10609/12776 [1:51:53<15:35,  2.32it/s] 83%|████████▎ | 10610/12776 [1:51:53<14:36,  2.47it/s]                                                        83%|████████▎ | 10610/12776 [1:51:53<14:36,  2.47it/s] 83%|████████▎ | 10611/12776 [1:51:54<13:46,  2.62it/s]                                                        83%|████████▎ | 10611/12776 [1:51:54<13:46,  2.62it/s] 83%|████████▎ | 10612/12776 [1:51:54<13:52,  2.60it/s]                                                        83%|████████▎ | 10612/12776 [1:51:54<13:52,  2.60it/s] 83%|████████▎ | 10613/12776 [1:51:54<13:01,  2.77it/s]                                                        83%|████████▎ | 10613/12776 [1:51:54<13:01,  2.77it/s] 83%|████████▎ | 10614/12776 [1:51:55<12:20,  2.92it/s]                                                        83%|████████▎ | 10614/12776 [1:51:55<12:20,  2.92it/s] 83%|████████▎ | 10615/12776 [1:51:55<12:41,  2.84it/s]                                                        83%|████████▎ | 10615/12776 [1:51:55<12:41,  2.84it/s] 83%|████████▎ | 10616/12776 [1:51:55<11:55,  3.02it/s]                                                        83%|████████▎ | 10616/12776 [1:51:55<11:55,  3.02it/s] 83%|████████▎ | 10617/12776 [1:51:56<11:17,  3.19it/s]                                                        83%|████████▎ | 10617/12776 [1:51:56<11:17,  3.19it/s] 83%|████████▎ | 10618/12776 [1:51:56<10:46,  3.34it/s]                                                        83%|████████▎ | 10618/12776 [1:51:56<10:46,  3.34it/s] 83%|████████▎ | 10619/12776 [1:51:56<11:05,  3.24it/s]                                                        83%|████████▎ | 10619/12776 [1:51:56<11:05,  3.24it/s] 83%|████████▎ | 10620/12776 [1:51:57<10:31,  3.41it/s]                                                        83%|████████▎ | 10620/12776 [1:51:57<10:31,  3.41it/s] 83%|████████▎ | 10621/12776 [1:51:57<10:02,  3.58it/s]                                                        83%|████████▎ | 10621/12776 [1:51:57<10:02,  3.58it/s] 83%|████████▎ | 10622/12776 [1:51:57<09:39,  3.72it/s]                                                        83%|████████▎ | 10622/12776 [1:51:57<09:39,  3.72it/s] 83%|████████▎ | 10623/12776 [1:51:57<09:19,  3.85it/s]                                                       {'loss': 0.2486, 'grad_norm': 0.5584045052528381, 'learning_rate': 5.483870967741935e-05, 'epoch': 1.65}
+{'loss': 0.3131, 'grad_norm': 0.7461156845092773, 'learning_rate': 5.481427174975562e-05, 'epoch': 1.65}
+{'loss': 0.1941, 'grad_norm': 0.6761148571968079, 'learning_rate': 5.478983382209188e-05, 'epoch': 1.65}
+{'loss': 0.2663, 'grad_norm': 1.4514672756195068, 'learning_rate': 5.476539589442815e-05, 'epoch': 1.65}
+{'loss': 0.2844, 'grad_norm': 0.8751680850982666, 'learning_rate': 5.474095796676442e-05, 'epoch': 1.65}
+{'loss': 0.2972, 'grad_norm': 1.7713919878005981, 'learning_rate': 5.471652003910068e-05, 'epoch': 1.65}
+{'loss': 0.3436, 'grad_norm': 1.0611058473587036, 'learning_rate': 5.4692082111436944e-05, 'epoch': 1.65}
+{'loss': 0.2195, 'grad_norm': 0.7453222274780273, 'learning_rate': 5.466764418377321e-05, 'epoch': 1.65}
+{'loss': 0.3868, 'grad_norm': 2.404334783554077, 'learning_rate': 5.4643206256109473e-05, 'epoch': 1.65}
+{'loss': 0.21, 'grad_norm': 0.9054138660430908, 'learning_rate': 5.4618768328445745e-05, 'epoch': 1.65}
+{'loss': 0.3469, 'grad_norm': 2.539930820465088, 'learning_rate': 5.4594330400782e-05, 'epoch': 1.65}
+{'loss': 0.3711, 'grad_norm': 1.4532172679901123, 'learning_rate': 5.4569892473118275e-05, 'epoch': 1.65}
+{'loss': 0.2647, 'grad_norm': 1.0003594160079956, 'learning_rate': 5.454545454545454e-05, 'epoch': 1.65}
+{'loss': 0.3655, 'grad_norm': 1.0202223062515259, 'learning_rate': 5.4521016617790804e-05, 'epoch': 1.65}
+{'loss': 0.4682, 'grad_norm': 2.1398379802703857, 'learning_rate': 5.449657869012707e-05, 'epoch': 1.65}
+{'loss': 0.273, 'grad_norm': 1.0124636888504028, 'learning_rate': 5.447214076246334e-05, 'epoch': 1.65}
+{'loss': 0.3883, 'grad_norm': 1.2546247243881226, 'learning_rate': 5.44477028347996e-05, 'epoch': 1.65}
+{'loss': 0.397, 'grad_norm': 1.4561783075332642, 'learning_rate': 5.442326490713587e-05, 'epoch': 1.65}
+{'loss': 0.3122, 'grad_norm': 0.9833201169967651, 'learning_rate': 5.4398826979472135e-05, 'epoch': 1.65}
+{'loss': 0.4702, 'grad_norm': 10.254325866699219, 'learning_rate': 5.43743890518084e-05, 'epoch': 1.65}
+{'loss': 0.7521, 'grad_norm': 1.9881863594055176, 'learning_rate': 5.4349951124144665e-05, 'epoch': 1.65}
+{'loss': 0.5512, 'grad_norm': 3.919692277908325, 'learning_rate': 5.4325513196480936e-05, 'epoch': 1.65}
+{'loss': 0.4922, 'grad_norm': 2.473768711090088, 'learning_rate': 5.4301075268817194e-05, 'epoch': 1.65}
+{'loss': 0.5504, 'grad_norm': 2.2449424266815186, 'learning_rate': 5.4276637341153466e-05, 'epoch': 1.65}
+{'loss': 0.8247, 'grad_norm': 2.9586992263793945, 'learning_rate': 5.425219941348973e-05, 'epoch': 1.65}
+{'loss': 0.461, 'grad_norm': 1.8599929809570312, 'learning_rate': 5.4227761485825995e-05, 'epoch': 1.65}
+{'loss': 0.958, 'grad_norm': 1.9860762357711792, 'learning_rate': 5.420332355816226e-05, 'epoch': 1.65}
+{'loss': 1.1638, 'grad_norm': 3.286160707473755, 'learning_rate': 5.417888563049853e-05, 'epoch': 1.65}
+{'loss': 0.4416, 'grad_norm': 1.1726208925247192, 'learning_rate': 5.415444770283479e-05, 'epoch': 1.66}
+{'loss': 0.8697, 'grad_norm': 2.7897346019744873, 'learning_rate': 5.413000977517106e-05, 'epoch': 1.66}
+{'loss': 0.8906, 'grad_norm': 4.233922958374023, 'learning_rate': 5.4105571847507326e-05, 'epoch': 1.66}
+{'loss': 0.6652, 'grad_norm': 1.2990089654922485, 'learning_rate': 5.408113391984359e-05, 'epoch': 1.66}
+{'loss': 1.022, 'grad_norm': 3.66686749458313, 'learning_rate': 5.4056695992179856e-05, 'epoch': 1.66}
+{'loss': 0.9791, 'grad_norm': 4.231387615203857, 'learning_rate': 5.403225806451613e-05, 'epoch': 1.66}
+{'loss': 1.2046, 'grad_norm': 2.5381076335906982, 'learning_rate': 5.4007820136852385e-05, 'epoch': 1.66}
+{'loss': 1.5159, 'grad_norm': 3.6664905548095703, 'learning_rate': 5.398338220918866e-05, 'epoch': 1.66}
+{'loss': 1.4945, 'grad_norm': 2.639780282974243, 'learning_rate': 5.395894428152492e-05, 'epoch': 1.66}
+{'loss': 0.7653, 'grad_norm': 1.567789912223816, 'learning_rate': 5.3934506353861187e-05, 'epoch': 1.66}
+{'loss': 1.0403, 'grad_norm': 4.89553689956665, 'learning_rate': 5.391006842619745e-05, 'epoch': 1.66}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5.391006842619745e-05, 'epoch': 1.66}
+{'loss': 0.5282, 'grad_norm': 3.5072648525238037, 'learning_rate': 5.388563049853372e-05, 'epoch': 1.66}
+{'loss': 0.6242, 'grad_norm': 3.3043320178985596, 'learning_rate': 5.386119257086998e-05, 'epoch': 1.66}
+{'loss': 1.0082, 'grad_norm': 2.609529733657837, 'learning_rate': 5.383675464320625e-05, 'epoch': 1.66}
+{'loss': 0.9866, 'grad_norm': 1.4475938081741333, 'learning_rate': 5.381231671554252e-05, 'epoch': 1.66}
+{'loss': 0.2334, 'grad_norm': 0.5289618372917175, 'learning_rate': 5.378787878787878e-05, 'epoch': 1.66}
+{'loss': 0.2082, 'grad_norm': 0.5513238310813904, 'learning_rate': 5.376344086021505e-05, 'epoch': 1.66}
+{'loss': 0.2386, 'grad_norm': 0.7842631936073303, 'learning_rate': 5.373900293255132e-05, 'epoch': 1.66}
+{'loss': 0.1653, 'grad_norm': 0.7243189811706543, 'learning_rate': 5.3714565004887577e-05, 'epoch': 1.66}
+{'loss': 0.208, 'grad_norm': 2.165306329727173, 'learning_rate': 5.369012707722385e-05, 'epoch': 1.66}
+{'loss': 0.2278, 'grad_norm': 0.661629319190979, 'learning_rate': 5.366568914956011e-05, 'epoch': 1.66}
+{'loss': 0.3416, 'grad_norm': 0.931580662727356, 'learning_rate': 5.364125122189638e-05, 'epoch': 1.66}
+{'loss': 0.3323, 'grad_norm': 0.9427894949913025, 'learning_rate': 5.361681329423264e-05, 'epoch': 1.66}
+{'loss': 0.2057, 'grad_norm': 0.6458982229232788, 'learning_rate': 5.3592375366568914e-05, 'epoch': 1.66}
+{'loss': 0.3066, 'grad_norm': 1.2524142265319824, 'learning_rate': 5.356793743890517e-05, 'epoch': 1.66}
+{'loss': 0.4275, 'grad_norm': 1.944617509841919, 'learning_rate': 5.3543499511241444e-05, 'epoch': 1.66}
+{'loss': 0.1748, 'grad_norm': 3.7578859329223633, 'learning_rate': 5.351906158357771e-05, 'epoch': 1.66}
+{'loss': 0.2722, 'grad_norm': 0.9285878539085388, 'learning_rate': 5.349462365591397e-05, 'epoch': 1.66}
+{'loss': 0.295, 'grad_norm': 3.483341693878174, 'learning_rate': 5.347018572825024e-05, 'epoch': 1.66}
+{'loss': 0.2591, 'grad_norm': 1.6603342294692993, 'learning_rate': 5.344574780058651e-05, 'epoch': 1.66}
+{'loss': 0.3515, 'grad_norm': 2.1465036869049072, 'learning_rate': 5.342130987292277e-05, 'epoch': 1.66}
+{'loss': 0.4868, 'grad_norm': 1.9516934156417847, 'learning_rate': 5.339687194525904e-05, 'epoch': 1.66}
+{'loss': 0.2049, 'grad_norm': 0.8558928966522217, 'learning_rate': 5.3372434017595304e-05, 'epoch': 1.66}
+{'loss': 0.2129, 'grad_norm': 1.204277515411377, 'learning_rate': 5.334799608993157e-05, 'epoch': 1.66}
+{'loss': 0.2187, 'grad_norm': 1.400564193725586, 'learning_rate': 5.3323558162267834e-05, 'epoch': 1.66}
+{'loss': 0.3519, 'grad_norm': 1.5953516960144043, 'learning_rate': 5.3299120234604105e-05, 'epoch': 1.66}
+{'loss': 0.4418, 'grad_norm': 1.5663790702819824, 'learning_rate': 5.327468230694036e-05, 'epoch': 1.66}
+{'loss': 0.4294, 'grad_norm': 2.3011529445648193, 'learning_rate': 5.3250244379276635e-05, 'epoch': 1.66}
+{'loss': 0.6262, 'grad_norm': 2.8383572101593018, 'learning_rate': 5.32258064516129e-05, 'epoch': 1.66}
+{'loss': 0.5056, 'grad_norm': 2.6488254070281982, 'learning_rate': 5.3201368523949164e-05, 'epoch': 1.66}
+{'loss': 0.3669, 'grad_norm': 0.9980300068855286, 'learning_rate': 5.317693059628543e-05, 'epoch': 1.66}
+{'loss': 0.5514, 'grad_norm': 1.887178897857666, 'learning_rate': 5.31524926686217e-05, 'epoch': 1.66}
+{'loss': 0.8874, 'grad_norm': 2.6637489795684814, 'learning_rate': 5.312805474095796e-05, 'epoch': 1.66}
+{'loss': 0.5448, 'grad_norm': 2.64481520652771, 'learning_rate': 5.310361681329423e-05, 'epoch': 1.66}
+{'loss': 0.9028, 'grad_norm': 2.7849998474121094, 'learning_rate': 5.3079178885630495e-05, 'epoch': 1.66}
+{'loss': 0.9225, 'grad_norm': 3.2674617767333984, 'learning_rate': 5.305474095796676e-05, 'epoch': 1.66}
+{'loss': 0.9553, 'grad_norm': 2.544890880584717, 'learning_rate': 5.3030303030303025e-05, 'epoch': 1.66}
+{'loss': 0.3589, 'grad_norm': 1.3422220945358276, 'learning_rate': 5.3005865102639296e-05, 'epoch': 1.66}
+{'loss': 0.6147, 'grad_norm': 1.9164516925811768, 'learning_rate': 5.2981427174975554e-05, 'epoch': 1.66}
+ 83%|████████▎ | 10623/12776 [1:51:57<09:19,  3.85it/s] 83%|████████▎ | 10624/12776 [1:51:58<09:39,  3.71it/s]                                                        83%|████████▎ | 10624/12776 [1:51:58<09:39,  3.71it/s] 83%|████████▎ | 10625/12776 [1:51:58<09:15,  3.87it/s]                                                        83%|████████▎ | 10625/12776 [1:51:58<09:15,  3.87it/s] 83%|████████▎ | 10626/12776 [1:51:58<08:53,  4.03it/s]                                                        83%|████████▎ | 10626/12776 [1:51:58<08:53,  4.03it/s] 83%|████████▎ | 10627/12776 [1:51:58<08:34,  4.17it/s]                                                        83%|████████▎ | 10627/12776 [1:51:58<08:34,  4.17it/s] 83%|████████▎ | 10628/12776 [1:51:59<09:17,  3.86it/s]                                                        83%|████████▎ | 10628/12776 [1:51:59<09:17,  3.86it/s] 83%|████████▎ | 10629/12776 [1:51:59<08:44,  4.09it/s]                                                        83%|███████���▎ | 10629/12776 [1:51:59<08:44,  4.09it/s] 83%|████████▎ | 10630/12776 [1:51:59<08:23,  4.26it/s]                                                        83%|████████▎ | 10630/12776 [1:51:59<08:23,  4.26it/s] 83%|████████▎ | 10631/12776 [1:51:59<08:06,  4.41it/s]                                                        83%|████████▎ | 10631/12776 [1:51:59<08:06,  4.41it/s] 83%|████████▎ | 10632/12776 [1:51:59<07:50,  4.56it/s]                                                        83%|████████▎ | 10632/12776 [1:51:59<07:50,  4.56it/s] 83%|████████▎ | 10633/12776 [1:52:00<08:44,  4.08it/s]                                                        83%|████████▎ | 10633/12776 [1:52:00<08:44,  4.08it/s] 83%|████████▎ | 10634/12776 [1:52:00<08:14,  4.33it/s]                                                        83%|████████▎ | 10634/12776 [1:52:00<08:14,  4.33it/s] 83%|████████▎ | 10635/12776 [1:52:00<07:50,  4.55it/s]                                                        83%|████████▎ | 10635/12776 [1:52:00<07:50,  4.55it/s] 83%|████████▎ | 10636/12776 [1:52:00<07:33,  4.72it/s]                                                        83%|████████▎ | 10636/12776 [1:52:00<07:33,  4.72it/s] 83%|████████▎ | 10637/12776 [1:52:00<07:17,  4.89it/s]                                                        83%|████████▎ | 10637/12776 [1:52:00<07:17,  4.89it/s] 83%|████████▎ | 10638/12776 [1:52:01<13:23,  2.66it/s]                                                        83%|████████▎ | 10638/12776 [1:52:01<13:23,  2.66it/s] 83%|████████▎ | 10639/12776 [1:52:03<25:57,  1.37it/s]                                                        83%|████████▎ | 10639/12776 [1:52:03<25:57,  1.37it/s] 83%|████████▎ | 10640/12776 [1:52:04<28:34,  1.25it/s]                                                        83%|████████▎ | 10640/12776 [1:52:04<28:34,  1.25it/s] 83%|████████▎ | 10641/12776 [1:52:05<29:26,  1.21it/s]                                                        83%|████████▎ | 10641/12776 [1:52:05<29:26,  1.21it/s] 83%|████████▎ | 10642/12776 [1:52:05<29:12,  1.22it/s]                                                        83%|████████▎ | 10642/12776 [1:52:05<29:12,  1.22it/s] 83%|████████▎ | 10643/12776 [1:52:06<28:18,  1.26it/s]                                                        83%|████████▎ | 10643/12776 [1:52:06<28:18,  1.26it/s] 83%|████████▎ | 10644/12776 [1:52:07<28:12,  1.26it/s]                                                        83%|████████▎ | 10644/12776 [1:52:07<28:12,  1.26it/s] 83%|████████▎ | 10645/12776 [1:52:08<27:58,  1.27it/s]                                                        83%|████████▎ | 10645/12776 [1:52:08<27:58,  1.27it/s] 83%|████████▎ | 10646/12776 [1:52:08<26:14,  1.35it/s]                                                        83%|████████▎ | 10646/12776 [1:52:08<26:14,  1.35it/s] 83%|████████▎ | 10647/12776 [1:52:09<24:26,  1.45it/s]                                                        83%|████████▎ | 10647/12776 [1:52:09<24:26,  1.45it/s] 83%|████████▎ | 10648/12776 [1:52:10<23:00,  1.54it/s]                                                        83%|████████▎ | 10648/12776 [1:52:10<23:00,  1.54it/s] 83%|████████▎ | 10649/12776 [1:52:10<23:00,  1.54it/s]                                                        83%|████████▎ | 10649/12776 [1:52:10<23:00,  1.54it/s] 83%|████████▎ | 10650/12776 [1:52:11<21:30,  1.65it/s]                                                        83%|████████▎ | 10650/12776 [1:52:11<21:30,  1.65it/s] 83%|████████▎ | 10651/12776 [1:52:11<21:11,  1.67it/s]                                                        83%|████████▎ | 10651/12776 [1:52:11<21:11,  1.67it/s] 83%|████████▎ | 10652/12776 [1:52:12<19:35,  1.81it/s]                                                        83%|████████▎ | 10652/12776 [1:52:12<19:35,  1.81it/s] 83%|████████▎ | 10653/12776 [1:52:12<18:20,  1.93it/s]                                                        83%|████████▎ | 10653/12776 [1:52:12<18:20,  1.93it/s] 83%|████████▎ | 10654/12776 [1:52:13<18:04,  1.96it/s]                                                        83%|█████��██▎ | 10654/12776 [1:52:13<18:04,  1.96it/s] 83%|████████▎ | 10655/12776 [1:52:13<16:55,  2.09it/s]                                                        83%|████████▎ | 10655/12776 [1:52:13<16:55,  2.09it/s] 83%|████████▎ | 10656/12776 [1:52:14<16:55,  2.09it/s]                                                        83%|████████▎ | 10656/12776 [1:52:14<16:55,  2.09it/s] 83%|████████▎ | 10657/12776 [1:52:14<15:47,  2.24it/s]                                                        83%|████████▎ | 10657/12776 [1:52:14<15:47,  2.24it/s] 83%|████████▎ | 10658/12776 [1:52:14<14:49,  2.38it/s]                                                        83%|████████▎ | 10658/12776 [1:52:14<14:49,  2.38it/s] 83%|████████▎ | 10659/12776 [1:52:15<14:46,  2.39it/s]                                                        83%|████████▎ | 10659/12776 [1:52:15<14:46,  2.39it/s] 83%|████████▎ | 10660/12776 [1:52:15<13:54,  2.54it/s]                                                        83%|████████▎ | 10660/12776 [1:52:15<13:54,  2.54it/s] 83%|████████▎ | 10661/12776 [1:52:15<13:13,  2.67it/s]                                                        83%|████████▎ | 10661/12776 [1:52:15<13:13,  2.67it/s] 83%|████████▎ | 10662/12776 [1:52:16<12:36,  2.80it/s]                                                        83%|████████▎ | 10662/12776 [1:52:16<12:36,  2.80it/s] 83%|████████▎ | 10663/12776 [1:52:16<12:22,  2.84it/s]                                                        83%|████████▎ | 10663/12776 [1:52:16<12:22,  2.84it/s] 83%|████████▎ | 10664/12776 [1:52:16<11:47,  2.98it/s]                                                        83%|████████▎ | 10664/12776 [1:52:16<11:47,  2.98it/s] 83%|████████▎ | 10665/12776 [1:52:17<11:20,  3.10it/s]                                                        83%|████████▎ | 10665/12776 [1:52:17<11:20,  3.10it/s] 83%|████████▎ | 10666/12776 [1:52:17<11:59,  2.93it/s]                                                        83%|████████▎ | 10666/12776 [1:52:17<11:59,  2.93it/s] 83%|████████▎ | 10667/12776 [1:52:17<11:17,  3.11it/s]                                                        83%|████████▎ | 10667/12776 [1:52:17<11:17,  3.11it/s] 84%|████████▎ | 10668/12776 [1:52:17<10:43,  3.28it/s]                                                        84%|████████▎ | 10668/12776 [1:52:17<10:43,  3.28it/s] 84%|████████▎ | 10669/12776 [1:52:18<10:08,  3.46it/s]                                                        84%|████████▎ | 10669/12776 [1:52:18<10:08,  3.46it/s] 84%|████████▎ | 10670/12776 [1:52:18<11:00,  3.19it/s]                                                        84%|████████▎ | 10670/12776 [1:52:18<11:00,  3.19it/s] 84%|████████▎ | 10671/12776 [1:52:18<10:17,  3.41it/s]                                                        84%|████████▎ | 10671/12776 [1:52:18<10:17,  3.41it/s] 84%|████████▎ | 10672/12776 [1:52:19<09:46,  3.59it/s]                                                        84%|████████▎ | 10672/12776 [1:52:19<09:46,  3.59it/s] 84%|████████▎ | 10673/12776 [1:52:19<09:18,  3.77it/s]                                                        84%|████████▎ | 10673/12776 [1:52:19<09:18,  3.77it/s] 84%|████████▎ | 10674/12776 [1:52:19<08:56,  3.92it/s]                                                        84%|████████▎ | 10674/12776 [1:52:19<08:56,  3.92it/s] 84%|████████▎ | 10675/12776 [1:52:19<09:18,  3.76it/s]                                                        84%|████████▎ | 10675/12776 [1:52:19<09:18,  3.76it/s] 84%|████████▎ | 10676/12776 [1:52:20<08:50,  3.96it/s]                                                        84%|████████▎ | 10676/12776 [1:52:20<08:50,  3.96it/s] 84%|████████▎ | 10677/12776 [1:52:20<08:30,  4.11it/s]                                                        84%|████████▎ | 10677/12776 [1:52:20<08:30,  4.11it/s] 84%|████████▎ | 10678/12776 [1:52:20<08:11,  4.27it/s]                                                        84%|████████▎ | 10678/12776 [1:52:20<08:11,  4.27it/s] 84%|████████▎ | 10679/12776 [1:52:20<07:55,  4.41it/s]                                                        84%|████████▎ | 10679/12776 [1:52:20<07:55,  4.41it/s] 84%|████████▎ | 10680/12776 [1:52:21<08:36,  4.06it/s]                                                        84%|████████▎ | 10680/12776 [1:52:21<08:36,  4.06it/s] 84%|████████▎ | 10681/12776 [1:52:21<08:11,  4.26it/s]                                                        84%|████████▎ | 10681/12776 [1:52:21<08:11,  4.26it/s] 84%|████████▎ | 10682/12776 [1:52:21<07:50,  4.45it/s]                                                        84%|████████▎ | 10682/12776 [1:52:21<07:50,  4.45it/s] 84%|████████▎ | 10683/12776 [1:52:21<07:34,  4.61it/s]                                                        84%|████████▎ | 10683/12776 [1:52:21<07:34,  4.61it/s] 84%|████████▎ | 10684/12776 [1:52:21<07:22,  4.73it/s]                                                        84%|████████▎ | 10684/12776 [1:52:21<07:22,  4.73it/s] 84%|████████▎ | 10685/12776 [1:52:22<08:20,  4.18it/s]                                                        84%|████████▎ | 10685/12776 [1:52:22<08:20,  4.18it/s] 84%|████████▎ | 10686/12776 [1:52:22<07:50,  4.45it/s]                                                        84%|████████▎ | 10686/12776 [1:52:22<07:50,  4.45it/s] 84%|████████▎ | 10687/12776 [1:52:22<07:24,  4.70it/s]                                                        84%|████████▎ | 10687/12776 [1:52:22<07:24,  4.70it/s] 84%|████████▎ | 10688/12776 [1:52:23<13:06,  2.66it/s]                                                        84%|████████▎ | 10688/12776 [1:52:23<13:06,  2.66it/s] 84%|████████▎ | 10689/12776 [1:52:24<24:12,  1.44it/s]                                                        84%|████████▎ | 10689/12776 [1:52:24<24:12,  1.44it/s] 84%|████████▎ | 10690/12776 [1:52:25<26:53,  1.29it/s]                                                        84%|████████▎ | 10690/12776 [1:52:25<26:53,  1.29it/s] 84%|████████▎ | 10691/12776 [1:52:26<29:11,  1.19it/s]                                                        84%|████████▎ | 10691/12776 [1:52:26<29:11,  1.19it/s] 84%|████████▎ | 10692/12776 [1:52:27<28:39,  1.21it/s]                                                        84%|████████▎ | 10692/12776 [1:52:27<28:39,  1.21it/s] 84%|████████▎ | 10693/12776 [1:52:28<28:16,  1.23it/s]                                                        84%|████████▎ | 10693/12776 [1:52:28<28:16,  1.23it/s] 84%|████████▎ | 10694/12776 [1:52:28<27:29,  1.26it/s]                                                        84%|████████▎ | 10694/12776 [1:52:28<27:29,  1.26it/s] 84%|████████▎ | 10695/12776 [1:52:29<25:56,  1.34it/s]                                                        84%|████████▎ | 10695/12776 [1:52:29<25:56,  1.34it/s] 84%|████████▎ | 10696/12776 [1:52:30<26:24,  1.31it/s]                                                        84%|████████▎ | 10696/12776 [1:52:30<26:24,  1.31it/s] 84%|████████▎ | 10697/12776 [1:52:30<24:22,  1.42it/s]                                                        84%|████████▎ | 10697/12776 [1:52:30<24:22,  1.42it/s] 84%|████████▎ | 10698/12776 [1:52:31<23:12,  1.49it/s]                                                        84%|████████▎ | 10698/12776 [1:52:31<23:12,  1.49it/s] 84%|████████▎ | 10699/12776 [1:52:32<21:28,  1.61it/s]                                                        84%|████████▎ | 10699/12776 [1:52:32<21:28,  1.61it/s] 84%|████████▍ | 10700/12776 [1:52:32<21:03,  1.64it/s]                                                        84%|████████▍ | 10700/12776 [1:52:32<21:03,  1.64it/s] 84%|████████▍ | 10701/12776 [1:52:33<19:17,  1.79it/s]                                                       {'loss': 0.6961, 'grad_norm': 5.187346458435059, 'learning_rate': 5.2956989247311826e-05, 'epoch': 1.66}
+{'loss': 0.8906, 'grad_norm': 2.5986785888671875, 'learning_rate': 5.293255131964809e-05, 'epoch': 1.66}
+{'loss': 1.0949, 'grad_norm': 3.034003734588623, 'learning_rate': 5.2908113391984356e-05, 'epoch': 1.66}
+{'loss': 0.5176, 'grad_norm': 2.1828017234802246, 'learning_rate': 5.288367546432062e-05, 'epoch': 1.66}
+{'loss': 0.899, 'grad_norm': 2.277587890625, 'learning_rate': 5.285923753665689e-05, 'epoch': 1.66}
+{'loss': 0.6992, 'grad_norm': 3.5474190711975098, 'learning_rate': 5.283479960899315e-05, 'epoch': 1.66}
+{'loss': 1.1746, 'grad_norm': 4.0805888175964355, 'learning_rate': 5.281036168132942e-05, 'epoch': 1.66}
+{'loss': 0.922, 'grad_norm': 3.748154640197754, 'learning_rate': 5.2785923753665686e-05, 'epoch': 1.66}
+{'loss': 1.751, 'grad_norm': 4.198906898498535, 'learning_rate': 5.276148582600195e-05, 'epoch': 1.66}
+{'loss': 1.1014, 'grad_norm': 4.466783046722412, 'learning_rate': 5.2737047898338216e-05, 'epoch': 1.66}
+{'loss': 0.8603, 'grad_norm': 1.4127699136734009, 'learning_rate': 5.271260997067449e-05, 'epoch': 1.66}
+{'loss': 1.191, 'grad_norm': 2.8340721130371094, 'learning_rate': 5.2688172043010746e-05, 'epoch': 1.66}
+{'loss': 0.5875, 'grad_norm': 2.287118673324585, 'learning_rate': 5.266373411534702e-05, 'epoch': 1.66}
+{'loss': 0.7849, 'grad_norm': 2.4103381633758545, 'learning_rate': 5.263929618768328e-05, 'epoch': 1.66}
+{'loss': 0.8766, 'grad_norm': 2.8598053455352783, 'learning_rate': 5.261485826001955e-05, 'epoch': 1.67}
+{'loss': 1.239, 'grad_norm': 3.413485288619995, 'learning_rate': 5.259042033235581e-05, 'epoch': 1.67}
+{'loss': 0.6029, 'grad_norm': 1.0679550170898438, 'learning_rate': 5.256598240469208e-05, 'epoch': 1.67}
+{'loss': 0.1905, 'grad_norm': 0.44509389996528625, 'learning_rate': 5.254154447702834e-05, 'epoch': 1.67}
+{'loss': 0.269, 'grad_norm': 1.0144532918930054, 'learning_rate': 5.251710654936461e-05, 'epoch': 1.67}
+{'loss': 0.2504, 'grad_norm': 1.4316686391830444, 'learning_rate': 5.249266862170088e-05, 'epoch': 1.67}
+{'loss': 0.3228, 'grad_norm': 0.9818958044052124, 'learning_rate': 5.2468230694037136e-05, 'epoch': 1.67}
+{'loss': 0.3437, 'grad_norm': 1.124721884727478, 'learning_rate': 5.244379276637341e-05, 'epoch': 1.67}
+{'loss': 0.1469, 'grad_norm': 0.6059370040893555, 'learning_rate': 5.241935483870968e-05, 'epoch': 1.67}
+{'loss': 0.2796, 'grad_norm': 1.5121008157730103, 'learning_rate': 5.239491691104594e-05, 'epoch': 1.67}
+{'loss': 0.2226, 'grad_norm': 0.9613409638404846, 'learning_rate': 5.237047898338221e-05, 'epoch': 1.67}
+{'loss': 0.3174, 'grad_norm': 1.1612027883529663, 'learning_rate': 5.234604105571847e-05, 'epoch': 1.67}
+{'loss': 0.3316, 'grad_norm': 2.4976742267608643, 'learning_rate': 5.232160312805473e-05, 'epoch': 1.67}
+{'loss': 0.4277, 'grad_norm': 1.2498027086257935, 'learning_rate': 5.2297165200391e-05, 'epoch': 1.67}
+{'loss': 0.2936, 'grad_norm': 1.24776291847229, 'learning_rate': 5.2272727272727274e-05, 'epoch': 1.67}
+{'loss': 0.4268, 'grad_norm': 1.3939722776412964, 'learning_rate': 5.224828934506353e-05, 'epoch': 1.67}
+{'loss': 0.4775, 'grad_norm': 1.675716519355774, 'learning_rate': 5.2223851417399804e-05, 'epoch': 1.67}
+{'loss': 0.3869, 'grad_norm': 1.3907341957092285, 'learning_rate': 5.219941348973607e-05, 'epoch': 1.67}
+{'loss': 0.3018, 'grad_norm': 1.2454923391342163, 'learning_rate': 5.217497556207233e-05, 'epoch': 1.67}
+{'loss': 0.3795, 'grad_norm': 3.0145490169525146, 'learning_rate': 5.21505376344086e-05, 'epoch': 1.67}
+{'loss': 0.3482, 'grad_norm': 1.352417230606079, 'learning_rate': 5.212609970674487e-05, 'epoch': 1.67}
+{'loss': 0.5697, 'grad_norm': 1.3013248443603516, 'learning_rate': 5.210166177908113e-05, 'epoch': 1.67}
+{'loss': 0.8786, 'grad_norm': 3.581904888153076, 'learning_rate': 5.20772238514174e-05, 'epoch': 1.67}
+{'loss': 0.4929, 'grad_norm': 1.4342955350875854, 'learning_rate': 5.2052785923753664e-05, 'epoch': 1.67}
+{'loss': 0.4167, 'grad_norm': 1.4663060903549194, 'learning_rate': 5.202834799608992e-05, 'epoch': 1.67}
+{'loss': 0.44, 'grad_norm': 1.2736845016479492, 'learning_rate': 5.2003910068426194e-05, 'epoch': 1.67}
+{'loss': 0.406, 'grad_norm': 3.1659364700317383, 'learning_rate': 5.1979472140762465e-05, 'epoch': 1.67}
+{'loss': 1.0747, 'grad_norm': 4.501870155334473, 'learning_rate': 5.1955034213098723e-05, 'epoch': 1.67}
+{'loss': 0.5949, 'grad_norm': 2.149975299835205, 'learning_rate': 5.193059628543499e-05, 'epoch': 1.67}
+{'loss': 0.4715, 'grad_norm': 1.4252465963363647, 'learning_rate': 5.190615835777126e-05, 'epoch': 1.67}
+{'loss': 0.3242, 'grad_norm': 1.7069036960601807, 'learning_rate': 5.188172043010752e-05, 'epoch': 1.67}
+{'loss': 0.4015, 'grad_norm': 2.188441753387451, 'learning_rate': 5.185728250244379e-05, 'epoch': 1.67}
+{'loss': 0.4151, 'grad_norm': 1.5735689401626587, 'learning_rate': 5.183284457478006e-05, 'epoch': 1.67}
+{'loss': 0.6561, 'grad_norm': 4.588582515716553, 'learning_rate': 5.180840664711632e-05, 'epoch': 1.67}
+{'loss': 0.48, 'grad_norm': 2.366029739379883, 'learning_rate': 5.1783968719452584e-05, 'epoch': 1.67}
+{'loss': 0.5077, 'grad_norm': 1.6738721132278442, 'learning_rate': 5.1759530791788855e-05, 'epoch': 1.67}
+{'loss': 0.6517, 'grad_norm': 2.501746892929077, 'learning_rate': 5.1735092864125113e-05, 'epoch': 1.67}
+{'loss': 1.1346, 'grad_norm': 3.7587740421295166, 'learning_rate': 5.1710654936461385e-05, 'epoch': 1.67}
+{'loss': 0.6876, 'grad_norm': 15.480533599853516, 'learning_rate': 5.168621700879766e-05, 'epoch': 1.67}
+{'loss': 0.6573, 'grad_norm': 2.9390463829040527, 'learning_rate': 5.1661779081133915e-05, 'epoch': 1.67}
+{'loss': 0.788, 'grad_norm': 5.4329304695129395, 'learning_rate': 5.163734115347018e-05, 'epoch': 1.67}
+{'loss': 1.1849, 'grad_norm': 9.223847389221191, 'learning_rate': 5.161290322580645e-05, 'epoch': 1.67}
+{'loss': 0.621, 'grad_norm': 2.515810966491699, 'learning_rate': 5.158846529814271e-05, 'epoch': 1.67}
+{'loss': 1.1932, 'grad_norm': 3.640105962753296, 'learning_rate': 5.156402737047898e-05, 'epoch': 1.67}
+{'loss': 0.586, 'grad_norm': 5.372037887573242, 'learning_rate': 5.153958944281524e-05, 'epoch': 1.67}
+{'loss': 1.0401, 'grad_norm': 1.5914926528930664, 'learning_rate': 5.151515151515151e-05, 'epoch': 1.67}
+{'loss': 1.3516, 'grad_norm': 3.534508466720581, 'learning_rate': 5.1490713587487775e-05, 'epoch': 1.67}
+{'loss': 0.6955, 'grad_norm': 1.6528741121292114, 'learning_rate': 5.146627565982404e-05, 'epoch': 1.67}
+{'loss': 0.8107, 'grad_norm': 2.1529715061187744, 'learning_rate': 5.1441837732160305e-05, 'epoch': 1.67}
+{'loss': 0.3083, 'grad_norm': 2.8191683292388916, 'learning_rate': 5.1417399804496576e-05, 'epoch': 1.67}
+{'loss': 0.6602, 'grad_norm': 1.5679564476013184, 'learning_rate': 5.1392961876832834e-05, 'epoch': 1.67}
+{'loss': 0.7978, 'grad_norm': 1.8020987510681152, 'learning_rate': 5.1368523949169106e-05, 'epoch': 1.67}
+{'loss': 0.1784, 'grad_norm': 0.6797225475311279, 'learning_rate': 5.134408602150537e-05, 'epoch': 1.67}
+{'loss': 0.1494, 'grad_norm': 1.2861469984054565, 'learning_rate': 5.1319648093841635e-05, 'epoch': 1.67}
+{'loss': 0.2241, 'grad_norm': 0.7776259183883667, 'learning_rate': 5.12952101661779e-05, 'epoch': 1.67}
+{'loss': 0.1937, 'grad_norm': 0.7680147290229797, 'learning_rate': 5.127077223851417e-05, 'epoch': 1.67}
+{'loss': 0.1732, 'grad_norm': 0.7081788182258606, 'learning_rate': 5.124633431085043e-05, 'epoch': 1.67}
+{'loss': 0.2248, 'grad_norm': 0.6900548338890076, 'learning_rate': 5.12218963831867e-05, 'epoch': 1.67}
+{'loss': 0.3333, 'grad_norm': 1.274192452430725, 'learning_rate': 5.1197458455522966e-05, 'epoch': 1.67}
+{'loss': 0.3009, 'grad_norm': 0.7436234354972839, 'learning_rate': 5.117302052785923e-05, 'epoch': 1.67}
+{'loss': 0.3789, 'grad_norm': 1.424438714981079, 'learning_rate': 5.1148582600195496e-05, 'epoch': 1.67}
+{'loss': 0.3426, 'grad_norm': 0.85284423828125, 'learning_rate': 5.112414467253177e-05, 'epoch': 1.67}
+{'loss': 0.2127, 'grad_norm': 1.165490984916687, 'learning_rate': 5.1099706744868025e-05, 'epoch': 1.67}
+{'loss': 0.3583, 'grad_norm': 1.005594253540039, 'learning_rate': 5.10752688172043e-05, 'epoch': 1.68}
+ 84%|████████▍ | 10701/12776 [1:52:33<19:17,  1.79it/s] 84%|████████▍ | 10702/12776 [1:52:33<18:49,  1.84it/s]                                                        84%|████████▍ | 10702/12776 [1:52:33<18:49,  1.84it/s] 84%|████████▍ | 10703/12776 [1:52:34<17:24,  1.98it/s]                                                        84%|████████▍ | 10703/12776 [1:52:34<17:24,  1.98it/s] 84%|████████▍ | 10704/12776 [1:52:34<16:18,  2.12it/s]                                                        84%|████████▍ | 10704/12776 [1:52:34<16:18,  2.12it/s] 84%|████████▍ | 10705/12776 [1:52:34<16:30,  2.09it/s]                                                        84%|████████▍ | 10705/12776 [1:52:34<16:30,  2.09it/s] 84%|████████▍ | 10706/12776 [1:52:35<15:22,  2.24it/s]                                                        84%|████████▍ | 10706/12776 [1:52:35<15:22,  2.24it/s] 84%|████████▍ | 10707/12776 [1:52:35<14:25,  2.39it/s]                                                        84%|████████▍ | 10707/12776 [1:52:35<14:25,  2.39it/s] 84%|████████▍ | 10708/12776 [1:52:36<14:19,  2.41it/s]                                                        84%|████████▍ | 10708/12776 [1:52:36<14:19,  2.41it/s] 84%|████████▍ | 10709/12776 [1:52:36<13:29,  2.55it/s]                                                        84%|████████▍ | 10709/12776 [1:52:36<13:29,  2.55it/s] 84%|████████▍ | 10710/12776 [1:52:36<12:51,  2.68it/s]                                                        84%|████████▍ | 10710/12776 [1:52:36<12:51,  2.68it/s] 84%|████████▍ | 10711/12776 [1:52:37<13:35,  2.53it/s]                                                        84%|████████▍ | 10711/12776 [1:52:37<13:35,  2.53it/s] 84%|████████▍ | 10712/12776 [1:52:37<12:38,  2.72it/s]                                                        84%|████████▍ | 10712/12776 [1:52:37<12:38,  2.72it/s] 84%|████████▍ | 10713/12776 [1:52:37<11:53,  2.89it/s]                                                        84%|████████▍ | 10713/12776 [1:52:37<11:53,  2.89it/s] 84%|████████▍ | 10714/12776 [1:52:38<12:16,  2.80it/s]                                                        84%|████████▍ | 10714/12776 [1:52:38<12:16,  2.80it/s] 84%|████████▍ | 10715/12776 [1:52:38<11:28,  2.99it/s]                                                        84%|████████▍ | 10715/12776 [1:52:38<11:28,  2.99it/s] 84%|████████▍ | 10716/12776 [1:52:38<10:49,  3.17it/s]                                                        84%|████████▍ | 10716/12776 [1:52:38<10:49,  3.17it/s] 84%|████████▍ | 10717/12776 [1:52:38<10:20,  3.32it/s]                                                        84%|████████▍ | 10717/12776 [1:52:38<10:20,  3.32it/s] 84%|████████▍ | 10718/12776 [1:52:39<10:20,  3.32it/s]                                                        84%|████████▍ | 10718/12776 [1:52:39<10:20,  3.32it/s] 84%|████████▍ | 10719/12776 [1:52:39<09:53,  3.47it/s]                                                        84%|████████▍ | 10719/12776 [1:52:39<09:53,  3.47it/s] 84%|████████▍ | 10720/12776 [1:52:39<09:28,  3.62it/s]                                                        84%|████████▍ | 10720/12776 [1:52:39<09:28,  3.62it/s] 84%|████████▍ | 10721/12776 [1:52:40<09:07,  3.75it/s]                                                        84%|████████▍ | 10721/12776 [1:52:40<09:07,  3.75it/s] 84%|████████▍ | 10722/12776 [1:52:40<08:52,  3.86it/s]                                                        84%|████████▍ | 10722/12776 [1:52:40<08:52,  3.86it/s] 84%|████████▍ | 10723/12776 [1:52:40<08:50,  3.87it/s]                                                        84%|████████▍ | 10723/12776 [1:52:40<08:50,  3.87it/s] 84%|████████▍ | 10724/12776 [1:52:40<08:31,  4.01it/s]                                                        84%|████████▍ | 10724/12776 [1:52:40<08:31,  4.01it/s] 84%|████████▍ | 10725/12776 [1:52:40<08:12,  4.17it/s]                                                        84%|████████▍ | 10725/12776 [1:52:40<08:12,  4.17it/s] 84%|████████▍ | 10726/12776 [1:52:41<07:56,  4.30it/s]                                                        84%|████████▍ | 10726/12776 [1:52:41<07:56,  4.30it/s] 84%|████████▍ | 10727/12776 [1:52:41<08:55,  3.83it/s]                                                        84%|████████▍ | 10727/12776 [1:52:41<08:55,  3.83it/s] 84%|████████▍ | 10728/12776 [1:52:41<08:27,  4.04it/s]                                                        84%|████████▍ | 10728/12776 [1:52:41<08:27,  4.04it/s] 84%|████████▍ | 10729/12776 [1:52:41<08:03,  4.24it/s]                                                        84%|████████▍ | 10729/12776 [1:52:41<08:03,  4.24it/s] 84%|████████▍ | 10730/12776 [1:52:42<07:43,  4.41it/s]                                                        84%|████████▍ | 10730/12776 [1:52:42<07:43,  4.41it/s] 84%|████████▍ | 10731/12776 [1:52:42<07:26,  4.58it/s]                                                        84%|████████▍ | 10731/12776 [1:52:42<07:26,  4.58it/s] 84%|████████▍ | 10732/12776 [1:52:42<08:10,  4.17it/s]                                                        84%|████████▍ | 10732/12776 [1:52:42<08:10,  4.17it/s] 84%|████████▍ | 10733/12776 [1:52:42<07:43,  4.41it/s]                                                        84%|████████▍ | 10733/12776 [1:52:42<07:43,  4.41it/s] 84%|████████▍ | 10734/12776 [1:52:43<07:22,  4.61it/s]                                                        84%|████████▍ | 10734/12776 [1:52:43<07:22,  4.61it/s] 84%|████████▍ | 10735/12776 [1:52:43<07:06,  4.79it/s]                                                        84%|████████▍ | 10735/12776 [1:52:43<07:06,  4.79it/s] 84%|████████▍ | 10736/12776 [1:52:43<06:52,  4.94it/s]                                                        84%|████████▍ | 10736/12776 [1:52:43<06:52,  4.94it/s] 84%|████████▍ | 10737/12776 [1:52:43<06:41,  5.08it/s]                                                        84%|████████▍ | 10737/12776 [1:52:43<06:41,  5.08it/s] 84%|████████▍ | 10738/12776 [1:52:44<12:24,  2.74it/s]                                                        84%|████████▍ | 10738/12776 [1:52:44<12:24,  2.74it/s] 84%|████████▍ | 10739/12776 [1:52:45<24:00,  1.41it/s]                                                        84%|████████▍ | 10739/12776 [1:52:45<24:00,  1.41it/s] 84%|████████▍ | 10740/12776 [1:52:46<26:58,  1.26it/s]                                                        84%|████████▍ | 10740/12776 [1:52:46<26:58,  1.26it/s] 84%|████████▍ | 10741/12776 [1:52:47<27:35,  1.23it/s]                                                        84%|████████▍ | 10741/12776 [1:52:47<27:35,  1.23it/s] 84%|████████▍ | 10742/12776 [1:52:48<27:02,  1.25it/s]                                                        84%|████████▍ | 10742/12776 [1:52:48<27:02,  1.25it/s] 84%|████████▍ | 10743/12776 [1:52:49<26:25,  1.28it/s]                                                        84%|████████▍ | 10743/12776 [1:52:49<26:25,  1.28it/s] 84%|████████▍ | 10744/12776 [1:52:49<25:51,  1.31it/s]                                                        84%|████████▍ | 10744/12776 [1:52:49<25:51,  1.31it/s] 84%|████████▍ | 10745/12776 [1:52:50<25:32,  1.33it/s]                                                        84%|████████▍ | 10745/12776 [1:52:50<25:32,  1.33it/s] 84%|████████▍ | 10746/12776 [1:52:51<24:19,  1.39it/s]                                                        84%|████████▍ | 10746/12776 [1:52:51<24:19,  1.39it/s] 84%|████████▍ | 10747/12776 [1:52:51<23:16,  1.45it/s]                                                        84%|████████▍ | 10747/12776 [1:52:51<23:16,  1.45it/s] 84%|████████▍ | 10748/12776 [1:52:52<21:58,  1.54it/s]                                                        84%|████████▍ | 10748/12776 [1:52:52<21:58,  1.54it/s] 84%|████████▍ | 10749/12776 [1:52:53<20:48,  1.62it/s]                                                        84%|████████▍ | 10749/12776 [1:52:53<20:48,  1.62it/s] 84%|████████▍ | 10750/12776 [1:52:53<19:50,  1.70it/s]                                                        84%|████████▍ | 10750/12776 [1:52:53<19:50,  1.70it/s] 84%|████████▍ | 10751/12776 [1:52:54<20:13,  1.67it/s]                                                        84%|████████▍ | 10751/12776 [1:52:54<20:13,  1.67it/s] 84%|████████▍ | 10752/12776 [1:52:54<18:41,  1.80it/s]                                                        84%|████████▍ | 10752/12776 [1:52:54<18:41,  1.80it/s] 84%|████████▍ | 10753/12776 [1:52:55<17:27,  1.93it/s]                                                        84%|████████▍ | 10753/12776 [1:52:55<17:27,  1.93it/s] 84%|████████▍ | 10754/12776 [1:52:55<17:21,  1.94it/s]                                                        84%|████████▍ | 10754/12776 [1:52:55<17:21,  1.94it/s] 84%|████████▍ | 10755/12776 [1:52:55<16:11,  2.08it/s]                                                        84%|████████▍ | 10755/12776 [1:52:55<16:11,  2.08it/s] 84%|████████▍ | 10756/12776 [1:52:56<16:07,  2.09it/s]                                                        84%|████████▍ | 10756/12776 [1:52:56<16:07,  2.09it/s] 84%|████████▍ | 10757/12776 [1:52:56<15:04,  2.23it/s]                                                        84%|████████▍ | 10757/12776 [1:52:56<15:04,  2.23it/s] 84%|████████▍ | 10758/12776 [1:52:57<14:04,  2.39it/s]                                                        84%|████████▍ | 10758/12776 [1:52:57<14:04,  2.39it/s] 84%|████████▍ | 10759/12776 [1:52:57<14:04,  2.39it/s]                                                        84%|████████▍ | 10759/12776 [1:52:57<14:04,  2.39it/s] 84%|████████▍ | 10760/12776 [1:52:57<13:13,  2.54it/s]                                                        84%|████████▍ | 10760/12776 [1:52:57<13:13,  2.54it/s] 84%|████████▍ | 10761/12776 [1:52:58<12:30,  2.69it/s]                                                        84%|████████▍ | 10761/12776 [1:52:58<12:30,  2.69it/s] 84%|████████▍ | 10762/12776 [1:52:58<11:51,  2.83it/s]                                                        84%|████████▍ | 10762/12776 [1:52:58<11:51,  2.83it/s] 84%|████████▍ | 10763/12776 [1:52:58<11:40,  2.87it/s]                                                        84%|████████▍ | 10763/12776 [1:52:58<11:40,  2.87it/s] 84%|████████▍ | 10764/12776 [1:52:59<11:06,  3.02it/s]                                                        84%|████████▍ | 10764/12776 [1:52:59<11:06,  3.02it/s] 84%|████████▍ | 10765/12776 [1:52:59<10:38,  3.15it/s]                                                        84%|████████▍ | 10765/12776 [1:52:59<10:38,  3.15it/s] 84%|████████▍ | 10766/12776 [1:52:59<11:12,  2.99it/s]                                                        84%|████████▍ | 10766/12776 [1:52:59<11:12,  2.99it/s] 84%|████████▍ | 10767/12776 [1:53:00<10:32,  3.18it/s]                                                        84%|████████▍ | 10767/12776 [1:53:00<10:32,  3.18it/s] 84%|████████▍ | 10768/12776 [1:53:00<10:00,  3.34it/s]                                                        84%|████████▍ | 10768/12776 [1:53:00<10:00,  3.34it/s] 84%|████████▍ | 10769/12776 [1:53:00<09:32,  3.51it/s]                                                        84%|████████▍ | 10769/12776 [1:53:00<09:32,  3.51it/s] 84%|████████▍ | 10770/12776 [1:53:00<10:08,  3.30it/s]                                                        84%|████████▍ | 10770/12776 [1:53:00<10:08,  3.30it/s] 84%|████████▍ | 10771/12776 [1:53:01<09:34,  3.49it/s]                                                        84%|████████▍ | 10771/12776 [1:53:01<09:34,  3.49it/s] 84%|████████▍ | 10772/12776 [1:53:01<09:06,  3.67it/s]                                                        84%|████████▍ | 10772/12776 [1:53:01<09:06,  3.67it/s] 84%|████████▍ | 10773/12776 [1:53:01<08:42,  3.84it/s]                                                        84%|████████▍ | 10773/12776 [1:53:01<08:42,  3.84it/s] 84%|████████▍ | 10774/12776 [1:53:01<08:26,  3.95it/s]                                                        84%|████████▍ | 10774/12776 [1:53:01<08:26,  3.95it/s] 84%|████████▍ | 10775/12776 [1:53:02<09:04,  3.67it/s]                                                        84%|████████▍ | 10775/12776 [1:53:02<09:04,  3.67it/s] 84%|████████▍ | 10776/12776 [1:53:02<08:29,  3.92it/s]                                                        84%|████████▍ | 10776/12776 [1:53:02<08:29,  3.92it/s] 84%|████████▍ | 10777/12776 [1:53:02<08:03,  4.13it/s]                                                        84%|████████▍ | 10777/12776 [1:53:02<08:03,  4.13it/s] 84%|████████▍ | 10778/12776 [1:53:02<07:44,  4.30it/s]                                                        84%|████████▍ | 10778/12776 [1:53:02<07:44,  4.30it/s] 84%|████████▍ | 10779/12776 [1:53:03<07:30,  4.44it/s]                                                       {'loss': 0.3293, 'grad_norm': 2.8059756755828857, 'learning_rate': 5.105083088954056e-05, 'epoch': 1.68}
+{'loss': 0.2471, 'grad_norm': 0.9310165643692017, 'learning_rate': 5.1026392961876827e-05, 'epoch': 1.68}
+{'loss': 0.5662, 'grad_norm': 1.2037992477416992, 'learning_rate': 5.100195503421309e-05, 'epoch': 1.68}
+{'loss': 0.3375, 'grad_norm': 2.170868396759033, 'learning_rate': 5.097751710654936e-05, 'epoch': 1.68}
+{'loss': 0.3144, 'grad_norm': 1.1550534963607788, 'learning_rate': 5.095307917888562e-05, 'epoch': 1.68}
+{'loss': 0.3797, 'grad_norm': 2.387791633605957, 'learning_rate': 5.092864125122189e-05, 'epoch': 1.68}
+{'loss': 0.2326, 'grad_norm': 1.3175456523895264, 'learning_rate': 5.090420332355816e-05, 'epoch': 1.68}
+{'loss': 0.3776, 'grad_norm': 1.391885757446289, 'learning_rate': 5.087976539589442e-05, 'epoch': 1.68}
+{'loss': 0.254, 'grad_norm': 3.515990734100342, 'learning_rate': 5.085532746823069e-05, 'epoch': 1.68}
+{'loss': 0.3833, 'grad_norm': 2.301178455352783, 'learning_rate': 5.083088954056696e-05, 'epoch': 1.68}
+{'loss': 0.3777, 'grad_norm': 0.8507611751556396, 'learning_rate': 5.0806451612903217e-05, 'epoch': 1.68}
+{'loss': 0.4903, 'grad_norm': 1.7067499160766602, 'learning_rate': 5.078201368523949e-05, 'epoch': 1.68}
+{'loss': 0.7006, 'grad_norm': 4.127223014831543, 'learning_rate': 5.075757575757575e-05, 'epoch': 1.68}
+{'loss': 0.3373, 'grad_norm': 1.4126865863800049, 'learning_rate': 5.073313782991202e-05, 'epoch': 1.68}
+{'loss': 0.9498, 'grad_norm': 1.455224871635437, 'learning_rate': 5.070869990224828e-05, 'epoch': 1.68}
+{'loss': 0.8621, 'grad_norm': 4.076616287231445, 'learning_rate': 5.0684261974584554e-05, 'epoch': 1.68}
+{'loss': 0.461, 'grad_norm': 2.2663486003875732, 'learning_rate': 5.065982404692081e-05, 'epoch': 1.68}
+{'loss': 0.7204, 'grad_norm': 2.508690118789673, 'learning_rate': 5.0635386119257084e-05, 'epoch': 1.68}
+{'loss': 0.6898, 'grad_norm': 1.673777461051941, 'learning_rate': 5.061094819159335e-05, 'epoch': 1.68}
+{'loss': 0.5936, 'grad_norm': 1.676172137260437, 'learning_rate': 5.058651026392961e-05, 'epoch': 1.68}
+{'loss': 1.0243, 'grad_norm': 7.18748664855957, 'learning_rate': 5.056207233626588e-05, 'epoch': 1.68}
+{'loss': 0.5048, 'grad_norm': 2.1801810264587402, 'learning_rate': 5.053763440860215e-05, 'epoch': 1.68}
+{'loss': 0.8976, 'grad_norm': 2.0111351013183594, 'learning_rate': 5.051319648093841e-05, 'epoch': 1.68}
+{'loss': 1.0209, 'grad_norm': 5.509698867797852, 'learning_rate': 5.048875855327468e-05, 'epoch': 1.68}
+{'loss': 1.3053, 'grad_norm': 2.151179552078247, 'learning_rate': 5.0464320625610944e-05, 'epoch': 1.68}
+{'loss': 1.1283, 'grad_norm': 3.7033746242523193, 'learning_rate': 5.043988269794721e-05, 'epoch': 1.68}
+{'loss': 0.6676, 'grad_norm': 2.599567174911499, 'learning_rate': 5.0415444770283474e-05, 'epoch': 1.68}
+{'loss': 0.5331, 'grad_norm': 2.8762729167938232, 'learning_rate': 5.0391006842619745e-05, 'epoch': 1.68}
+{'loss': 0.6954, 'grad_norm': 4.5754265785217285, 'learning_rate': 5.0366568914956e-05, 'epoch': 1.68}
+{'loss': 0.8285, 'grad_norm': 4.394883155822754, 'learning_rate': 5.0342130987292275e-05, 'epoch': 1.68}
+{'loss': 1.6435, 'grad_norm': 3.905444383621216, 'learning_rate': 5.031769305962854e-05, 'epoch': 1.68}
+{'loss': 0.625, 'grad_norm': 2.222259521484375, 'learning_rate': 5.0293255131964804e-05, 'epoch': 1.68}
+{'loss': 1.2802, 'grad_norm': 2.369340658187866, 'learning_rate': 5.026881720430107e-05, 'epoch': 1.68}
+{'loss': 0.7457, 'grad_norm': 8.684860229492188, 'learning_rate': 5.024437927663734e-05, 'epoch': 1.68}
+{'loss': 0.3532, 'grad_norm': 0.9671575427055359, 'learning_rate': 5.02199413489736e-05, 'epoch': 1.68}
+{'loss': 1.1237, 'grad_norm': 3.1766295433044434, 'learning_rate': 5.019550342130987e-05, 'epoch': 1.68}
+{'loss': 0.5706, 'grad_norm': 4.267437934875488, 'learning_rate': 5.0171065493646135e-05, 'epoch': 1.68}
+{'loss': 0.8392, 'grad_norm': 3.1225411891937256, 'learning_rate': 5.01466275659824e-05, 'epoch': 1.68}
+{'loss': 0.2465, 'grad_norm': 1.9683570861816406, 'learning_rate': 5.0122189638318665e-05, 'epoch': 1.68}
+{'loss': 0.2046, 'grad_norm': 0.6025456190109253, 'learning_rate': 5.0097751710654936e-05, 'epoch': 1.68}
+{'loss': 0.1829, 'grad_norm': 0.7359471321105957, 'learning_rate': 5.0073313782991194e-05, 'epoch': 1.68}
+{'loss': 0.2044, 'grad_norm': 0.568925142288208, 'learning_rate': 5.0048875855327466e-05, 'epoch': 1.68}
+{'loss': 0.3406, 'grad_norm': 0.593213677406311, 'learning_rate': 5.002443792766373e-05, 'epoch': 1.68}
+{'loss': 0.2111, 'grad_norm': 0.6833570599555969, 'learning_rate': 4.9999999999999996e-05, 'epoch': 1.68}
+{'loss': 0.3462, 'grad_norm': 1.2374072074890137, 'learning_rate': 4.997556207233626e-05, 'epoch': 1.68}
+{'loss': 0.3209, 'grad_norm': 0.7850791811943054, 'learning_rate': 4.995112414467253e-05, 'epoch': 1.68}
+{'loss': 0.3351, 'grad_norm': 0.6952258944511414, 'learning_rate': 4.992668621700879e-05, 'epoch': 1.68}
+{'loss': 0.2355, 'grad_norm': 1.0042572021484375, 'learning_rate': 4.990224828934506e-05, 'epoch': 1.68}
+{'loss': 0.2027, 'grad_norm': 1.752544641494751, 'learning_rate': 4.9877810361681326e-05, 'epoch': 1.68}
+{'loss': 0.2959, 'grad_norm': 1.0626980066299438, 'learning_rate': 4.985337243401759e-05, 'epoch': 1.68}
+{'loss': 0.3076, 'grad_norm': 1.8501675128936768, 'learning_rate': 4.9828934506353856e-05, 'epoch': 1.68}
+{'loss': 0.4469, 'grad_norm': 1.9613275527954102, 'learning_rate': 4.980449657869013e-05, 'epoch': 1.68}
+{'loss': 0.3296, 'grad_norm': 0.9938980340957642, 'learning_rate': 4.9780058651026386e-05, 'epoch': 1.68}
+{'loss': 0.3944, 'grad_norm': 3.928609609603882, 'learning_rate': 4.975562072336266e-05, 'epoch': 1.68}
+{'loss': 0.4749, 'grad_norm': 1.8537261486053467, 'learning_rate': 4.973118279569892e-05, 'epoch': 1.68}
+{'loss': 0.3248, 'grad_norm': 1.3114187717437744, 'learning_rate': 4.970674486803519e-05, 'epoch': 1.68}
+{'loss': 0.3514, 'grad_norm': 2.2068116664886475, 'learning_rate': 4.968230694037145e-05, 'epoch': 1.68}
+{'loss': 0.4841, 'grad_norm': 1.8806259632110596, 'learning_rate': 4.965786901270772e-05, 'epoch': 1.68}
+{'loss': 0.7084, 'grad_norm': 2.068279504776001, 'learning_rate': 4.963343108504398e-05, 'epoch': 1.68}
+{'loss': 0.5013, 'grad_norm': 2.2462806701660156, 'learning_rate': 4.960899315738025e-05, 'epoch': 1.68}
+{'loss': 0.4172, 'grad_norm': 7.449122905731201, 'learning_rate': 4.958455522971652e-05, 'epoch': 1.68}
+{'loss': 0.5593, 'grad_norm': 2.3347396850585938, 'learning_rate': 4.956011730205278e-05, 'epoch': 1.68}
+{'loss': 0.3513, 'grad_norm': 1.909669280052185, 'learning_rate': 4.953567937438905e-05, 'epoch': 1.68}
+{'loss': 0.641, 'grad_norm': 2.685105085372925, 'learning_rate': 4.951124144672532e-05, 'epoch': 1.69}
+{'loss': 0.5653, 'grad_norm': 1.1838382482528687, 'learning_rate': 4.948680351906158e-05, 'epoch': 1.69}
+{'loss': 0.5987, 'grad_norm': 1.5641990900039673, 'learning_rate': 4.946236559139785e-05, 'epoch': 1.69}
+{'loss': 0.651, 'grad_norm': 2.0659899711608887, 'learning_rate': 4.943792766373411e-05, 'epoch': 1.69}
+{'loss': 0.7361, 'grad_norm': 2.0896155834198, 'learning_rate': 4.941348973607038e-05, 'epoch': 1.69}
+{'loss': 0.5407, 'grad_norm': 4.1929426193237305, 'learning_rate': 4.938905180840664e-05, 'epoch': 1.69}
+{'loss': 0.6625, 'grad_norm': 1.9943493604660034, 'learning_rate': 4.9364613880742914e-05, 'epoch': 1.69}
+{'loss': 1.2895, 'grad_norm': 2.7732839584350586, 'learning_rate': 4.934017595307917e-05, 'epoch': 1.69}
+{'loss': 1.0281, 'grad_norm': 2.526482582092285, 'learning_rate': 4.9315738025415444e-05, 'epoch': 1.69}
+{'loss': 1.0371, 'grad_norm': 1.9940377473831177, 'learning_rate': 4.929130009775171e-05, 'epoch': 1.69}
+{'loss': 0.6451, 'grad_norm': 2.744659662246704, 'learning_rate': 4.926686217008797e-05, 'epoch': 1.69}
+{'loss': 1.0116, 'grad_norm': 4.247622013092041, 'learning_rate': 4.924242424242424e-05, 'epoch': 1.69}
+{'loss': 0.9087, 'grad_norm': 2.5660741329193115, 'learning_rate': 4.921798631476051e-05, 'epoch': 1.69}
+{'loss': 0.9292, 'grad_norm': 3.1664459705352783, 'learning_rate': 4.919354838709677e-05, 'epoch': 1.69}
+{'loss': 0.9163, 'grad_norm': 1.8890631198883057, 'learning_rate': 4.916911045943304e-05, 'epoch': 1.69}
+ 84%|████████▍ | 10779/12776 [1:53:03<07:30,  4.44it/s] 84%|████████▍ | 10780/12776 [1:53:03<07:56,  4.19it/s]                                                        84%|████████▍ | 10780/12776 [1:53:03<07:56,  4.19it/s] 84%|████████▍ | 10781/12776 [1:53:03<07:33,  4.40it/s]                                                        84%|████████▍ | 10781/12776 [1:53:03<07:33,  4.40it/s] 84%|████████▍ | 10782/12776 [1:53:03<07:10,  4.63it/s]                                                        84%|████████▍ | 10782/12776 [1:53:03<07:10,  4.63it/s] 84%|████████▍ | 10783/12776 [1:53:03<06:59,  4.75it/s]                                                        84%|████████▍ | 10783/12776 [1:53:03<06:59,  4.75it/s] 84%|████████▍ | 10784/12776 [1:53:04<06:48,  4.87it/s]                                                        84%|████████▍ | 10784/12776 [1:53:04<06:48,  4.87it/s] 84%|████████▍ | 10785/12776 [1:53:04<07:15,  4.57it/s]                                                        84%|████████▍ | 10785/12776 [1:53:04<07:15,  4.57it/s] 84%|████████▍ | 10786/12776 [1:53:04<06:55,  4.79it/s]                                                        84%|████████▍ | 10786/12776 [1:53:04<06:55,  4.79it/s] 84%|████████▍ | 10787/12776 [1:53:04<06:41,  4.95it/s]                                                        84%|████████▍ | 10787/12776 [1:53:04<06:41,  4.95it/s] 84%|████████▍ | 10788/12776 [1:53:05<13:19,  2.49it/s]                                                        84%|████████▍ | 10788/12776 [1:53:05<13:19,  2.49it/s] 84%|████████▍ | 10789/12776 [1:53:07<24:55,  1.33it/s]                                                        84%|████████▍ | 10789/12776 [1:53:07<24:55,  1.33it/s] 84%|████████▍ | 10790/12776 [1:53:08<27:56,  1.18it/s]                                                        84%|████████▍ | 10790/12776 [1:53:08<27:56,  1.18it/s] 84%|████████▍ | 10791/12776 [1:53:09<28:15,  1.17it/s]                                                        84%|████████▍ | 10791/12776 [1:53:09<28:15,  1.17it/s] 84%|████████▍ | 10792/12776 [1:53:09<27:35,  1.20it/s]                                                        84%|████████▍ | 10792/12776 [1:53:09<27:35,  1.20it/s] 84%|████████▍ | 10793/12776 [1:53:10<26:49,  1.23it/s]                                                        84%|████████▍ | 10793/12776 [1:53:10<26:49,  1.23it/s] 84%|████████▍ | 10794/12776 [1:53:11<25:56,  1.27it/s]                                                        84%|████████▍ | 10794/12776 [1:53:11<25:56,  1.27it/s] 84%|████████▍ | 10795/12776 [1:53:12<24:40,  1.34it/s]                                                        84%|████████▍ | 10795/12776 [1:53:12<24:40,  1.34it/s] 85%|████████▍ | 10796/12776 [1:53:12<24:14,  1.36it/s]                                                        85%|████████▍ | 10796/12776 [1:53:12<24:14,  1.36it/s] 85%|████████▍ | 10797/12776 [1:53:13<22:47,  1.45it/s]                                                        85%|████████▍ | 10797/12776 [1:53:13<22:47,  1.45it/s] 85%|████████▍ | 10798/12776 [1:53:13<21:56,  1.50it/s]                                                        85%|████████▍ | 10798/12776 [1:53:13<21:56,  1.50it/s] 85%|██████��█▍ | 10799/12776 [1:53:14<20:41,  1.59it/s]                                                        85%|████████▍ | 10799/12776 [1:53:14<20:41,  1.59it/s] 85%|████████▍ | 10800/12776 [1:53:15<20:01,  1.64it/s]                                                        85%|████████▍ | 10800/12776 [1:53:15<20:01,  1.64it/s]Saving model checkpoint to ./checkpoint-10800
+Configuration saved in ./checkpoint-10800/config.json
+Model weights saved in ./checkpoint-10800/model.safetensors
+Feature extractor saved in ./checkpoint-10800/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-10800/tokenizer_config.json
+Special tokens file saved in ./checkpoint-10800/special_tokens_map.json
+added tokens file saved in ./checkpoint-10800/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-9600] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 85%|████████▍ | 10801/12776 [1:53:20<1:10:53,  2.15s/it]                                                          85%|████████▍ | 10801/12776 [1:53:20<1:10:53,  2.15s/it] 85%|████████▍ | 10802/12776 [1:53:21<53:49,  1.64s/it]                                                          85%|████████▍ | 10802/12776 [1:53:21<53:49,  1.64s/it] 85%|████████▍ | 10803/12776 [1:53:21<42:51,  1.30s/it]                                                        85%|████████▍ | 10803/12776 [1:53:21<42:51,  1.30s/it] 85%|████████▍ | 10804/12776 [1:53:22<33:54,  1.03s/it]                                                        85%|████████▍ | 10804/12776 [1:53:22<33:54,  1.03s/it] 85%|████████▍ | 10805/12776 [1:53:22<27:27,  1.20it/s]                                                        85%|████████▍ | 10805/12776 [1:53:22<27:27,  1.20it/s] 85%|████████▍ | 10806/12776 [1:53:22<23:05,  1.42it/s]                                                        85%|████████▍ | 10806/12776 [1:53:22<23:05,  1.42it/s] 85%|████████▍ | 10807/12776 [1:53:23<19:36,  1.67it/s]                                                        85%|████████▍ | 10807/12776 [1:53:23<19:36,  1.67it/s] 85%|████████▍ | 10808/12776 [1:53:23<17:03,  1.92it/s]                                                        85%|████████▍ | 10808/12776 [1:53:23<17:03,  1.92it/s] 85%|████████▍ | 10809/12776 [1:53:24<15:58,  2.05it/s]                                                        85%|████████▍ | 10809/12776 [1:53:24<15:58,  2.05it/s] 85%|████████▍ | 10810/12776 [1:53:24<14:14,  2.30it/s]                                                        85%|████████▍ | 10810/12776 [1:53:24<14:14,  2.30it/s] 85%|████████▍ | 10811/12776 [1:53:24<12:56,  2.53it/s]                                                        85%|████████▍ | 10811/12776 [1:53:24<12:56,  2.53it/s] 85%|████████▍ | 10812/12776 [1:53:24<12:23,  2.64it/s]                                                        85%|████████▍ | 10812/12776 [1:53:24<12:23,  2.64it/s] 85%|████████▍ | 10813/12776 [1:53:25<11:24,  2.87it/s]                                                        85%|████████▍ | 10813/12776 [1:53:25<11:24,  2.87it/s] 85%|████████▍ | 10814/12776 [1:53:25<10:37,  3.08it/s]                                                        85%|████████▍ | 10814/12776 [1:53:25<10:37,  3.08it/s] 85%|████████▍ | 10815/12776 [1:53:25<10:01,  3.26it/s]                                                        85%|████████▍ | 10815/12776 [1:53:25<10:01,  3.26it/s] 85%|████████▍ | 10816/12776 [1:53:26<10:11,  3.21it/s]                                                        85%|████████▍ | 10816/12776 [1:53:26<10:11,  3.21it/s] 85%|████████▍ | 10817/12776 [1:53:26<09:35,  3.40it/s]                                                        85%|████████▍ | 10817/12776 [1:53:26<09:35,  3.40it/s] 85%|████████▍ | 10818/12776 [1:53:26<09:06,  3.58it/s]                                                        85%|████████▍ | 10818/12776 [1:53:26<09:06,  3.58it/s] 85%|████████▍ | 10819/12776 [1:53:26<08:37,  3.78it/s]                                                        85%|████████▍ | 10819/12776 [1:53:26<08:37,  3.78it/s] 85%|████████▍ | 10820/12776 [1:53:27<08:14,  3.96it/s]                                                        85%|████████▍ | 10820/12776 [1:53:27<08:14,  3.96it/s] 85%|████████▍ | 10821/12776 [1:53:27<08:36,  3.79it/s]                                                        85%|████████▍ | 10821/12776 [1:53:27<08:36,  3.79it/s] 85%|████████▍ | 10822/12776 [1:53:27<08:08,  4.00it/s]                                                        85%|████████▍ | 10822/12776 [1:53:27<08:08,  4.00it/s] 85%|████████▍ | 10823/12776 [1:53:27<07:44,  4.20it/s]                                                        85%|████████▍ | 10823/12776 [1:53:27<07:44,  4.20it/s] 85%|████████▍ | 10824/12776 [1:53:27<07:26,  4.37it/s]                                                        85%|████████▍ | 10824/12776 [1:53:27<07:26,  4.37it/s] 85%|████████▍ | 10825/12776 [1:53:28<07:10,  4.53it/s]                                                        85%|████████▍ | 10825/12776 [1:53:28<07:10,  4.53it/s] 85%|████████▍ | 10826/12776 [1:53:28<07:54,  4.11it/s]                                                        85%|████████▍ | 10826/12776 [1:53:28<07:54,  4.11it/s] 85%|████████▍ | 10827/12776 [1:53:28<07:26,  4.37it/s]                                                        85%|████████▍ | 10827/12776 [1:53:28<07:26,  4.37it/s] 85%|████████▍ | 10828/12776 [1:53:28<07:06,  4.57it/s]                                                        85%|████████▍ | 10828/12776 [1:53:28<07:06,  4.57it/s] 85%|████████▍ | 10829/12776 [1:53:29<06:49,  4.76it/s]                                                        85%|████████▍ | 10829/12776 [1:53:29<06:49,  4.76it/s] 85%|████████▍ | 10830/12776 [1:53:29<06:34,  4.94it/s]                                                        85%|████████▍ | 10830/12776 [1:53:29<06:34,  4.94it/s] 85%|████████▍ | 10831/12776 [1:53:29<07:00,  4.62it/s]                                                        85%|████████▍ | 10831/12776 [1:53:29<07:00,  4.62it/s] 85%|████████▍ | 10832/12776 [1:53:29<06:38,  4.87it/s]                                                        85%|████████▍ | 10832/12776 [1:53:29<06:38,  4.87it/s] 85%|████████▍ | 10833/12776 [1:53:29<06:25,  5.04it/s]                                                        85%|████████▍ | 10833/12776 [1:53:29<06:25,  5.04it/s] 85%|████████▍ | 10834/12776 [1:53:30<06:13,  5.19it/s]                                                        85%|████████▍ | 10834/12776 [1:53:30<06:13,  5.19it/s] 85%|████████▍ | 10835/12776 [1:53:30<06:02,  5.36it/s]                                                        85%|████████▍ | 10835/12776 [1:53:30<06:02,  5.36it/s] 85%|████████▍ | 10836/12776 [1:53:30<05:50,  5.53it/s]                                                        85%|████████▍ | 10836/12776 [1:53:30<05:50,  5.53it/s] 85%|████████▍ | 10837/12776 [1:53:30<06:34,  4.92it/s]                                                        85%|████████▍ | 10837/12776 [1:53:30<06:34,  4.92it/s] 85%|████████▍ | 10838/12776 [1:53:31<11:15,  2.87it/s]                                                        85%|████████▍ | 10838/12776 [1:53:31<11:15,  2.87it/s] 85%|████████▍ | 10839/12776 [1:53:32<20:49,  1.55it/s]                                                        85%|████████▍ | 10839/12776 [1:53:32<20:49,  1.55it/s] 85%|████████▍ | 10840/12776 [1:53:33<23:49,  1.35it/s]                                                        85%|████████▍ | 10840/12776 [1:53:33<23:49,  1.35it/s] 85%|████████▍ | 10841/12776 [1:53:34<25:27,  1.27it/s]                                                        85%|████████▍ | 10841/12776 [1:53:34<25:27,  1.27it/s] 85%|████████▍ | 10842/12776 [1:53:35<25:38,  1.26it/s]                                                        85%|████████▍ | 10842/12776 [1:53:35<25:38,  1.26it/s] 85%|████████▍ | 10843/12776 [1:53:36<24:44,  1.30it/s]                                                        85%|████████▍ | 10843/12776 [1:53:36<24:44,  1.30it/s] 85%|████████▍ | 10844/12776 [1:53:36<23:44,  1.36it/s]                                                        85%|████████▍ | 10844/12776 [1:53:36<23:44,  1.36it/s] 85%|████████▍ | 10845/12776 [1:53:37<23:53,  1.35it/s]                                                        85%|████████▍ | 10845/12776 [1:53:37<23:53,  1.35it/s] 85%|████████▍ | 10846/12776 [1:53:38<22:32,  1.43it/s]                                                        85%|████████▍ | 10846/12776 [1:53:38<22:32,  1.43it/s] 85%|████████▍ | 10847/12776 [1:53:38<21:24,  1.50it/s]                                                        85%|████████▍ | 10847/12776 [1:53:38<21:24,  1.50it/s] 85%|████████▍ | 10848/12776 [1:53:39<20:12,  1.59it/s]                                                        85%|████████▍ | 10848/12776 [1:53:39<20:12,  1.59it/s] 85%|████████▍ | 10849/12776 [1:53:39<19:32,  1.64it/s]                                                        85%|████████▍ | 10849/12776 [1:53:39<19:32,  1.64it/s] 85%|████████▍ | 10850/12776 [1:53:40<18:26,  1.74it/s]                                                        85%|████████▍ | 10850/12776 [1:53:40<18:26,  1.74it/s] 85%|████████▍ | 10851/12776 [1:53:40<17:26,  1.84it/s]                                                        85%|████████▍ | 10851/12776 [1:53:40<17:26,  1.84it/s] 85%|████████▍ | 10852/12776 [1:53:41<16:28,  1.95it/s]                                                        85%|████████▍ | 10852/12776 [1:53:41<16:28,  1.95it/s] 85%|████████▍ | 10853/12776 [1:53:41<15:32,  2.06it/s]                                                        85%|████████▍ | 10853/12776 [1:53:41<15:32,  2.06it/s] 85%|████████▍ | 10854/12776 [1:53:42<15:04,  2.12it/s]                                                        85%|████████▍ | 10854/12776 [1:53:42<15:04,  2.12it/s] 85%|████████▍ | 10855/12776 [1:53:42<14:15,  2.25it/s]                                                        85%|████████▍ | 10855/12776 [1:53:42<14:15,  2.25it/s] 85%|████████▍ | 10856/12776 [1:53:42<13:34,  2.36it/s]                                                        85%|████████▍ | 10856/12776 [1:53:42<13:34,  2.36it/s] 85%|████████▍ | 10857/12776 [1:53:43<13:26,  2.38it/s]                                                       {'loss': 0.7879, 'grad_norm': 3.557894706726074, 'learning_rate': 4.9144672531769304e-05, 'epoch': 1.69}
+{'loss': 0.7324, 'grad_norm': 3.2021689414978027, 'learning_rate': 4.912023460410556e-05, 'epoch': 1.69}
+{'loss': 0.7799, 'grad_norm': 3.014575242996216, 'learning_rate': 4.9095796676441834e-05, 'epoch': 1.69}
+{'loss': 1.1459, 'grad_norm': 3.13476824760437, 'learning_rate': 4.9071358748778105e-05, 'epoch': 1.69}
+{'loss': 1.0614, 'grad_norm': 2.8570799827575684, 'learning_rate': 4.9046920821114363e-05, 'epoch': 1.69}
+{'loss': 0.5261, 'grad_norm': 1.858405351638794, 'learning_rate': 4.9022482893450635e-05, 'epoch': 1.69}
+{'loss': 0.8285, 'grad_norm': 2.642915964126587, 'learning_rate': 4.89980449657869e-05, 'epoch': 1.69}
+{'loss': 0.6748, 'grad_norm': 6.898956775665283, 'learning_rate': 4.897360703812316e-05, 'epoch': 1.69}
+{'loss': 0.7534, 'grad_norm': 2.4896328449249268, 'learning_rate': 4.894916911045943e-05, 'epoch': 1.69}
+{'loss': 1.3809, 'grad_norm': 7.32418155670166, 'learning_rate': 4.89247311827957e-05, 'epoch': 1.69}
+{'loss': 0.2564, 'grad_norm': 1.3984144926071167, 'learning_rate': 4.890029325513196e-05, 'epoch': 1.69}
+{'loss': 0.2788, 'grad_norm': 0.826632559299469, 'learning_rate': 4.887585532746823e-05, 'epoch': 1.69}
+{'loss': 0.1724, 'grad_norm': 0.395866721868515, 'learning_rate': 4.8851417399804495e-05, 'epoch': 1.69}
+{'loss': 0.2389, 'grad_norm': 0.7708562016487122, 'learning_rate': 4.8826979472140753e-05, 'epoch': 1.69}
+{'loss': 0.2478, 'grad_norm': 0.6404639482498169, 'learning_rate': 4.8802541544477025e-05, 'epoch': 1.69}
+{'loss': 0.3287, 'grad_norm': 0.717633068561554, 'learning_rate': 4.87781036168133e-05, 'epoch': 1.69}
+{'loss': 0.3062, 'grad_norm': 1.3549281358718872, 'learning_rate': 4.8753665689149555e-05, 'epoch': 1.69}
+{'loss': 0.3737, 'grad_norm': 1.1032696962356567, 'learning_rate': 4.8729227761485826e-05, 'epoch': 1.69}
+{'loss': 0.3077, 'grad_norm': 1.0608580112457275, 'learning_rate': 4.870478983382209e-05, 'epoch': 1.69}
+{'loss': 0.3382, 'grad_norm': 1.941976547241211, 'learning_rate': 4.868035190615835e-05, 'epoch': 1.69}
+{'loss': 0.3143, 'grad_norm': 1.254552960395813, 'learning_rate': 4.865591397849462e-05, 'epoch': 1.69}
+{'loss': 0.2226, 'grad_norm': 0.9301863312721252, 'learning_rate': 4.863147605083089e-05, 'epoch': 1.69}
+{'loss': 0.4288, 'grad_norm': 0.8874600529670715, 'learning_rate': 4.860703812316715e-05, 'epoch': 1.69}
+{'loss': 0.3001, 'grad_norm': 1.8583930730819702, 'learning_rate': 4.8582600195503415e-05, 'epoch': 1.69}
+{'loss': 0.2913, 'grad_norm': 1.5552978515625, 'learning_rate': 4.8558162267839687e-05, 'epoch': 1.69}
+{'loss': 0.2821, 'grad_norm': 0.9442487359046936, 'learning_rate': 4.8533724340175945e-05, 'epoch': 1.69}
+{'loss': 0.4196, 'grad_norm': 1.6942837238311768, 'learning_rate': 4.8509286412512216e-05, 'epoch': 1.69}
+{'loss': 0.3276, 'grad_norm': 1.1128238439559937, 'learning_rate': 4.848484848484849e-05, 'epoch': 1.69}
+{'loss': 0.3271, 'grad_norm': 1.4044594764709473, 'learning_rate': 4.8460410557184746e-05, 'epoch': 1.69}
+{'loss': 0.5816, 'grad_norm': 2.1907525062561035, 'learning_rate': 4.843597262952101e-05, 'epoch': 1.69}
+{'loss': 0.3358, 'grad_norm': 1.5539323091506958, 'learning_rate': 4.8411534701857275e-05, 'epoch': 1.69}
+{'loss': 0.53, 'grad_norm': 2.5822982788085938, 'learning_rate': 4.838709677419354e-05, 'epoch': 1.69}
+{'loss': 0.8552, 'grad_norm': 4.984518527984619, 'learning_rate': 4.836265884652981e-05, 'epoch': 1.69}
+{'loss': 0.5728, 'grad_norm': 1.6038222312927246, 'learning_rate': 4.833822091886607e-05, 'epoch': 1.69}
+{'loss': 0.6206, 'grad_norm': 2.4173617362976074, 'learning_rate': 4.831378299120234e-05, 'epoch': 1.69}
+{'loss': 0.6327, 'grad_norm': 3.724741220474243, 'learning_rate': 4.8289345063538606e-05, 'epoch': 1.69}
+{'loss': 0.6322, 'grad_norm': 1.6614710092544556, 'learning_rate': 4.826490713587487e-05, 'epoch': 1.69}
+{'loss': 0.6787, 'grad_norm': 2.336488723754883, 'learning_rate': 4.8240469208211136e-05, 'epoch': 1.69}
+{'loss': 0.5463, 'grad_norm': 1.9805657863616943, 'learning_rate': 4.821603128054741e-05, 'epoch': 1.69}
+{'loss': 0.2454, 'grad_norm': 0.8958491086959839, 'learning_rate': 4.8191593352883665e-05, 'epoch': 1.69}
+{'loss': 0.7812, 'grad_norm': 4.5509562492370605, 'learning_rate': 4.816715542521994e-05, 'epoch': 1.69}
+{'loss': 0.7259, 'grad_norm': 2.46221923828125, 'learning_rate': 4.81427174975562e-05, 'epoch': 1.69}
+{'loss': 0.6677, 'grad_norm': 2.2861831188201904, 'learning_rate': 4.8118279569892467e-05, 'epoch': 1.69}
+{'loss': 0.4636, 'grad_norm': 2.7029225826263428, 'learning_rate': 4.809384164222873e-05, 'epoch': 1.69}
+{'loss': 0.5958, 'grad_norm': 1.2040103673934937, 'learning_rate': 4.8069403714565e-05, 'epoch': 1.69}
+{'loss': 0.7894, 'grad_norm': 5.391923904418945, 'learning_rate': 4.804496578690126e-05, 'epoch': 1.69}
+{'loss': 1.516, 'grad_norm': 4.4691972732543945, 'learning_rate': 4.802052785923753e-05, 'epoch': 1.69}
+{'loss': 1.0442, 'grad_norm': 2.4495632648468018, 'learning_rate': 4.79960899315738e-05, 'epoch': 1.69}
+{'loss': 0.9353, 'grad_norm': 6.339310169219971, 'learning_rate': 4.797165200391006e-05, 'epoch': 1.69}
+{'loss': 1.0026, 'grad_norm': 3.8489506244659424, 'learning_rate': 4.794721407624633e-05, 'epoch': 1.7}
+{'loss': 0.6062, 'grad_norm': 2.871527671813965, 'learning_rate': 4.79227761485826e-05, 'epoch': 1.7}
+{'loss': 0.4849, 'grad_norm': 1.528554081916809, 'learning_rate': 4.7898338220918857e-05, 'epoch': 1.7}
+{'loss': 1.0743, 'grad_norm': 4.001394748687744, 'learning_rate': 4.787390029325513e-05, 'epoch': 1.7}
+{'loss': 1.147, 'grad_norm': 3.781374454498291, 'learning_rate': 4.784946236559139e-05, 'epoch': 1.7}
+{'loss': 0.9814, 'grad_norm': 4.306796550750732, 'learning_rate': 4.782502443792766e-05, 'epoch': 1.7}
+{'loss': 0.9608, 'grad_norm': 2.652988910675049, 'learning_rate': 4.780058651026392e-05, 'epoch': 1.7}
+{'loss': 0.694, 'grad_norm': 2.375042676925659, 'learning_rate': 4.7776148582600194e-05, 'epoch': 1.7}
+{'loss': 0.4705, 'grad_norm': 1.6284605264663696, 'learning_rate': 4.775171065493645e-05, 'epoch': 1.7}
+{'loss': 0.7331, 'grad_norm': 2.3897979259490967, 'learning_rate': 4.7727272727272724e-05, 'epoch': 1.7}
+{'loss': 0.6741, 'grad_norm': 1.4052478075027466, 'learning_rate': 4.770283479960899e-05, 'epoch': 1.7}
+{'loss': 0.2161, 'grad_norm': 0.5717368125915527, 'learning_rate': 4.767839687194525e-05, 'epoch': 1.7}
+{'loss': 0.2037, 'grad_norm': 0.46546775102615356, 'learning_rate': 4.765395894428152e-05, 'epoch': 1.7}
+{'loss': 0.2561, 'grad_norm': 0.6907462477684021, 'learning_rate': 4.762952101661779e-05, 'epoch': 1.7}
+{'loss': 0.2443, 'grad_norm': 0.85224848985672, 'learning_rate': 4.760508308895405e-05, 'epoch': 1.7}
+{'loss': 0.1929, 'grad_norm': 0.8416429162025452, 'learning_rate': 4.758064516129032e-05, 'epoch': 1.7}
+{'loss': 0.2167, 'grad_norm': 0.6139851808547974, 'learning_rate': 4.7556207233626584e-05, 'epoch': 1.7}
+{'loss': 0.2006, 'grad_norm': 0.6043062210083008, 'learning_rate': 4.753176930596285e-05, 'epoch': 1.7}
+{'loss': 0.1848, 'grad_norm': 0.43999460339546204, 'learning_rate': 4.7507331378299114e-05, 'epoch': 1.7}
+{'loss': 0.2923, 'grad_norm': 2.4954354763031006, 'learning_rate': 4.7482893450635385e-05, 'epoch': 1.7}
+{'loss': 0.365, 'grad_norm': 1.6755168437957764, 'learning_rate': 4.745845552297164e-05, 'epoch': 1.7}
+{'loss': 0.1334, 'grad_norm': 0.7627212405204773, 'learning_rate': 4.7434017595307915e-05, 'epoch': 1.7}
+{'loss': 0.206, 'grad_norm': 0.7243126034736633, 'learning_rate': 4.740957966764418e-05, 'epoch': 1.7}
+{'loss': 0.3525, 'grad_norm': 2.446509599685669, 'learning_rate': 4.7385141739980444e-05, 'epoch': 1.7}
+{'loss': 0.3959, 'grad_norm': 2.082062005996704, 'learning_rate': 4.736070381231671e-05, 'epoch': 1.7}
+{'loss': 0.36, 'grad_norm': 1.239433765411377, 'learning_rate': 4.733626588465298e-05, 'epoch': 1.7}
+{'loss': 0.3264, 'grad_norm': 1.6544456481933594, 'learning_rate': 4.731182795698924e-05, 'epoch': 1.7}
+{'loss': 0.4697, 'grad_norm': 16.526798248291016, 'learning_rate': 4.728739002932551e-05, 'epoch': 1.7}
+{'loss': 0.4604, 'grad_norm': 1.5879408121109009, 'learning_rate': 4.7262952101661775e-05, 'epoch': 1.7}
+ 85%|████████▍ | 10857/12776 [1:53:43<13:26,  2.38it/s] 85%|████████▍ | 10858/12776 [1:53:43<12:45,  2.50it/s]                                                        85%|████████▍ | 10858/12776 [1:53:43<12:45,  2.50it/s] 85%|████████▍ | 10859/12776 [1:53:43<12:06,  2.64it/s]                                                        85%|████████▍ | 10859/12776 [1:53:43<12:06,  2.64it/s] 85%|████████▌ | 10860/12776 [1:53:44<12:32,  2.55it/s]                                                        85%|████████▌ | 10860/12776 [1:53:44<12:32,  2.55it/s] 85%|████████▌ | 10861/12776 [1:53:44<11:42,  2.73it/s]                                                        85%|████████▌ | 10861/12776 [1:53:44<11:42,  2.73it/s] 85%|████████▌ | 10862/12776 [1:53:44<11:05,  2.88it/s]                                                        85%|████████▌ | 10862/12776 [1:53:44<11:05,  2.88it/s] 85%|████████▌ | 10863/12776 [1:53:45<10:27,  3.05it/s]                                                        85%|████████▌ | 10863/12776 [1:53:45<10:27,  3.05it/s] 85%|████████▌ | 10864/12776 [1:53:45<10:57,  2.91it/s]                                                        85%|████████▌ | 10864/12776 [1:53:45<10:57,  2.91it/s] 85%|████████▌ | 10865/12776 [1:53:45<10:14,  3.11it/s]                                                        85%|████████▌ | 10865/12776 [1:53:45<10:14,  3.11it/s] 85%|████████▌ | 10866/12776 [1:53:46<09:39,  3.29it/s]                                                        85%|████████▌ | 10866/12776 [1:53:46<09:39,  3.29it/s] 85%|████████▌ | 10867/12776 [1:53:46<09:10,  3.47it/s]                                                        85%|████████▌ | 10867/12776 [1:53:46<09:10,  3.47it/s] 85%|████████▌ | 10868/12776 [1:53:46<09:44,  3.26it/s]                                                        85%|████████▌ | 10868/12776 [1:53:46<09:44,  3.26it/s] 85%|████████▌ | 10869/12776 [1:53:46<09:01,  3.52it/s]                                                        85%|████████▌ | 10869/12776 [1:53:46<09:01,  3.52it/s] 85%|████████▌ | 10870/12776 [1:53:47<08:30,  3.73it/s]                                                        85%|████████▌ | 10870/12776 [1:53:47<08:30,  3.73it/s] 85%|████████▌ | 10871/12776 [1:53:47<08:04,  3.93it/s]                                                        85%|████████▌ | 10871/12776 [1:53:47<08:04,  3.93it/s] 85%|████████▌ | 10872/12776 [1:53:47<08:43,  3.64it/s]                                                        85%|████████▌ | 10872/12776 [1:53:47<08:43,  3.64it/s] 85%|████████▌ | 10873/12776 [1:53:47<08:05,  3.92it/s]                                                        85%|████████▌ | 10873/12776 [1:53:47<08:05,  3.92it/s] 85%|████████▌ | 10874/12776 [1:53:48<07:33,  4.19it/s]                                                        85%|████████▌ | 10874/12776 [1:53:48<07:33,  4.19it/s] 85%|████████▌ | 10875/12776 [1:53:48<07:10,  4.42it/s]                                                        85%|████████▌ | 10875/12776 [1:53:48<07:10,  4.42it/s] 85%|████████▌ | 10876/12776 [1:53:48<06:50,  4.63it/s]                                                        85%|████████▌ | 10876/12776 [1:53:48<06:50,  4.63it/s] 85%|████████▌ | 10877/12776 [1:53:48<06:56,  4.56it/s]                                                        85%|████████▌ | 10877/12776 [1:53:48<06:56,  4.56it/s] 85%|█���██████▌ | 10878/12776 [1:53:48<07:04,  4.47it/s]                                                        85%|████████▌ | 10878/12776 [1:53:48<07:04,  4.47it/s] 85%|████████▌ | 10879/12776 [1:53:49<06:42,  4.71it/s]                                                        85%|████████▌ | 10879/12776 [1:53:49<06:42,  4.71it/s] 85%|████████▌ | 10880/12776 [1:53:49<06:25,  4.92it/s]                                                        85%|████████▌ | 10880/12776 [1:53:49<06:25,  4.92it/s] 85%|████████▌ | 10881/12776 [1:53:49<06:11,  5.10it/s]                                                        85%|████████▌ | 10881/12776 [1:53:49<06:11,  5.10it/s] 85%|████████▌ | 10882/12776 [1:53:49<05:59,  5.27it/s]                                                        85%|████████▌ | 10882/12776 [1:53:49<05:59,  5.27it/s] 85%|████████▌ | 10883/12776 [1:53:49<06:46,  4.66it/s]                                                        85%|████████▌ | 10883/12776 [1:53:49<06:46,  4.66it/s] 85%|████████▌ | 10884/12776 [1:53:50<06:19,  4.98it/s]                                                        85%|████████▌ | 10884/12776 [1:53:50<06:19,  4.98it/s] 85%|████████▌ | 10885/12776 [1:53:50<05:59,  5.26it/s]                                                        85%|████████▌ | 10885/12776 [1:53:50<05:59,  5.26it/s] 85%|████████▌ | 10886/12776 [1:53:50<05:43,  5.49it/s]                                                        85%|████████▌ | 10886/12776 [1:53:50<05:43,  5.49it/s] 85%|████████▌ | 10887/12776 [1:53:50<05:31,  5.71it/s]                                                        85%|████████▌ | 10887/12776 [1:53:50<05:31,  5.71it/s] 85%|████████▌ | 10888/12776 [1:53:51<10:35,  2.97it/s]                                                        85%|████████▌ | 10888/12776 [1:53:51<10:35,  2.97it/s] 85%|████████▌ | 10889/12776 [1:53:52<21:54,  1.44it/s]                                                        85%|████████▌ | 10889/12776 [1:53:52<21:54,  1.44it/s] 85%|████████▌ | 10890/12776 [1:53:53<24:12,  1.30it/s]                                                        85%|████████▌ | 10890/12776 [1:53:53<24:12,  1.30it/s] 85%|████████▌ | 10891/12776 [1:53:54<24:37,  1.28it/s]                                                        85%|████████▌ | 10891/12776 [1:53:54<24:37,  1.28it/s] 85%|████████▌ | 10892/12776 [1:53:55<24:03,  1.31it/s]                                                        85%|████████▌ | 10892/12776 [1:53:55<24:03,  1.31it/s] 85%|████████▌ | 10893/12776 [1:53:56<24:23,  1.29it/s]                                                        85%|████████▌ | 10893/12776 [1:53:56<24:23,  1.29it/s] 85%|████████▌ | 10894/12776 [1:53:56<24:07,  1.30it/s]                                                        85%|████████▌ | 10894/12776 [1:53:56<24:07,  1.30it/s] 85%|████████▌ | 10895/12776 [1:53:57<22:54,  1.37it/s]                                                        85%|████████▌ | 10895/12776 [1:53:57<22:54,  1.37it/s] 85%|████████▌ | 10896/12776 [1:53:58<23:26,  1.34it/s]                                                        85%|████████▌ | 10896/12776 [1:53:58<23:26,  1.34it/s] 85%|████████▌ | 10897/12776 [1:53:58<21:51,  1.43it/s]                                                        85%|████████▌ | 10897/12776 [1:53:58<21:51,  1.43it/s] 85%|████████▌ | 10898/12776 [1:53:59<20:54,  1.50it/s]                                                        85%|████████▌ | 10898/12776 [1:53:59<20:54,  1.50it/s] 85%|████████▌ | 10899/12776 [1:54:00<19:35,  1.60it/s]                                                        85%|████████▌ | 10899/12776 [1:54:00<19:35,  1.60it/s] 85%|████████▌ | 10900/12776 [1:54:00<19:20,  1.62it/s]                                                        85%|████████▌ | 10900/12776 [1:54:00<19:20,  1.62it/s] 85%|████████▌ | 10901/12776 [1:54:01<17:51,  1.75it/s]                                                        85%|████████▌ | 10901/12776 [1:54:01<17:51,  1.75it/s] 85%|████████▌ | 10902/12776 [1:54:01<17:31,  1.78it/s]                                                        85%|████████▌ | 10902/12776 [1:54:01<17:31,  1.78it/s] 85%|████████▌ | 10903/12776 [1:54:02<16:21,  1.91it/s]                                                        85%|████████▌ | 10903/12776 [1:54:02<16:21,  1.91it/s] 85%|████████▌ | 10904/12776 [1:54:02<16:30,  1.89it/s]                                                        85%|████████▌ | 10904/12776 [1:54:02<16:30,  1.89it/s] 85%|████████▌ | 10905/12776 [1:54:03<15:18,  2.04it/s]                                                        85%|████████▌ | 10905/12776 [1:54:03<15:18,  2.04it/s] 85%|████████▌ | 10906/12776 [1:54:03<14:20,  2.17it/s]                                                        85%|████████▌ | 10906/12776 [1:54:03<14:20,  2.17it/s] 85%|████████▌ | 10907/12776 [1:54:03<14:43,  2.12it/s]                                                        85%|████████▌ | 10907/12776 [1:54:03<14:43,  2.12it/s] 85%|████████▌ | 10908/12776 [1:54:04<13:38,  2.28it/s]                                                        85%|████████▌ | 10908/12776 [1:54:04<13:38,  2.28it/s] 85%|████████▌ | 10909/12776 [1:54:04<12:49,  2.43it/s]                                                        85%|████████▌ | 10909/12776 [1:54:04<12:49,  2.43it/s] 85%|████████▌ | 10910/12776 [1:54:05<12:50,  2.42it/s]                                                        85%|████████▌ | 10910/12776 [1:54:05<12:50,  2.42it/s] 85%|████████▌ | 10911/12776 [1:54:05<12:05,  2.57it/s]                                                        85%|████████▌ | 10911/12776 [1:54:05<12:05,  2.57it/s] 85%|████████▌ | 10912/12776 [1:54:05<11:34,  2.68it/s]                                                        85%|████████▌ | 10912/12776 [1:54:05<11:34,  2.68it/s] 85%|████████▌ | 10913/12776 [1:54:06<12:07,  2.56it/s]                                                        85%|████████▌ | 10913/12776 [1:54:06<12:07,  2.56it/s] 85%|████████▌ | 10914/12776 [1:54:06<11:18,  2.75it/s]                                                        85%|████████▌ | 10914/12776 [1:54:06<11:18,  2.75it/s] 85%|████████▌ | 10915/12776 [1:54:06<10:42,  2.90it/s]                                                        85%|████████▌ | 10915/12776 [1:54:06<10:42,  2.90it/s] 85%|████████▌ | 10916/12776 [1:54:07<11:08,  2.78it/s]                                                        85%|████████▌ | 10916/12776 [1:54:07<11:08,  2.78it/s] 85%|████████▌ | 10917/12776 [1:54:07<10:27,  2.96it/s]                                                        85%|████████▌ | 10917/12776 [1:54:07<10:27,  2.96it/s] 85%|████████▌ | 10918/12776 [1:54:07<09:52,  3.14it/s]                                                        85%|████████▌ | 10918/12776 [1:54:07<09:52,  3.14it/s] 85%|████████▌ | 10919/12776 [1:54:07<09:24,  3.29it/s]                                                        85%|████████▌ | 10919/12776 [1:54:07<09:24,  3.29it/s] 85%|████████▌ | 10920/12776 [1:54:08<09:35,  3.23it/s]                                                        85%|████████▌ | 10920/12776 [1:54:08<09:35,  3.23it/s] 85%|████████▌ | 10921/12776 [1:54:08<09:02,  3.42it/s]                                                        85%|████████▌ | 10921/12776 [1:54:08<09:02,  3.42it/s] 85%|████████▌ | 10922/12776 [1:54:08<08:38,  3.58it/s]                                                        85%|████████▌ | 10922/12776 [1:54:08<08:38,  3.58it/s] 85%|████████▌ | 10923/12776 [1:54:09<08:16,  3.73it/s]                                                        85%|████████▌ | 10923/12776 [1:54:09<08:16,  3.73it/s] 86%|████████▌ | 10924/12776 [1:54:09<08:04,  3.82it/s]                                                        86%|████████▌ | 10924/12776 [1:54:09<08:04,  3.82it/s] 86%|████████▌ | 10925/12776 [1:54:09<08:19,  3.71it/s]                                                        86%|████████▌ | 10925/12776 [1:54:09<08:19,  3.71it/s] 86%|████████▌ | 10926/12776 [1:54:09<07:53,  3.90it/s]                                                        86%|████████▌ | 10926/12776 [1:54:09<07:53,  3.90it/s] 86%|████████▌ | 10927/12776 [1:54:10<07:34,  4.06it/s]                                                        86%|████████▌ | 10927/12776 [1:54:10<07:34,  4.06it/s] 86%|████████▌ | 10928/12776 [1:54:10<07:17,  4.23it/s]                                                        86%|████████▌ | 10928/12776 [1:54:10<07:17,  4.23it/s] 86%|████████▌ | 10929/12776 [1:54:10<07:59,  3.85it/s]                                                        86%|████████▌ | 10929/12776 [1:54:10<07:59,  3.85it/s] 86%|████████▌ | 10930/12776 [1:54:10<07:38,  4.03it/s]                                                        86%|████████▌ | 10930/12776 [1:54:10<07:38,  4.03it/s] 86%|████████▌ | 10931/12776 [1:54:10<07:14,  4.24it/s]                                                        86%|████████▌ | 10931/12776 [1:54:10<07:14,  4.24it/s] 86%|████████▌ | 10932/12776 [1:54:11<06:58,  4.41it/s]                                                        86%|████████▌ | 10932/12776 [1:54:11<06:58,  4.41it/s] 86%|████████▌ | 10933/12776 [1:54:11<06:46,  4.53it/s]                                                        86%|████████▌ | 10933/12776 [1:54:11<06:46,  4.53it/s] 86%|████████▌ | 10934/12776 [1:54:11<07:44,  3.97it/s]                                                        86%|████████▌ | 10934/12776 [1:54:11<07:44,  3.97it/s] 86%|████████▌ | 10935/12776 [1:54:11<07:16,  4.22it/s]                                                       {'loss': 0.2591, 'grad_norm': 1.414926528930664, 'learning_rate': 4.723851417399804e-05, 'epoch': 1.7}
+{'loss': 0.2829, 'grad_norm': 1.0178431272506714, 'learning_rate': 4.7214076246334305e-05, 'epoch': 1.7}
+{'loss': 0.2904, 'grad_norm': 1.5842969417572021, 'learning_rate': 4.7189638318670576e-05, 'epoch': 1.7}
+{'loss': 0.3683, 'grad_norm': 1.6290944814682007, 'learning_rate': 4.7165200391006834e-05, 'epoch': 1.7}
+{'loss': 0.4134, 'grad_norm': 1.115478754043579, 'learning_rate': 4.7140762463343106e-05, 'epoch': 1.7}
+{'loss': 0.3749, 'grad_norm': 1.0377562046051025, 'learning_rate': 4.711632453567937e-05, 'epoch': 1.7}
+{'loss': 0.6543, 'grad_norm': 3.567615032196045, 'learning_rate': 4.7091886608015636e-05, 'epoch': 1.7}
+{'loss': 0.4259, 'grad_norm': 1.049473524093628, 'learning_rate': 4.70674486803519e-05, 'epoch': 1.7}
+{'loss': 0.2459, 'grad_norm': 0.785376250743866, 'learning_rate': 4.704301075268817e-05, 'epoch': 1.7}
+{'loss': 0.7491, 'grad_norm': 3.4430458545684814, 'learning_rate': 4.701857282502443e-05, 'epoch': 1.7}
+{'loss': 0.4102, 'grad_norm': 0.9417991638183594, 'learning_rate': 4.69941348973607e-05, 'epoch': 1.7}
+{'loss': 0.6608, 'grad_norm': 1.8139441013336182, 'learning_rate': 4.6969696969696966e-05, 'epoch': 1.7}
+{'loss': 0.6622, 'grad_norm': 6.10275411605835, 'learning_rate': 4.694525904203323e-05, 'epoch': 1.7}
+{'loss': 0.5263, 'grad_norm': 1.916246771812439, 'learning_rate': 4.6920821114369496e-05, 'epoch': 1.7}
+{'loss': 0.7636, 'grad_norm': 2.4708452224731445, 'learning_rate': 4.689638318670577e-05, 'epoch': 1.7}
+{'loss': 0.3469, 'grad_norm': 1.3258744478225708, 'learning_rate': 4.6871945259042026e-05, 'epoch': 1.7}
+{'loss': 0.7012, 'grad_norm': 1.8512464761734009, 'learning_rate': 4.68475073313783e-05, 'epoch': 1.7}
+{'loss': 1.1852, 'grad_norm': 3.0667710304260254, 'learning_rate': 4.682306940371456e-05, 'epoch': 1.7}
+{'loss': 0.9591, 'grad_norm': 7.509494781494141, 'learning_rate': 4.679863147605083e-05, 'epoch': 1.7}
+{'loss': 1.3427, 'grad_norm': 2.958660364151001, 'learning_rate': 4.677419354838709e-05, 'epoch': 1.7}
+{'loss': 0.9676, 'grad_norm': 2.816685914993286, 'learning_rate': 4.674975562072336e-05, 'epoch': 1.7}
+{'loss': 0.7556, 'grad_norm': 3.361295700073242, 'learning_rate': 4.672531769305962e-05, 'epoch': 1.7}
+{'loss': 1.2321, 'grad_norm': 1.7162786722183228, 'learning_rate': 4.670087976539589e-05, 'epoch': 1.7}
+{'loss': 0.9282, 'grad_norm': 9.585728645324707, 'learning_rate': 4.667644183773216e-05, 'epoch': 1.7}
+{'loss': 0.8398, 'grad_norm': 1.799238681793213, 'learning_rate': 4.665200391006842e-05, 'epoch': 1.7}
+{'loss': 1.5273, 'grad_norm': 5.488026142120361, 'learning_rate': 4.662756598240469e-05, 'epoch': 1.7}
+{'loss': 0.9112, 'grad_norm': 3.029956817626953, 'learning_rate': 4.660312805474096e-05, 'epoch': 1.7}
+{'loss': 1.0213, 'grad_norm': 3.336357831954956, 'learning_rate': 4.657869012707722e-05, 'epoch': 1.7}
+{'loss': 0.6799, 'grad_norm': 1.5135464668273926, 'learning_rate': 4.655425219941349e-05, 'epoch': 1.7}
+{'loss': 0.7304, 'grad_norm': 2.607713460922241, 'learning_rate': 4.652981427174975e-05, 'epoch': 1.7}
+{'loss': 1.1594, 'grad_norm': 4.478157043457031, 'learning_rate': 4.650537634408602e-05, 'epoch': 1.7}
+{'loss': 0.6976, 'grad_norm': 1.5213872194290161, 'learning_rate': 4.648093841642228e-05, 'epoch': 1.7}
+{'loss': 0.3431, 'grad_norm': 1.294400930404663, 'learning_rate': 4.6456500488758554e-05, 'epoch': 1.7}
+{'loss': 0.2044, 'grad_norm': 0.9387235641479492, 'learning_rate': 4.643206256109481e-05, 'epoch': 1.7}
+{'loss': 0.2973, 'grad_norm': 0.8334453105926514, 'learning_rate': 4.6407624633431084e-05, 'epoch': 1.7}
+{'loss': 0.3179, 'grad_norm': 0.7536923289299011, 'learning_rate': 4.638318670576735e-05, 'epoch': 1.71}
+{'loss': 0.1864, 'grad_norm': 1.139009714126587, 'learning_rate': 4.6358748778103614e-05, 'epoch': 1.71}
+{'loss': 0.1953, 'grad_norm': 0.825655996799469, 'learning_rate': 4.633431085043988e-05, 'epoch': 1.71}
+{'loss': 0.2487, 'grad_norm': 1.221582055091858, 'learning_rate': 4.630987292277615e-05, 'epoch': 1.71}
+{'loss': 0.2307, 'grad_norm': 0.9275577068328857, 'learning_rate': 4.628543499511241e-05, 'epoch': 1.71}
+{'loss': 0.3036, 'grad_norm': 0.8654758930206299, 'learning_rate': 4.626099706744868e-05, 'epoch': 1.71}
+{'loss': 0.2112, 'grad_norm': 5.584901809692383, 'learning_rate': 4.6236559139784944e-05, 'epoch': 1.71}
+{'loss': 0.257, 'grad_norm': 1.3160961866378784, 'learning_rate': 4.621212121212121e-05, 'epoch': 1.71}
+{'loss': 0.1725, 'grad_norm': 0.8160209655761719, 'learning_rate': 4.6187683284457474e-05, 'epoch': 1.71}
+{'loss': 0.3657, 'grad_norm': 1.1387146711349487, 'learning_rate': 4.6163245356793745e-05, 'epoch': 1.71}
+{'loss': 0.5964, 'grad_norm': 2.705634117126465, 'learning_rate': 4.6138807429130004e-05, 'epoch': 1.71}
+{'loss': 0.3722, 'grad_norm': 1.0726594924926758, 'learning_rate': 4.6114369501466275e-05, 'epoch': 1.71}
+{'loss': 0.6121, 'grad_norm': 2.15543794631958, 'learning_rate': 4.608993157380254e-05, 'epoch': 1.71}
+{'loss': 0.3009, 'grad_norm': 2.3426249027252197, 'learning_rate': 4.6065493646138805e-05, 'epoch': 1.71}
+{'loss': 0.366, 'grad_norm': 1.6663450002670288, 'learning_rate': 4.604105571847507e-05, 'epoch': 1.71}
+{'loss': 0.392, 'grad_norm': 4.950168609619141, 'learning_rate': 4.601661779081134e-05, 'epoch': 1.71}
+{'loss': 0.3112, 'grad_norm': 2.3819642066955566, 'learning_rate': 4.59921798631476e-05, 'epoch': 1.71}
+{'loss': 0.4806, 'grad_norm': 1.7315618991851807, 'learning_rate': 4.596774193548387e-05, 'epoch': 1.71}
+{'loss': 0.4802, 'grad_norm': 1.356067419052124, 'learning_rate': 4.5943304007820135e-05, 'epoch': 1.71}
+{'loss': 0.282, 'grad_norm': 1.5837355852127075, 'learning_rate': 4.5918866080156393e-05, 'epoch': 1.71}
+{'loss': 0.3698, 'grad_norm': 1.9646797180175781, 'learning_rate': 4.5894428152492665e-05, 'epoch': 1.71}
+{'loss': 0.7863, 'grad_norm': 3.7682149410247803, 'learning_rate': 4.586999022482894e-05, 'epoch': 1.71}
+{'loss': 0.5196, 'grad_norm': 2.856004476547241, 'learning_rate': 4.5845552297165195e-05, 'epoch': 1.71}
+{'loss': 0.6147, 'grad_norm': 4.119771957397461, 'learning_rate': 4.5821114369501466e-05, 'epoch': 1.71}
+{'loss': 0.5624, 'grad_norm': 3.8047308921813965, 'learning_rate': 4.579667644183773e-05, 'epoch': 1.71}
+{'loss': 1.135, 'grad_norm': 1.4393452405929565, 'learning_rate': 4.577223851417399e-05, 'epoch': 1.71}
+{'loss': 0.7294, 'grad_norm': 2.7042551040649414, 'learning_rate': 4.574780058651026e-05, 'epoch': 1.71}
+{'loss': 0.2943, 'grad_norm': 1.8399097919464111, 'learning_rate': 4.572336265884653e-05, 'epoch': 1.71}
+{'loss': 0.5965, 'grad_norm': 3.5511062145233154, 'learning_rate': 4.569892473118279e-05, 'epoch': 1.71}
+{'loss': 0.2229, 'grad_norm': 1.2243882417678833, 'learning_rate': 4.567448680351906e-05, 'epoch': 1.71}
+{'loss': 0.9775, 'grad_norm': 2.0039992332458496, 'learning_rate': 4.565004887585533e-05, 'epoch': 1.71}
+{'loss': 0.749, 'grad_norm': 2.668609857559204, 'learning_rate': 4.5625610948191585e-05, 'epoch': 1.71}
+{'loss': 0.6062, 'grad_norm': 3.119973659515381, 'learning_rate': 4.5601173020527856e-05, 'epoch': 1.71}
+{'loss': 1.0235, 'grad_norm': 4.085299491882324, 'learning_rate': 4.557673509286413e-05, 'epoch': 1.71}
+{'loss': 0.746, 'grad_norm': 1.9524751901626587, 'learning_rate': 4.5552297165200386e-05, 'epoch': 1.71}
+{'loss': 0.8188, 'grad_norm': 2.2046310901641846, 'learning_rate': 4.552785923753666e-05, 'epoch': 1.71}
+{'loss': 1.0935, 'grad_norm': 3.634552001953125, 'learning_rate': 4.550342130987292e-05, 'epoch': 1.71}
+{'loss': 1.4068, 'grad_norm': 4.985217571258545, 'learning_rate': 4.547898338220918e-05, 'epoch': 1.71}
+{'loss': 0.9341, 'grad_norm': 4.0953288078308105, 'learning_rate': 4.545454545454545e-05, 'epoch': 1.71}
+{'loss': 0.7642, 'grad_norm': 2.7812812328338623, 'learning_rate': 4.543010752688172e-05, 'epoch': 1.71}
+{'loss': 1.6983, 'grad_norm': 2.699666738510132, 'learning_rate': 4.540566959921798e-05, 'epoch': 1.71}
+{'loss': 1.0511, 'grad_norm': 2.4366767406463623, 'learning_rate': 4.5381231671554246e-05, 'epoch': 1.71}
+{'loss': 0.9385, 'grad_norm': 1.5458954572677612, 'learning_rate': 4.535679374389052e-05, 'epoch': 1.71}
+ 86%|████████▌ | 10935/12776 [1:54:11<07:16,  4.22it/s] 86%|████████▌ | 10936/12776 [1:54:12<06:57,  4.40it/s]                                                        86%|████████▌ | 10936/12776 [1:54:12<06:57,  4.40it/s] 86%|████████▌ | 10937/12776 [1:54:12<06:36,  4.64it/s]                                                        86%|████████▌ | 10937/12776 [1:54:12<06:36,  4.64it/s] 86%|████████▌ | 10938/12776 [1:54:12<10:57,  2.80it/s]                                                        86%|████████▌ | 10938/12776 [1:54:12<10:57,  2.80it/s] 86%|████████▌ | 10939/12776 [1:54:14<20:22,  1.50it/s]                                                        86%|████████▌ | 10939/12776 [1:54:14<20:22,  1.50it/s] 86%|████████▌ | 10940/12776 [1:54:15<22:40,  1.35it/s]                                                        86%|████████▌ | 10940/12776 [1:54:15<22:40,  1.35it/s] 86%|████████▌ | 10941/12776 [1:54:16<23:36,  1.29it/s]                                                        86%|████████▌ | 10941/12776 [1:54:16<23:36,  1.29it/s] 86%|████████▌ | 10942/12776 [1:54:16<23:41,  1.29it/s]                                                        86%|████████▌ | 10942/12776 [1:54:16<23:41,  1.29it/s] 86%|████████▌ | 10943/12776 [1:54:17<24:46,  1.23it/s]                                                        86%|████████▌ | 10943/12776 [1:54:17<24:46,  1.23it/s] 86%|████████▌ | 10944/12776 [1:54:18<23:26,  1.30it/s]                                                        86%|████████▌ | 10944/12776 [1:54:18<23:26,  1.30it/s] 86%|████████▌ | 10945/12776 [1:54:19<22:15,  1.37it/s]                                                        86%|████████▌ | 10945/12776 [1:54:19<22:15,  1.37it/s] 86%|████████▌ | 10946/12776 [1:54:19<21:52,  1.39it/s]                                                        86%|████████▌ | 10946/12776 [1:54:19<21:52,  1.39it/s] 86%|████████▌ | 10947/12776 [1:54:20<20:34,  1.48it/s]                                                        86%|████████▌ | 10947/12776 [1:54:20<20:34,  1.48it/s] 86%|████████▌ | 10948/12776 [1:54:20<19:36,  1.55it/s]                                                        86%|████████▌ | 10948/12776 [1:54:20<19:36,  1.55it/s] 86%|████████▌ | 10949/12776 [1:54:21<18:36,  1.64it/s]                                                        86%|████████▌ | 10949/12776 [1:54:21<18:36,  1.64it/s] 86%|████████▌ | 10950/12776 [1:54:22<18:29,  1.65it/s]                                                        86%|████████▌ | 10950/12776 [1:54:22<18:29,  1.65it/s] 86%|████████▌ | 10951/12776 [1:54:22<17:22,  1.75it/s]                                                        86%|████████▌ | 10951/12776 [1:54:22<17:22,  1.75it/s] 86%|████████▌ | 10952/12776 [1:54:23<16:17,  1.87it/s]                                                        86%|████████▌ | 10952/12776 [1:54:23<16:17,  1.87it/s] 86%|████████▌ | 10953/12776 [1:54:23<15:48,  1.92it/s]                                                        86%|████████▌ | 10953/12776 [1:54:23<15:48,  1.92it/s] 86%|████████▌ | 10954/12776 [1:54:23<14:53,  2.04it/s]                                                        86%|████████▌ | 10954/12776 [1:54:23<14:53,  2.04it/s] 86%|████████▌ | 10955/12776 [1:54:24<14:37,  2.08it/s]                                                        86%|████████▌ | 10955/12776 [1:54:24<14:37,  2.08it/s] 86%|████████▌ | 10956/12776 [1:54:24<13:46,  2.20it/s]                                                        86%|████████▌ | 10956/12776 [1:54:24<13:46,  2.20it/s] 86%|████████▌ | 10957/12776 [1:54:25<13:07,  2.31it/s]                                                        86%|████████▌ | 10957/12776 [1:54:25<13:07,  2.31it/s] 86%|████████▌ | 10958/12776 [1:54:25<12:48,  2.37it/s]                                                        86%|████████▌ | 10958/12776 [1:54:25<12:48,  2.37it/s] 86%|████████▌ | 10959/12776 [1:54:25<12:10,  2.49it/s]                                                        86%|████████▌ | 10959/12776 [1:54:25<12:10,  2.49it/s] 86%|████████▌ | 10960/12776 [1:54:26<11:40,  2.59it/s]                                                        86%|████████▌ | 10960/12776 [1:54:26<11:40,  2.59it/s] 86%|████████▌ | 10961/12776 [1:54:26<12:02,  2.51it/s]                                                        86%|████████▌ | 10961/12776 [1:54:26<12:02,  2.51it/s] 86%|████████▌ | 10962/12776 [1:54:27<11:23,  2.65it/s]                                                        86%|████████▌ | 10962/12776 [1:54:27<11:23,  2.65it/s] 86%|████████▌ | 10963/12776 [1:54:27<10:43,  2.82it/s]                                                        86%|████████▌ | 10963/12776 [1:54:27<10:43,  2.82it/s] 86%|████████▌ | 10964/12776 [1:54:27<10:12,  2.96it/s]                                                        86%|████████▌ | 10964/12776 [1:54:27<10:12,  2.96it/s] 86%|████████▌ | 10965/12776 [1:54:27<10:14,  2.95it/s]                                                        86%|████████▌ | 10965/12776 [1:54:27<10:14,  2.95it/s] 86%|████████▌ | 10966/12776 [1:54:28<09:40,  3.12it/s]                                                        86%|████████▌ | 10966/12776 [1:54:28<09:40,  3.12it/s] 86%|████████▌ | 10967/12776 [1:54:28<09:13,  3.27it/s]                                                        86%|████████▌ | 10967/12776 [1:54:28<09:13,  3.27it/s] 86%|████████▌ | 10968/12776 [1:54:28<08:51,  3.40it/s]                                                        86%|████████▌ | 10968/12776 [1:54:28<08:51,  3.40it/s] 86%|████████▌ | 10969/12776 [1:54:29<09:13,  3.26it/s]                                                        86%|████████▌ | 10969/12776 [1:54:29<09:13,  3.26it/s] 86%|████████▌ | 10970/12776 [1:54:29<08:42,  3.45it/s]                                                        86%|████████▌ | 10970/12776 [1:54:29<08:42,  3.45it/s] 86%|████████▌ | 10971/12776 [1:54:29<08:18,  3.62it/s]                                                        86%|████████▌ | 10971/12776 [1:54:29<08:18,  3.62it/s] 86%|████████▌ | 10972/12776 [1:54:29<07:59,  3.76it/s]                                                        86%|████████▌ | 10972/12776 [1:54:29<07:59,  3.76it/s] 86%|████████▌ | 10973/12776 [1:54:30<08:44,  3.44it/s]                                                        86%|████████▌ | 10973/12776 [1:54:30<08:44,  3.44it/s] 86%|████████▌ | 10974/12776 [1:54:30<08:09,  3.68it/s]                                                        86%|████████▌ | 10974/12776 [1:54:30<08:09,  3.68it/s] 86%|████████▌ | 10975/12776 [1:54:30<07:40,  3.91it/s]                                                        86%|████████▌ | 10975/12776 [1:54:30<07:40,  3.91it/s] 86%|████████▌ | 10976/12776 [1:54:30<07:16,  4.12it/s]                                                        86%|████████▌ | 10976/12776 [1:54:30<07:16,  4.12it/s] 86%|████████▌ | 10977/12776 [1:54:31<06:58,  4.30it/s]                                                        86%|████████▌ | 10977/12776 [1:54:31<06:58,  4.30it/s] 86%|████████▌ | 10978/12776 [1:54:31<07:36,  3.94it/s]                                                        86%|████████▌ | 10978/12776 [1:54:31<07:36,  3.94it/s] 86%|████████▌ | 10979/12776 [1:54:31<07:10,  4.17it/s]                                                        86%|████████▌ | 10979/12776 [1:54:31<07:10,  4.17it/s] 86%|████████▌ | 10980/12776 [1:54:31<06:50,  4.37it/s]                                                        86%|████████▌ | 10980/12776 [1:54:31<06:50,  4.37it/s] 86%|████████▌ | 10981/12776 [1:54:31<06:34,  4.55it/s]                                                        86%|████████▌ | 10981/12776 [1:54:31<06:34,  4.55it/s] 86%|████████▌ | 10982/12776 [1:54:32<06:22,  4.69it/s]                                                        86%|████████▌ | 10982/12776 [1:54:32<06:22,  4.69it/s] 86%|████████▌ | 10983/12776 [1:54:32<07:05,  4.22it/s]                                                        86%|████████▌ | 10983/12776 [1:54:32<07:05,  4.22it/s] 86%|████████▌ | 10984/12776 [1:54:32<06:41,  4.47it/s]                                                        86%|████████▌ | 10984/12776 [1:54:32<06:41,  4.47it/s] 86%|████████▌ | 10985/12776 [1:54:32<06:23,  4.67it/s]                                                        86%|████████▌ | 10985/12776 [1:54:32<06:23,  4.67it/s] 86%|████████▌ | 10986/12776 [1:54:33<06:08,  4.86it/s]                                                        86%|████████▌ | 10986/12776 [1:54:33<06:08,  4.86it/s] 86%|████████▌ | 10987/12776 [1:54:33<05:57,  5.00it/s]                                                        86%|████████▌ | 10987/12776 [1:54:33<05:57,  5.00it/s] 86%|████████▌ | 10988/12776 [1:54:33<10:37,  2.80it/s]                                                        86%|████████▌ | 10988/12776 [1:54:33<10:37,  2.80it/s] 86%|████████▌ | 10989/12776 [1:54:35<21:01,  1.42it/s]                                                        86%|████████▌ | 10989/12776 [1:54:35<21:01,  1.42it/s] 86%|████████▌ | 10990/12776 [1:54:36<23:09,  1.29it/s]                                                        86%|████████▌ | 10990/12776 [1:54:36<23:09,  1.29it/s] 86%|████████▌ | 10991/12776 [1:54:37<24:15,  1.23it/s]                                                        86%|████████▌ | 10991/12776 [1:54:37<24:15,  1.23it/s] 86%|████████▌ | 10992/12776 [1:54:38<24:23,  1.22it/s]                                                        86%|████████▌ | 10992/12776 [1:54:38<24:23,  1.22it/s] 86%|████████▌ | 10993/12776 [1:54:38<23:40,  1.26it/s]                                                        86%|████████▌ | 10993/12776 [1:54:38<23:40,  1.26it/s] 86%|████████▌ | 10994/12776 [1:54:39<23:02,  1.29it/s]                                                        86%|████████▌ | 10994/12776 [1:54:39<23:02,  1.29it/s] 86%|████████▌ | 10995/12776 [1:54:40<21:53,  1.36it/s]                                                        86%|████████▌ | 10995/12776 [1:54:40<21:53,  1.36it/s] 86%|████████▌ | 10996/12776 [1:54:40<20:49,  1.42it/s]                                                        86%|████████▌ | 10996/12776 [1:54:40<20:49,  1.42it/s] 86%|████████▌ | 10997/12776 [1:54:41<20:48,  1.42it/s]                                                        86%|████████▌ | 10997/12776 [1:54:41<20:48,  1.42it/s] 86%|████████▌ | 10998/12776 [1:54:42<19:28,  1.52it/s]                                                        86%|████████▌ | 10998/12776 [1:54:42<19:28,  1.52it/s] 86%|████████▌ | 10999/12776 [1:54:42<18:45,  1.58it/s]                                                        86%|████████▌ | 10999/12776 [1:54:42<18:45,  1.58it/s] 86%|████████▌ | 11000/12776 [1:54:43<17:38,  1.68it/s]                                                        86%|████████▌ | 11000/12776 [1:54:43<17:38,  1.68it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 0.1675, 'grad_norm': 0.7985953688621521, 'learning_rate': 4.5332355816226776e-05, 'epoch': 1.71}
+{'loss': 0.7345, 'grad_norm': 2.136017084121704, 'learning_rate': 4.530791788856305e-05, 'epoch': 1.71}
+{'loss': 0.3092, 'grad_norm': 1.3035306930541992, 'learning_rate': 4.5283479960899305e-05, 'epoch': 1.71}
+{'loss': 1.2992, 'grad_norm': 2.315929412841797, 'learning_rate': 4.525904203323558e-05, 'epoch': 1.71}
+{'loss': 0.2215, 'grad_norm': 0.8758809566497803, 'learning_rate': 4.523460410557184e-05, 'epoch': 1.71}
+{'loss': 0.2701, 'grad_norm': 0.9055795669555664, 'learning_rate': 4.5210166177908107e-05, 'epoch': 1.71}
+{'loss': 0.2285, 'grad_norm': 0.5900222659111023, 'learning_rate': 4.518572825024437e-05, 'epoch': 1.71}
+{'loss': 0.2342, 'grad_norm': 0.6624120473861694, 'learning_rate': 4.516129032258064e-05, 'epoch': 1.71}
+{'loss': 0.258, 'grad_norm': 0.567440927028656, 'learning_rate': 4.51368523949169e-05, 'epoch': 1.71}
+{'loss': 0.1772, 'grad_norm': 0.45192110538482666, 'learning_rate': 4.511241446725317e-05, 'epoch': 1.71}
+{'loss': 0.2843, 'grad_norm': 1.0923038721084595, 'learning_rate': 4.508797653958944e-05, 'epoch': 1.71}
+{'loss': 0.2207, 'grad_norm': 0.739136278629303, 'learning_rate': 4.50635386119257e-05, 'epoch': 1.71}
+{'loss': 0.1736, 'grad_norm': 0.9049036502838135, 'learning_rate': 4.503910068426197e-05, 'epoch': 1.71}
+{'loss': 0.2189, 'grad_norm': 1.2949659824371338, 'learning_rate': 4.501466275659824e-05, 'epoch': 1.71}
+{'loss': 0.2894, 'grad_norm': 0.8264017701148987, 'learning_rate': 4.4990224828934497e-05, 'epoch': 1.71}
+{'loss': 0.2483, 'grad_norm': 0.9105465412139893, 'learning_rate': 4.496578690127077e-05, 'epoch': 1.71}
+{'loss': 0.2536, 'grad_norm': 0.6218265891075134, 'learning_rate': 4.494134897360703e-05, 'epoch': 1.71}
+{'loss': 0.464, 'grad_norm': 1.0677177906036377, 'learning_rate': 4.49169110459433e-05, 'epoch': 1.71}
+{'loss': 0.4199, 'grad_norm': 1.5835673809051514, 'learning_rate': 4.489247311827956e-05, 'epoch': 1.71}
+{'loss': 0.4786, 'grad_norm': 3.1192257404327393, 'learning_rate': 4.4868035190615834e-05, 'epoch': 1.71}
+{'loss': 0.3151, 'grad_norm': 2.4409453868865967, 'learning_rate': 4.484359726295209e-05, 'epoch': 1.71}
+{'loss': 0.4947, 'grad_norm': 1.3552123308181763, 'learning_rate': 4.4819159335288364e-05, 'epoch': 1.72}
+{'loss': 0.6383, 'grad_norm': 1.606695294380188, 'learning_rate': 4.479472140762463e-05, 'epoch': 1.72}
+{'loss': 0.5953, 'grad_norm': 3.5459699630737305, 'learning_rate': 4.477028347996089e-05, 'epoch': 1.72}
+{'loss': 0.5194, 'grad_norm': 1.4051164388656616, 'learning_rate': 4.474584555229716e-05, 'epoch': 1.72}
+{'loss': 0.3916, 'grad_norm': 1.265329122543335, 'learning_rate': 4.472140762463343e-05, 'epoch': 1.72}
+{'loss': 0.5158, 'grad_norm': 2.5221071243286133, 'learning_rate': 4.469696969696969e-05, 'epoch': 1.72}
+{'loss': 0.3095, 'grad_norm': 2.5884549617767334, 'learning_rate': 4.467253176930596e-05, 'epoch': 1.72}
+{'loss': 0.6869, 'grad_norm': 2.0343172550201416, 'learning_rate': 4.4648093841642224e-05, 'epoch': 1.72}
+{'loss': 0.7452, 'grad_norm': 2.2614247798919678, 'learning_rate': 4.462365591397849e-05, 'epoch': 1.72}
+{'loss': 0.4323, 'grad_norm': 1.4859260320663452, 'learning_rate': 4.4599217986314754e-05, 'epoch': 1.72}
+{'loss': 0.6757, 'grad_norm': 2.1878414154052734, 'learning_rate': 4.4574780058651025e-05, 'epoch': 1.72}
+{'loss': 0.6629, 'grad_norm': 2.8746695518493652, 'learning_rate': 4.455034213098728e-05, 'epoch': 1.72}
+{'loss': 0.526, 'grad_norm': 2.4773285388946533, 'learning_rate': 4.4525904203323555e-05, 'epoch': 1.72}
+{'loss': 0.735, 'grad_norm': 4.917586803436279, 'learning_rate': 4.450146627565982e-05, 'epoch': 1.72}
+{'loss': 0.9001, 'grad_norm': 3.6827588081359863, 'learning_rate': 4.4477028347996084e-05, 'epoch': 1.72}
+{'loss': 0.8196, 'grad_norm': 3.3840487003326416, 'learning_rate': 4.445259042033235e-05, 'epoch': 1.72}
+{'loss': 0.8158, 'grad_norm': 3.665562391281128, 'learning_rate': 4.442815249266862e-05, 'epoch': 1.72}
+{'loss': 0.8043, 'grad_norm': 1.8126392364501953, 'learning_rate': 4.440371456500488e-05, 'epoch': 1.72}
+{'loss': 0.9601, 'grad_norm': 2.7795164585113525, 'learning_rate': 4.437927663734115e-05, 'epoch': 1.72}
+{'loss': 0.6358, 'grad_norm': 3.174985647201538, 'learning_rate': 4.4354838709677415e-05, 'epoch': 1.72}
+{'loss': 0.6868, 'grad_norm': 2.6935548782348633, 'learning_rate': 4.433040078201368e-05, 'epoch': 1.72}
+{'loss': 1.1501, 'grad_norm': 3.725315809249878, 'learning_rate': 4.4305962854349945e-05, 'epoch': 1.72}
+{'loss': 1.0729, 'grad_norm': 3.672032356262207, 'learning_rate': 4.4281524926686216e-05, 'epoch': 1.72}
+{'loss': 0.9444, 'grad_norm': 3.4297823905944824, 'learning_rate': 4.4257086999022474e-05, 'epoch': 1.72}
+{'loss': 0.7806, 'grad_norm': 2.7439005374908447, 'learning_rate': 4.4232649071358746e-05, 'epoch': 1.72}
+{'loss': 0.8998, 'grad_norm': 1.5056949853897095, 'learning_rate': 4.420821114369501e-05, 'epoch': 1.72}
+{'loss': 0.5133, 'grad_norm': 4.065816879272461, 'learning_rate': 4.4183773216031276e-05, 'epoch': 1.72}
+{'loss': 1.242, 'grad_norm': 2.949634552001953, 'learning_rate': 4.415933528836754e-05, 'epoch': 1.72}
+{'loss': 0.8277, 'grad_norm': 3.2532920837402344, 'learning_rate': 4.413489736070381e-05, 'epoch': 1.72}
+{'loss': 0.4927, 'grad_norm': 2.431811809539795, 'learning_rate': 4.411045943304007e-05, 'epoch': 1.72}
+{'loss': 0.6917, 'grad_norm': 1.749243140220642, 'learning_rate': 4.408602150537634e-05, 'epoch': 1.72}
+{'loss': 0.5891, 'grad_norm': 3.768820285797119, 'learning_rate': 4.4061583577712606e-05, 'epoch': 1.72}
+{'loss': 0.6472, 'grad_norm': 1.9826332330703735, 'learning_rate': 4.403714565004887e-05, 'epoch': 1.72}
+{'loss': 0.4265, 'grad_norm': 1.206260323524475, 'learning_rate': 4.4012707722385136e-05, 'epoch': 1.72}
+{'loss': 0.1923, 'grad_norm': 0.634954571723938, 'learning_rate': 4.398826979472141e-05, 'epoch': 1.72}
+{'loss': 0.1833, 'grad_norm': 0.9732500910758972, 'learning_rate': 4.3963831867057666e-05, 'epoch': 1.72}
+{'loss': 0.2087, 'grad_norm': 0.934144139289856, 'learning_rate': 4.393939393939394e-05, 'epoch': 1.72}
+{'loss': 0.3764, 'grad_norm': 1.1597187519073486, 'learning_rate': 4.39149560117302e-05, 'epoch': 1.72}
+{'loss': 0.1977, 'grad_norm': 0.7562986612319946, 'learning_rate': 4.389051808406647e-05, 'epoch': 1.72}
+{'loss': 0.3025, 'grad_norm': 0.9375834465026855, 'learning_rate': 4.386608015640273e-05, 'epoch': 1.72}
+{'loss': 0.2857, 'grad_norm': 0.608613908290863, 'learning_rate': 4.3841642228739e-05, 'epoch': 1.72}
+{'loss': 0.2124, 'grad_norm': 0.8662186861038208, 'learning_rate': 4.381720430107526e-05, 'epoch': 1.72}
+{'loss': 0.25, 'grad_norm': 0.936297595500946, 'learning_rate': 4.379276637341153e-05, 'epoch': 1.72}
+{'loss': 0.3277, 'grad_norm': 3.5861268043518066, 'learning_rate': 4.37683284457478e-05, 'epoch': 1.72}
+{'loss': 0.3815, 'grad_norm': 1.0144635438919067, 'learning_rate': 4.374389051808406e-05, 'epoch': 1.72}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:45,  4.66it/s][A
+  0%|          | 3/774 [00:00<03:13,  3.99it/s][A
+  1%|          | 4/774 [00:01<03:30,  3.65it/s][A
+  1%|          | 5/774 [00:01<03:25,  3.74it/s][A
+  1%|          | 6/774 [00:01<03:35,  3.56it/s][A
+  1%|          | 7/774 [00:01<03:32,  3.62it/s][A
+  1%|          | 8/774 [00:02<03:33,  3.59it/s][A
+  1%|          | 9/774 [00:02<03:19,  3.83it/s][A
+  1%|▏         | 10/774 [00:02<03:18,  3.84it/s][A
+  1%|▏         | 11/774 [00:02<03:33,  3.58it/s][A
+  2%|▏         | 12/774 [00:03<03:20,  3.81it/s][A
+  2%|▏         | 13/774 [00:03<03:13,  3.94it/s][A
+  2%|▏         | 14/774 [00:03<03:23,  3.74it/s][A
+  2%|▏         | 15/774 [00:04<03:41,  3.42it/s][A
+  2%|▏         | 16/774 [00:04<03:39,  3.45it/s][A
+  2%|▏         | 17/774 [00:04<03:16,  3.85it/s][A
+  2%|▏         | 18/774 [00:04<03:08,  4.02it/s][A
+  2%|▏         | 19/774 [00:05<03:17,  3.82it/s][A
+  3%|▎         | 20/774 [00:05<03:14,  3.88it/s][A
+  3%|▎         | 21/774 [00:05<03:18,  3.79it/s][A
+  3%|▎         | 22/774 [00:05<03:23,  3.70it/s][A
+  3%|▎         | 23/774 [00:06<03:33,  3.51it/s][A
+  3%|▎         | 24/774 [00:06<03:32,  3.53it/s][A
+  3%|▎         | 25/774 [00:06<03:38,  3.43it/s][A
+  3%|▎         | 26/774 [00:07<03:36,  3.45it/s][A
+  3%|▎         | 27/774 [00:07<03:33,  3.50it/s][A
+  4%|▎         | 28/774 [00:07<03:41,  3.37it/s][A
+  4%|▎         | 29/774 [00:07<03:45,  3.30it/s][A
+  4%|▍         | 30/774 [00:08<03:33,  3.48it/s][A
+  4%|▍         | 31/774 [00:08<03:31,  3.51it/s][A
+  4%|▍         | 32/774 [00:08<04:05,  3.02it/s][A
+  4%|▍         | 33/774 [00:09<03:53,  3.17it/s][A
+  4%|▍         | 34/774 [00:09<03:40,  3.35it/s][A
+  5%|▍         | 35/774 [00:09<03:47,  3.25it/s][A
+  5%|▍         | 36/774 [00:10<03:48,  3.23it/s][A
+  5%|▍         | 37/774 [00:10<03:48,  3.22it/s][A
+  5%|▍         | 38/774 [00:10<03:39,  3.36it/s][A
+  5%|▌         | 39/774 [00:10<03:22,  3.63it/s][A
+  5%|▌         | 40/774 [00:11<03:27,  3.54it/s][A
+  5%|▌         | 41/774 [00:11<03:24,  3.59it/s][A
+  5%|▌         | 42/774 [00:11<03:13,  3.78it/s][A
+  6%|▌         | 43/774 [00:12<03:25,  3.55it/s][A
+  6%|▌         | 44/774 [00:12<03:29,  3.48it/s][A
+  6%|▌         | 45/774 [00:12<03:18,  3.68it/s][A
+  6%|▌         | 46/774 [00:12<03:02,  3.99it/s][A
+  6%|▌         | 47/774 [00:12<02:50,  4.26it/s][A
+  6%|▌         | 48/774 [00:13<02:51,  4.23it/s][A
+  6%|▋         | 49/774 [00:13<02:53,  4.18it/s][A
+  6%|▋         | 50/774 [00:13<02:56,  4.11it/s][A
+  7%|▋         | 51/774 [00:13<02:57,  4.08it/s][A
+  7%|▋         | 52/774 [00:14<02:56,  4.09it/s][A
+  7%|▋         | 53/774 [00:14<03:04,  3.90it/s][A
+  7%|▋         | 54/774 [00:14<03:09,  3.81it/s][A
+  7%|▋         | 55/774 [00:15<03:18,  3.63it/s][A
+  7%|▋         | 56/774 [00:15<03:17,  3.63it/s][A
+  7%|▋         | 57/774 [00:15<03:24,  3.51it/s][A
+  7%|▋         | 58/774 [00:15<03:23,  3.52it/s][A
+  8%|▊         | 59/774 [00:16<03:07,  3.81it/s][A
+  8%|▊         | 60/774 [00:16<02:53,  4.11it/s][A
+  8%|▊         | 61/774 [00:16<02:32,  4.69it/s][A
+  8%|▊         | 62/774 [00:16<02:29,  4.75it/s][A
+  8%|▊         | 63/774 [00:17<02:55,  4.06it/s][A
+  8%|▊         | 64/774 [00:17<02:46,  4.27it/s][A
+  8%|▊         | 65/774 [00:17<02:48,  4.21it/s][A
+  9%|▊         | 66/774 [00:17<02:45,  4.27it/s][A
+  9%|▊         | 67/774 [00:17<02:38,  4.46it/s][A
+  9%|▉         | 68/774 [00:18<02:35,  4.55it/s][A
+  9%|▉         | 69/774 [00:18<02:27,  4.79it/s][A
+  9%|▉         | 70/774 [00:18<02:35,  4.53it/s][A
+  9%|▉         | 71/774 [00:18<02:30,  4.68it/s][A
+  9%|▉         | 72/774 [00:19<02:41,  4.35it/s][A
+  9%|▉         | 73/774 [00:19<02:51,  4.10it/s][A
+ 10%|▉         | 74/774 [00:19<02:58,  3.93it/s][A
+ 10%|▉         | 75/774 [00:19<03:04,  3.79it/s][A
+ 10%|▉         | 76/774 [00:20<03:00,  3.87it/s][A
+ 10%|▉         | 77/774 [00:20<03:11,  3.63it/s][A
+ 10%|█         | 78/774 [00:20<02:53,  4.02it/s][A
+ 10%|█         | 79/774 [00:20<02:41,  4.30it/s][A
+ 10%|█         | 80/774 [00:21<02:38,  4.38it/s][A
+ 10%|█         | 81/774 [00:21<02:17,  5.05it/s][A
+ 11%|█         | 82/774 [00:21<02:17,  5.02it/s][A
+ 11%|█         | 83/774 [00:21<02:20,  4.93it/s][A
+ 11%|█         | 84/774 [00:21<02:26,  4.72it/s][A
+ 11%|█         | 85/774 [00:22<02:35,  4.44it/s][A
+ 11%|█         | 86/774 [00:22<02:41,  4.25it/s][A
+ 11%|█         | 87/774 [00:22<02:43,  4.21it/s][A
+ 11%|█▏        | 88/774 [00:22<02:31,  4.53it/s][A
+ 11%|█▏        | 89/774 [00:22<02:25,  4.72it/s][A
+ 12%|█▏        | 90/774 [00:23<02:33,  4.45it/s][A
+ 12%|█▏        | 91/774 [00:23<02:48,  4.05it/s][A
+ 12%|█▏        | 92/774 [00:23<03:01,  3.76it/s][A
+ 12%|█▏        | 93/774 [00:24<02:57,  3.83it/s][A
+ 12%|█▏        | 94/774 [00:24<03:01,  3.74it/s][A
+ 12%|█▏        | 95/774 [00:24<03:00,  3.77it/s][A
+ 12%|█▏        | 96/774 [00:24<02:55,  3.87it/s][A
+ 13%|█▎        | 97/774 [00:25<02:39,  4.25it/s][A
+ 13%|█▎        | 98/774 [00:25<02:32,  4.43it/s][A
+ 13%|█▎        | 99/774 [00:25<02:45,  4.09it/s][A
+ 13%|█▎        | 100/774 [00:25<02:56,  3.81it/s][A
+ 13%|█▎        | 101/774 [00:26<03:02,  3.69it/s][A
+ 13%|█▎        | 102/774 [00:26<03:15,  3.44it/s][A
+ 13%|█▎        | 103/774 [00:26<03:18,  3.38it/s][A
+ 13%|█▎        | 104/774 [00:27<03:16,  3.41it/s][A
+ 14%|█▎        | 105/774 [00:27<03:14,  3.44it/s][A
+ 14%|█▎        | 106/774 [00:27<03:34,  3.12it/s][A
+ 14%|█▍        | 107/774 [00:28<03:45,  2.95it/s][A
+ 14%|█▍        | 108/774 [00:28<03:37,  3.07it/s][A
+ 14%|█▍        | 109/774 [00:28<03:34,  3.10it/s][A
+ 14%|█▍        | 110/774 [00:28<03:23,  3.26it/s][A
+ 14%|█▍        | 111/774 [00:29<03:23,  3.26it/s][A
+ 14%|█▍        | 112/774 [00:29<03:13,  3.43it/s][A
+ 15%|█▍        | 113/774 [00:29<03:18,  3.34it/s][A
+ 15%|█▍        | 114/774 [00:30<03:22,  3.26it/s][A
+ 15%|█▍        | 115/774 [00:30<03:15,  3.37it/s][A
+ 15%|█▍        | 116/774 [00:30<03:00,  3.64it/s][A
+ 15%|█▌        | 117/774 [00:30<03:07,  3.51it/s][A
+ 15%|█▌        | 118/774 [00:31<03:05,  3.53it/s][A
+ 15%|█▌        | 119/774 [00:31<02:57,  3.69it/s][A
+ 16%|█▌        | 120/774 [00:31<03:07,  3.48it/s][A
+ 16%|█▌        | 121/774 [00:32<03:04,  3.55it/s][A
+ 16%|█▌        | 122/774 [00:32<03:06,  3.49it/s][A
+ 16%|█▌        | 123/774 [00:32<02:58,  3.66it/s][A
+ 16%|█▌        | 124/774 [00:32<02:59,  3.62it/s][A
+ 16%|█▌        | 125/774 [00:33<02:59,  3.61it/s][A
+ 16%|█▋        | 126/774 [00:33<03:08,  3.44it/s][A
+ 16%|█▋        | 127/774 [00:33<03:18,  3.26it/s][A
+ 17%|█▋        | 128/774 [00:34<03:09,  3.41it/s][A
+ 17%|█▋        | 129/774 [00:34<03:09,  3.40it/s][A
+ 17%|█▋        | 130/774 [00:34<03:16,  3.27it/s][A
+ 17%|█▋        | 131/774 [00:35<03:07,  3.43it/s][A
+ 17%|█▋        | 132/774 [00:35<03:08,  3.41it/s][A
+ 17%|█▋        | 133/774 [00:35<03:03,  3.50it/s][A
+ 17%|█▋        | 134/774 [00:35<03:02,  3.50it/s][A
+ 17%|█▋        | 135/774 [00:36<03:19,  3.20it/s][A
+ 18%|█▊        | 136/774 [00:36<03:27,  3.08it/s][A
+ 18%|█▊        | 137/774 [00:36<03:28,  3.06it/s][A
+ 18%|█▊        | 138/774 [00:37<03:24,  3.12it/s][A
+ 18%|█▊        | 139/774 [00:37<03:24,  3.11it/s][A
+ 18%|█▊        | 140/774 [00:37<03:20,  3.17it/s][A
+ 18%|█▊        | 141/774 [00:38<03:12,  3.29it/s][A
+ 18%|█▊        | 142/774 [00:38<03:21,  3.14it/s][A
+ 18%|█▊        | 143/774 [00:38<03:17,  3.19it/s][A
+ 19%|█▊        | 144/774 [00:39<03:06,  3.38it/s][A
+ 19%|█▊        | 145/774 [00:39<02:59,  3.50it/s][A
+ 19%|█▉        | 146/774 [00:39<02:48,  3.72it/s][A
+ 19%|█▉        | 147/774 [00:39<02:40,  3.92it/s][A
+ 19%|█▉        | 148/774 [00:40<02:49,  3.69it/s][A
+ 19%|█▉        | 149/774 [00:40<03:02,  3.42it/s][A
+ 19%|█▉        | 150/774 [00:40<03:05,  3.37it/s][A
+ 20%|█▉        | 151/774 [00:40<02:55,  3.55it/s][A
+ 20%|█▉        | 152/774 [00:41<02:47,  3.72it/s][A
+ 20%|█▉        | 153/774 [00:41<02:53,  3.58it/s][A
+ 20%|█▉        | 154/774 [00:41<02:48,  3.67it/s][A
+ 20%|██        | 155/774 [00:42<02:45,  3.73it/s][A
+ 20%|██        | 156/774 [00:42<02:40,  3.85it/s][A
+ 20%|██        | 157/774 [00:42<02:34,  3.99it/s][A
+ 20%|██        | 158/774 [00:42<02:38,  3.90it/s][A
+ 21%|██        | 159/774 [00:43<02:40,  3.84it/s][A
+ 21%|██        | 160/774 [00:43<02:31,  4.04it/s][A
+ 21%|██        | 161/774 [00:43<02:41,  3.80it/s][A
+ 21%|██        | 162/774 [00:43<02:46,  3.68it/s][A
+ 21%|██        | 163/774 [00:44<02:45,  3.69it/s][A
+ 21%|██        | 164/774 [00:44<02:39,  3.82it/s][A
+ 21%|██▏       | 165/774 [00:44<02:37,  3.86it/s][A
+ 21%|██▏       | 166/774 [00:44<02:42,  3.74it/s][A
+ 22%|██▏       | 167/774 [00:45<02:44,  3.68it/s][A
+ 22%|██▏       | 168/774 [00:45<02:36,  3.87it/s][A
+ 22%|██▏       | 169/774 [00:45<02:29,  4.05it/s][A
+ 22%|██▏       | 170/774 [00:45<02:37,  3.83it/s][A
+ 22%|██▏       | 171/774 [00:46<02:48,  3.59it/s][A
+ 22%|██▏       | 172/774 [00:46<02:55,  3.43it/s][A
+ 22%|██▏       | 173/774 [00:46<02:51,  3.50it/s][A
+ 22%|██▏       | 174/774 [00:47<02:44,  3.66it/s][A
+ 23%|██▎       | 175/774 [00:47<02:44,  3.65it/s][A
+ 23%|██▎       | 176/774 [00:47<02:37,  3.79it/s][A
+ 23%|██▎       | 177/774 [00:47<02:51,  3.48it/s][A
+ 23%|██▎       | 178/774 [00:48<02:35,  3.82it/s][A
+ 23%|██▎       | 179/774 [00:48<02:22,  4.17it/s][A
+ 23%|██▎       | 180/774 [00:48<02:16,  4.34it/s][A
+ 23%|██▎       | 181/774 [00:48<02:20,  4.23it/s][A
+ 24%|██▎       | 182/774 [00:49<02:24,  4.11it/s][A
+ 24%|██��       | 183/774 [00:49<02:24,  4.08it/s][A
+ 24%|██▍       | 184/774 [00:49<02:34,  3.81it/s][A
+ 24%|██▍       | 185/774 [00:49<02:43,  3.60it/s][A
+ 24%|██▍       | 186/774 [00:50<02:42,  3.63it/s][A
+ 24%|██▍       | 187/774 [00:50<02:36,  3.75it/s][A
+ 24%|██▍       | 188/774 [00:50<02:35,  3.76it/s][A
+ 24%|██▍       | 189/774 [00:50<02:32,  3.83it/s][A
+ 25%|██▍       | 190/774 [00:51<02:27,  3.95it/s][A
+ 25%|██▍       | 191/774 [00:51<02:32,  3.82it/s][A
+ 25%|██▍       | 192/774 [00:51<02:36,  3.71it/s][A
+ 25%|██▍       | 193/774 [00:52<02:38,  3.67it/s][A
+ 25%|██▌       | 194/774 [00:52<02:47,  3.45it/s][A
+ 25%|██▌       | 195/774 [00:52<02:56,  3.28it/s][A
+ 25%|██▌       | 196/774 [00:52<02:56,  3.27it/s][A
+ 25%|██▌       | 197/774 [00:53<02:55,  3.28it/s][A
+ 26%|██▌       | 198/774 [00:53<02:45,  3.47it/s][A
+ 26%|██▌       | 199/774 [00:53<02:46,  3.46it/s][A
+ 26%|██▌       | 200/774 [00:54<02:40,  3.57it/s][A
+ 26%|██▌       | 201/774 [00:54<02:35,  3.68it/s][A
+ 26%|██▌       | 202/774 [00:54<02:33,  3.73it/s][A
+ 26%|██▌       | 203/774 [00:54<02:26,  3.91it/s][A
+ 26%|██▋       | 204/774 [00:55<02:29,  3.81it/s][A
+ 26%|██▋       | 205/774 [00:55<02:39,  3.56it/s][A
+ 27%|██▋       | 206/774 [00:55<02:35,  3.65it/s][A
+ 27%|██▋       | 207/774 [00:55<02:33,  3.70it/s][A
+ 27%|██▋       | 208/774 [00:56<02:34,  3.67it/s][A
+ 27%|██▋       | 209/774 [00:56<02:32,  3.70it/s][A
+ 27%|██▋       | 210/774 [00:56<02:30,  3.75it/s][A
+ 27%|██▋       | 211/774 [00:57<02:27,  3.82it/s][A
+ 27%|██▋       | 212/774 [00:57<02:17,  4.10it/s][A
+ 28%|██▊       | 213/774 [00:57<02:02,  4.59it/s][A
+ 28%|██▊       | 214/774 [00:57<02:04,  4.51it/s][A
+ 28%|██▊       | 215/774 [00:57<02:03,  4.52it/s][A
+ 28%|██▊       | 216/774 [00:58<02:02,  4.55it/s][A
+ 28%|██▊       | 217/774 [00:58<02:05,  4.42it/s][A
+ 28%|██▊       | 218/774 [00:58<02:12,  4.21it/s][A
+ 28%|██▊       | 219/774 [00:58<02:21,  3.93it/s][A
+ 28%|██▊       | 220/774 [00:59<02:18,  3.99it/s][A
+ 29%|██▊       | 221/774 [00:59<02:24,  3.83it/s][A
+ 29%|██▊       | 222/774 [00:59<02:34,  3.58it/s][A
+ 29%|██▉       | 223/774 [01:00<02:51,  3.22it/s][A
+ 29%|██▉       | 224/774 [01:00<03:00,  3.05it/s][A
+ 29%|██▉       | 225/774 [01:00<03:11,  2.86it/s][A
+ 29%|██▉       | 226/774 [01:01<03:16,  2.79it/s][A
+ 29%|██▉       | 227/774 [01:01<03:11,  2.85it/s][A
+ 29%|██▉       | 228/774 [01:01<03:05,  2.94it/s][A
+ 30%|██▉       | 229/774 [01:02<03:20,  2.72it/s][A
+ 30%|██▉       | 230/774 [01:02<03:05,  2.94it/s][A
+ 30%|██▉       | 231/774 [01:02<03:01,  2.99it/s][A
+ 30%|██▉       | 232/774 [01:03<02:53,  3.13it/s][A
+ 30%|███       | 233/774 [01:03<03:08,  2.87it/s][A
+ 30%|███       | 234/774 [01:03<03:10,  2.83it/s][A
+ 30%|███       | 235/774 [01:04<03:09,  2.84it/s][A
+ 30%|███       | 236/774 [01:04<03:13,  2.78it/s][A
+ 31%|███       | 237/774 [01:05<03:09,  2.83it/s][A
+ 31%|███       | 238/774 [01:05<03:00,  2.97it/s][A
+ 31%|███       | 239/774 [01:05<02:58,  2.99it/s][A
+ 31%|███       | 240/774 [01:05<02:59,  2.98it/s][A
+ 31%|███       | 241/774 [01:06<03:01,  2.94it/s][A
+ 31%|███▏      | 242/774 [01:06<03:11,  2.77it/s][A
+ 31%|███▏      | 243/774 [01:07<03:21,  2.63it/s][A
+ 32%|███▏      | 244/774 [01:07<03:14,  2.72it/s][A
+ 32%|███▏      | 245/774 [01:07<03:07,  2.82it/s][A
+ 32%|███▏      | 246/774 [01:08<03:07,  2.81it/s][A
+ 32%|███▏      | 247/774 [01:08<03:48,  2.31it/s][A
+ 32%|███▏      | 248/774 [01:09<03:53,  2.26it/s][A
+ 32%|███▏      | 249/774 [01:09<03:29,  2.51it/s][A
+ 32%|███▏      | 250/774 [01:09<03:22,  2.59it/s][A
+ 32%|███▏      | 251/774 [01:10<03:19,  2.62it/s][A
+ 33%|███▎      | 252/774 [01:10<03:15,  2.67it/s][A
+ 33%|███▎      | 253/774 [01:11<03:12,  2.70it/s][A
+ 33%|███▎      | 254/774 [01:11<03:08,  2.76it/s][A
+ 33%|███▎      | 255/774 [01:11<03:03,  2.82it/s][A
+ 33%|███▎      | 256/774 [01:12<02:58,  2.91it/s][A
+ 33%|███▎      | 257/774 [01:12<02:56,  2.93it/s][A
+ 33%|███▎      | 258/774 [01:12<02:42,  3.18it/s][A
+ 33%|███▎      | 259/774 [01:12<02:24,  3.56it/s][A
+ 34%|███▎      | 260/774 [01:13<02:24,  3.57it/s][A
+ 34%|███▎      | 261/774 [01:13<02:28,  3.45it/s][A
+ 34%|███▍      | 262/774 [01:13<02:13,  3.83it/s][A
+ 34%|███▍      | 263/774 [01:13<02:06,  4.04it/s][A
+ 34%|███▍      | 264/774 [01:14<02:15,  3.76it/s][A
+ 34%|███▍      | 265/774 [01:14<02:09,  3.94it/s][A
+ 34%|███▍      | 266/774 [01:14<02:03,  4.10it/s][A
+ 34%|███▍      | 267/774 [01:14<02:02,  4.14it/s][A
+ 35%|███▍      | 268/774 [01:15<02:09,  3.92it/s][A
+ 35%|███▍      | 269/774 [01:15<02:14,  3.75it/s][A
+ 35%|███▍      | 270/774 [01:15<02:20,  3.60it/s][A
+ 35%|███▌      | 271/774 [01:15<02:16,  3.69it/s][A
+ 35%|███▌      | 272/774 [01:16<02:05,  4.01it/s][A
+ 35%|███▌      | 273/774 [01:16<02:00,  4.14it/s][A
+ 35%|███▌      | 274/774 [01:16<02:05,  3.99it/s][A
+ 36%|███▌      | 275/774 [01:16<01:59,  4.19it/s][A
+ 36%|███▌      | 276/774 [01:17<01:53,  4.40it/s][A
+ 36%|███▌      | 277/774 [01:17<01:57,  4.25it/s][A
+ 36%|███▌      | 278/774 [01:17<01:59,  4.16it/s][A
+ 36%|███▌      | 279/774 [01:17<01:53,  4.37it/s][A
+ 36%|███▌      | 280/774 [01:17<01:54,  4.30it/s][A
+ 36%|███▋      | 281/774 [01:18<02:05,  3.93it/s][A
+ 36%|███▋      | 282/774 [01:18<02:16,  3.60it/s][A
+ 37%|███▋      | 283/774 [01:18<02:12,  3.71it/s][A
+ 37%|███▋      | 284/774 [01:19<02:13,  3.68it/s][A
+ 37%|███▋      | 285/774 [01:19<02:05,  3.89it/s][A
+ 37%|███▋      | 286/774 [01:19<02:00,  4.04it/s][A
+ 37%|███▋      | 287/774 [01:19<02:12,  3.69it/s][A
+ 37%|███▋      | 288/774 [01:20<02:15,  3.60it/s][A
+ 37%|███▋      | 289/774 [01:20<02:13,  3.65it/s][A
+ 37%|███▋      | 290/774 [01:20<02:09,  3.75it/s][A
+ 38%|███▊      | 291/774 [01:20<02:08,  3.76it/s][A
+ 38%|███▊      | 292/774 [01:21<02:05,  3.85it/s][A
+ 38%|███▊      | 293/774 [01:21<01:54,  4.18it/s][A
+ 38%|███▊      | 294/774 [01:21<01:51,  4.31it/s][A
+ 38%|███▊      | 295/774 [01:21<01:49,  4.36it/s][A
+ 38%|███▊      | 296/774 [01:22<01:44,  4.56it/s][A
+ 38%|███▊      | 297/774 [01:22<01:39,  4.81it/s][A
+ 39%|███▊      | 298/774 [01:22<01:42,  4.63it/s][A
+ 39%|███▊      | 299/774 [01:22<01:47,  4.42it/s][A
+ 39%|███▉      | 300/774 [01:23<01:54,  4.15it/s][A
+ 39%|███▉      | 301/774 [01:23<01:47,  4.41it/s][A
+ 39%|███▉      | 302/774 [01:23<01:41,  4.64it/s][A
+ 39%|███▉      | 303/774 [01:23<01:37,  4.82it/s][A
+ 39%|███▉      | 304/774 [01:23<01:25,  5.52it/s][A
+ 39%|███▉      | 305/774 [01:23<01:24,  5.52it/s][A
+ 40%|███▉      | 306/774 [01:24<01:37,  4.81it/s][A
+ 40%|███▉      | 307/774 [01:24<01:42,  4.57it/s][A
+ 40%|███▉      | 308/774 [01:24<01:38,  4.75it/s][A
+ 40%|███▉      | 309/774 [01:24<01:38,  4.74it/s][A
+ 40%|████      | 310/774 [01:25<01:43,  4.46it/s][A
+ 40%|████      | 311/774 [01:25<01:42,  4.53it/s][A
+ 40%|████      | 312/774 [01:25<01:39,  4.65it/s][A
+ 40%|████      | 313/774 [01:25<01:39,  4.64it/s][A
+ 41%|████      | 314/774 [01:25<01:40,  4.57it/s][A
+ 41%|████      | 315/774 [01:26<01:49,  4.20it/s][A
+ 41%|████      | 316/774 [01:26<01:40,  4.56it/s][A
+ 41%|████      | 317/774 [01:26<01:33,  4.90it/s][A
+ 41%|████      | 318/774 [01:26<01:37,  4.70it/s][A
+ 41%|████      | 319/774 [01:27<01:39,  4.58it/s][A
+ 41%|████▏     | 320/774 [01:27<01:40,  4.51it/s][A
+ 41%|████▏     | 321/774 [01:27<01:32,  4.91it/s][A
+ 42%|████▏     | 322/774 [01:27<01:26,  5.22it/s][A
+ 42%|████▏     | 323/774 [01:27<01:17,  5.79it/s][A
+ 42%|████▏     | 324/774 [01:27<01:24,  5.30it/s][A
+ 42%|████▏     | 325/774 [01:28<01:29,  5.04it/s][A
+ 42%|████▏     | 326/774 [01:28<01:24,  5.32it/s][A
+ 42%|████▏     | 327/774 [01:28<01:27,  5.08it/s][A
+ 42%|████▏     | 328/774 [01:28<01:25,  5.21it/s][A
+ 43%|████▎     | 329/774 [01:28<01:34,  4.73it/s][A
+ 43%|████▎     | 330/774 [01:29<01:30,  4.92it/s][A
+ 43%|████▎     | 331/774 [01:29<01:21,  5.41it/s][A
+ 43%|████▎     | 332/774 [01:29<01:18,  5.60it/s][A
+ 43%|████▎     | 333/774 [01:29<01:22,  5.35it/s][A
+ 43%|████▎     | 334/774 [01:29<01:26,  5.07it/s][A
+ 43%|████▎     | 335/774 [01:30<01:27,  5.01it/s][A
+ 43%|████▎     | 336/774 [01:30<01:27,  5.03it/s][A
+ 44%|████▎     | 337/774 [01:30<01:20,  5.42it/s][A
+ 44%|████▎     | 338/774 [01:30<01:15,  5.81it/s][A
+ 44%|████▍     | 339/774 [01:30<01:10,  6.18it/s][A
+ 44%|████▍     | 340/774 [01:30<01:10,  6.17it/s][A
+ 44%|████▍     | 341/774 [01:31<01:28,  4.91it/s][A
+ 44%|████▍     | 342/774 [01:31<01:37,  4.43it/s][A
+ 44%|████▍     | 343/774 [01:31<01:38,  4.38it/s][A
+ 44%|████▍     | 344/774 [01:31<01:42,  4.22it/s][A
+ 45%|████▍     | 345/774 [01:32<01:45,  4.08it/s][A
+ 45%|████▍     | 346/774 [01:32<01:46,  4.00it/s][A
+ 45%|████▍     | 347/774 [01:32<01:43,  4.11it/s][A
+ 45%|████▍     | 348/774 [01:32<01:39,  4.29it/s][A
+ 45%|████▌     | 349/774 [01:33<01:35,  4.46it/s][A
+ 45%|████▌     | 350/774 [01:33<01:38,  4.31it/s][A
+ 45%|████▌     | 351/774 [01:33<01:38,  4.31it/s][A
+ 45%|████▌     | 352/774 [01:33<01:34,  4.48it/s][A
+ 46%|████▌     | 353/774 [01:34<01:33,  4.48it/s][A
+ 46%|████▌     | 354/774 [01:34<01:33,  4.49it/s][A
+ 46%|████▌     | 355/774 [01:34<01:38,  4.25it/s][A
+ 46%|████▌     | 356/774 [01:34<01:48,  3.86it/s][A
+ 46%|████▌     | 357/774 [01:35<02:04,  3.35it/s][A
+ 46%|████▋     | 358/774 [01:35<02:08,  3.23it/s][A
+ 46%|████▋     | 359/774 [01:35<02:07,  3.26it/s][A
+ 47%|████▋     | 360/774 [01:36<02:07,  3.25it/s][A
+ 47%|████▋     | 361/774 [01:36<02:00,  3.43it/s][A
+ 47%|████▋     | 362/774 [01:36<02:07,  3.23it/s][A
+ 47%|████▋     | 363/774 [01:37<02:06,  3.25it/s][A
+ 47%|████▋     | 364/774 [01:37<02:08,  3.20it/s][A
+ 47%|████▋     | 365/774 [01:37<02:04,  3.30it/s][A
+ 47%|████▋     | 366/774 [01:37<01:55,  3.55it/s][A
+ 47%|████▋     | 367/774 [01:38<01:49,  3.71it/s][A
+ 48%|████▊     | 368/774 [01:38<01:47,  3.78it/s][A
+ 48%|████▊     | 369/774 [01:38<01:54,  3.55it/s][A
+ 48%|████▊     | 370/774 [01:39<02:08,  3.15it/s][A
+ 48%|████▊     | 371/774 [01:39<01:59,  3.36it/s][A
+ 48%|████▊     | 372/774 [01:39<02:00,  3.35it/s][A
+ 48%|████▊     | 373/774 [01:39<01:57,  3.41it/s][A
+ 48%|████▊     | 374/774 [01:40<01:54,  3.48it/s][A
+ 48%|████▊     | 375/774 [01:40<01:55,  3.47it/s][A
+ 49%|████▊     | 376/774 [01:40<01:59,  3.34it/s][A
+ 49%|████▊     | 377/774 [01:41<02:11,  3.03it/s][A
+ 49%|████▉     | 378/774 [01:41<02:12,  2.99it/s][A
+ 49%|████▉     | 379/774 [01:41<02:03,  3.21it/s][A
+ 49%|████▉     | 380/774 [01:42<01:52,  3.49it/s][A
+ 49%|████▉     | 381/774 [01:42<01:44,  3.76it/s][A
+ 49%|████▉     | 382/774 [01:42<01:41,  3.87it/s][A
+ 49%|████▉     | 383/774 [01:42<01:39,  3.93it/s][A
+ 50%|████▉     | 384/774 [01:43<01:47,  3.64it/s][A
+ 50%|████▉     | 385/774 [01:43<01:55,  3.36it/s][A
+ 50%|████▉     | 386/774 [01:43<01:48,  3.58it/s][A
+ 50%|█████     | 387/774 [01:43<01:41,  3.81it/s][A
+ 50%|█████     | 388/774 [01:44<01:46,  3.61it/s][A
+ 50%|█████     | 389/774 [01:44<01:43,  3.72it/s][A
+ 50%|█████     | 390/774 [01:44<01:56,  3.30it/s][A
+ 51%|█████     | 391/774 [01:45<01:58,  3.24it/s][A
+ 51%|█████     | 392/774 [01:45<01:48,  3.52it/s][A
+ 51%|█████     | 393/774 [01:45<01:40,  3.79it/s][A
+ 51%|█████     | 394/774 [01:45<01:39,  3.81it/s][A
+ 51%|█████     | 395/774 [01:46<01:47,  3.52it/s][A
+ 51%|█████     | 396/774 [01:46<01:44,  3.60it/s][A
+ 51%|█████▏    | 397/774 [01:46<01:48,  3.49it/s][A
+ 51%|█████▏    | 398/774 [01:47<01:42,  3.66it/s][A
+ 52%|█████▏    | 399/774 [01:47<01:41,  3.70it/s][A
+ 52%|█████▏    | 400/774 [01:47<01:34,  3.97it/s][A
+ 52%|█████▏    | 401/774 [01:47<01:31,  4.10it/s][A
+ 52%|█████▏    | 402/774 [01:47<01:30,  4.11it/s][A
+ 52%|█████▏    | 403/774 [01:48<01:34,  3.91it/s][A
+ 52%|█████▏    | 404/774 [01:48<01:40,  3.67it/s][A
+ 52%|█████▏    | 405/774 [01:48<01:37,  3.79it/s][A
+ 52%|█████▏    | 406/774 [01:49<01:40,  3.68it/s][A
+ 53%|█████▎    | 407/774 [01:49<01:45,  3.49it/s][A
+ 53%|█████▎    | 408/774 [01:49<01:41,  3.60it/s][A
+ 53%|█████▎    | 409/774 [01:49<01:38,  3.70it/s][A
+ 53%|█████▎    | 410/774 [01:50<01:39,  3.65it/s][A
+ 53%|█████▎    | 411/774 [01:50<01:39,  3.65it/s][A
+ 53%|█████▎    | 412/774 [01:50<01:41,  3.57it/s][A
+ 53%|█████▎    | 413/774 [01:51<01:39,  3.63it/s][A
+ 53%|█████▎    | 414/774 [01:51<01:36,  3.73it/s][A
+ 54%|█████▎    | 415/774 [01:51<01:25,  4.22it/s][A
+ 54%|█████▎    | 416/774 [01:51<01:25,  4.21it/s][A
+ 54%|█████▍    | 417/774 [01:51<01:23,  4.27it/s][A
+ 54%|█████▍    | 418/774 [01:52<01:17,  4.60it/s][A
+ 54%|█████▍    | 419/774 [01:52<01:32,  3.86it/s][A
+ 54%|█████▍    | 420/774 [01:52<01:36,  3.67it/s][A
+ 54%|█████▍    | 421/774 [01:53<01:36,  3.65it/s][A
+ 55%|█████▍    | 422/774 [01:53<01:36,  3.66it/s][A
+ 55%|█████▍    | 423/774 [01:53<01:37,  3.61it/s][A
+ 55%|█████▍    | 424/774 [01:53<01:34,  3.69it/s][A
+ 55%|█████▍    | 425/774 [01:54<01:23,  4.17it/s][A
+ 55%|█████▌    | 426/774 [01:54<01:17,  4.48it/s][A
+ 55%|█████▌    | 427/774 [01:54<01:14,  4.67it/s][A
+ 55%|█████▌    | 428/774 [01:54<01:16,  4.55it/s][A
+ 55%|█████▌    | 429/774 [01:54<01:18,  4.38it/s][A
+ 56%|█████▌    | 430/774 [01:55<01:23,  4.14it/s][A
+ 56%|█████▌    | 431/774 [01:55<01:35,  3.58it/s][A
+ 56%|█████▌    | 432/774 [01:55<01:34,  3.61it/s][A
+ 56%|█████▌    | 433/774 [01:55<01:27,  3.89it/s][A
+ 56%|█████▌    | 434/774 [01:56<01:22,  4.11it/s][A
+ 56%|█████▌    | 435/774 [01:56<01:21,  4.17it/s][A
+ 56%|█████▋    | 436/774 [01:56<01:23,  4.06it/s][A
+ 56%|█████▋    | 437/774 [01:56<01:20,  4.20it/s][A
+ 57%|█████▋    | 438/774 [01:57<01:16,  4.40it/s][A
+ 57%|█████▋    | 439/774 [01:57<01:19,  4.20it/s][A
+ 57%|█████▋    | 440/774 [01:57<01:23,  4.00it/s][A
+ 57%|█████▋    | 441/774 [01:57<01:27,  3.80it/s][A
+ 57%|█████▋    | 442/774 [01:58<01:29,  3.72it/s][A
+ 57%|█████▋    | 443/774 [01:58<01:27,  3.80it/s][A
+ 57%|█████▋    | 444/774 [01:58<01:24,  3.89it/s][A
+ 57%|█████▋    | 445/774 [01:58<01:25,  3.86it/s][A
+ 58%|█████▊    | 446/774 [01:59<01:22,  3.95it/s][A
+ 58%|█████▊    | 447/774 [01:59<01:21,  4.01it/s][A
+ 58%|█████▊    | 448/774 [01:59<01:14,  4.38it/s][A
+ 58%|█████▊    | 449/774 [01:59<01:14,  4.35it/s][A
+ 58%|█████▊    | 450/774 [02:00<01:17,  4.16it/s][A
+ 58%|█████▊    | 451/774 [02:00<01:15,  4.26it/s][A
+ 58%|█████▊    | 452/774 [02:00<01:11,  4.48it/s][A
+ 59%|█████▊    | 453/774 [02:00<01:10,  4.55it/s][A
+ 59%|█████▊    | 454/774 [02:01<01:16,  4.21it/s][A
+ 59%|█████▉    | 455/774 [02:01<01:20,  3.94it/s][A
+ 59%|█████▉    | 456/774 [02:01<01:24,  3.74it/s][A
+ 59%|█████▉    | 457/774 [02:01<01:19,  4.01it/s][A
+ 59%|█████▉    | 458/774 [02:02<01:18,  4.02it/s][A
+ 59%|█████▉    | 459/774 [02:02<01:16,  4.09it/s][A
+ 59%|█████▉    | 460/774 [02:02<01:22,  3.82it/s][A
+ 60%|█████▉    | 461/774 [02:02<01:29,  3.51it/s][A
+ 60%|█████▉    | 462/774 [02:03<01:26,  3.61it/s][A
+ 60%|█████▉    | 463/774 [02:03<01:23,  3.73it/s][A
+ 60%|█████▉    | 464/774 [02:03<01:23,  3.71it/s][A
+ 60%|██████    | 465/774 [02:03<01:15,  4.10it/s][A
+ 60%|██████    | 466/774 [02:04<01:12,  4.24it/s][A
+ 60%|██████    | 467/774 [02:04<01:08,  4.48it/s][A
+ 60%|██████    | 468/774 [02:04<01:08,  4.46it/s][A
+ 61%|██████    | 469/774 [02:04<01:02,  4.85it/s][A
+ 61%|██████    | 470/774 [02:04<01:00,  5.04it/s][A
+ 61%|██████    | 471/774 [02:05<01:02,  4.85it/s][A
+ 61%|██████    | 472/774 [02:05<01:07,  4.50it/s][A
+ 61%|██████    | 473/774 [02:05<01:10,  4.29it/s][A
+ 61%|██████    | 474/774 [02:05<01:08,  4.38it/s][A
+ 61%|██████▏   | 475/774 [02:06<01:09,  4.29it/s][A
+ 61%|██████▏   | 476/774 [02:06<01:17,  3.83it/s][A
+ 62%|██████▏   | 477/774 [02:06<01:32,  3.22it/s][A
+ 62%|██████▏   | 478/774 [02:07<01:33,  3.17it/s][A
+ 62%|██████▏   | 479/774 [02:07<01:31,  3.24it/s][A
+ 62%|██████▏   | 480/774 [02:07<01:27,  3.35it/s][A
+ 62%|██████▏   | 481/774 [02:08<01:28,  3.29it/s][A
+ 62%|██████▏   | 482/774 [02:08<01:26,  3.36it/s][A
+ 62%|██████▏   | 483/774 [02:08<01:24,  3.44it/s][A
+ 63%|██████▎   | 484/774 [02:08<01:25,  3.37it/s][A
+ 63%|██████▎   | 485/774 [02:09<01:27,  3.29it/s][A
+ 63%|██████▎   | 486/774 [02:09<01:24,  3.42it/s][A
+ 63%|██████▎   | 487/774 [02:09<01:25,  3.36it/s][A
+ 63%|██████▎   | 488/774 [02:10<01:23,  3.45it/s][A
+ 63%|██████▎   | 489/774 [02:10<01:17,  3.66it/s][A
+ 63%|██████▎   | 490/774 [02:10<01:18,  3.64it/s][A
+ 63%|██████▎   | 491/774 [02:10<01:16,  3.68it/s][A
+ 64%|██████▎   | 492/774 [02:11<01:18,  3.59it/s][A
+ 64%|██████▎   | 493/774 [02:11<01:19,  3.56it/s][A
+ 64%|██████▍   | 494/774 [02:11<01:17,  3.62it/s][A
+ 64%|██████▍   | 495/774 [02:12<01:17,  3.61it/s][A
+ 64%|██████▍   | 496/774 [02:12<01:22,  3.37it/s][A
+ 64%|██████▍   | 497/774 [02:12<01:23,  3.33it/s][A
+ 64%|██████▍   | 498/774 [02:12<01:21,  3.38it/s][A
+ 64%|██████▍   | 499/774 [02:13<01:20,  3.43it/s][A
+ 65%|██████���   | 500/774 [02:13<01:17,  3.51it/s][A
+ 65%|██████▍   | 501/774 [02:13<01:14,  3.64it/s][A
+ 65%|██████▍   | 502/774 [02:14<01:13,  3.68it/s][A
+ 65%|██████▍   | 503/774 [02:14<01:20,  3.39it/s][A
+ 65%|██████▌   | 504/774 [02:14<01:22,  3.28it/s][A
+ 65%|██████▌   | 505/774 [02:14<01:19,  3.39it/s][A
+ 65%|██████▌   | 506/774 [02:15<01:18,  3.42it/s][A
+ 66%|██████▌   | 507/774 [02:15<01:23,  3.19it/s][A
+ 66%|██████▌   | 508/774 [02:15<01:21,  3.26it/s][A
+ 66%|██████▌   | 509/774 [02:16<01:20,  3.31it/s][A
+ 66%|██████▌   | 510/774 [02:16<01:16,  3.44it/s][A
+ 66%|██████▌   | 511/774 [02:16<01:12,  3.61it/s][A
+ 66%|██████▌   | 512/774 [02:16<01:12,  3.62it/s][A
+ 66%|██████▋   | 513/774 [02:17<01:15,  3.46it/s][A
+ 66%|██████▋   | 514/774 [02:17<01:16,  3.41it/s][A
+ 67%|██████▋   | 515/774 [02:17<01:22,  3.13it/s][A
+ 67%|██████▋   | 516/774 [02:18<01:17,  3.34it/s][A
+ 67%|██████▋   | 517/774 [02:18<01:10,  3.64it/s][A
+ 67%|██████▋   | 518/774 [02:18<01:07,  3.77it/s][A
+ 67%|██████▋   | 519/774 [02:19<01:10,  3.61it/s][A
+ 67%|██████▋   | 520/774 [02:19<01:09,  3.63it/s][A
+ 67%|██████▋   | 521/774 [02:19<01:07,  3.73it/s][A
+ 67%|██████▋   | 522/774 [02:19<01:05,  3.83it/s][A
+ 68%|██████▊   | 523/774 [02:20<01:03,  3.93it/s][A
+ 68%|██████▊   | 524/774 [02:20<01:07,  3.71it/s][A
+ 68%|██████▊   | 525/774 [02:20<01:08,  3.66it/s][A
+ 68%|██████▊   | 526/774 [02:20<01:10,  3.52it/s][A
+ 68%|██████▊   | 527/774 [02:21<01:11,  3.44it/s][A
+ 68%|██████▊   | 528/774 [02:21<01:10,  3.47it/s][A
+ 68%|██████▊   | 529/774 [02:21<01:07,  3.65it/s][A
+ 68%|██████▊   | 530/774 [02:22<01:05,  3.71it/s][A
+ 69%|██████▊   | 531/774 [02:22<01:04,  3.75it/s][A
+ 69%|██████▊   | 532/774 [02:22<01:03,  3.84it/s][A
+ 69%|██████▉   | 533/774 [02:22<00:59,  4.04it/s][A
+ 69%|██████▉   | 534/774 [02:22<00:56,  4.22it/s][A
+ 69%|██████▉   | 535/774 [02:23<00:59,  4.03it/s][A
+ 69%|██████▉   | 536/774 [02:23<01:01,  3.90it/s][A
+ 69%|██████▉   | 537/774 [02:23<01:01,  3.83it/s][A
+ 70%|██████▉   | 538/774 [02:24<01:05,  3.58it/s][A
+ 70%|██████▉   | 539/774 [02:24<01:05,  3.60it/s][A
+ 70%|██████▉   | 540/774 [02:24<01:04,  3.64it/s][A
+ 70%|██████▉   | 541/774 [02:24<01:02,  3.74it/s][A
+ 70%|███████   | 542/774 [02:25<01:02,  3.73it/s][A
+ 70%|███████   | 543/774 [02:25<01:03,  3.64it/s][A
+ 70%|███████   | 544/774 [02:25<01:02,  3.65it/s][A
+ 70%|███████   | 545/774 [02:25<01:01,  3.75it/s][A
+ 71%|███████   | 546/774 [02:26<00:57,  3.94it/s][A
+ 71%|███████   | 547/774 [02:26<00:55,  4.07it/s][A
+ 71%|███████   | 548/774 [02:26<00:54,  4.13it/s][A
+ 71%|███████   | 549/774 [02:26<00:54,  4.10it/s][A
+ 71%|███████   | 550/774 [02:27<00:58,  3.84it/s][A
+ 71%|███████   | 551/774 [02:27<01:00,  3.66it/s][A
+ 71%|███████▏  | 552/774 [02:27<01:04,  3.43it/s][A
+ 71%|███████▏  | 553/774 [02:28<01:08,  3.24it/s][A
+ 72%|███████▏  | 554/774 [02:28<01:07,  3.28it/s][A
+ 72%|███████▏  | 555/774 [02:28<01:06,  3.28it/s][A
+ 72%|███████▏  | 556/774 [02:29<01:03,  3.45it/s][A
+ 72%|███████▏  | 557/774 [02:29<01:06,  3.26it/s][A
+ 72%|███████▏  | 558/774 [02:29<01:00,  3.55it/s][A
+ 72%|███████▏  | 559/774 [02:29<00:56,  3.83it/s][A
+ 72%|███████▏  | 560/774 [02:30<01:00,  3.52it/s][A
+ 72%|███████▏  | 561/774 [02:30<00:57,  3.73it/s][A
+ 73%|███████▎  | 562/774 [02:30<00:52,  4.06it/s][A
+ 73%|███████▎  | 563/774 [02:30<00:50,  4.20it/s][A
+ 73%|███████▎  | 564/774 [02:31<00:52,  4.03it/s][A
+ 73%|███████▎  | 565/774 [02:31<00:54,  3.86it/s][A
+ 73%|███████▎  | 566/774 [02:31<00:50,  4.16it/s][A
+ 73%|███████▎  | 567/774 [02:31<00:45,  4.55it/s][A
+ 73%|███████▎  | 568/774 [02:31<00:47,  4.35it/s][A
+ 74%|███████▎  | 569/774 [02:32<00:48,  4.25it/s][A
+ 74%|███████▎  | 570/774 [02:32<00:48,  4.22it/s][A
+ 74%|███████▍  | 571/774 [02:32<00:52,  3.87it/s][A
+ 74%|███████▍  | 572/774 [02:33<00:54,  3.71it/s][A
+ 74%|███████▍  | 573/774 [02:33<00:53,  3.73it/s][A
+ 74%|███████▍  | 574/774 [02:33<00:52,  3.83it/s][A
+ 74%|███████▍  | 575/774 [02:33<00:51,  3.84it/s][A
+ 74%|███████▍  | 576/774 [02:34<00:56,  3.48it/s][A
+ 75%|███████▍  | 577/774 [02:34<00:54,  3.59it/s][A
+ 75%|███████▍  | 578/774 [02:34<00:53,  3.63it/s][A
+ 75%|███████▍  | 579/774 [02:35<00:56,  3.46it/s][A
+ 75%|███████▍  | 580/774 [02:35<00:55,  3.48it/s][A
+ 75%|███████▌  | 581/774 [02:35<00:55,  3.51it/s][A
+ 75%|███████▌  | 582/774 [02:35<00:53,  3.60it/s][A
+ 75%|███████▌  | 583/774 [02:36<00:51,  3.73it/s][A
+ 75%|███████▌  | 584/774 [02:36<00:50,  3.78it/s][A
+ 76%|███████▌  | 585/774 [02:36<00:52,  3.61it/s][A
+ 76%|███████▌  | 586/774 [02:36<00:52,  3.55it/s][A
+ 76%|███████▌  | 587/774 [02:37<00:51,  3.62it/s][A
+ 76%|███████▌  | 588/774 [02:37<00:50,  3.70it/s][A
+ 76%|███████▌  | 589/774 [02:37<00:48,  3.80it/s][A
+ 76%|███████▌  | 590/774 [02:37<00:45,  4.04it/s][A
+ 76%|███████▋  | 591/774 [02:38<00:46,  3.90it/s][A
+ 76%|███████▋  | 592/774 [02:38<00:49,  3.66it/s][A
+ 77%|███████▋  | 593/774 [02:38<00:50,  3.61it/s][A
+ 77%|███████▋  | 594/774 [02:39<00:50,  3.58it/s][A
+ 77%|███████▋  | 595/774 [02:39<00:54,  3.31it/s][A
+ 77%|███████▋  | 596/774 [02:39<00:56,  3.15it/s][A
+ 77%|███████▋  | 597/774 [02:40<00:56,  3.14it/s][A
+ 77%|███████▋  | 598/774 [02:40<00:57,  3.06it/s][A
+ 77%|███████▋  | 599/774 [02:40<00:58,  3.02it/s][A
+ 78%|███████▊  | 600/774 [02:41<00:57,  3.01it/s][A
+ 78%|███████▊  | 601/774 [02:41<00:57,  2.99it/s][A
+ 78%|███████▊  | 602/774 [02:41<00:58,  2.96it/s][A
+ 78%|███████▊  | 603/774 [02:42<00:56,  3.00it/s][A
+ 78%|███████▊  | 604/774 [02:42<00:57,  2.96it/s][A
+ 78%|███████▊  | 605/774 [02:42<00:56,  3.01it/s][A
+ 78%|███████▊  | 606/774 [02:43<00:57,  2.91it/s][A
+ 78%|███████▊  | 607/774 [02:43<00:56,  2.93it/s][A
+ 79%|███████▊  | 608/774 [02:43<00:56,  2.93it/s][A
+ 79%|███████▊  | 609/774 [02:44<00:54,  3.05it/s][A
+ 79%|███████▉  | 610/774 [02:44<00:55,  2.97it/s][A
+ 79%|███████▉  | 611/774 [02:44<00:59,  2.73it/s][A
+ 79%|███████▉  | 612/774 [02:45<01:01,  2.62it/s][A
+ 79%|███████▉  | 613/774 [02:45<00:57,  2.82it/s][A
+ 79%|███████▉  | 614/774 [02:45<00:55,  2.90it/s][A
+ 79%|███████▉  | 615/774 [02:46<00:52,  3.04it/s][A
+ 80%|███████▉  | 616/774 [02:46<00:51,  3.09it/s][A
+ 80%|███████▉  | 617/774 [02:46<00:50,  3.10it/s][A
+ 80%|███████▉  | 618/774 [02:47<00:47,  3.27it/s][A
+ 80%|███████▉  | 619/774 [02:47<00:45,  3.42it/s][A
+ 80%|████████  | 620/774 [02:47<00:44,  3.45it/s][A
+ 80%|████████  | 621/774 [02:47<00:41,  3.72it/s][A
+ 80%|████████  | 622/774 [02:48<00:38,  3.98it/s][A
+ 80%|████████  | 623/774 [02:48<00:38,  3.94it/s][A
+ 81%|████████  | 624/774 [02:48<00:41,  3.61it/s][A
+ 81%|████████  | 625/774 [02:49<00:41,  3.56it/s][A
+ 81%|████████  | 626/774 [02:49<00:44,  3.30it/s][A
+ 81%|████████  | 627/774 [02:49<00:45,  3.21it/s][A
+ 81%|████████  | 628/774 [02:50<00:45,  3.20it/s][A
+ 81%|████████▏ | 629/774 [02:50<00:43,  3.31it/s][A
+ 81%|████████▏ | 630/774 [02:50<00:40,  3.55it/s][A
+ 82%|████████▏ | 631/774 [02:50<00:38,  3.74it/s][A
+ 82%|████████▏ | 632/774 [02:51<00:38,  3.74it/s][A
+ 82%|████████▏ | 633/774 [02:51<00:39,  3.56it/s][A
+ 82%|████████▏ | 634/774 [02:51<00:40,  3.47it/s][A
+ 82%|████████▏ | 635/774 [02:51<00:39,  3.56it/s][A
+ 82%|████████▏ | 636/774 [02:52<00:39,  3.48it/s][A
+ 82%|████████▏ | 637/774 [02:52<00:38,  3.53it/s][A
+ 82%|████████▏ | 638/774 [02:52<00:38,  3.49it/s][A
+ 83%|████████▎ | 639/774 [02:53<00:43,  3.08it/s][A
+ 83%|████████▎ | 640/774 [02:53<00:50,  2.67it/s][A
+ 83%|████████▎ | 641/774 [02:54<00:49,  2.69it/s][A
+ 83%|████████▎ | 642/774 [02:54<00:45,  2.88it/s][A
+ 83%|████████▎ | 643/774 [02:54<00:45,  2.90it/s][A
+ 83%|████████▎ | 644/774 [02:54<00:41,  3.11it/s][A
+ 83%|████████▎ | 645/774 [02:55<00:37,  3.40it/s][A
+ 83%|████████▎ | 646/774 [02:55<00:35,  3.63it/s][A
+ 84%|████████▎ | 647/774 [02:55<00:32,  3.89it/s][A
+ 84%|████████▎ | 648/774 [02:55<00:31,  4.04it/s][A
+ 84%|████████▍ | 649/774 [02:56<00:30,  4.07it/s][A
+ 84%|████████▍ | 650/774 [02:56<00:28,  4.29it/s][A
+ 84%|████████▍ | 651/774 [02:56<00:28,  4.25it/s][A
+ 84%|████████▍ | 652/774 [02:56<00:29,  4.10it/s][A
+ 84%|████████▍ | 653/774 [02:57<00:31,  3.82it/s][A
+ 84%|████████▍ | 654/774 [02:57<00:29,  4.07it/s][A
+ 85%|████████▍ | 655/774 [02:57<00:27,  4.37it/s][A
+ 85%|████████▍ | 656/774 [02:57<00:27,  4.24it/s][A
+ 85%|████████▍ | 657/774 [02:57<00:26,  4.43it/s][A
+ 85%|████████▌ | 658/774 [02:58<00:27,  4.20it/s][A
+ 85%|████████▌ | 659/774 [02:58<00:29,  3.86it/s][A
+ 85%|████████▌ | 660/774 [02:58<00:30,  3.73it/s][A
+ 85%|████████▌ | 661/774 [02:59<00:30,  3.67it/s][A
+ 86%|████████▌ | 662/774 [02:59<00:29,  3.85it/s][A
+ 86%|████████▌ | 663/774 [02:59<00:30,  3.64it/s][A
+ 86%|████████▌ | 664/774 [02:59<00:30,  3.62it/s][A
+ 86%|████████▌ | 665/774 [03:00<00:27,  3.90it/s][A
+ 86%|████████▌ | 666/774 [03:00<00:25,  4.31it/s][A
+ 86%|████████▌ | 667/774 [03:00<00:23,  4.58it/s][A
+ 86%|████████▋ | 668/774 [03:00<00:24,  4.40it/s][A
+ 86%|████████▋ | 669/774 [03:01<00:25,  4.15it/s][A
+ 87%|████████▋ | 670/774 [03:01<00:23,  4.33it/s][A
+ 87%|████████▋ | 671/774 [03:01<00:26,  3.93it/s][A
+ 87%|████████▋ | 672/774 [03:01<00:25,  4.01it/s][A
+ 87%|████████▋ | 673/774 [03:02<00:24,  4.09it/s][A
+ 87%|████████▋ | 674/774 [03:02<00:24,  4.03it/s][A
+ 87%|████████▋ | 675/774 [03:02<00:23,  4.29it/s][A
+ 87%|████████▋ | 676/774 [03:02<00:22,  4.45it/s][A
+ 87%|████████▋ | 677/774 [03:02<00:21,  4.42it/s][A
+ 88%|████████▊ | 678/774 [03:03<00:21,  4.45it/s][A
+ 88%|████████▊ | 679/774 [03:03<00:22,  4.18it/s][A
+ 88%|████████▊ | 680/774 [03:03<00:22,  4.19it/s][A
+ 88%|████████▊ | 681/774 [03:03<00:20,  4.45it/s][A
+ 88%|████████▊ | 682/774 [03:04<00:20,  4.49it/s][A
+ 88%|████████▊ | 683/774 [03:04<00:22,  4.13it/s][A
+ 88%|████████▊ | 684/774 [03:04<00:23,  3.86it/s][A
+ 89%|████████▊ | 685/774 [03:04<00:24,  3.70it/s][A
+ 89%|████████▊ | 686/774 [03:05<00:23,  3.80it/s][A
+ 89%|████████▉ | 687/774 [03:05<00:21,  4.01it/s][A
+ 89%|████████▉ | 688/774 [03:05<00:21,  4.01it/s][A
+ 89%|████████▉ | 689/774 [03:05<00:20,  4.17it/s][A
+ 89%|████████▉ | 690/774 [03:06<00:19,  4.31it/s][A
+ 89%|████████▉ | 691/774 [03:06<00:18,  4.40it/s][A
+ 89%|████████▉ | 692/774 [03:06<00:18,  4.45it/s][A
+ 90%|████████▉ | 693/774 [03:06<00:18,  4.45it/s][A
+ 90%|████████▉ | 694/774 [03:07<00:19,  4.16it/s][A
+ 90%|████████▉ | 695/774 [03:07<00:20,  3.83it/s][A
+ 90%|████████▉ | 696/774 [03:07<00:19,  3.94it/s][A
+ 90%|█████████ | 697/774 [03:07<00:19,  3.92it/s][A
+ 90%|█████████ | 698/774 [03:07<00:17,  4.31it/s][A
+ 90%|█████████ | 699/774 [03:08<00:16,  4.65it/s][A
+ 90%|█████████ | 700/774 [03:08<00:17,  4.29it/s][A
+ 91%|█████████ | 701/774 [03:08<00:16,  4.35it/s][A
+ 91%|█████████ | 702/774 [03:08<00:16,  4.33it/s][A
+ 91%|█████████ | 703/774 [03:09<00:16,  4.34it/s][A
+ 91%|█████████ | 704/774 [03:09<00:16,  4.21it/s][A
+ 91%|█████████ | 705/774 [03:09<00:14,  4.61it/s][A
+ 91%|█████████ | 706/774 [03:09<00:14,  4.76it/s][A
+ 91%|█████████▏| 707/774 [03:09<00:14,  4.65it/s][A
+ 91%|█████████▏| 708/774 [03:10<00:13,  4.92it/s][A
+ 92%|█████████▏| 709/774 [03:10<00:13,  4.75it/s][A
+ 92%|█████████▏| 710/774 [03:10<00:13,  4.66it/s][A
+ 92%|█████████▏| 711/774 [03:10<00:12,  4.86it/s][A
+ 92%|█████████▏| 712/774 [03:10<00:12,  5.07it/s][A
+ 92%|█████████▏| 713/774 [03:11<00:12,  4.91it/s][A
+ 92%|█████████▏| 714/774 [03:11<00:12,  4.63it/s][A
+ 92%|█████████▏| 715/774 [03:11<00:12,  4.73it/s][A
+ 93%|█████████▎| 716/774 [03:11<00:11,  5.24it/s][A
+ 93%|█████████▎| 717/774 [03:11<00:10,  5.32it/s][A
+ 93%|█████████▎| 718/774 [03:12<00:11,  4.75it/s][A
+ 93%|█████████▎| 719/774 [03:12<00:11,  4.62it/s][A
+ 93%|█████████▎| 720/774 [03:12<00:10,  4.95it/s][A
+ 93%|█████████▎| 721/774 [03:12<00:10,  5.22it/s][A
+ 93%|█████████▎| 722/774 [03:12<00:09,  5.67it/s][A
+ 93%|█████████▎| 723/774 [03:13<00:09,  5.43it/s][A
+ 94%|█████████▎| 724/774 [03:13<00:09,  5.37it/s][A
+ 94%|█████████▎| 725/774 [03:13<00:08,  5.52it/s][A
+ 94%|█████████▍| 726/774 [03:13<00:08,  5.56it/s][A
+ 94%|█████████▍| 727/774 [03:13<00:08,  5.35it/s][A
+ 94%|█████████▍| 728/774 [03:14<00:09,  4.83it/s][A
+ 94%|█████████▍| 729/774 [03:14<00:08,  5.12it/s][A
+ 94%|█████████▍| 730/774 [03:14<00:08,  5.40it/s][A
+ 94%|█████████▍| 731/774 [03:14<00:07,  5.39it/s][A
+ 95%|█████████▍| 732/774 [03:14<00:07,  5.54it/s][A
+ 95%|█████████▍| 733/774 [03:14<00:07,  5.52it/s][A
+ 95%|█████████▍| 734/774 [03:15<00:07,  5.58it/s][A
+ 95%|█████████▍| 735/774 [03:15<00:06,  5.80it/s][A
+ 95%|█████████▌| 736/774 [03:15<00:06,  5.85it/s][A
+ 95%|█████████▌| 737/774 [03:15<00:06,  5.75it/s][A
+ 95%|█████████▌| 738/774 [03:15<00:06,  5.56it/s][A
+ 95%|█████████▌| 739/774 [03:16<00:06,  5.50it/s][A
+ 96%|█████████▌| 740/774 [03:16<00:06,  5.41it/s][A
+ 96%|█████████▌| 741/774 [03:16<00:06,  5.14it/s][A
+ 96%|█████████▌| 742/774 [03:16<00:06,  5.32it/s][A
+ 96%|█████████▌| 743/774 [03:16<00:05,  5.63it/s][A
+ 96%|█████████▌| 744/774 [03:16<00:05,  5.41it/s][A
+ 96%|█████████▋| 745/774 [03:17<00:06,  4.50it/s][A
+ 96%|█████████▋| 746/774 [03:17<00:07,  3.90it/s][A
+ 97%|█████████▋| 747/774 [03:17<00:06,  4.10it/s][A
+ 97%|█████████▋| 748/774 [03:18<00:06,  4.29it/s][A
+ 97%|█████████▋| 749/774 [03:18<00:05,  4.58it/s][A
+ 97%|█████████▋| 750/774 [03:18<00:05,  4.27it/s][A
+ 97%|█████████▋| 751/774 [03:18<00:05,  4.48it/s][A
+ 97%|█████████▋| 752/774 [03:18<00:04,  4.43it/s][A
+ 97%|█████████▋| 753/774 [03:19<00:04,  4.71it/s][A
+ 97%|█████████▋| 754/774 [03:19<00:03,  5.33it/s][A
+ 98%|█████████▊| 755/774 [03:19<00:03,  5.60it/s][A
+ 98%|█████████▊| 756/774 [03:19<00:03,  5.44it/s][A
+ 98%|█████████▊| 757/774 [03:19<00:03,  5.26it/s][A
+ 98%|█████████▊| 758/774 [03:19<00:03,  5.21it/s][A
+ 98%|█████████▊| 759/774 [03:20<00:02,  5.44it/s][A
+ 98%|█████████▊| 760/774 [03:20<00:02,  5.42it/s][A
+ 98%|█████████▊| 761/774 [03:20<00:02,  5.89it/s][A
+ 98%|█████████▊| 762/774 [03:20<00:02,  5.98it/s][A
+ 99%|█████████▊| 763/774 [03:20<00:01,  6.17it/s][A
+ 99%|█████████▊| 764/774 [03:20<00:01,  6.29it/s][A
+ 99%|█████████▉| 765/774 [03:21<00:01,  6.28it/s][A
+ 99%|█████████▉| 766/774 [03:21<00:01,  5.35it/s][A
+ 99%|█████████▉| 767/774 [03:21<00:01,  5.50it/s][A
+ 99%|█████████▉| 768/774 [03:21<00:01,  5.46it/s][A
+ 99%|█████████▉| 769/774 [03:21<00:00,  5.15it/s][A
+ 99%|█████████▉| 770/774 [03:22<00:00,  5.01it/s][A
+100%|█████████▉| 771/774 [03:22<00:00,  5.20it/s][A
+100%|█████████▉| 772/774 [03:22<00:00,  4.96it/s][A
+100%|█████████▉| 773/774 [03:22<00:00,  4.82it/s][A                                                       
+                                                 [A 86%|████████▌ | 11000/12776 [1:58:08<17:38,  1.68it/s]
+100%|██████████| 774/774 [03:25<00:00,  4.82it/s][A
+                                                 [A 86%|████████▌ | 11001/12776 [1:58:09<30:43:15, 62.31s/it]                                                           86%|████████▌ | 11001/12776 [1:58:09<30:43:15, 62.31s/it] 86%|████████▌ | 11002/12776 [1:58:09<21:33:34, 43.75s/it]                                                           86%|████████▌ | 11002/12776 [1:58:09<21:33:34, 43.75s/it] 86%|████████▌ | 11003/12776 [1:58:10<15:09:15, 30.77s/it]                                                           86%|████████▌ | 11003/12776 [1:58:10<15:09:15, 30.77s/it] 86%|████████▌ | 11004/12776 [1:58:10<10:39:43, 21.66s/it]                                                           86%|████████▌ | 11004/12776 [1:58:10<10:39:43, 21.66s/it] 86%|████████▌ | 11005/12776 [1:58:11<7:31:05, 15.28s/it]                                                           86%|████████▌ | 11005/12776 [1:58:11<7:31:05, 15.28s/it] 86%|████████▌ | 11006/12776 [1:58:11<5:19:50, 10.84s/it]                                                          86%|████████▌ | 11006/12776 [1:58:11<5:19:50, 10.84s/it] 86%|████████▌ | 11007/12776 [1:58:12<3:47:02,  7.70s/it]                                                          86%|████████▌ | 11007/12776 [1:58:12<3:47:02,  7.70s/it] 86%|████████▌ | 11008/12776 [1:58:12<2:41:56,  5.50s/it]                                                          86%|████████▌ | 11008/12776 [1:58:12<2:41:56,  5.50s/it] 86%|████████▌ | 11009/12776 [1:58:12<1:56:50,  3.97s/it]                                                          86%|████████▌ | 11009/12776 [1:58:12<1:56:50,  3.97s/it] 86%|████████▌ | 11010/12776 [1:58:13<1:24:43,  2.88s/it]                                                          86%|████████▌ | 11010/12776 [1:58:13<1:24:43,  2.88s/it] 86%|████████▌ | 11011/12776 [1:58:13<1:02:11,  2.11s/it]                                                          86%|████████▌ | 11011/12776 [1:58:13<1:02:11,  2.11s/it] 86%|████████▌ | 11012/12776 [1:58:14<47:29,  1.62s/it]                                                          86%|████████▌ | 11012/12776 [1:58:14<47:29,  1.62s/it] 86%|████████▌ | 11013/12776 [1:58:14<35:54,  1.22s/it]                                                        86%|████████▌ | 11013/12776 [1:58:14<35:54,  1.22s/it] 86%|████████▌ | 11014/12776 [1:58:14<27:42,  1.06it/s]                                                        86%|████████▌ | 11014/12776 [1:58:14<27:42,  1.06it/s] 86%|████████▌ | 11015/12776 [1:58:14<22:49,  1.29it/s]                                                        86%|████████▌ | 11015/12776 [1:58:14<22:49,  1.29it/s] 86%|████████▌ | 11016/12776 [1:58:15<18:24,  1.59it/s]                                                        86%|████████▌ | 11016/12776 [1:58:15<18:24,  1.59it/s] 86%|████████▌ | 11017/12776 [1:58:15<15:14,  1.92it/s]                                                        86%|████████▌ | 11017/12776 [1:58:15<15:14,  1.92it/s] 86%|████████▌ | 11018/12776 [1:58:15<12:58,  2.26it/s]                                                        86%|████████▌ | 11018/12776 [1:58:15<12:58,  2.26it/s] 86%|████████▌ | 11019/12776 [1:58:16<11:52,  2.47it/s]                                                        86%|████████▌ | 11019/12776 [1:58:16<11:52,  2.47it/s] 86%|████████▋ | 11020/12776 [1:58:16<10:28,  2.79it/s]                                                        86%|████████▋ | 11020/12776 [1:58:16<10:28,  2.79it/s] 86%|████████▋ | 11021/12776 [1:58:16<09:28,  3.09it/s]                                                        86%|████████▋ | 11021/12776 [1:58:16<09:28,  3.09it/s] 86%|████████▋ | 11022/12776 [1:58:16<08:43,  3.35it/s]                                                        86%|████████▋ | 11022/12776 [1:58:16<08:43,  3.35it/s] 86%|████████▋ | 11023/12776 [1:58:17<08:08,  3.59it/s]                                                        86%|████████▋ | 11023/12776 [1:58:17<08:08,  3.59it/s] 86%|████████▋ | 11024/12776 [1:58:17<08:28,  3.45it/s]                                                        86%|████████▋ | 11024/12776 [1:58:17<08:28,  3.45it/s] 86%|████████▋ | 11025/12776 [1:58:17<07:53,  3.70it/s]                                                        86%|████████▋ | 11025/12776 [1:58:17<07:53,  3.70it/s] 86%|████████▋ | 11026/12776 [1:58:17<07:25,  3.93it/s]                                                        86%|████████▋ | 11026/12776 [1:58:17<07:25,  3.93it/s] 86%|████████▋ | 11027/12776 [1:58:18<07:03,  4.13it/s]                                                        86%|████████▋ | 11027/12776 [1:58:18<07:03,  4.13it/s] 86%|████████▋ | 11028/12776 [1:58:18<06:45,  4.31it/s]                                                        86%|████████▋ | 11028/12776 [1:58:18<06:45,  4.31it/s] 86%|████████▋ | 11029/12776 [1:58:18<07:20,  3.96it/s]                                                        86%|████████▋ | 11029/12776 [1:58:18<07:20,  3.96it/s] 86%|████████▋ | 11030/12776 [1:58:18<06:54,  4.21it/s]                                                        86%|████████▋ | 11030/12776 [1:58:18<06:54,  4.21it/s] 86%|████████▋ | 11031/12776 [1:58:18<06:36,  4.40it/s]                                                        86%|████████▋ | 11031/12776 [1:58:18<06:36,  4.40it/s] 86%|████████▋ | 11032/12776 [1:58:19<06:21,  4.57it/s]                                                        86%|████████▋ | 11032/12776 [1:58:19<06:21,  4.57it/s] 86%|████████▋ | 11033/12776 [1:58:19<06:10,  4.71it/s]                                                        86%|████████▋ | 11033/12776 [1:58:19<06:10,  4.71it/s] 86%|████████▋ | 11034/12776 [1:58:19<06:55,  4.19it/s]                                                        86%|████████▋ | 11034/12776 [1:58:19<06:55,  4.19it/s] 86%|████████▋ | 11035/12776 [1:58:19<06:30,  4.46it/s]                                                        86%|████████▋ | 11035/12776 [1:58:19<06:30,  4.46it/s] 86%|████████▋ | 11036/12776 [1:58:20<06:11,  4.69it/s]                                                        86%|████████▋ | 11036/12776 [1:58:20<06:11,  4.69it/s] 86%|████████▋ | 11037/12776 [1:58:20<05:57,  4.87it/s]                                                        86%|████████▋ | 11037/12776 [1:58:20<05:57,  4.87it/s] 86%|████████▋ | 11038/12776 [1:58:20<10:01,  2.89it/s]                                                        86%|████████▋ | 11038/12776 [1:58:20<10:01,  2.89it/s] 86%|████████▋ | 11039/12776 [1:58:22<19:54,  1.45it/s]                                                        86%|████████▋ | 11039/12776 [1:58:22<19:54,  1.45it/s] 86%|████████▋ | 11040/12776 [1:58:23<22:24,  1.29it/s]                                                        86%|████████▋ | 11040/12776 [1:58:23<22:24,  1.29it/s] 86%|████████▋ | 11041/12776 [1:58:24<23:06,  1.25it/s]                                                        86%|████████▋ | 11041/12776 [1:58:24<23:06,  1.25it/s] 86%|████████▋ | 11042/12776 [1:58:25<23:06,  1.25it/s]                                                        86%|████████▋ | 11042/12776 [1:58:25<23:06,  1.25it/s] 86%|████████▋ | 11043/12776 [1:58:25<22:36,  1.28it/s]                                                        86%|████████▋ | 11043/12776 [1:58:25<22:36,  1.28it/s] 86%|████████▋ | 11044/12776 [1:58:26<21:48,  1.32it/s]                                                        86%|████████▋ | 11044/12776 [1:58:26<21:48,  1.32it/s] 86%|████████▋ | 11045/12776 [1:58:27<20:57,  1.38it/s]                                                        86%|████████▋ | 11045/12776 [1:58:27<20:57,  1.38it/s] 86%|████████▋ | 11046/12776 [1:58:27<20:59,  1.37it/s]                                                        86%|████████▋ | 11046/12776 [1:58:27<20:59,  1.37it/s] 86%|████████▋ | 11047/12776 [1:58:28<19:43,  1.46it/s]                                                        86%|████████▋ | 11047/12776 [1:58:28<19:43,  1.46it/s] 86%|████████▋ | 11048/12776 [1:58:29<19:04,  1.51it/s]                                                        86%|████████▋ | 11048/12776 [1:58:29<19:04,  1.51it/s] 86%|████████▋ | 11049/12776 [1:58:29<17:53,  1.61it/s]                                                        86%|████████▋ | 11049/12776 [1:58:29<17:53,  1.61it/s] 86%|████████▋ | 11050/12776 [1:58:30<17:31,  1.64it/s]                                                        86%|████████▋ | 11050/12776 [1:58:30<17:31,  1.64it/s] 86%|████████▋ | 11051/12776 [1:58:30<16:23,  1.75it/s]                                                        86%|████████▋ | 11051/12776 [1:58:30<16:23,  1.75it/s] 87%|████████▋ | 11052/12776 [1:58:31<16:32,  1.74it/s]                                                        87%|████████▋ | 11052/12776 [1:58:31<16:32,  1.74it/s] 87%|████████▋ | 11053/12776 [1:58:31<15:18,  1.88it/s]                                                        87%|████████▋ | 11053/12776 [1:58:31<15:18,  1.88it/s] 87%|████████▋ | 11054/12776 [1:58:32<15:20,  1.87it/s]                                                        87%|████████▋ | 11054/12776 [1:58:32<15:20,  1.87it/s] 87%|████████▋ | 11055/12776 [1:58:32<14:14,  2.01it/s]                                                        87%|████████▋ | 11055/12776 [1:58:32<14:14,  2.01it/s] 87%|████████▋ | 11056/12776 [1:58:32<13:23,  2.14it/s]                                                        87%|████████▋ | 11056/12776 [1:58:32<13:23,  2.14it/s] 87%|████████▋ | 11057/12776 [1:58:33<13:51,  2.07it/s]                                                        87%|████████▋ | 11057/12776 [1:58:33<13:51,  2.07it/s] 87%|████████▋ | 11058/12776 [1:58:33<12:55,  2.21it/s]                                                        87%|████████▋ | 11058/12776 [1:58:33<12:55,  2.21it/s] 87%|████████▋ | 11059/12776 [1:58:34<12:07,  2.36it/s]                                                        87%|████████▋ | 11059/12776 [1:58:34<12:07,  2.36it/s] 87%|████████▋ | 11060/12776 [1:58:34<12:20,  2.32it/s]                                                        87%|████████▋ | 11060/12776 [1:58:34<12:20,  2.32it/s] 87%|████████▋ | 11061/12776 [1:58:35<11:36,  2.46it/s]                                                        87%|████████▋ | 11061/12776 [1:58:35<11:36,  2.46it/s] 87%|████████▋ | 11062/12776 [1:58:35<10:58,  2.60it/s]                                                        87%|████████▋ | 11062/12776 [1:58:35<10:58,  2.60it/s] 87%|████████▋ | 11063/12776 [1:58:35<11:00,  2.59it/s]                                                        87%|████████▋ | 11063/12776 [1:58:35<11:00,  2.59it/s] 87%|████████▋ | 11064/12776 [1:58:36<10:19,  2.76it/s]                                                        87%|████████▋ | 11064/12776 [1:58:36<10:19,  2.76it/s] 87%|████████▋ | 11065/12776 [1:58:36<09:46,  2.92it/s]                                                        87%|████████▋ | 11065/12776 [1:58:36<09:46,  2.92it/s] 87%|████████▋ | 11066/12776 [1:58:36<09:53,  2.88it/s]                                                        87%|████████▋ | 11066/12776 [1:58:36<09:53,  2.88it/s] 87%|████████▋ | 11067/12776 [1:58:37<09:20,  3.05it/s]                                                        87%|████████▋ | 11067/12776 [1:58:37<09:20,  3.05it/s] 87%|████████▋ | 11068/12776 [1:58:37<08:52,  3.21it/s]                                                        87%|████████▋ | 11068/12776 [1:58:37<08:52,  3.21it/s] 87%|████████▋ | 11069/12776 [1:58:37<08:28,  3.36it/s]                                                        87%|████████▋ | 11069/12776 [1:58:37<08:28,  3.36it/s] 87%|████████▋ | 11070/12776 [1:58:37<08:42,  3.26it/s]                                                        87%|████████▋ | 11070/12776 [1:58:37<08:42,  3.26it/s] 87%|████████▋ | 11071/12776 [1:58:38<08:14,  3.45it/s]                                                        87%|████████▋ | 11071/12776 [1:58:38<08:14,  3.45it/s] 87%|████████▋ | 11072/12776 [1:58:38<07:50,  3.62it/s]                                                        87%|████████▋ | 11072/12776 [1:58:38<07:50,  3.62it/s] 87%|████████▋ | 11073/12776 [1:58:38<07:32,  3.76it/s]                                                        87%|████████▋ | 11073/12776 [1:58:38<07:32,  3.76it/s] 87%|████████▋ | 11074/12776 [1:58:38<07:15,  3.91it/s]                                                        87%|████████▋ | 11074/12776 [1:58:38<07:15,  3.91it/s] 87%|████████▋ | 11075/12776 [1:58:39<07:38,  3.71it/s]                                                        87%|████████▋ | 11075/12776 [1:58:39<07:38,  3.71it/s] 87%|████████▋ | 11076/12776 [1:58:39<07:13,  3.92it/s]                                                        87%|████████▋ | 11076/12776 [1:58:39<07:13,  3.92it/s] 87%|████████▋ | 11077/12776 [1:58:39<06:51,  4.13it/s]                                                       {'eval_loss': 0.49481523036956787, 'eval_wer': 0.31803982404318165, 'eval_runtime': 205.6846, 'eval_samples_per_second': 60.204, 'eval_steps_per_second': 3.763, 'epoch': 1.72}
+{'loss': 0.4074, 'grad_norm': 1.0101224184036255, 'learning_rate': 4.371945259042033e-05, 'epoch': 1.72}
+{'loss': 0.3936, 'grad_norm': 1.773930549621582, 'learning_rate': 4.36950146627566e-05, 'epoch': 1.72}
+{'loss': 0.4899, 'grad_norm': 1.4694632291793823, 'learning_rate': 4.367057673509286e-05, 'epoch': 1.72}
+{'loss': 0.4263, 'grad_norm': 2.8984897136688232, 'learning_rate': 4.364613880742913e-05, 'epoch': 1.72}
+{'loss': 0.4388, 'grad_norm': 1.7322542667388916, 'learning_rate': 4.362170087976539e-05, 'epoch': 1.72}
+{'loss': 0.441, 'grad_norm': 1.0889065265655518, 'learning_rate': 4.359726295210166e-05, 'epoch': 1.72}
+{'loss': 0.3078, 'grad_norm': 0.9822616577148438, 'learning_rate': 4.357282502443792e-05, 'epoch': 1.72}
+{'loss': 0.3078, 'grad_norm': 0.8187744617462158, 'learning_rate': 4.3548387096774194e-05, 'epoch': 1.72}
+{'loss': 0.5249, 'grad_norm': 2.007962465286255, 'learning_rate': 4.352394916911045e-05, 'epoch': 1.72}
+{'loss': 0.7433, 'grad_norm': 1.7840535640716553, 'learning_rate': 4.3499511241446724e-05, 'epoch': 1.72}
+{'loss': 0.5195, 'grad_norm': 1.7337597608566284, 'learning_rate': 4.347507331378299e-05, 'epoch': 1.72}
+{'loss': 0.3144, 'grad_norm': 1.91448175907135, 'learning_rate': 4.3450635386119254e-05, 'epoch': 1.72}
+{'loss': 0.5275, 'grad_norm': 2.5847277641296387, 'learning_rate': 4.342619745845552e-05, 'epoch': 1.72}
+{'loss': 0.6911, 'grad_norm': 8.203601837158203, 'learning_rate': 4.340175953079179e-05, 'epoch': 1.72}
+{'loss': 0.6947, 'grad_norm': 2.416170835494995, 'learning_rate': 4.337732160312805e-05, 'epoch': 1.72}
+{'loss': 0.4616, 'grad_norm': 2.0026931762695312, 'learning_rate': 4.335288367546432e-05, 'epoch': 1.72}
+{'loss': 0.4845, 'grad_norm': 2.050572633743286, 'learning_rate': 4.3328445747800584e-05, 'epoch': 1.72}
+{'loss': 0.469, 'grad_norm': 2.2032740116119385, 'learning_rate': 4.330400782013685e-05, 'epoch': 1.72}
+{'loss': 0.8241, 'grad_norm': 2.604659080505371, 'learning_rate': 4.3279569892473114e-05, 'epoch': 1.72}
+{'loss': 0.5109, 'grad_norm': 1.5613411664962769, 'learning_rate': 4.3255131964809385e-05, 'epoch': 1.73}
+{'loss': 0.5093, 'grad_norm': 1.821577548980713, 'learning_rate': 4.3230694037145644e-05, 'epoch': 1.73}
+{'loss': 0.5367, 'grad_norm': 2.0643904209136963, 'learning_rate': 4.3206256109481915e-05, 'epoch': 1.73}
+{'loss': 1.0387, 'grad_norm': 2.7602663040161133, 'learning_rate': 4.318181818181818e-05, 'epoch': 1.73}
+{'loss': 1.1233, 'grad_norm': 5.380650520324707, 'learning_rate': 4.3157380254154445e-05, 'epoch': 1.73}
+{'loss': 1.0676, 'grad_norm': 3.1562740802764893, 'learning_rate': 4.313294232649071e-05, 'epoch': 1.73}
+{'loss': 0.8957, 'grad_norm': 3.140995502471924, 'learning_rate': 4.310850439882698e-05, 'epoch': 1.73}
+{'loss': 0.5783, 'grad_norm': 2.0646467208862305, 'learning_rate': 4.308406647116324e-05, 'epoch': 1.73}
+{'loss': 1.0697, 'grad_norm': 3.1891887187957764, 'learning_rate': 4.305962854349951e-05, 'epoch': 1.73}
+{'loss': 1.599, 'grad_norm': 4.30599308013916, 'learning_rate': 4.3035190615835775e-05, 'epoch': 1.73}
+{'loss': 0.8014, 'grad_norm': 1.5007469654083252, 'learning_rate': 4.301075268817204e-05, 'epoch': 1.73}
+{'loss': 1.201, 'grad_norm': 3.0806431770324707, 'learning_rate': 4.2986314760508305e-05, 'epoch': 1.73}
+{'loss': 0.8214, 'grad_norm': 3.5166988372802734, 'learning_rate': 4.296187683284458e-05, 'epoch': 1.73}
+{'loss': 1.237, 'grad_norm': 2.085771083831787, 'learning_rate': 4.2937438905180835e-05, 'epoch': 1.73}
+{'loss': 0.512, 'grad_norm': 1.5638984441757202, 'learning_rate': 4.2913000977517106e-05, 'epoch': 1.73}
+{'loss': 0.7843, 'grad_norm': 1.8894531726837158, 'learning_rate': 4.288856304985337e-05, 'epoch': 1.73}
+{'loss': 0.777, 'grad_norm': 3.0840094089508057, 'learning_rate': 4.2864125122189636e-05, 'epoch': 1.73}
+{'loss': 0.9629, 'grad_norm': 3.5215682983398438, 'learning_rate': 4.28396871945259e-05, 'epoch': 1.73}
+{'loss': 0.923, 'grad_norm': 2.8310680389404297, 'learning_rate': 4.281524926686217e-05, 'epoch': 1.73}
+{'loss': 0.2002, 'grad_norm': 1.4804656505584717, 'learning_rate': 4.279081133919843e-05, 'epoch': 1.73}
+{'loss': 0.2244, 'grad_norm': 0.7842070460319519, 'learning_rate': 4.27663734115347e-05, 'epoch': 1.73}
+{'loss': 0.277, 'grad_norm': 0.7354947328567505, 'learning_rate': 4.274193548387097e-05, 'epoch': 1.73}
+{'loss': 0.1575, 'grad_norm': 0.7232000827789307, 'learning_rate': 4.2717497556207225e-05, 'epoch': 1.73}
+{'loss': 0.2009, 'grad_norm': 2.255171537399292, 'learning_rate': 4.2693059628543496e-05, 'epoch': 1.73}
+{'loss': 0.3623, 'grad_norm': 1.2567336559295654, 'learning_rate': 4.266862170087977e-05, 'epoch': 1.73}
+{'loss': 0.1847, 'grad_norm': 0.5677285194396973, 'learning_rate': 4.2644183773216026e-05, 'epoch': 1.73}
+{'loss': 0.2151, 'grad_norm': 4.918628215789795, 'learning_rate': 4.26197458455523e-05, 'epoch': 1.73}
+{'loss': 0.3459, 'grad_norm': 0.9489184617996216, 'learning_rate': 4.259530791788856e-05, 'epoch': 1.73}
+{'loss': 0.3196, 'grad_norm': 3.799226999282837, 'learning_rate': 4.257086999022482e-05, 'epoch': 1.73}
+{'loss': 0.2697, 'grad_norm': 0.8148688673973083, 'learning_rate': 4.254643206256109e-05, 'epoch': 1.73}
+{'loss': 0.3325, 'grad_norm': 0.8470799922943115, 'learning_rate': 4.252199413489736e-05, 'epoch': 1.73}
+{'loss': 0.3461, 'grad_norm': 0.811160147190094, 'learning_rate': 4.249755620723362e-05, 'epoch': 1.73}
+{'loss': 0.5058, 'grad_norm': 1.2158924341201782, 'learning_rate': 4.247311827956989e-05, 'epoch': 1.73}
+{'loss': 0.3067, 'grad_norm': 7.7890825271606445, 'learning_rate': 4.244868035190616e-05, 'epoch': 1.73}
+{'loss': 0.6897, 'grad_norm': 2.8912837505340576, 'learning_rate': 4.2424242424242416e-05, 'epoch': 1.73}
+{'loss': 0.533, 'grad_norm': 1.3283599615097046, 'learning_rate': 4.239980449657869e-05, 'epoch': 1.73}
+{'loss': 0.2678, 'grad_norm': 0.8418616056442261, 'learning_rate': 4.237536656891496e-05, 'epoch': 1.73}
+{'loss': 0.2165, 'grad_norm': 1.1012041568756104, 'learning_rate': 4.235092864125122e-05, 'epoch': 1.73}
+{'loss': 0.4197, 'grad_norm': 1.7594144344329834, 'learning_rate': 4.232649071358749e-05, 'epoch': 1.73}
+{'loss': 0.7293, 'grad_norm': 2.9779410362243652, 'learning_rate': 4.230205278592375e-05, 'epoch': 1.73}
+{'loss': 0.3366, 'grad_norm': 2.907015085220337, 'learning_rate': 4.227761485826001e-05, 'epoch': 1.73}
+{'loss': 0.3136, 'grad_norm': 1.4411675930023193, 'learning_rate': 4.225317693059628e-05, 'epoch': 1.73}
+{'loss': 0.6951, 'grad_norm': 1.9844144582748413, 'learning_rate': 4.2228739002932555e-05, 'epoch': 1.73}
+{'loss': 0.4267, 'grad_norm': 1.4029967784881592, 'learning_rate': 4.220430107526881e-05, 'epoch': 1.73}
+{'loss': 0.7369, 'grad_norm': 2.417787551879883, 'learning_rate': 4.2179863147605084e-05, 'epoch': 1.73}
+{'loss': 0.7088, 'grad_norm': 2.0894277095794678, 'learning_rate': 4.215542521994134e-05, 'epoch': 1.73}
+{'loss': 0.5431, 'grad_norm': 3.367473840713501, 'learning_rate': 4.213098729227761e-05, 'epoch': 1.73}
+{'loss': 0.3418, 'grad_norm': 1.4022886753082275, 'learning_rate': 4.210654936461388e-05, 'epoch': 1.73}
+{'loss': 0.7644, 'grad_norm': 3.337224245071411, 'learning_rate': 4.2082111436950137e-05, 'epoch': 1.73}
+{'loss': 0.7558, 'grad_norm': 3.8841137886047363, 'learning_rate': 4.205767350928641e-05, 'epoch': 1.73}
+{'loss': 0.4611, 'grad_norm': 1.932528018951416, 'learning_rate': 4.203323558162267e-05, 'epoch': 1.73}
+{'loss': 0.3621, 'grad_norm': 2.1921749114990234, 'learning_rate': 4.200879765395894e-05, 'epoch': 1.73}
+{'loss': 0.8474, 'grad_norm': 1.8706334829330444, 'learning_rate': 4.19843597262952e-05, 'epoch': 1.73}
+{'loss': 0.6197, 'grad_norm': 2.5191290378570557, 'learning_rate': 4.1959921798631474e-05, 'epoch': 1.73}
+{'loss': 0.5236, 'grad_norm': 2.9892024993896484, 'learning_rate': 4.193548387096773e-05, 'epoch': 1.73}
+{'loss': 1.1508, 'grad_norm': 18.517911911010742, 'learning_rate': 4.1911045943304004e-05, 'epoch': 1.73}
+{'loss': 0.9638, 'grad_norm': 2.092811346054077, 'learning_rate': 4.188660801564027e-05, 'epoch': 1.73}
+ 87%|████████▋ | 11077/12776 [1:58:39<06:51,  4.13it/s] 87%|████████▋ | 11078/12776 [1:58:39<06:34,  4.31it/s]                                                        87%|████████▋ | 11078/12776 [1:58:39<06:34,  4.31it/s] 87%|████████▋ | 11079/12776 [1:58:40<06:22,  4.43it/s]                                                        87%|████████▋ | 11079/12776 [1:58:40<06:22,  4.43it/s] 87%|████████▋ | 11080/12776 [1:58:40<07:01,  4.02it/s]                                                        87%|████████▋ | 11080/12776 [1:58:40<07:01,  4.02it/s] 87%|████████▋ | 11081/12776 [1:58:40<06:36,  4.28it/s]                                                        87%|████████▋ | 11081/12776 [1:58:40<06:36,  4.28it/s] 87%|████████▋ | 11082/12776 [1:58:40<06:17,  4.48it/s]                                                        87%|████████▋ | 11082/12776 [1:58:40<06:17,  4.48it/s] 87%|████████▋ | 11083/12776 [1:58:40<06:03,  4.65it/s]                                                        87%|████████▋ | 11083/12776 [1:58:40<06:03,  4.65it/s] 87%|████████▋ | 11084/12776 [1:58:41<05:53,  4.79it/s]                                                        87%|████████▋ | 11084/12776 [1:58:41<05:53,  4.79it/s] 87%|████████▋ | 11085/12776 [1:58:41<06:54,  4.08it/s]                                                        87%|████████▋ | 11085/12776 [1:58:41<06:54,  4.08it/s] 87%|████████▋ | 11086/12776 [1:58:41<06:24,  4.39it/s]                                                        87%|████████▋ | 11086/12776 [1:58:41<06:24,  4.39it/s] 87%|████████▋ | 11087/12776 [1:58:41<06:03,  4.65it/s]                                                        87%|████████▋ | 11087/12776 [1:58:41<06:03,  4.65it/s] 87%|████████▋ | 11088/12776 [1:58:42<10:45,  2.61it/s]                                                        87%|████████▋ | 11088/12776 [1:58:42<10:45,  2.61it/s] 87%|████████▋ | 11089/12776 [1:58:43<19:13,  1.46it/s]                                                        87%|████████▋ | 11089/12776 [1:58:43<19:13,  1.46it/s] 87%|████████▋ | 11090/12776 [1:58:44<21:29,  1.31it/s]                                                        87%|████████▋ | 11090/12776 [1:58:44<21:29,  1.31it/s] 87%|████████▋ | 11091/12776 [1:58:45<22:13,  1.26it/s]                                                        87%|████████▋ | 11091/12776 [1:58:45<22:13,  1.26it/s] 87%|████████▋ | 11092/12776 [1:58:46<21:59,  1.28it/s]                                                        87%|████████▋ | 11092/12776 [1:58:46<21:59,  1.28it/s] 87%|████████▋ | 11093/12776 [1:58:47<22:41,  1.24it/s]                                                        87%|████████▋ | 11093/12776 [1:58:47<22:41,  1.24it/s] 87%|████████▋ | 11094/12776 [1:58:48<21:42,  1.29it/s]                                                        87%|████████▋ | 11094/12776 [1:58:48<21:42,  1.29it/s] 87%|████████▋ | 11095/12776 [1:58:48<20:47,  1.35it/s]                                                        87%|████████▋ | 11095/12776 [1:58:48<20:47,  1.35it/s] 87%|████████▋ | 11096/12776 [1:58:49<20:03,  1.40it/s]                                                        87%|████████▋ | 11096/12776 [1:58:49<20:03,  1.40it/s] 87%|████████▋ | 11097/12776 [1:58:50<19:16,  1.45it/s]                                                        87%|████████▋ | 11097/12776 [1:58:50<19:16,  1.45it/s] 87%|████████▋ | 11098/12776 [1:58:50<18:19,  1.53it/s]                                                        87%|████████▋ | 11098/12776 [1:58:50<18:19,  1.53it/s] 87%|████████▋ | 11099/12776 [1:58:51<17:29,  1.60it/s]                                                        87%|████████▋ | 11099/12776 [1:58:51<17:29,  1.60it/s] 87%|████████▋ | 11100/12776 [1:58:51<17:17,  1.62it/s]                                                        87%|████████▋ | 11100/12776 [1:58:51<17:17,  1.62it/s] 87%|████████▋ | 11101/12776 [1:58:52<16:17,  1.71it/s]                                                        87%|████████▋ | 11101/12776 [1:58:52<16:17,  1.71it/s] 87%|████████▋ | 11102/12776 [1:58:52<15:20,  1.82it/s]                                                        87%|████████▋ | 11102/12776 [1:58:52<15:20,  1.82it/s] 87%|████████▋ | 11103/12776 [1:58:53<14:45,  1.89it/s]                                                        87%|████████▋ | 11103/12776 [1:58:53<14:45,  1.89it/s] 87%|████████▋ | 11104/12776 [1:58:53<13:56,  2.00it/s]                                                        87%|████████▋ | 11104/12776 [1:58:53<13:56,  2.00it/s] 87%|████████▋ | 11105/12776 [1:58:54<13:31,  2.06it/s]                                                        87%|████████▋ | 11105/12776 [1:58:54<13:31,  2.06it/s] 87%|████████▋ | 11106/12776 [1:58:54<12:47,  2.18it/s]                                                        87%|████████▋ | 11106/12776 [1:58:54<12:47,  2.18it/s] 87%|████████▋ | 11107/12776 [1:58:54<12:11,  2.28it/s]                                                        87%|████████▋ | 11107/12776 [1:58:54<12:11,  2.28it/s] 87%|████████▋ | 11108/12776 [1:58:55<11:49,  2.35it/s]                                                        87%|████████▋ | 11108/12776 [1:58:55<11:49,  2.35it/s] 87%|████████▋ | 11109/12776 [1:58:55<11:13,  2.47it/s]                                                        87%|████████▋ | 11109/12776 [1:58:55<11:13,  2.47it/s] 87%|████████▋ | 11110/12776 [1:58:55<10:44,  2.59it/s]                                                        87%|████████▋ | 11110/12776 [1:58:55<10:44,  2.59it/s] 87%|████████▋ | 11111/12776 [1:58:56<11:03,  2.51it/s]                                                        87%|████████▋ | 11111/12776 [1:58:56<11:03,  2.51it/s] 87%|████████▋ | 11112/12776 [1:58:56<10:29,  2.64it/s]                                                        87%|████████▋ | 11112/12776 [1:58:56<10:29,  2.64it/s] 87%|████████▋ | 11113/12776 [1:58:57<10:03,  2.76it/s]                                                        87%|████████▋ | 11113/12776 [1:58:57<10:03,  2.76it/s] 87%|████████▋ | 11114/12776 [1:58:57<09:35,  2.89it/s]                                                        87%|████████▋ | 11114/12776 [1:58:57<09:35,  2.89it/s] 87%|████████▋ | 11115/12776 [1:58:57<09:34,  2.89it/s]                                                        87%|████████▋ | 11115/12776 [1:58:57<09:34,  2.89it/s] 87%|████████▋ | 11116/12776 [1:58:58<09:08,  3.03it/s]                                                        87%|████████▋ | 11116/12776 [1:58:58<09:08,  3.03it/s] 87%|████████▋ | 11117/12776 [1:58:58<08:45,  3.16it/s]                                                        87%|████████▋ | 11117/12776 [1:58:58<08:45,  3.16it/s] 87%|████████▋ | 11118/12776 [1:58:58<09:20,  2.96it/s]                                                        87%|████████▋ | 11118/12776 [1:58:58<09:20,  2.96it/s] 87%|████████▋ | 11119/12776 [1:58:58<08:44,  3.16it/s]                                                        87%|████████▋ | 11119/12776 [1:58:58<08:44,  3.16it/s] 87%|████████▋ | 11120/12776 [1:58:59<08:16,  3.34it/s]                                                        87%|████████▋ | 11120/12776 [1:58:59<08:16,  3.34it/s] 87%|████████▋ | 11121/12776 [1:58:59<07:51,  3.51it/s]                                                        87%|████████▋ | 11121/12776 [1:58:59<07:51,  3.51it/s] 87%|████████▋ | 11122/12776 [1:58:59<08:21,  3.30it/s]                                                        87%|████████▋ | 11122/12776 [1:58:59<08:21,  3.30it/s] 87%|████████▋ | 11123/12776 [1:59:00<07:49,  3.52it/s]                                                        87%|████████▋ | 11123/12776 [1:59:00<07:49,  3.52it/s] 87%|████████▋ | 11124/12776 [1:59:00<07:25,  3.71it/s]                                                        87%|████████▋ | 11124/12776 [1:59:00<07:25,  3.71it/s] 87%|████████▋ | 11125/12776 [1:59:00<07:02,  3.91it/s]                                                        87%|████████▋ | 11125/12776 [1:59:00<07:02,  3.91it/s] 87%|████████▋ | 11126/12776 [1:59:00<06:43,  4.09it/s]                                                        87%|████████▋ | 11126/12776 [1:59:00<06:43,  4.09it/s] 87%|████████▋ | 11127/12776 [1:59:01<07:15,  3.79it/s]                                                        87%|████████▋ | 11127/12776 [1:59:01<07:15,  3.79it/s] 87%|████████▋ | 11128/12776 [1:59:01<06:49,  4.03it/s]                                                        87%|████████▋ | 11128/12776 [1:59:01<06:49,  4.03it/s] 87%|████████▋ | 11129/12776 [1:59:01<06:29,  4.23it/s]                                                        87%|████████▋ | 11129/12776 [1:59:01<06:29,  4.23it/s] 87%|████████▋ | 11130/12776 [1:59:01<06:13,  4.41it/s]                                                        87%|████████▋ | 11130/12776 [1:59:01<06:13,  4.41it/s] 87%|████████▋ | 11131/12776 [1:59:01<05:58,  4.58it/s]                                                        87%|████████▋ | 11131/12776 [1:59:01<05:58,  4.58it/s] 87%|████████▋ | 11132/12776 [1:59:02<06:43,  4.08it/s]                                                        87%|████████▋ | 11132/12776 [1:59:02<06:43,  4.08it/s] 87%|████████▋ | 11133/12776 [1:59:02<06:19,  4.33it/s]                                                        87%|████████▋ | 11133/12776 [1:59:02<06:19,  4.33it/s] 87%|████████▋ | 11134/12776 [1:59:02<06:00,  4.55it/s]                                                        87%|████████▋ | 11134/12776 [1:59:02<06:00,  4.55it/s] 87%|████████▋ | 11135/12776 [1:59:02<05:26,  5.02it/s]                                                        87%|████████▋ | 11135/12776 [1:59:02<05:26,  5.02it/s] 87%|████████▋ | 11136/12776 [1:59:02<05:10,  5.29it/s]                                                        87%|████████▋ | 11136/12776 [1:59:02<05:10,  5.29it/s] 87%|████████▋ | 11137/12776 [1:59:03<06:03,  4.50it/s]                                                        87%|████████▋ | 11137/12776 [1:59:03<06:03,  4.50it/s] 87%|████████▋ | 11138/12776 [1:59:03<09:37,  2.84it/s]                                                        87%|████████▋ | 11138/12776 [1:59:03<09:37,  2.84it/s] 87%|████████▋ | 11139/12776 [1:59:05<19:52,  1.37it/s]                                                        87%|████████▋ | 11139/12776 [1:59:05<19:52,  1.37it/s] 87%|████████▋ | 11140/12776 [1:59:06<21:05,  1.29it/s]                                                        87%|████████▋ | 11140/12776 [1:59:06<21:05,  1.29it/s] 87%|████████▋ | 11141/12776 [1:59:07<21:27,  1.27it/s]                                                        87%|████████▋ | 11141/12776 [1:59:07<21:27,  1.27it/s] 87%|████████▋ | 11142/12776 [1:59:08<22:20,  1.22it/s]                                                        87%|████████▋ | 11142/12776 [1:59:08<22:20,  1.22it/s] 87%|████████▋ | 11143/12776 [1:59:08<22:35,  1.20it/s]                                                        87%|████████▋ | 11143/12776 [1:59:08<22:35,  1.20it/s] 87%|████████▋ | 11144/12776 [1:59:09<21:17,  1.28it/s]                                                        87%|████████▋ | 11144/12776 [1:59:09<21:17,  1.28it/s] 87%|████████▋ | 11145/12776 [1:59:10<21:18,  1.28it/s]                                                        87%|████████▋ | 11145/12776 [1:59:10<21:18,  1.28it/s] 87%|████████▋ | 11146/12776 [1:59:10<20:01,  1.36it/s]                                                        87%|████████▋ | 11146/12776 [1:59:10<20:01,  1.36it/s] 87%|████████▋ | 11147/12776 [1:59:11<18:50,  1.44it/s]                                                        87%|████████▋ | 11147/12776 [1:59:11<18:50,  1.44it/s] 87%|████████▋ | 11148/12776 [1:59:12<17:46,  1.53it/s]                                                        87%|████████▋ | 11148/12776 [1:59:12<17:46,  1.53it/s] 87%|████████▋ | 11149/12776 [1:59:12<17:09,  1.58it/s]                                                        87%|████████▋ | 11149/12776 [1:59:12<17:09,  1.58it/s] 87%|████████▋ | 11150/12776 [1:59:13<16:08,  1.68it/s]                                                        87%|████████▋ | 11150/12776 [1:59:13<16:08,  1.68it/s] 87%|████████▋ | 11151/12776 [1:59:13<15:37,  1.73it/s]                                                        87%|████████▋ | 11151/12776 [1:59:13<15:37,  1.73it/s] 87%|████████▋ | 11152/12776 [1:59:14<14:35,  1.86it/s]                                                        87%|████████▋ | 11152/12776 [1:59:14<14:35,  1.86it/s] 87%|████████▋ | 11153/12776 [1:59:14<14:15,  1.90it/s]                                                        87%|████████▋ | 11153/12776 [1:59:14<14:15,  1.90it/s] 87%|████████▋ | 11154/12776 [1:59:15<13:23,  2.02it/s]                                                        87%|████████▋ | 11154/12776 [1:59:15<13:23,  2.02it/s] 87%|████████▋ | 11155/12776 [1:59:15<12:36,  2.14it/s]                                                       {'loss': 0.8178, 'grad_norm': 2.0851621627807617, 'learning_rate': 4.186217008797653e-05, 'epoch': 1.73}
+{'loss': 0.7663, 'grad_norm': 2.2361598014831543, 'learning_rate': 4.18377321603128e-05, 'epoch': 1.73}
+{'loss': 1.1463, 'grad_norm': 2.4493370056152344, 'learning_rate': 4.181329423264907e-05, 'epoch': 1.73}
+{'loss': 1.4037, 'grad_norm': 2.9259376525878906, 'learning_rate': 4.178885630498533e-05, 'epoch': 1.73}
+{'loss': 1.0048, 'grad_norm': 2.476630926132202, 'learning_rate': 4.17644183773216e-05, 'epoch': 1.73}
+{'loss': 1.3103, 'grad_norm': 2.6673030853271484, 'learning_rate': 4.1739980449657864e-05, 'epoch': 1.73}
+{'loss': 0.9964, 'grad_norm': 2.157806873321533, 'learning_rate': 4.171554252199413e-05, 'epoch': 1.73}
+{'loss': 0.4646, 'grad_norm': 1.6744898557662964, 'learning_rate': 4.1691104594330394e-05, 'epoch': 1.74}
+{'loss': 0.6487, 'grad_norm': 2.1040098667144775, 'learning_rate': 4.1666666666666665e-05, 'epoch': 1.74}
+{'loss': 0.4765, 'grad_norm': 1.4487780332565308, 'learning_rate': 4.164222873900292e-05, 'epoch': 1.74}
+{'loss': 0.5098, 'grad_norm': 1.442169189453125, 'learning_rate': 4.1617790811339195e-05, 'epoch': 1.74}
+{'loss': 1.1097, 'grad_norm': 1.7279036045074463, 'learning_rate': 4.159335288367546e-05, 'epoch': 1.74}
+{'loss': 0.2139, 'grad_norm': 0.6806561350822449, 'learning_rate': 4.1568914956011724e-05, 'epoch': 1.74}
+{'loss': 0.201, 'grad_norm': 0.7717947959899902, 'learning_rate': 4.154447702834799e-05, 'epoch': 1.74}
+{'loss': 0.2105, 'grad_norm': 1.283336877822876, 'learning_rate': 4.152003910068426e-05, 'epoch': 1.74}
+{'loss': 0.2479, 'grad_norm': 0.7973412275314331, 'learning_rate': 4.149560117302052e-05, 'epoch': 1.74}
+{'loss': 0.2806, 'grad_norm': 1.4070724248886108, 'learning_rate': 4.147116324535679e-05, 'epoch': 1.74}
+{'loss': 0.1904, 'grad_norm': 0.8974490761756897, 'learning_rate': 4.1446725317693055e-05, 'epoch': 1.74}
+{'loss': 0.3231, 'grad_norm': 1.0805352926254272, 'learning_rate': 4.142228739002932e-05, 'epoch': 1.74}
+{'loss': 0.2135, 'grad_norm': 0.7345172762870789, 'learning_rate': 4.1397849462365585e-05, 'epoch': 1.74}
+{'loss': 0.3516, 'grad_norm': 1.2740095853805542, 'learning_rate': 4.1373411534701856e-05, 'epoch': 1.74}
+{'loss': 0.362, 'grad_norm': 0.7698444724082947, 'learning_rate': 4.1348973607038114e-05, 'epoch': 1.74}
+{'loss': 0.4615, 'grad_norm': 1.8207945823669434, 'learning_rate': 4.1324535679374386e-05, 'epoch': 1.74}
+{'loss': 0.3479, 'grad_norm': 2.351137638092041, 'learning_rate': 4.130009775171065e-05, 'epoch': 1.74}
+{'loss': 0.2351, 'grad_norm': 0.9402967095375061, 'learning_rate': 4.1275659824046916e-05, 'epoch': 1.74}
+{'loss': 0.2323, 'grad_norm': 0.9055925011634827, 'learning_rate': 4.125122189638318e-05, 'epoch': 1.74}
+{'loss': 0.3386, 'grad_norm': 1.6312155723571777, 'learning_rate': 4.122678396871945e-05, 'epoch': 1.74}
+{'loss': 0.4023, 'grad_norm': 1.4077454805374146, 'learning_rate': 4.120234604105571e-05, 'epoch': 1.74}
+{'loss': 0.3056, 'grad_norm': 1.1839890480041504, 'learning_rate': 4.117790811339198e-05, 'epoch': 1.74}
+{'loss': 0.4321, 'grad_norm': 1.0270860195159912, 'learning_rate': 4.1153470185728246e-05, 'epoch': 1.74}
+{'loss': 0.3378, 'grad_norm': 1.704751968383789, 'learning_rate': 4.112903225806451e-05, 'epoch': 1.74}
+{'loss': 0.3107, 'grad_norm': 1.4541360139846802, 'learning_rate': 4.1104594330400776e-05, 'epoch': 1.74}
+{'loss': 0.5536, 'grad_norm': 2.142528772354126, 'learning_rate': 4.108015640273705e-05, 'epoch': 1.74}
+{'loss': 0.5362, 'grad_norm': 1.4981656074523926, 'learning_rate': 4.1055718475073306e-05, 'epoch': 1.74}
+{'loss': 0.5691, 'grad_norm': 2.808462381362915, 'learning_rate': 4.103128054740958e-05, 'epoch': 1.74}
+{'loss': 0.6461, 'grad_norm': 1.8647723197937012, 'learning_rate': 4.100684261974584e-05, 'epoch': 1.74}
+{'loss': 0.3173, 'grad_norm': 1.9372211694717407, 'learning_rate': 4.098240469208211e-05, 'epoch': 1.74}
+{'loss': 0.3582, 'grad_norm': 1.7311444282531738, 'learning_rate': 4.095796676441837e-05, 'epoch': 1.74}
+{'loss': 0.7594, 'grad_norm': 3.7500319480895996, 'learning_rate': 4.093352883675464e-05, 'epoch': 1.74}
+{'loss': 0.6123, 'grad_norm': 1.9873162508010864, 'learning_rate': 4.09090909090909e-05, 'epoch': 1.74}
+{'loss': 0.719, 'grad_norm': 15.171030044555664, 'learning_rate': 4.088465298142717e-05, 'epoch': 1.74}
+{'loss': 0.5115, 'grad_norm': 6.604526996612549, 'learning_rate': 4.086021505376344e-05, 'epoch': 1.74}
+{'loss': 0.4616, 'grad_norm': 1.5956652164459229, 'learning_rate': 4.08357771260997e-05, 'epoch': 1.74}
+{'loss': 0.4544, 'grad_norm': 2.011253833770752, 'learning_rate': 4.081133919843597e-05, 'epoch': 1.74}
+{'loss': 0.6543, 'grad_norm': 1.8588051795959473, 'learning_rate': 4.078690127077224e-05, 'epoch': 1.74}
+{'loss': 0.8809, 'grad_norm': 3.617205858230591, 'learning_rate': 4.07624633431085e-05, 'epoch': 1.74}
+{'loss': 0.3365, 'grad_norm': 5.1503825187683105, 'learning_rate': 4.073802541544477e-05, 'epoch': 1.74}
+{'loss': 0.8173, 'grad_norm': 4.6813154220581055, 'learning_rate': 4.071358748778103e-05, 'epoch': 1.74}
+{'loss': 0.7053, 'grad_norm': 2.9903922080993652, 'learning_rate': 4.06891495601173e-05, 'epoch': 1.74}
+{'loss': 0.7405, 'grad_norm': 3.1063003540039062, 'learning_rate': 4.066471163245356e-05, 'epoch': 1.74}
+{'loss': 0.837, 'grad_norm': 2.323392868041992, 'learning_rate': 4.0640273704789834e-05, 'epoch': 1.74}
+{'loss': 1.2962, 'grad_norm': 4.709715843200684, 'learning_rate': 4.061583577712609e-05, 'epoch': 1.74}
+{'loss': 1.254, 'grad_norm': 3.221708059310913, 'learning_rate': 4.0591397849462364e-05, 'epoch': 1.74}
+{'loss': 1.2712, 'grad_norm': 2.867579936981201, 'learning_rate': 4.056695992179863e-05, 'epoch': 1.74}
+{'loss': 1.0635, 'grad_norm': 2.7349681854248047, 'learning_rate': 4.0542521994134894e-05, 'epoch': 1.74}
+{'loss': 0.8331, 'grad_norm': 1.872054100036621, 'learning_rate': 4.051808406647116e-05, 'epoch': 1.74}
+{'loss': 1.7437, 'grad_norm': 4.403456211090088, 'learning_rate': 4.049364613880743e-05, 'epoch': 1.74}
+{'loss': 0.5395, 'grad_norm': 2.841325283050537, 'learning_rate': 4.046920821114369e-05, 'epoch': 1.74}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 4.046920821114369e-05, 'epoch': 1.74}
+{'loss': 0.3682, 'grad_norm': 1.3489454984664917, 'learning_rate': 4.044477028347996e-05, 'epoch': 1.74}
+{'loss': 0.7724, 'grad_norm': 2.201267719268799, 'learning_rate': 4.0420332355816224e-05, 'epoch': 1.74}
+{'loss': 1.2445, 'grad_norm': 2.0831868648529053, 'learning_rate': 4.039589442815249e-05, 'epoch': 1.74}
+{'loss': 0.2516, 'grad_norm': 1.2250694036483765, 'learning_rate': 4.0371456500488754e-05, 'epoch': 1.74}
+{'loss': 0.3832, 'grad_norm': 1.6923754215240479, 'learning_rate': 4.0347018572825025e-05, 'epoch': 1.74}
+{'loss': 0.1989, 'grad_norm': 1.450895071029663, 'learning_rate': 4.0322580645161284e-05, 'epoch': 1.74}
+{'loss': 0.1985, 'grad_norm': 2.482814311981201, 'learning_rate': 4.0298142717497555e-05, 'epoch': 1.74}
+{'loss': 0.2036, 'grad_norm': 0.9089357852935791, 'learning_rate': 4.027370478983382e-05, 'epoch': 1.74}
+{'loss': 0.2729, 'grad_norm': 0.5683241486549377, 'learning_rate': 4.0249266862170085e-05, 'epoch': 1.74}
+{'loss': 0.279, 'grad_norm': 1.6889972686767578, 'learning_rate': 4.022482893450635e-05, 'epoch': 1.74}
+{'loss': 0.2574, 'grad_norm': 0.7691648602485657, 'learning_rate': 4.020039100684262e-05, 'epoch': 1.74}
+{'loss': 0.2245, 'grad_norm': 3.3021767139434814, 'learning_rate': 4.017595307917888e-05, 'epoch': 1.74}
+{'loss': 0.327, 'grad_norm': 1.7656270265579224, 'learning_rate': 4.015151515151515e-05, 'epoch': 1.75}
+{'loss': 0.2726, 'grad_norm': 1.371148705482483, 'learning_rate': 4.0127077223851415e-05, 'epoch': 1.75}
+{'loss': 0.2061, 'grad_norm': 0.8461998701095581, 'learning_rate': 4.010263929618768e-05, 'epoch': 1.75}
+{'loss': 0.1667, 'grad_norm': 1.068411111831665, 'learning_rate': 4.0078201368523945e-05, 'epoch': 1.75}
+{'loss': 0.3076, 'grad_norm': 1.8549538850784302, 'learning_rate': 4.005376344086022e-05, 'epoch': 1.75}
+{'loss': 0.4524, 'grad_norm': 1.2925175428390503, 'learning_rate': 4.0029325513196475e-05, 'epoch': 1.75}
+{'loss': 0.2918, 'grad_norm': 1.1557775735855103, 'learning_rate': 4.0004887585532746e-05, 'epoch': 1.75}
+ 87%|████████▋ | 11155/12776 [1:59:15<12:36,  2.14it/s] 87%|████████▋ | 11156/12776 [1:59:16<13:12,  2.04it/s]                                                        87%|████████▋ | 11156/12776 [1:59:16<13:12,  2.04it/s] 87%|████████▋ | 11157/12776 [1:59:16<12:06,  2.23it/s]                                                        87%|████████▋ | 11157/12776 [1:59:16<12:06,  2.23it/s] 87%|████████▋ | 11158/12776 [1:59:16<11:17,  2.39it/s]                                                        87%|████████▋ | 11158/12776 [1:59:16<11:17,  2.39it/s] 87%|████████▋ | 11159/12776 [1:59:17<11:12,  2.40it/s]                                                        87%|████████▋ | 11159/12776 [1:59:17<11:12,  2.40it/s] 87%|████████▋ | 11160/12776 [1:59:17<10:31,  2.56it/s]                                                        87%|████████▋ | 11160/12776 [1:59:17<10:31,  2.56it/s] 87%|████████▋ | 11161/12776 [1:59:17<09:54,  2.71it/s]                                                        87%|████████▋ | 11161/12776 [1:59:17<09:54,  2.71it/s] 87%|████████▋ | 11162/12776 [1:59:18<09:44,  2.76it/s]                                                        87%|████████▋ | 11162/12776 [1:59:18<09:44,  2.76it/s] 87%|████████▋ | 11163/12776 [1:59:18<09:13,  2.92it/s]                                                        87%|████████▋ | 11163/12776 [1:59:18<09:13,  2.92it/s] 87%|████████▋ | 11164/12776 [1:59:18<08:47,  3.06it/s]                                                        87%|████████▋ | 11164/12776 [1:59:18<08:47,  3.06it/s] 87%|████████▋ | 11165/12776 [1:59:19<08:26,  3.18it/s]                                                        87%|████████▋ | 11165/12776 [1:59:19<08:26,  3.18it/s] 87%|████████▋ | 11166/12776 [1:59:19<09:01,  2.97it/s]                                                        87%|████████▋ | 11166/12776 [1:59:19<09:01,  2.97it/s] 87%|████████▋ | 11167/12776 [1:59:19<08:28,  3.16it/s]                                                        87%|████████▋ | 11167/12776 [1:59:19<08:28,  3.16it/s] 87%|████████▋ | 11168/12776 [1:59:19<07:56,  3.38it/s]                                                        87%|████████▋ | 11168/12776 [1:59:19<07:56,  3.38it/s] 87%|████████▋ | 11169/12776 [1:59:20<07:34,  3.53it/s]                                                        87%|████████▋ | 11169/12776 [1:59:20<07:34,  3.53it/s] 87%|████████▋ | 11170/12776 [1:59:20<08:07,  3.29it/s]                                                        87%|████████▋ | 11170/12776 [1:59:20<08:07,  3.29it/s] 87%|████████▋ | 11171/12776 [1:59:20<07:38,  3.50it/s]                                                        87%|████████▋ | 11171/12776 [1:59:20<07:38,  3.50it/s] 87%|█████���██▋ | 11172/12776 [1:59:21<07:14,  3.69it/s]                                                        87%|████████▋ | 11172/12776 [1:59:21<07:14,  3.69it/s] 87%|████████▋ | 11173/12776 [1:59:21<06:54,  3.86it/s]                                                        87%|████████▋ | 11173/12776 [1:59:21<06:54,  3.86it/s] 87%|████████▋ | 11174/12776 [1:59:21<07:11,  3.71it/s]                                                        87%|████████▋ | 11174/12776 [1:59:21<07:11,  3.71it/s] 87%|████████▋ | 11175/12776 [1:59:21<06:47,  3.93it/s]                                                        87%|████████▋ | 11175/12776 [1:59:21<06:47,  3.93it/s] 87%|████████▋ | 11176/12776 [1:59:22<06:27,  4.13it/s]                                                        87%|████████▋ | 11176/12776 [1:59:22<06:27,  4.13it/s] 87%|████████▋ | 11177/12776 [1:59:22<06:11,  4.30it/s]                                                        87%|████████▋ | 11177/12776 [1:59:22<06:11,  4.30it/s] 87%|████████▋ | 11178/12776 [1:59:22<06:01,  4.42it/s]                                                        87%|████████▋ | 11178/12776 [1:59:22<06:01,  4.42it/s] 88%|████████▊ | 11179/12776 [1:59:22<06:27,  4.12it/s]                                                        88%|████████▊ | 11179/12776 [1:59:22<06:27,  4.12it/s] 88%|████████▊ | 11180/12776 [1:59:22<06:08,  4.33it/s]                                                        88%|████████▊ | 11180/12776 [1:59:22<06:08,  4.33it/s] 88%|████████▊ | 11181/12776 [1:59:23<05:54,  4.50it/s]                                                        88%|████████▊ | 11181/12776 [1:59:23<05:54,  4.50it/s] 88%|████████▊ | 11182/12776 [1:59:23<05:43,  4.64it/s]                                                        88%|████████▊ | 11182/12776 [1:59:23<05:43,  4.64it/s] 88%|████████▊ | 11183/12776 [1:59:23<05:34,  4.76it/s]                                                        88%|████████▊ | 11183/12776 [1:59:23<05:34,  4.76it/s] 88%|████████▊ | 11184/12776 [1:59:23<05:26,  4.88it/s]                                                        88%|████████▊ | 11184/12776 [1:59:23<05:26,  4.88it/s] 88%|████████▊ | 11185/12776 [1:59:23<05:48,  4.56it/s]                                                        88%|████████▊ | 11185/12776 [1:59:23<05:48,  4.56it/s] 88%|████████▊ | 11186/12776 [1:59:24<05:33,  4.77it/s]                                                        88%|████████▊ | 11186/12776 [1:59:24<05:33,  4.77it/s] 88%|████████▊ | 11187/12776 [1:59:24<05:21,  4.95it/s]                                                        88%|████████▊ | 11187/12776 [1:59:24<05:21,  4.95it/s] 88%|████████▊ | 11188/12776 [1:59:25<09:51,  2.69it/s]                                                        88%|████████▊ | 11188/12776 [1:59:25<09:51,  2.69it/s] 88%|████████▊ | 11189/12776 [1:59:26<19:08,  1.38it/s]                                                        88%|████████▊ | 11189/12776 [1:59:26<19:08,  1.38it/s] 88%|████████▊ | 11190/12776 [1:59:27<21:54,  1.21it/s]                                                        88%|████████▊ | 11190/12776 [1:59:27<21:54,  1.21it/s] 88%|████████▊ | 11191/12776 [1:59:28<22:04,  1.20it/s]                                                        88%|████████▊ | 11191/12776 [1:59:28<22:04,  1.20it/s] 88%|████████▊ | 11192/12776 [1:59:29<21:52,  1.21it/s]                                                        88%|████████▊ | 11192/12776 [1:59:29<21:52,  1.21it/s] 88%|████████▊ | 11193/12776 [1:59:30<21:06,  1.25it/s]                                                        88%|████████▊ | 11193/12776 [1:59:30<21:06,  1.25it/s] 88%|████████▊ | 11194/12776 [1:59:30<20:18,  1.30it/s]                                                        88%|████████▊ | 11194/12776 [1:59:30<20:18,  1.30it/s] 88%|████████▊ | 11195/12776 [1:59:31<19:21,  1.36it/s]                                                        88%|████████▊ | 11195/12776 [1:59:31<19:21,  1.36it/s] 88%|████████▊ | 11196/12776 [1:59:32<19:12,  1.37it/s]                                                        88%|████████▊ | 11196/12776 [1:59:32<19:12,  1.37it/s] 88%|███��████▊ | 11197/12776 [1:59:32<18:05,  1.45it/s]                                                        88%|████████▊ | 11197/12776 [1:59:32<18:05,  1.45it/s] 88%|████████▊ | 11198/12776 [1:59:33<17:28,  1.51it/s]                                                        88%|████████▊ | 11198/12776 [1:59:33<17:28,  1.51it/s] 88%|████████▊ | 11199/12776 [1:59:33<16:34,  1.58it/s]                                                        88%|████████▊ | 11199/12776 [1:59:33<16:34,  1.58it/s] 88%|████████▊ | 11200/12776 [1:59:34<16:00,  1.64it/s]                                                        88%|████████▊ | 11200/12776 [1:59:34<16:00,  1.64it/s]Saving model checkpoint to ./checkpoint-11200
+Configuration saved in ./checkpoint-11200/config.json
+Model weights saved in ./checkpoint-11200/model.safetensors
+Feature extractor saved in ./checkpoint-11200/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-11200/tokenizer_config.json
+Special tokens file saved in ./checkpoint-11200/special_tokens_map.json
+added tokens file saved in ./checkpoint-11200/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-10000] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 88%|████████▊ | 11201/12776 [1:59:40<56:48,  2.16s/it]                                                        88%|████████▊ | 11201/12776 [1:59:40<56:48,  2.16s/it] 88%|████████▊ | 11202/12776 [1:59:40<43:10,  1.65s/it]                                                        88%|████████▊ | 11202/12776 [1:59:40<43:10,  1.65s/it] 88%|████████▊ | 11203/12776 [1:59:41<33:55,  1.29s/it]                                                        88%|████████▊ | 11203/12776 [1:59:41<33:55,  1.29s/it] 88%|████████▊ | 11204/12776 [1:59:41<26:52,  1.03s/it]                                                        88%|████████▊ | 11204/12776 [1:59:41<26:52,  1.03s/it] 88%|████████▊ | 11205/12776 [1:59:41<21:47,  1.20it/s]                                                        88%|████████▊ | 11205/12776 [1:59:41<21:47,  1.20it/s] 88%|████████▊ | 11206/12776 [1:59:42<18:08,  1.44it/s]                                                        88%|████████▊ | 11206/12776 [1:59:42<18:08,  1.44it/s] 88%|████████▊ | 11207/12776 [1:59:42<15:28,  1.69it/s]                                                        88%|████████▊ | 11207/12776 [1:59:42<15:28,  1.69it/s] 88%|████████▊ | 11208/12776 [1:59:43<13:22,  1.95it/s]                                                        88%|████████▊ | 11208/12776 [1:59:43<13:22,  1.95it/s] 88%|████████▊ | 11209/12776 [1:59:43<12:29,  2.09it/s]                                                        88%|████████▊ | 11209/12776 [1:59:43<12:29,  2.09it/s] 88%|████████▊ | 11210/12776 [1:59:43<11:18,  2.31it/s]                                                        88%|████████▊ | 11210/12776 [1:59:43<11:18,  2.31it/s] 88%|████████▊ | 11211/12776 [1:59:44<10:19,  2.53it/s]                                                        88%|████████▊ | 11211/12776 [1:59:44<10:19,  2.53it/s] 88%|████████▊ | 11212/12776 [1:59:44<10:01,  2.60it/s]                                                        88%|████████▊ | 11212/12776 [1:59:44<10:01,  2.60it/s] 88%|████████▊ | 11213/12776 [1:59:44<09:16,  2.81it/s]                                                        88%|████████▊ | 11213/12776 [1:59:44<09:16,  2.81it/s] 88%|████████▊ | 11214/12776 [1:59:44<08:38,  3.01it/s]                                                        88%|████████▊ | 11214/12776 [1:59:44<08:38,  3.01it/s] 88%|████████▊ | 11215/12776 [1:59:45<08:07,  3.20it/s]                                                        88%|████████▊ | 11215/12776 [1:59:45<08:07,  3.20it/s] 88%|████████▊ | 11216/12776 [1:59:45<08:12,  3.17it/s]                                                        88%|████████▊ | 11216/12776 [1:59:45<08:12,  3.17it/s] 88%|████████▊ | 11217/12776 [1:59:45<07:56,  3.27it/s]                                                        88%|████████▊ | 11217/12776 [1:59:45<07:56,  3.27it/s] 88%|████████▊ | 11218/12776 [1:59:46<07:33,  3.43it/s]                                                        88%|████████▊ | 11218/12776 [1:59:46<07:33,  3.43it/s] 88%|████████▊ | 11219/12776 [1:59:46<07:10,  3.62it/s]                                                        88%|████████▊ | 11219/12776 [1:59:46<07:10,  3.62it/s] 88%|████████▊ | 11220/12776 [1:59:46<07:32,  3.44it/s]                                                        88%|████████▊ | 11220/12776 [1:59:46<07:32,  3.44it/s] 88%|████████▊ | 11221/12776 [1:59:46<07:01,  3.69it/s]                                                        88%|████████▊ | 11221/12776 [1:59:46<07:01,  3.69it/s] 88%|████████▊ | 11222/12776 [1:59:47<06:37,  3.90it/s]                                                        88%|████████▊ | 11222/12776 [1:59:47<06:37,  3.90it/s] 88%|████████▊ | 11223/12776 [1:59:47<06:19,  4.09it/s]                                                        88%|████████▊ | 11223/12776 [1:59:47<06:19,  4.09it/s] 88%|████████▊ | 11224/12776 [1:59:47<06:03,  4.27it/s]                                                        88%|████████▊ | 11224/12776 [1:59:47<06:03,  4.27it/s] 88%|████████▊ | 11225/12776 [1:59:47<06:14,  4.14it/s]                                                        88%|████████▊ | 11225/12776 [1:59:47<06:14,  4.14it/s] 88%|████████▊ | 11226/12776 [1:59:48<05:52,  4.40it/s]                                                        88%|████████▊ | 11226/12776 [1:59:48<05:52,  4.40it/s] 88%|████████▊ | 11227/12776 [1:59:48<05:35,  4.61it/s]                                                        88%|████████▊ | 11227/12776 [1:59:48<05:35,  4.61it/s] 88%|████████▊ | 11228/12776 [1:59:48<05:24,  4.77it/s]                                                        88%|████████▊ | 11228/12776 [1:59:48<05:24,  4.77it/s] 88%|████████▊ | 11229/12776 [1:59:48<05:13,  4.93it/s]                                                        88%|████████▊ | 11229/12776 [1:59:48<05:13,  4.93it/s] 88%|████████▊ | 11230/12776 [1:59:48<05:04,  5.09it/s]                                                        88%|████████▊ | 11230/12776 [1:59:48<05:04,  5.09it/s] 88%|████████▊ | 11231/12776 [1:59:49<05:35,  4.61it/s]                                                        88%|████████▊ | 11231/12776 [1:59:49<05:35,  4.61it/s] 88%|████████▊ | 11232/12776 [1:59:49<05:17,  4.86it/s]                                                        88%|████████▊ | 11232/12776 [1:59:49<05:17,  4.86it/s] 88%|████████▊ | 11233/12776 [1:59:49<05:03,  5.08it/s]                                                       {'loss': 0.4363, 'grad_norm': 1.3997137546539307, 'learning_rate': 3.998044965786901e-05, 'epoch': 1.75}
+{'loss': 0.6176, 'grad_norm': 1.8607534170150757, 'learning_rate': 3.9956011730205276e-05, 'epoch': 1.75}
+{'loss': 0.6712, 'grad_norm': 2.017686605453491, 'learning_rate': 3.993157380254154e-05, 'epoch': 1.75}
+{'loss': 0.553, 'grad_norm': 2.3117191791534424, 'learning_rate': 3.990713587487781e-05, 'epoch': 1.75}
+{'loss': 0.3159, 'grad_norm': 2.349550247192383, 'learning_rate': 3.988269794721407e-05, 'epoch': 1.75}
+{'loss': 0.5679, 'grad_norm': 2.04817533493042, 'learning_rate': 3.985826001955034e-05, 'epoch': 1.75}
+{'loss': 0.4161, 'grad_norm': 7.92338228225708, 'learning_rate': 3.983382209188661e-05, 'epoch': 1.75}
+{'loss': 0.4948, 'grad_norm': 2.4229795932769775, 'learning_rate': 3.980938416422287e-05, 'epoch': 1.75}
+{'loss': 0.7086, 'grad_norm': 3.308713436126709, 'learning_rate': 3.9784946236559136e-05, 'epoch': 1.75}
+{'loss': 0.3026, 'grad_norm': 1.2768535614013672, 'learning_rate': 3.976050830889541e-05, 'epoch': 1.75}
+{'loss': 0.6669, 'grad_norm': 2.1575565338134766, 'learning_rate': 3.9736070381231666e-05, 'epoch': 1.75}
+{'loss': 0.5661, 'grad_norm': 2.513392448425293, 'learning_rate': 3.971163245356794e-05, 'epoch': 1.75}
+{'loss': 0.3623, 'grad_norm': 2.0377068519592285, 'learning_rate': 3.96871945259042e-05, 'epoch': 1.75}
+{'loss': 1.0925, 'grad_norm': 3.4843387603759766, 'learning_rate': 3.966275659824047e-05, 'epoch': 1.75}
+{'loss': 0.7483, 'grad_norm': 2.729309320449829, 'learning_rate': 3.963831867057673e-05, 'epoch': 1.75}
+{'loss': 0.5983, 'grad_norm': 3.7597618103027344, 'learning_rate': 3.9613880742913e-05, 'epoch': 1.75}
+{'loss': 0.7542, 'grad_norm': 3.6038522720336914, 'learning_rate': 3.958944281524926e-05, 'epoch': 1.75}
+{'loss': 0.9845, 'grad_norm': 2.91158390045166, 'learning_rate': 3.956500488758553e-05, 'epoch': 1.75}
+{'loss': 0.6904, 'grad_norm': 5.084097385406494, 'learning_rate': 3.95405669599218e-05, 'epoch': 1.75}
+{'loss': 0.6184, 'grad_norm': 2.9743316173553467, 'learning_rate': 3.951612903225806e-05, 'epoch': 1.75}
+{'loss': 0.5974, 'grad_norm': 5.7044172286987305, 'learning_rate': 3.949169110459433e-05, 'epoch': 1.75}
+{'loss': 1.2483, 'grad_norm': 2.698913097381592, 'learning_rate': 3.94672531769306e-05, 'epoch': 1.75}
+{'loss': 0.9042, 'grad_norm': 3.227285623550415, 'learning_rate': 3.944281524926686e-05, 'epoch': 1.75}
+{'loss': 0.9025, 'grad_norm': 3.3017385005950928, 'learning_rate': 3.941837732160313e-05, 'epoch': 1.75}
+{'loss': 0.437, 'grad_norm': 3.2290198802948, 'learning_rate': 3.939393939393939e-05, 'epoch': 1.75}
+{'loss': 0.7402, 'grad_norm': 3.120434522628784, 'learning_rate': 3.936950146627565e-05, 'epoch': 1.75}
+{'loss': 0.5906, 'grad_norm': 2.314493417739868, 'learning_rate': 3.934506353861192e-05, 'epoch': 1.75}
+{'loss': 1.206, 'grad_norm': 1.895832896232605, 'learning_rate': 3.9320625610948195e-05, 'epoch': 1.75}
+{'loss': 1.0012, 'grad_norm': 1.9516853094100952, 'learning_rate': 3.929618768328445e-05, 'epoch': 1.75}
+{'loss': 0.6269, 'grad_norm': 1.7214974164962769, 'learning_rate': 3.9271749755620724e-05, 'epoch': 1.75}
+{'loss': 0.8477, 'grad_norm': 2.6578586101531982, 'learning_rate': 3.924731182795699e-05, 'epoch': 1.75}
+{'loss': 0.7678, 'grad_norm': 2.1132657527923584, 'learning_rate': 3.922287390029325e-05, 'epoch': 1.75}
+{'loss': 0.6208, 'grad_norm': 3.0268445014953613, 'learning_rate': 3.919843597262952e-05, 'epoch': 1.75}
+{'loss': 0.8431, 'grad_norm': 4.13357400894165, 'learning_rate': 3.917399804496579e-05, 'epoch': 1.75}
+{'loss': 0.1721, 'grad_norm': 0.46277809143066406, 'learning_rate': 3.914956011730205e-05, 'epoch': 1.75}
+{'loss': 0.271, 'grad_norm': 0.6381204724311829, 'learning_rate': 3.912512218963832e-05, 'epoch': 1.75}
+{'loss': 0.3409, 'grad_norm': 1.2064273357391357, 'learning_rate': 3.9100684261974585e-05, 'epoch': 1.75}
+{'loss': 0.2178, 'grad_norm': 0.6451768279075623, 'learning_rate': 3.907624633431084e-05, 'epoch': 1.75}
+{'loss': 0.2548, 'grad_norm': 0.9377066493034363, 'learning_rate': 3.9051808406647114e-05, 'epoch': 1.75}
+{'loss': 0.2966, 'grad_norm': 1.069899320602417, 'learning_rate': 3.902737047898337e-05, 'epoch': 1.75}
+{'loss': 0.211, 'grad_norm': 1.6345460414886475, 'learning_rate': 3.9002932551319644e-05, 'epoch': 1.75}
+{'loss': 0.2923, 'grad_norm': 1.789792537689209, 'learning_rate': 3.8978494623655915e-05, 'epoch': 1.75}
+{'loss': 0.5062, 'grad_norm': 1.5872666835784912, 'learning_rate': 3.895405669599217e-05, 'epoch': 1.75}
+{'loss': 0.2508, 'grad_norm': 2.3181369304656982, 'learning_rate': 3.892961876832844e-05, 'epoch': 1.75}
+{'loss': 0.3586, 'grad_norm': 2.8838610649108887, 'learning_rate': 3.890518084066471e-05, 'epoch': 1.75}
+{'loss': 0.2989, 'grad_norm': 1.250127911567688, 'learning_rate': 3.888074291300097e-05, 'epoch': 1.75}
+{'loss': 0.3618, 'grad_norm': 0.9054256677627563, 'learning_rate': 3.885630498533724e-05, 'epoch': 1.75}
+{'loss': 0.6206, 'grad_norm': 3.5090181827545166, 'learning_rate': 3.8831867057673504e-05, 'epoch': 1.75}
+{'loss': 0.26, 'grad_norm': 1.8899184465408325, 'learning_rate': 3.880742913000977e-05, 'epoch': 1.75}
+{'loss': 0.4928, 'grad_norm': 1.9521454572677612, 'learning_rate': 3.8782991202346034e-05, 'epoch': 1.75}
+{'loss': 0.3366, 'grad_norm': 1.03826904296875, 'learning_rate': 3.8758553274682305e-05, 'epoch': 1.75}
+{'loss': 0.3439, 'grad_norm': 0.876063346862793, 'learning_rate': 3.873411534701856e-05, 'epoch': 1.75}
+{'loss': 0.2939, 'grad_norm': 3.456061363220215, 'learning_rate': 3.8709677419354835e-05, 'epoch': 1.75}
+{'loss': 0.5492, 'grad_norm': 1.792235016822815, 'learning_rate': 3.86852394916911e-05, 'epoch': 1.75}
+{'loss': 0.6838, 'grad_norm': 8.215913772583008, 'learning_rate': 3.8660801564027364e-05, 'epoch': 1.75}
+{'loss': 0.3709, 'grad_norm': 8.348455429077148, 'learning_rate': 3.863636363636363e-05, 'epoch': 1.75}
+{'loss': 0.3818, 'grad_norm': 1.567761778831482, 'learning_rate': 3.86119257086999e-05, 'epoch': 1.76}
+{'loss': 0.606, 'grad_norm': 1.7908146381378174, 'learning_rate': 3.858748778103616e-05, 'epoch': 1.76}
+{'loss': 0.5006, 'grad_norm': 2.4007041454315186, 'learning_rate': 3.856304985337243e-05, 'epoch': 1.76}
+{'loss': 0.5744, 'grad_norm': 1.7256523370742798, 'learning_rate': 3.8538611925708695e-05, 'epoch': 1.76}
+{'loss': 0.4488, 'grad_norm': 2.6474239826202393, 'learning_rate': 3.851417399804496e-05, 'epoch': 1.76}
+{'loss': 0.4047, 'grad_norm': 6.710626125335693, 'learning_rate': 3.8489736070381225e-05, 'epoch': 1.76}
+{'loss': 0.735, 'grad_norm': 2.4664697647094727, 'learning_rate': 3.8465298142717496e-05, 'epoch': 1.76}
+{'loss': 0.4921, 'grad_norm': 1.39060640335083, 'learning_rate': 3.8440860215053754e-05, 'epoch': 1.76}
+{'loss': 0.5779, 'grad_norm': 3.047032117843628, 'learning_rate': 3.8416422287390026e-05, 'epoch': 1.76}
+{'loss': 0.6209, 'grad_norm': 1.9049490690231323, 'learning_rate': 3.839198435972629e-05, 'epoch': 1.76}
+{'loss': 0.4992, 'grad_norm': 2.976374626159668, 'learning_rate': 3.8367546432062556e-05, 'epoch': 1.76}
+{'loss': 0.8037, 'grad_norm': 2.006279706954956, 'learning_rate': 3.834310850439882e-05, 'epoch': 1.76}
+{'loss': 0.9672, 'grad_norm': 4.789150238037109, 'learning_rate': 3.831867057673509e-05, 'epoch': 1.76}
+{'loss': 0.5118, 'grad_norm': 4.674681186676025, 'learning_rate': 3.829423264907135e-05, 'epoch': 1.76}
+{'loss': 0.7725, 'grad_norm': 8.977636337280273, 'learning_rate': 3.826979472140762e-05, 'epoch': 1.76}
+{'loss': 0.5044, 'grad_norm': 3.718141794204712, 'learning_rate': 3.8245356793743886e-05, 'epoch': 1.76}
+{'loss': 0.8362, 'grad_norm': 2.4552061557769775, 'learning_rate': 3.822091886608015e-05, 'epoch': 1.76}
+{'loss': 1.0312, 'grad_norm': 4.80755615234375, 'learning_rate': 3.8196480938416416e-05, 'epoch': 1.76}
+{'loss': 1.6852, 'grad_norm': 5.233382225036621, 'learning_rate': 3.817204301075269e-05, 'epoch': 1.76}
+{'loss': 1.3879, 'grad_norm': 5.555087566375732, 'learning_rate': 3.8147605083088946e-05, 'epoch': 1.76}
+{'loss': 1.2419, 'grad_norm': 5.3640923500061035, 'learning_rate': 3.812316715542522e-05, 'epoch': 1.76}
+{'loss': 0.566, 'grad_norm': 1.4658650159835815, 'learning_rate': 3.809872922776148e-05, 'epoch': 1.76}
+ 88%|████████▊ | 11233/12776 [1:59:49<05:03,  5.08it/s] 88%|████████▊ | 11234/12776 [1:59:49<04:52,  5.27it/s]                                                        88%|████████▊ | 11234/12776 [1:59:49<04:52,  5.27it/s] 88%|████████▊ | 11235/12776 [1:59:49<04:43,  5.44it/s]                                                        88%|████████▊ | 11235/12776 [1:59:49<04:43,  5.44it/s] 88%|████████▊ | 11236/12776 [1:59:49<04:34,  5.61it/s]                                                        88%|████████▊ | 11236/12776 [1:59:49<04:34,  5.61it/s] 88%|████████▊ | 11237/12776 [1:59:50<05:04,  5.06it/s]                                                        88%|████████▊ | 11237/12776 [1:59:50<05:04,  5.06it/s] 88%|████████▊ | 11238/12776 [1:59:50<09:02,  2.84it/s]                                                        88%|████████▊ | 11238/12776 [1:59:50<09:02,  2.84it/s] 88%|████████▊ | 11239/12776 [1:59:52<18:02,  1.42it/s]                                                        88%|████████▊ | 11239/12776 [1:59:52<18:02,  1.42it/s] 88%|████████▊ | 11240/12776 [1:59:53<20:51,  1.23it/s]                                                        88%|████████▊ | 11240/12776 [1:59:53<20:51,  1.23it/s] 88%|████████▊ | 11241/12776 [1:59:54<21:07,  1.21it/s]                                                        88%|████████▊ | 11241/12776 [1:59:54<21:07,  1.21it/s] 88%|████████▊ | 11242/12776 [1:59:55<20:32,  1.24it/s]                                                        88%|████████▊ | 11242/12776 [1:59:55<20:32,  1.24it/s] 88%|████████▊ | 11243/12776 [1:59:55<19:47,  1.29it/s]                                                        88%|████████▊ | 11243/12776 [1:59:55<19:47,  1.29it/s] 88%|████████▊ | 11244/12776 [1:59:56<19:01,  1.34it/s]                                                        88%|████████▊ | 11244/12776 [1:59:56<19:01,  1.34it/s] 88%|████████▊ | 11245/12776 [1:59:57<18:52,  1.35it/s]                                                        88%|████████▊ | 11245/12776 [1:59:57<18:52,  1.35it/s] 88%|████████▊ | 11246/12776 [1:59:57<17:51,  1.43it/s]                                                        88%|████████▊ | 11246/12776 [1:59:57<17:51,  1.43it/s] 88%|████████▊ | 11247/12776 [1:59:58<17:10,  1.48it/s]                                                        88%|████████▊ | 11247/12776 [1:59:58<17:10,  1.48it/s] 88%|████████▊ | 11248/12776 [1:59:58<16:13,  1.57it/s]                                                        88%|████████▊ | 11248/12776 [1:59:58<16:13,  1.57it/s] 88%|████████▊ | 11249/12776 [1:59:59<15:35,  1.63it/s]                                                        88%|████████▊ | 11249/12776 [1:59:59<15:35,  1.63it/s] 88%|████████▊ | 11250/12776 [2:00:00<14:52,  1.71it/s]                                                        88%|████████▊ | 11250/12776 [2:00:00<14:52,  1.71it/s] 88%|████████▊ | 11251/12776 [2:00:00<15:00,  1.69it/s]                                                        88%|████████▊ | 11251/12776 [2:00:00<15:00,  1.69it/s] 88%|████████▊ | 11252/12776 [2:00:01<14:10,  1.79it/s]                                                        88%|████████▊ | 11252/12776 [2:00:01<14:10,  1.79it/s] 88%|████████▊ | 11253/12776 [2:00:01<13:57,  1.82it/s]                                                        88%|████████▊ | 11253/12776 [2:00:01<13:57,  1.82it/s] 88%|████████▊ | 11254/12776 [2:00:02<12:56,  1.96it/s]                                                        88%|████████▊ | 11254/12776 [2:00:02<12:56,  1.96it/s] 88%|████████▊ | 11255/12776 [2:00:02<13:02,  1.94it/s]                                                        88%|████████▊ | 11255/12776 [2:00:02<13:02,  1.94it/s] 88%|████████▊ | 11256/12776 [2:00:02<11:59,  2.11it/s]                                                        88%|████████▊ | 11256/12776 [2:00:02<11:59,  2.11it/s] 88%|████████▊ | 11257/12776 [2:00:03<11:10,  2.27it/s]                                                        88%|████████▊ | 11257/12776 [2:00:03<11:10,  2.27it/s] 88%|████████▊ | 11258/12776 [2:00:03<11:04,  2.28it/s]                                                        88%|████████▊ | 11258/12776 [2:00:03<11:04,  2.28it/s] 88%|████████▊ | 11259/12776 [2:00:04<10:17,  2.46it/s]                                                        88%|████████▊ | 11259/12776 [2:00:04<10:17,  2.46it/s] 88%|████████▊ | 11260/12776 [2:00:04<09:38,  2.62it/s]                                                        88%|████████▊ | 11260/12776 [2:00:04<09:38,  2.62it/s] 88%|████████▊ | 11261/12776 [2:00:04<09:06,  2.77it/s]                                                        88%|████████▊ | 11261/12776 [2:00:04<09:06,  2.77it/s] 88%|████████▊ | 11262/12776 [2:00:05<09:21,  2.69it/s]                                                        88%|████████▊ | 11262/12776 [2:00:05<09:21,  2.69it/s] 88%|████████▊ | 11263/12776 [2:00:05<08:47,  2.87it/s]                                                        88%|████████▊ | 11263/12776 [2:00:05<08:47,  2.87it/s] 88%|████████▊ | 11264/12776 [2:00:05<08:16,  3.05it/s]                                                        88%|████████▊ | 11264/12776 [2:00:05<08:16,  3.05it/s] 88%|████████▊ | 11265/12776 [2:00:06<08:46,  2.87it/s]                                                        88%|████████▊ | 11265/12776 [2:00:06<08:46,  2.87it/s] 88%|████████▊ | 11266/12776 [2:00:06<08:08,  3.09it/s]                                                        88%|████████▊ | 11266/12776 [2:00:06<08:08,  3.09it/s] 88%|████████▊ | 11267/12776 [2:00:06<07:38,  3.29it/s]                                                        88%|████████▊ | 11267/12776 [2:00:06<07:38,  3.29it/s] 88%|████████▊ | 11268/12776 [2:00:06<07:14,  3.47it/s]                                                        88%|████████▊ | 11268/12776 [2:00:06<07:14,  3.47it/s] 88%|████████▊ | 11269/12776 [2:00:07<07:39,  3.28it/s]                                                        88%|████████▊ | 11269/12776 [2:00:07<07:39,  3.28it/s] 88%|████████▊ | 11270/12776 [2:00:07<07:06,  3.53it/s]                                                        88%|████████▊ | 11270/12776 [2:00:07<07:06,  3.53it/s] 88%|████████▊ | 11271/12776 [2:00:07<06:40,  3.75it/s]                                                        88%|████████▊ | 11271/12776 [2:00:07<06:40,  3.75it/s] 88%|████████▊ | 11272/12776 [2:00:07<06:21,  3.94it/s]                                                        88%|████████▊ | 11272/12776 [2:00:07<06:21,  3.94it/s] 88%|████████▊ | 11273/12776 [2:00:08<06:06,  4.11it/s]                                                        88%|████████▊ | 11273/12776 [2:00:08<06:06,  4.11it/s] 88%|████████▊ | 11274/12776 [2:00:08<06:44,  3.71it/s]                                                        88%|████████▊ | 11274/12776 [2:00:08<06:44,  3.71it/s] 88%|████████▊ | 11275/12776 [2:00:08<06:16,  3.99it/s]                                                        88%|████████▊ | 11275/12776 [2:00:08<06:16,  3.99it/s] 88%|████████▊ | 11276/12776 [2:00:08<05:53,  4.24it/s]                                                        88%|████████▊ | 11276/12776 [2:00:08<05:53,  4.24it/s] 88%|████████▊ | 11277/12776 [2:00:09<05:35,  4.47it/s]                                                        88%|████████▊ | 11277/12776 [2:00:09<05:35,  4.47it/s] 88%|████████▊ | 11278/12776 [2:00:09<05:20,  4.67it/s]                                                        88%|████████▊ | 11278/12776 [2:00:09<05:20,  4.67it/s] 88%|████████▊ | 11279/12776 [2:00:09<05:56,  4.20it/s]                                                        88%|████████▊ | 11279/12776 [2:00:09<05:56,  4.20it/s] 88%|████████▊ | 11280/12776 [2:00:09<05:31,  4.51it/s]                                                        88%|████████▊ | 11280/12776 [2:00:09<05:31,  4.51it/s] 88%|████████▊ | 11281/12776 [2:00:09<05:13,  4.76it/s]                                                        88%|████████▊ | 11281/12776 [2:00:09<05:13,  4.76it/s] 88%|████████▊ | 11282/12776 [2:00:10<05:01,  4.95it/s]                                                        88%|████████▊ | 11282/12776 [2:00:10<05:01,  4.95it/s] 88%|████████▊ | 11283/12776 [2:00:10<04:49,  5.16it/s]                                                        88%|████████▊ | 11283/12776 [2:00:10<04:49,  5.16it/s] 88%|████████▊ | 11284/12776 [2:00:10<04:38,  5.35it/s]                                                        88%|████████▊ | 11284/12776 [2:00:10<04:38,  5.35it/s] 88%|████████▊ | 11285/12776 [2:00:10<04:56,  5.03it/s]                                                        88%|████████▊ | 11285/12776 [2:00:10<04:56,  5.03it/s] 88%|████████▊ | 11286/12776 [2:00:10<04:40,  5.32it/s]                                                        88%|████████▊ | 11286/12776 [2:00:10<04:40,  5.32it/s] 88%|████████▊ | 11287/12776 [2:00:10<04:27,  5.56it/s]                                                        88%|████████▊ | 11287/12776 [2:00:10<04:27,  5.56it/s] 88%|████████▊ | 11288/12776 [2:00:11<09:10,  2.71it/s]                                                        88%|████████▊ | 11288/12776 [2:00:11<09:10,  2.71it/s] 88%|████████▊ | 11289/12776 [2:00:13<16:59,  1.46it/s]                                                        88%|████████▊ | 11289/12776 [2:00:13<16:59,  1.46it/s] 88%|████████▊ | 11290/12776 [2:00:14<18:47,  1.32it/s]                                                        88%|████████▊ | 11290/12776 [2:00:14<18:47,  1.32it/s] 88%|████████▊ | 11291/12776 [2:00:14<19:23,  1.28it/s]                                                        88%|████████▊ | 11291/12776 [2:00:14<19:23,  1.28it/s] 88%|████████▊ | 11292/12776 [2:00:15<19:29,  1.27it/s]                                                        88%|████████▊ | 11292/12776 [2:00:15<19:29,  1.27it/s] 88%|████████▊ | 11293/12776 [2:00:16<20:00,  1.24it/s]                                                        88%|████████▊ | 11293/12776 [2:00:16<20:00,  1.24it/s] 88%|████████▊ | 11294/12776 [2:00:17<20:10,  1.22it/s]                                                        88%|████████▊ | 11294/12776 [2:00:17<20:10,  1.22it/s] 88%|████████▊ | 11295/12776 [2:00:18<19:02,  1.30it/s]                                                        88%|████████▊ | 11295/12776 [2:00:18<19:02,  1.30it/s] 88%|████████▊ | 11296/12776 [2:00:18<19:04,  1.29it/s]                                                        88%|████████▊ | 11296/12776 [2:00:18<19:04,  1.29it/s] 88%|████████▊ | 11297/12776 [2:00:19<17:52,  1.38it/s]                                                        88%|████████▊ | 11297/12776 [2:00:19<17:52,  1.38it/s] 88%|████████▊ | 11298/12776 [2:00:20<16:59,  1.45it/s]                                                        88%|████████▊ | 11298/12776 [2:00:20<16:59,  1.45it/s] 88%|████████▊ | 11299/12776 [2:00:20<15:54,  1.55it/s]                                                        88%|████████▊ | 11299/12776 [2:00:20<15:54,  1.55it/s] 88%|████████▊ | 11300/12776 [2:00:21<16:08,  1.52it/s]                                                        88%|████████▊ | 11300/12776 [2:00:21<16:08,  1.52it/s] 88%|████████▊ | 11301/12776 [2:00:21<14:56,  1.64it/s]                                                        88%|████████▊ | 11301/12776 [2:00:21<14:56,  1.64it/s] 88%|████████▊ | 11302/12776 [2:00:22<14:22,  1.71it/s]                                                        88%|████████▊ | 11302/12776 [2:00:22<14:22,  1.71it/s] 88%|████████▊ | 11303/12776 [2:00:22<13:21,  1.84it/s]                                                        88%|████████▊ | 11303/12776 [2:00:22<13:21,  1.84it/s] 88%|████████▊ | 11304/12776 [2:00:23<12:30,  1.96it/s]                                                        88%|████████▊ | 11304/12776 [2:00:23<12:30,  1.96it/s] 88%|████████▊ | 11305/12776 [2:00:23<11:53,  2.06it/s]                                                        88%|████████▊ | 11305/12776 [2:00:23<11:53,  2.06it/s] 88%|████████▊ | 11306/12776 [2:00:24<11:12,  2.19it/s]                                                        88%|████████▊ | 11306/12776 [2:00:24<11:12,  2.19it/s] 89%|████████▊ | 11307/12776 [2:00:24<11:18,  2.17it/s]                                                        89%|████████▊ | 11307/12776 [2:00:24<11:18,  2.17it/s] 89%|████████▊ | 11308/12776 [2:00:24<10:36,  2.31it/s]                                                        89%|████████▊ | 11308/12776 [2:00:24<10:36,  2.31it/s] 89%|████████▊ | 11309/12776 [2:00:25<10:02,  2.43it/s]                                                        89%|████████▊ | 11309/12776 [2:00:25<10:02,  2.43it/s] 89%|████████▊ | 11310/12776 [2:00:25<10:22,  2.36it/s]                                                        89%|████████▊ | 11310/12776 [2:00:25<10:22,  2.36it/s] 89%|████████▊ | 11311/12776 [2:00:26<09:45,  2.50it/s]                                                       {'loss': 1.0166, 'grad_norm': 10.554597854614258, 'learning_rate': 3.807429130009775e-05, 'epoch': 1.76}
+{'loss': 0.2771, 'grad_norm': 2.0296871662139893, 'learning_rate': 3.804985337243401e-05, 'epoch': 1.76}
+{'loss': 0.7007, 'grad_norm': 2.8788039684295654, 'learning_rate': 3.802541544477028e-05, 'epoch': 1.76}
+{'loss': 0.7984, 'grad_norm': 3.588125228881836, 'learning_rate': 3.800097751710654e-05, 'epoch': 1.76}
+{'loss': 1.3304, 'grad_norm': 3.308217763900757, 'learning_rate': 3.797653958944281e-05, 'epoch': 1.76}
+{'loss': 1.4925, 'grad_norm': 2.5939760208129883, 'learning_rate': 3.795210166177908e-05, 'epoch': 1.76}
+{'loss': 0.2275, 'grad_norm': 0.596777617931366, 'learning_rate': 3.792766373411534e-05, 'epoch': 1.76}
+{'loss': 0.2228, 'grad_norm': 1.5214952230453491, 'learning_rate': 3.790322580645161e-05, 'epoch': 1.76}
+{'loss': 0.2029, 'grad_norm': 3.9668772220611572, 'learning_rate': 3.787878787878788e-05, 'epoch': 1.76}
+{'loss': 0.6679, 'grad_norm': 2.6096763610839844, 'learning_rate': 3.785434995112414e-05, 'epoch': 1.76}
+{'loss': 0.2772, 'grad_norm': 0.46722492575645447, 'learning_rate': 3.782991202346041e-05, 'epoch': 1.76}
+{'loss': 0.2544, 'grad_norm': 1.0145384073257446, 'learning_rate': 3.780547409579667e-05, 'epoch': 1.76}
+{'loss': 0.2445, 'grad_norm': 0.5798178315162659, 'learning_rate': 3.778103616813294e-05, 'epoch': 1.76}
+{'loss': 0.2651, 'grad_norm': 0.9099991917610168, 'learning_rate': 3.77565982404692e-05, 'epoch': 1.76}
+{'loss': 0.2738, 'grad_norm': 0.7325129508972168, 'learning_rate': 3.7732160312805474e-05, 'epoch': 1.76}
+{'loss': 0.2302, 'grad_norm': 0.7348865866661072, 'learning_rate': 3.770772238514173e-05, 'epoch': 1.76}
+{'loss': 0.2498, 'grad_norm': 0.6817787885665894, 'learning_rate': 3.7683284457478004e-05, 'epoch': 1.76}
+{'loss': 0.3367, 'grad_norm': 5.162353038787842, 'learning_rate': 3.765884652981427e-05, 'epoch': 1.76}
+{'loss': 0.24, 'grad_norm': 0.7174679636955261, 'learning_rate': 3.7634408602150534e-05, 'epoch': 1.76}
+{'loss': 0.3054, 'grad_norm': 1.1641637086868286, 'learning_rate': 3.76099706744868e-05, 'epoch': 1.76}
+{'loss': 0.3698, 'grad_norm': 1.7706217765808105, 'learning_rate': 3.758553274682307e-05, 'epoch': 1.76}
+{'loss': 0.2319, 'grad_norm': 0.8290618658065796, 'learning_rate': 3.756109481915933e-05, 'epoch': 1.76}
+{'loss': 0.3196, 'grad_norm': 1.1751283407211304, 'learning_rate': 3.75366568914956e-05, 'epoch': 1.76}
+{'loss': 1.5936, 'grad_norm': 5.826601982116699, 'learning_rate': 3.7512218963831864e-05, 'epoch': 1.76}
+{'loss': 0.3376, 'grad_norm': 1.5065512657165527, 'learning_rate': 3.748778103616813e-05, 'epoch': 1.76}
+{'loss': 0.3552, 'grad_norm': 1.3743765354156494, 'learning_rate': 3.7463343108504394e-05, 'epoch': 1.76}
+{'loss': 0.3617, 'grad_norm': 1.3303420543670654, 'learning_rate': 3.743890518084066e-05, 'epoch': 1.76}
+{'loss': 0.3999, 'grad_norm': 0.8232090473175049, 'learning_rate': 3.741446725317693e-05, 'epoch': 1.76}
+{'loss': 0.4174, 'grad_norm': 1.9334932565689087, 'learning_rate': 3.7390029325513195e-05, 'epoch': 1.76}
+{'loss': 0.3955, 'grad_norm': 1.5762377977371216, 'learning_rate': 3.736559139784946e-05, 'epoch': 1.76}
+{'loss': 0.5943, 'grad_norm': 1.6700406074523926, 'learning_rate': 3.7341153470185725e-05, 'epoch': 1.76}
+{'loss': 0.4755, 'grad_norm': 1.3183702230453491, 'learning_rate': 3.731671554252199e-05, 'epoch': 1.76}
+{'loss': 0.4606, 'grad_norm': 1.5598750114440918, 'learning_rate': 3.7292277614858254e-05, 'epoch': 1.76}
+{'loss': 0.4197, 'grad_norm': 1.2456644773483276, 'learning_rate': 3.7267839687194526e-05, 'epoch': 1.76}
+{'loss': 0.6294, 'grad_norm': 2.95483660697937, 'learning_rate': 3.724340175953079e-05, 'epoch': 1.76}
+{'loss': 0.3793, 'grad_norm': 1.7271779775619507, 'learning_rate': 3.7218963831867055e-05, 'epoch': 1.76}
+{'loss': 0.7018, 'grad_norm': 3.223212957382202, 'learning_rate': 3.719452590420332e-05, 'epoch': 1.76}
+{'loss': 0.6636, 'grad_norm': 7.701069355010986, 'learning_rate': 3.7170087976539585e-05, 'epoch': 1.76}
+{'loss': 0.8444, 'grad_norm': 1.763765573501587, 'learning_rate': 3.714565004887585e-05, 'epoch': 1.76}
+{'loss': 0.8607, 'grad_norm': 4.53816032409668, 'learning_rate': 3.712121212121212e-05, 'epoch': 1.76}
+{'loss': 0.611, 'grad_norm': 3.1749961376190186, 'learning_rate': 3.7096774193548386e-05, 'epoch': 1.76}
+{'loss': 1.0795, 'grad_norm': 5.895051956176758, 'learning_rate': 3.707233626588465e-05, 'epoch': 1.76}
+{'loss': 0.9543, 'grad_norm': 4.479997634887695, 'learning_rate': 3.7047898338220916e-05, 'epoch': 1.77}
+{'loss': 0.9561, 'grad_norm': 4.240951061248779, 'learning_rate': 3.702346041055718e-05, 'epoch': 1.77}
+{'loss': 0.8473, 'grad_norm': 3.595991849899292, 'learning_rate': 3.6999022482893445e-05, 'epoch': 1.77}
+{'loss': 1.0942, 'grad_norm': 3.00730037689209, 'learning_rate': 3.697458455522972e-05, 'epoch': 1.77}
+{'loss': 0.9541, 'grad_norm': 2.618523359298706, 'learning_rate': 3.695014662756598e-05, 'epoch': 1.77}
+{'loss': 1.0638, 'grad_norm': 2.935112237930298, 'learning_rate': 3.692570869990225e-05, 'epoch': 1.77}
+{'loss': 1.0236, 'grad_norm': 3.7480924129486084, 'learning_rate': 3.690127077223851e-05, 'epoch': 1.77}
+{'loss': 1.3881, 'grad_norm': 3.3669211864471436, 'learning_rate': 3.6876832844574776e-05, 'epoch': 1.77}
+{'loss': 1.0829, 'grad_norm': 2.294576406478882, 'learning_rate': 3.685239491691104e-05, 'epoch': 1.77}
+{'loss': 1.1662, 'grad_norm': 2.1476564407348633, 'learning_rate': 3.682795698924731e-05, 'epoch': 1.77}
+{'loss': 0.613, 'grad_norm': 1.508748173713684, 'learning_rate': 3.680351906158358e-05, 'epoch': 1.77}
+{'loss': 0.6305, 'grad_norm': 1.9185620546340942, 'learning_rate': 3.677908113391984e-05, 'epoch': 1.77}
+{'loss': 0.6495, 'grad_norm': 2.450026750564575, 'learning_rate': 3.675464320625611e-05, 'epoch': 1.77}
+{'loss': 0.5036, 'grad_norm': 2.664174795150757, 'learning_rate': 3.673020527859237e-05, 'epoch': 1.77}
+{'loss': 0.2408, 'grad_norm': 1.3441131114959717, 'learning_rate': 3.6705767350928637e-05, 'epoch': 1.77}
+{'loss': 0.2059, 'grad_norm': 1.2940044403076172, 'learning_rate': 3.66813294232649e-05, 'epoch': 1.77}
+{'loss': 0.1938, 'grad_norm': 0.5093015432357788, 'learning_rate': 3.665689149560117e-05, 'epoch': 1.77}
+{'loss': 0.213, 'grad_norm': 0.7823761105537415, 'learning_rate': 3.663245356793744e-05, 'epoch': 1.77}
+{'loss': 0.3456, 'grad_norm': 1.0145026445388794, 'learning_rate': 3.66080156402737e-05, 'epoch': 1.77}
+{'loss': 0.1895, 'grad_norm': 1.1629821062088013, 'learning_rate': 3.658357771260997e-05, 'epoch': 1.77}
+{'loss': 0.3128, 'grad_norm': 1.3927007913589478, 'learning_rate': 3.655913978494623e-05, 'epoch': 1.77}
+{'loss': 0.4427, 'grad_norm': 1.2417762279510498, 'learning_rate': 3.65347018572825e-05, 'epoch': 1.77}
+{'loss': 0.3086, 'grad_norm': 1.2290184497833252, 'learning_rate': 3.651026392961877e-05, 'epoch': 1.77}
+{'loss': 0.3525, 'grad_norm': 1.231251835823059, 'learning_rate': 3.648582600195503e-05, 'epoch': 1.77}
+{'loss': 0.2957, 'grad_norm': 1.0766758918762207, 'learning_rate': 3.64613880742913e-05, 'epoch': 1.77}
+{'loss': 0.2731, 'grad_norm': 0.900638222694397, 'learning_rate': 3.643695014662756e-05, 'epoch': 1.77}
+{'loss': 0.4606, 'grad_norm': 5.631997108459473, 'learning_rate': 3.641251221896383e-05, 'epoch': 1.77}
+{'loss': 0.4096, 'grad_norm': 1.9502328634262085, 'learning_rate': 3.638807429130009e-05, 'epoch': 1.77}
+{'loss': 0.3263, 'grad_norm': 2.7071378231048584, 'learning_rate': 3.6363636363636364e-05, 'epoch': 1.77}
+{'loss': 0.3564, 'grad_norm': 2.8107943534851074, 'learning_rate': 3.633919843597263e-05, 'epoch': 1.77}
+{'loss': 0.3068, 'grad_norm': 1.432214617729187, 'learning_rate': 3.6314760508308894e-05, 'epoch': 1.77}
+{'loss': 0.6622, 'grad_norm': 6.236049652099609, 'learning_rate': 3.629032258064516e-05, 'epoch': 1.77}
+{'loss': 0.4399, 'grad_norm': 1.3720581531524658, 'learning_rate': 3.626588465298142e-05, 'epoch': 1.77}
+{'loss': 0.6246, 'grad_norm': 6.22260856628418, 'learning_rate': 3.624144672531769e-05, 'epoch': 1.77}
+{'loss': 0.3889, 'grad_norm': 1.9137059450149536, 'learning_rate': 3.621700879765396e-05, 'epoch': 1.77}
+{'loss': 0.5285, 'grad_norm': 2.993480682373047, 'learning_rate': 3.6192570869990225e-05, 'epoch': 1.77}
+ 89%|████████▊ | 11311/12776 [2:00:26<09:45,  2.50it/s] 89%|████████▊ | 11312/12776 [2:00:26<09:15,  2.63it/s]                                                        89%|████████▊ | 11312/12776 [2:00:26<09:15,  2.63it/s] 89%|████████▊ | 11313/12776 [2:00:26<09:21,  2.60it/s]                                                        89%|████████▊ | 11313/12776 [2:00:26<09:21,  2.60it/s] 89%|████████▊ | 11314/12776 [2:00:27<08:47,  2.77it/s]                                                        89%|████████▊ | 11314/12776 [2:00:27<08:47,  2.77it/s] 89%|████████▊ | 11315/12776 [2:00:27<08:20,  2.92it/s]                                                        89%|████████▊ | 11315/12776 [2:00:27<08:20,  2.92it/s] 89%|████████▊ | 11316/12776 [2:00:27<08:33,  2.84it/s]                                                        89%|████████▊ | 11316/12776 [2:00:27<08:33,  2.84it/s] 89%|████████▊ | 11317/12776 [2:00:28<08:05,  3.01it/s]                                                        89%|████████▊ | 11317/12776 [2:00:28<08:05,  3.01it/s] 89%|████████▊ | 11318/12776 [2:00:28<07:42,  3.15it/s]                                                        89%|████████▊ | 11318/12776 [2:00:28<07:42,  3.15it/s] 89%|████████▊ | 11319/12776 [2:00:28<07:24,  3.28it/s]                                                        89%|████████▊ | 11319/12776 [2:00:28<07:24,  3.28it/s] 89%|████████▊ | 11320/12776 [2:00:28<07:20,  3.30it/s]                                                        89%|████████▊ | 11320/12776 [2:00:28<07:20,  3.30it/s] 89%|████████▊ | 11321/12776 [2:00:29<07:00,  3.46it/s]                                                        89%|████████▊ | 11321/12776 [2:00:29<07:00,  3.46it/s] 89%|████████▊ | 11322/12776 [2:00:29<06:42,  3.61it/s]                                                        89%|████████▊ | 11322/12776 [2:00:29<06:42,  3.61it/s] 89%|████████▊ | 11323/12776 [2:00:29<06:28,  3.74it/s]                                                        89%|████████▊ | 11323/12776 [2:00:29<06:28,  3.74it/s] 89%|████████▊ | 11324/12776 [2:00:29<06:16,  3.86it/s]                                                        89%|████████▊ | 11324/12776 [2:00:29<06:16,  3.86it/s] 89%|████████▊ | 11325/12776 [2:00:30<06:32,  3.70it/s]                                                        89%|████████▊ | 11325/12776 [2:00:30<06:32,  3.70it/s] 89%|████████▊ | 11326/12776 [2:00:30<06:13,  3.89it/s]                                                        89%|████████▊ | 11326/12776 [2:00:30<06:13,  3.89it/s] 89%|████████▊ | 11327/12776 [2:00:30<05:57,  4.06it/s]                                                        89%|████████▊ | 11327/12776 [2:00:30<05:57,  4.06it/s] 89%|████████▊ | 11328/12776 [2:00:30<05:43,  4.22it/s]                                                        89%|████████▊ | 11328/12776 [2:00:30<05:43,  4.22it/s] 89%|████████▊ | 11329/12776 [2:00:31<06:17,  3.83it/s]                                                        89%|████████▊ | 11329/12776 [2:00:31<06:17,  3.83it/s] 89%|████████▊ | 11330/12776 [2:00:31<05:55,  4.07it/s]                                                        89%|████████▊ | 11330/12776 [2:00:31<05:55,  4.07it/s] 89%|████████▊ | 11331/12776 [2:00:31<05:37,  4.28it/s]                                                        89%|████████▊ | 11331/12776 [2:00:31<05:37,  4.28it/s] 89%|████████▊ | 11332/12776 [2:00:31<05:26,  4.42it/s]                                                        89%|████████▊ | 11332/12776 [2:00:31<05:26,  4.42it/s] 89%|████████▊ | 11333/12776 [2:00:32<05:14,  4.58it/s]                                                        89%|████████▊ | 11333/12776 [2:00:32<05:14,  4.58it/s] 89%|████████▊ | 11334/12776 [2:00:32<05:45,  4.17it/s]                                                        89%|████████▊ | 11334/12776 [2:00:32<05:45,  4.17it/s] 89%|████████▊ | 11335/12776 [2:00:32<05:26,  4.42it/s]                                                        89%|���███████▊ | 11335/12776 [2:00:32<05:26,  4.42it/s] 89%|████████▊ | 11336/12776 [2:00:32<05:11,  4.63it/s]                                                        89%|████████▊ | 11336/12776 [2:00:32<05:11,  4.63it/s] 89%|████████▊ | 11337/12776 [2:00:32<04:57,  4.84it/s]                                                        89%|████████▊ | 11337/12776 [2:00:32<04:57,  4.84it/s] 89%|████████▊ | 11338/12776 [2:00:33<08:54,  2.69it/s]                                                        89%|████████▊ | 11338/12776 [2:00:33<08:54,  2.69it/s] 89%|████████▉ | 11339/12776 [2:00:35<18:48,  1.27it/s]                                                        89%|████████▉ | 11339/12776 [2:00:35<18:48,  1.27it/s] 89%|████████▉ | 11340/12776 [2:00:36<20:12,  1.18it/s]                                                        89%|████████▉ | 11340/12776 [2:00:36<20:12,  1.18it/s] 89%|████████▉ | 11341/12776 [2:00:37<20:42,  1.16it/s]                                                        89%|████████▉ | 11341/12776 [2:00:37<20:42,  1.16it/s] 89%|████████▉ | 11342/12776 [2:00:38<20:21,  1.17it/s]                                                        89%|████████▉ | 11342/12776 [2:00:38<20:21,  1.17it/s] 89%|████████▉ | 11343/12776 [2:00:38<19:25,  1.23it/s]                                                        89%|████████▉ | 11343/12776 [2:00:38<19:25,  1.23it/s] 89%|████████▉ | 11344/12776 [2:00:39<18:55,  1.26it/s]                                                        89%|████████▉ | 11344/12776 [2:00:39<18:55,  1.26it/s] 89%|████████▉ | 11345/12776 [2:00:40<17:51,  1.34it/s]                                                        89%|████████▉ | 11345/12776 [2:00:40<17:51,  1.34it/s] 89%|████████▉ | 11346/12776 [2:00:40<16:57,  1.41it/s]                                                        89%|████████▉ | 11346/12776 [2:00:40<16:57,  1.41it/s] 89%|████████▉ | 11347/12776 [2:00:41<17:10,  1.39it/s]                                                        89%|████████▉ | 11347/12776 [2:00:41<17:10,  1.39it/s] 89%|████████▉ | 11348/12776 [2:00:42<16:04,  1.48it/s]                                                        89%|████████▉ | 11348/12776 [2:00:42<16:04,  1.48it/s] 89%|████████▉ | 11349/12776 [2:00:42<15:29,  1.53it/s]                                                        89%|████████▉ | 11349/12776 [2:00:42<15:29,  1.53it/s] 89%|████████▉ | 11350/12776 [2:00:43<14:31,  1.64it/s]                                                        89%|████████▉ | 11350/12776 [2:00:43<14:31,  1.64it/s] 89%|████████▉ | 11351/12776 [2:00:43<14:32,  1.63it/s]                                                        89%|████████▉ | 11351/12776 [2:00:43<14:32,  1.63it/s] 89%|████████▉ | 11352/12776 [2:00:44<13:34,  1.75it/s]                                                        89%|████████▉ | 11352/12776 [2:00:44<13:34,  1.75it/s] 89%|████████▉ | 11353/12776 [2:00:44<12:40,  1.87it/s]                                                        89%|████████▉ | 11353/12776 [2:00:44<12:40,  1.87it/s] 89%|████████▉ | 11354/12776 [2:00:45<12:24,  1.91it/s]                                                        89%|████████▉ | 11354/12776 [2:00:45<12:24,  1.91it/s] 89%|████████▉ | 11355/12776 [2:00:45<11:42,  2.02it/s]                                                        89%|████████▉ | 11355/12776 [2:00:45<11:42,  2.02it/s] 89%|████████▉ | 11356/12776 [2:00:46<11:29,  2.06it/s]                                                        89%|████████▉ | 11356/12776 [2:00:46<11:29,  2.06it/s] 89%|████████▉ | 11357/12776 [2:00:46<10:48,  2.19it/s]                                                        89%|████████▉ | 11357/12776 [2:00:46<10:48,  2.19it/s] 89%|████████▉ | 11358/12776 [2:00:46<10:13,  2.31it/s]                                                        89%|████████▉ | 11358/12776 [2:00:46<10:13,  2.31it/s] 89%|████████▉ | 11359/12776 [2:00:47<09:56,  2.37it/s]                                                        89%|████████▉ | 11359/12776 [2:00:47<09:56,  2.37it/s] 89%|████████▉ | 11360/12776 [2:00:47<09:25,  2.51it/s]                                                        89%|████████▉ | 11360/12776 [2:00:47<09:25,  2.51it/s] 89%|████████▉ | 11361/12776 [2:00:48<08:57,  2.63it/s]                                                        89%|████████▉ | 11361/12776 [2:00:48<08:57,  2.63it/s] 89%|████████▉ | 11362/12776 [2:00:48<09:25,  2.50it/s]                                                        89%|████████▉ | 11362/12776 [2:00:48<09:25,  2.50it/s] 89%|████████▉ | 11363/12776 [2:00:48<08:48,  2.67it/s]                                                        89%|████████▉ | 11363/12776 [2:00:48<08:48,  2.67it/s] 89%|████████▉ | 11364/12776 [2:00:49<08:15,  2.85it/s]                                                        89%|████████▉ | 11364/12776 [2:00:49<08:15,  2.85it/s] 89%|████████▉ | 11365/12776 [2:00:49<07:50,  3.00it/s]                                                        89%|████████▉ | 11365/12776 [2:00:49<07:50,  3.00it/s] 89%|████████▉ | 11366/12776 [2:00:49<08:06,  2.90it/s]                                                        89%|████████▉ | 11366/12776 [2:00:49<08:06,  2.90it/s] 89%|████████▉ | 11367/12776 [2:00:50<07:35,  3.09it/s]                                                        89%|████████▉ | 11367/12776 [2:00:50<07:35,  3.09it/s] 89%|████████▉ | 11368/12776 [2:00:50<07:10,  3.27it/s]                                                        89%|████████▉ | 11368/12776 [2:00:50<07:10,  3.27it/s] 89%|████████▉ | 11369/12776 [2:00:50<06:46,  3.46it/s]                                                        89%|████████▉ | 11369/12776 [2:00:50<06:46,  3.46it/s] 89%|████████▉ | 11370/12776 [2:00:50<07:02,  3.33it/s]                                                        89%|████████▉ | 11370/12776 [2:00:50<07:02,  3.33it/s] 89%|████████▉ | 11371/12776 [2:00:51<06:39,  3.52it/s]                                                        89%|████████▉ | 11371/12776 [2:00:51<06:39,  3.52it/s] 89%|████████▉ | 11372/12776 [2:00:51<06:21,  3.68it/s]                                                        89%|████████▉ | 11372/12776 [2:00:51<06:21,  3.68it/s] 89%|████████▉ | 11373/12776 [2:00:51<06:07,  3.82it/s]                                                        89%|████████▉ | 11373/12776 [2:00:51<06:07,  3.82it/s] 89%|████████▉ | 11374/12776 [2:00:51<06:22,  3.67it/s]                                                        89%|████████▉ | 11374/12776 [2:00:51<06:22,  3.67it/s] 89%|████████▉ | 11375/12776 [2:00:52<06:02,  3.86it/s]                                                        89%|████████▉ | 11375/12776 [2:00:52<06:02,  3.86it/s] 89%|████████▉ | 11376/12776 [2:00:52<05:45,  4.06it/s]                                                        89%|████████▉ | 11376/12776 [2:00:52<05:45,  4.06it/s] 89%|████████▉ | 11377/12776 [2:00:52<05:29,  4.25it/s]                                                        89%|████████▉ | 11377/12776 [2:00:52<05:29,  4.25it/s] 89%|████████▉ | 11378/12776 [2:00:52<05:18,  4.39it/s]                                                        89%|████████▉ | 11378/12776 [2:00:52<05:18,  4.39it/s] 89%|████████▉ | 11379/12776 [2:00:53<05:37,  4.14it/s]                                                        89%|████████▉ | 11379/12776 [2:00:53<05:37,  4.14it/s] 89%|████████▉ | 11380/12776 [2:00:53<05:20,  4.35it/s]                                                        89%|████████▉ | 11380/12776 [2:00:53<05:20,  4.35it/s] 89%|████████▉ | 11381/12776 [2:00:53<05:08,  4.52it/s]                                                        89%|████████▉ | 11381/12776 [2:00:53<05:08,  4.52it/s] 89%|████████▉ | 11382/12776 [2:00:53<04:58,  4.67it/s]                                                        89%|████████▉ | 11382/12776 [2:00:53<04:58,  4.67it/s] 89%|████████▉ | 11383/12776 [2:00:53<04:51,  4.78it/s]                                                        89%|████████▉ | 11383/12776 [2:00:53<04:51,  4.78it/s] 89%|████████▉ | 11384/12776 [2:00:54<04:44,  4.88it/s]                                                        89%|████████▉ | 11384/12776 [2:00:54<04:44,  4.88it/s] 89%|████████▉ | 11385/12776 [2:00:54<05:05,  4.55it/s]                                                        89%|████████▉ | 11385/12776 [2:00:54<05:05,  4.55it/s] 89%|████████▉ | 11386/12776 [2:00:54<04:53,  4.73it/s]                                                        89%|████████▉ | 11386/12776 [2:00:54<04:53,  4.73it/s] 89%|████████▉ | 11387/12776 [2:00:54<04:42,  4.91it/s]                                                        89%|████████▉ | 11387/12776 [2:00:54<04:42,  4.91it/s] 89%|████████▉ | 11388/12776 [2:00:55<08:25,  2.74it/s]                                                        89%|████████▉ | 11388/12776 [2:00:55<08:25,  2.74it/s] 89%|████████▉ | 11389/12776 [2:00:56<13:53,  1.66it/s]                                                       {'loss': 0.484, 'grad_norm': 4.049930095672607, 'learning_rate': 3.616813294232649e-05, 'epoch': 1.77}
+{'loss': 0.4673, 'grad_norm': 3.0351674556732178, 'learning_rate': 3.6143695014662754e-05, 'epoch': 1.77}
+{'loss': 0.42, 'grad_norm': 3.3933756351470947, 'learning_rate': 3.611925708699902e-05, 'epoch': 1.77}
+{'loss': 0.5587, 'grad_norm': 2.0746243000030518, 'learning_rate': 3.6094819159335284e-05, 'epoch': 1.77}
+{'loss': 0.4801, 'grad_norm': 2.525590658187866, 'learning_rate': 3.6070381231671555e-05, 'epoch': 1.77}
+{'loss': 0.6689, 'grad_norm': 3.3657305240631104, 'learning_rate': 3.604594330400782e-05, 'epoch': 1.77}
+{'loss': 0.5339, 'grad_norm': 2.2926595211029053, 'learning_rate': 3.602150537634408e-05, 'epoch': 1.77}
+{'loss': 0.4416, 'grad_norm': 2.5583808422088623, 'learning_rate': 3.599706744868035e-05, 'epoch': 1.77}
+{'loss': 0.4616, 'grad_norm': 3.0211122035980225, 'learning_rate': 3.5972629521016615e-05, 'epoch': 1.77}
+{'loss': 0.6978, 'grad_norm': 5.442112445831299, 'learning_rate': 3.594819159335288e-05, 'epoch': 1.77}
+{'loss': 0.6472, 'grad_norm': 2.6855411529541016, 'learning_rate': 3.592375366568915e-05, 'epoch': 1.77}
+{'loss': 0.5912, 'grad_norm': 1.714129090309143, 'learning_rate': 3.5899315738025416e-05, 'epoch': 1.77}
+{'loss': 0.7321, 'grad_norm': 5.199019908905029, 'learning_rate': 3.5874877810361674e-05, 'epoch': 1.77}
+{'loss': 0.9314, 'grad_norm': 2.025068759918213, 'learning_rate': 3.5850439882697945e-05, 'epoch': 1.77}
+{'loss': 0.6754, 'grad_norm': 3.1110622882843018, 'learning_rate': 3.582600195503421e-05, 'epoch': 1.77}
+{'loss': 0.6682, 'grad_norm': 1.5338470935821533, 'learning_rate': 3.5801564027370475e-05, 'epoch': 1.77}
+{'loss': 0.6782, 'grad_norm': 1.7983545064926147, 'learning_rate': 3.5777126099706746e-05, 'epoch': 1.77}
+{'loss': 0.6843, 'grad_norm': 8.5779390335083, 'learning_rate': 3.5752688172043004e-05, 'epoch': 1.77}
+{'loss': 0.8621, 'grad_norm': 2.1752936840057373, 'learning_rate': 3.572825024437927e-05, 'epoch': 1.77}
+{'loss': 0.4116, 'grad_norm': 4.906739711761475, 'learning_rate': 3.570381231671554e-05, 'epoch': 1.77}
+{'loss': 1.0547, 'grad_norm': 3.4587199687957764, 'learning_rate': 3.5679374389051806e-05, 'epoch': 1.77}
+{'loss': 0.8674, 'grad_norm': 2.0558602809906006, 'learning_rate': 3.565493646138807e-05, 'epoch': 1.77}
+{'loss': 1.3265, 'grad_norm': 6.779541492462158, 'learning_rate': 3.563049853372434e-05, 'epoch': 1.77}
+{'loss': 0.8637, 'grad_norm': 4.761612892150879, 'learning_rate': 3.56060606060606e-05, 'epoch': 1.77}
+{'loss': 0.3855, 'grad_norm': 1.2794733047485352, 'learning_rate': 3.5581622678396865e-05, 'epoch': 1.77}
+{'loss': 0.5897, 'grad_norm': 1.7263725996017456, 'learning_rate': 3.5557184750733136e-05, 'epoch': 1.77}
+{'loss': 1.131, 'grad_norm': 2.5758883953094482, 'learning_rate': 3.55327468230694e-05, 'epoch': 1.77}
+{'loss': 0.8907, 'grad_norm': 3.0026354789733887, 'learning_rate': 3.5508308895405666e-05, 'epoch': 1.77}
+{'loss': 0.3081, 'grad_norm': 0.7817611694335938, 'learning_rate': 3.548387096774193e-05, 'epoch': 1.78}
+{'loss': 0.2291, 'grad_norm': 1.6110485792160034, 'learning_rate': 3.5459433040078196e-05, 'epoch': 1.78}
+{'loss': 0.2239, 'grad_norm': 1.1875455379486084, 'learning_rate': 3.543499511241446e-05, 'epoch': 1.78}
+{'loss': 0.2251, 'grad_norm': 0.6927761435508728, 'learning_rate': 3.541055718475073e-05, 'epoch': 1.78}
+{'loss': 0.3533, 'grad_norm': 1.9501267671585083, 'learning_rate': 3.5386119257087e-05, 'epoch': 1.78}
+{'loss': 0.4284, 'grad_norm': 1.7915222644805908, 'learning_rate': 3.536168132942326e-05, 'epoch': 1.78}
+{'loss': 0.3234, 'grad_norm': 1.3947383165359497, 'learning_rate': 3.5337243401759526e-05, 'epoch': 1.78}
+{'loss': 0.3943, 'grad_norm': 3.555690288543701, 'learning_rate': 3.531280547409579e-05, 'epoch': 1.78}
+{'loss': 0.2929, 'grad_norm': 0.8692519068717957, 'learning_rate': 3.5288367546432056e-05, 'epoch': 1.78}
+{'loss': 0.3083, 'grad_norm': 1.724339246749878, 'learning_rate': 3.526392961876833e-05, 'epoch': 1.78}
+{'loss': 0.3695, 'grad_norm': 1.0251134634017944, 'learning_rate': 3.523949169110459e-05, 'epoch': 1.78}
+{'loss': 0.4185, 'grad_norm': 1.2282941341400146, 'learning_rate': 3.521505376344086e-05, 'epoch': 1.78}
+{'loss': 0.2435, 'grad_norm': 0.9312837719917297, 'learning_rate': 3.519061583577712e-05, 'epoch': 1.78}
+{'loss': 0.3492, 'grad_norm': 1.5906509160995483, 'learning_rate': 3.516617790811339e-05, 'epoch': 1.78}
+{'loss': 0.2204, 'grad_norm': 0.7517256140708923, 'learning_rate': 3.514173998044965e-05, 'epoch': 1.78}
+{'loss': 0.3857, 'grad_norm': 2.044954776763916, 'learning_rate': 3.5117302052785916e-05, 'epoch': 1.78}
+{'loss': 1.0261, 'grad_norm': 5.702359676361084, 'learning_rate': 3.509286412512219e-05, 'epoch': 1.78}
+{'loss': 0.5519, 'grad_norm': 1.4534990787506104, 'learning_rate': 3.506842619745845e-05, 'epoch': 1.78}
+{'loss': 0.474, 'grad_norm': 1.8530081510543823, 'learning_rate': 3.504398826979472e-05, 'epoch': 1.78}
+{'loss': 0.3142, 'grad_norm': 2.5802159309387207, 'learning_rate': 3.501955034213098e-05, 'epoch': 1.78}
+{'loss': 0.4216, 'grad_norm': 0.7793805599212646, 'learning_rate': 3.499511241446725e-05, 'epoch': 1.78}
+{'loss': 0.5939, 'grad_norm': 2.537649631500244, 'learning_rate': 3.497067448680351e-05, 'epoch': 1.78}
+{'loss': 0.4864, 'grad_norm': 3.798841953277588, 'learning_rate': 3.4946236559139784e-05, 'epoch': 1.78}
+{'loss': 0.6817, 'grad_norm': 2.482792377471924, 'learning_rate': 3.492179863147605e-05, 'epoch': 1.78}
+{'loss': 0.185, 'grad_norm': 0.7896741032600403, 'learning_rate': 3.489736070381231e-05, 'epoch': 1.78}
+{'loss': 0.5607, 'grad_norm': 2.387078046798706, 'learning_rate': 3.487292277614858e-05, 'epoch': 1.78}
+{'loss': 0.525, 'grad_norm': 7.810622215270996, 'learning_rate': 3.484848484848484e-05, 'epoch': 1.78}
+{'loss': 0.6811, 'grad_norm': 5.911670207977295, 'learning_rate': 3.482404692082111e-05, 'epoch': 1.78}
+{'loss': 0.4714, 'grad_norm': 1.889855146408081, 'learning_rate': 3.479960899315738e-05, 'epoch': 1.78}
+{'loss': 0.3659, 'grad_norm': 1.3557449579238892, 'learning_rate': 3.4775171065493644e-05, 'epoch': 1.78}
+{'loss': 0.6288, 'grad_norm': 3.040102481842041, 'learning_rate': 3.475073313782991e-05, 'epoch': 1.78}
+{'loss': 0.5326, 'grad_norm': 7.0420756340026855, 'learning_rate': 3.4726295210166174e-05, 'epoch': 1.78}
+{'loss': 0.6711, 'grad_norm': 2.125647783279419, 'learning_rate': 3.470185728250244e-05, 'epoch': 1.78}
+{'loss': 0.509, 'grad_norm': 9.777316093444824, 'learning_rate': 3.46774193548387e-05, 'epoch': 1.78}
+{'loss': 0.7898, 'grad_norm': 2.167560577392578, 'learning_rate': 3.4652981427174975e-05, 'epoch': 1.78}
+{'loss': 0.7889, 'grad_norm': 5.202997207641602, 'learning_rate': 3.462854349951124e-05, 'epoch': 1.78}
+{'loss': 1.0625, 'grad_norm': 3.4150354862213135, 'learning_rate': 3.4604105571847504e-05, 'epoch': 1.78}
+{'loss': 0.8714, 'grad_norm': 2.184953451156616, 'learning_rate': 3.457966764418377e-05, 'epoch': 1.78}
+{'loss': 1.0218, 'grad_norm': 2.1605887413024902, 'learning_rate': 3.4555229716520034e-05, 'epoch': 1.78}
+{'loss': 1.2488, 'grad_norm': 3.93656063079834, 'learning_rate': 3.45307917888563e-05, 'epoch': 1.78}
+{'loss': 0.8522, 'grad_norm': 2.7407288551330566, 'learning_rate': 3.450635386119257e-05, 'epoch': 1.78}
+{'loss': 1.3899, 'grad_norm': 2.3924989700317383, 'learning_rate': 3.4481915933528835e-05, 'epoch': 1.78}
+{'loss': 1.1462, 'grad_norm': 2.2750840187072754, 'learning_rate': 3.44574780058651e-05, 'epoch': 1.78}
+{'loss': 1.0301, 'grad_norm': 1.9890302419662476, 'learning_rate': 3.4433040078201365e-05, 'epoch': 1.78}
+{'loss': 0.8329, 'grad_norm': 1.8051940202713013, 'learning_rate': 3.440860215053763e-05, 'epoch': 1.78}
+{'loss': 0.2328, 'grad_norm': 2.2022345066070557, 'learning_rate': 3.4384164222873894e-05, 'epoch': 1.78}
+{'loss': 0.4919, 'grad_norm': 2.2471179962158203, 'learning_rate': 3.4359726295210166e-05, 'epoch': 1.78}
+{'loss': 0.7256, 'grad_norm': 3.4314959049224854, 'learning_rate': 3.433528836754643e-05, 'epoch': 1.78}
+{'loss': 0.6489, 'grad_norm': 2.0382392406463623, 'learning_rate': 3.4310850439882695e-05, 'epoch': 1.78}
+{'loss': 0.5647, 'grad_norm': 12.943007469177246, 'learning_rate': 3.428641251221896e-05, 'epoch': 1.78}
+ 89%|████████▉ | 11389/12776 [2:00:56<13:53,  1.66it/s] 89%|████████▉ | 11390/12776 [2:00:57<16:44,  1.38it/s]                                                        89%|████████▉ | 11390/12776 [2:00:57<16:44,  1.38it/s] 89%|████████▉ | 11391/12776 [2:00:58<17:28,  1.32it/s]                                                        89%|████████▉ | 11391/12776 [2:00:58<17:28,  1.32it/s] 89%|████████▉ | 11392/12776 [2:00:59<17:27,  1.32it/s]                                                        89%|████████▉ | 11392/12776 [2:00:59<17:27,  1.32it/s] 89%|████████▉ | 11393/12776 [2:00:59<17:17,  1.33it/s]                                                        89%|████████▉ | 11393/12776 [2:00:59<17:17,  1.33it/s] 89%|████████▉ | 11394/12776 [2:01:00<16:42,  1.38it/s]                                                        89%|████████▉ | 11394/12776 [2:01:00<16:42,  1.38it/s] 89%|████████▉ | 11395/12776 [2:01:01<16:44,  1.37it/s]                                                        89%|████████▉ | 11395/12776 [2:01:01<16:44,  1.37it/s] 89%|████████▉ | 11396/12776 [2:01:01<15:46,  1.46it/s]                                                        89%|████████▉ | 11396/12776 [2:01:01<15:46,  1.46it/s] 89%|████████▉ | 11397/12776 [2:01:02<15:06,  1.52it/s]                                                        89%|████████▉ | 11397/12776 [2:01:02<15:06,  1.52it/s] 89%|████████▉ | 11398/12776 [2:01:03<14:23,  1.60it/s]                                                        89%|████████▉ | 11398/12776 [2:01:03<14:23,  1.60it/s] 89%|████████▉ | 11399/12776 [2:01:03<13:54,  1.65it/s]                                                        89%|████████▉ | 11399/12776 [2:01:03<13:54,  1.65it/s] 89%|████████▉ | 11400/12776 [2:01:04<13:10,  1.74it/s]                                                        89%|████████▉ | 11400/12776 [2:01:04<13:10,  1.74it/s] 89%|████████▉ | 11401/12776 [2:01:04<12:55,  1.77it/s]                                                        89%|████████▉ | 11401/12776 [2:01:04<12:55,  1.77it/s] 89%|████████▉ | 11402/12776 [2:01:05<12:05,  1.89it/s]                                                        89%|████████▉ | 11402/12776 [2:01:05<12:05,  1.89it/s] 89%|████████▉ | 11403/12776 [2:01:05<11:55,  1.92it/s]                                                        89%|████████▉ | 11403/12776 [2:01:05<11:55,  1.92it/s] 89%|████████▉ | 11404/12776 [2:01:05<11:08,  2.05it/s]                                                        89%|████████▉ | 11404/12776 [2:01:05<11:08,  2.05it/s] 89%|████████▉ | 11405/12776 [2:01:06<10:26,  2.19it/s]                                                        89%|████████▉ | 11405/12776 [2:01:06<10:26,  2.19it/s] 89%|████████▉ | 11406/12776 [2:01:06<10:41,  2.14it/s]                                                        89%|████████▉ | 11406/12776 [2:01:06<10:41,  2.14it/s] 89%|████████▉ | 11407/12776 [2:01:07<09:53,  2.31it/s]                                                        89%|████████▉ | 11407/12776 [2:01:07<09:53,  2.31it/s] 89%|████████▉ | 11408/12776 [2:01:07<09:17,  2.45it/s]                                                        89%|████████▉ | 11408/12776 [2:01:07<09:17,  2.45it/s] 89%|████████▉ | 11409/12776 [2:01:08<09:29,  2.40it/s]                                                        89%|████████▉ | 11409/12776 [2:01:08<09:29,  2.40it/s] 89%|████████▉ | 11410/12776 [2:01:08<08:51,  2.57it/s]                                                        89%|████████▉ | 11410/12776 [2:01:08<08:51,  2.57it/s] 89%|████████▉ | 11411/12776 [2:01:08<08:18,  2.74it/s]                                                        89%|████████▉ | 11411/12776 [2:01:08<08:18,  2.74it/s] 89%|████████▉ | 11412/12776 [2:01:09<08:19,  2.73it/s]                                                        89%|████████▉ | 11412/12776 [2:01:09<08:19,  2.73it/s] 89%|████████▉ | 11413/12776 [2:01:09<07:52,  2.88it/s]                                                        89%|████████▉ | 11413/12776 [2:01:09<07:52,  2.88it/s] 89%|████████▉ | 11414/12776 [2:01:09<07:29,  3.03it/s]                                                        89%|████████▉ | 11414/12776 [2:01:09<07:29,  3.03it/s] 89%|████████▉ | 11415/12776 [2:01:09<07:10,  3.16it/s]                                                        89%|████████▉ | 11415/12776 [2:01:09<07:10,  3.16it/s] 89%|████████▉ | 11416/12776 [2:01:10<07:33,  3.00it/s]                                                        89%|████████▉ | 11416/12776 [2:01:10<07:33,  3.00it/s] 89%|████████▉ | 11417/12776 [2:01:10<07:06,  3.19it/s]                                                        89%|████████▉ | 11417/12776 [2:01:10<07:06,  3.19it/s] 89%|████████▉ | 11418/12776 [2:01:10<06:43,  3.37it/s]                                                        89%|████████▉ | 11418/12776 [2:01:10<06:43,  3.37it/s] 89%|████████▉ | 11419/12776 [2:01:11<06:25,  3.52it/s]                                                        89%|████████▉ | 11419/12776 [2:01:11<06:25,  3.52it/s] 89%|████████▉ | 11420/12776 [2:01:11<06:43,  3.36it/s]                                                        89%|████████▉ | 11420/12776 [2:01:11<06:43,  3.36it/s] 89%|████████▉ | 11421/12776 [2:01:11<06:21,  3.55it/s]                                                        89%|████████▉ | 11421/12776 [2:01:11<06:21,  3.55it/s] 89%|████████▉ | 11422/12776 [2:01:11<06:01,  3.74it/s]                                                        89%|████████▉ | 11422/12776 [2:01:11<06:01,  3.74it/s] 89%|████████▉ | 11423/12776 [2:01:12<05:45,  3.91it/s]                                                        89%|████████▉ | 11423/12776 [2:01:12<05:45,  3.91it/s] 89%|████████▉ | 11424/12776 [2:01:12<05:59,  3.77it/s]                                                        89%|████████▉ | 11424/12776 [2:01:12<05:59,  3.77it/s] 89%|████████▉ | 11425/12776 [2:01:12<05:38,  4.00it/s]                                                        89%|████████▉ | 11425/12776 [2:01:12<05:38,  4.00it/s] 89%|████████▉ | 11426/12776 [2:01:12<05:22,  4.18it/s]                                                        89%|████████▉ | 11426/12776 [2:01:12<05:22,  4.18it/s] 89%|████████▉ | 11427/12776 [2:01:13<05:12,  4.32it/s]                                                        89%|████████▉ | 11427/12776 [2:01:13<05:12,  4.32it/s] 89%|████████▉ | 11428/12776 [2:01:13<05:02,  4.45it/s]                                                        89%|████████▉ | 11428/12776 [2:01:13<05:02,  4.45it/s] 89%|████████▉ | 11429/12776 [2:01:13<05:20,  4.20it/s]                                                        89%|████████▉ | 11429/12776 [2:01:13<05:20,  4.20it/s] 89%|████████▉ | 11430/12776 [2:01:13<05:06,  4.39it/s]                                                        89%|████████▉ | 11430/12776 [2:01:13<05:06,  4.39it/s] 89%|████████▉ | 11431/12776 [2:01:13<04:56,  4.54it/s]                                                        89%|████████▉ | 11431/12776 [2:01:13<04:56,  4.54it/s] 89%|████████▉ | 11432/12776 [2:01:14<04:46,  4.69it/s]                                                        89%|████████▉ | 11432/12776 [2:01:14<04:46,  4.69it/s] 89%|████████▉ | 11433/12776 [2:01:14<04:39,  4.81it/s]                                                        89%|████████▉ | 11433/12776 [2:01:14<04:39,  4.81it/s] 89%|████████▉ | 11434/12776 [2:01:14<04:33,  4.90it/s]                                                        89%|████████▉ | 11434/12776 [2:01:14<04:33,  4.90it/s] 90%|████████▉ | 11435/12776 [2:01:14<04:56,  4.52it/s]                                                        90%|████████▉ | 11435/12776 [2:01:14<04:56,  4.52it/s] 90%|████████▉ | 11436/12776 [2:01:14<04:44,  4.70it/s]                                                        90%|████████▉ | 11436/12776 [2:01:14<04:44,  4.70it/s] 90%|████████▉ | 11437/12776 [2:01:15<04:33,  4.90it/s]                                                        90%|████████▉ | 11437/12776 [2:01:15<04:33,  4.90it/s] 90%|████████▉ | 11438/12776 [2:01:15<08:09,  2.73it/s]                                                        90%|████████▉ | 11438/12776 [2:01:15<08:09,  2.73it/s] 90%|████████▉ | 11439/12776 [2:01:17<15:43,  1.42it/s]                                                        90%|████████▉ | 11439/12776 [2:01:17<15:43,  1.42it/s] 90%|████████▉ | 11440/12776 [2:01:18<18:16,  1.22it/s]                                                        90%|████████▉ | 11440/12776 [2:01:18<18:16,  1.22it/s] 90%|████████▉ | 11441/12776 [2:01:19<18:40,  1.19it/s]                                                        90%|████████▉ | 11441/12776 [2:01:19<18:40,  1.19it/s] 90%|████████▉ | 11442/12776 [2:01:20<18:29,  1.20it/s]                                                        90%|████████▉ | 11442/12776 [2:01:20<18:29,  1.20it/s] 90%|████████▉ | 11443/12776 [2:01:20<17:55,  1.24it/s]                                                        90%|████████▉ | 11443/12776 [2:01:20<17:55,  1.24it/s] 90%|████████▉ | 11444/12776 [2:01:21<17:04,  1.30it/s]                                                        90%|████████▉ | 11444/12776 [2:01:21<17:04,  1.30it/s] 90%|████████▉ | 11445/12776 [2:01:22<16:24,  1.35it/s]                                                        90%|████████▉ | 11445/12776 [2:01:22<16:24,  1.35it/s] 90%|████████▉ | 11446/12776 [2:01:22<16:13,  1.37it/s]                                                        90%|████████▉ | 11446/12776 [2:01:22<16:13,  1.37it/s] 90%|████████▉ | 11447/12776 [2:01:23<15:25,  1.44it/s]                                                        90%|████████▉ | 11447/12776 [2:01:23<15:25,  1.44it/s] 90%|████████▉ | 11448/12776 [2:01:24<14:40,  1.51it/s]                                                        90%|████████▉ | 11448/12776 [2:01:24<14:40,  1.51it/s] 90%|████████▉ | 11449/12776 [2:01:24<13:56,  1.59it/s]                                                        90%|████████▉ | 11449/12776 [2:01:24<13:56,  1.59it/s] 90%|████████▉ | 11450/12776 [2:01:25<13:32,  1.63it/s]                                                        90%|████████▉ | 11450/12776 [2:01:25<13:32,  1.63it/s] 90%|████████▉ | 11451/12776 [2:01:25<12:49,  1.72it/s]                                                        90%|████████▉ | 11451/12776 [2:01:25<12:49,  1.72it/s] 90%|████████▉ | 11452/12776 [2:01:26<12:34,  1.75it/s]                                                        90%|████████▉ | 11452/12776 [2:01:26<12:34,  1.75it/s] 90%|████████▉ | 11453/12776 [2:01:26<11:49,  1.87it/s]                                                        90%|████████▉ | 11453/12776 [2:01:26<11:49,  1.87it/s] 90%|████████▉ | 11454/12776 [2:01:27<11:27,  1.92it/s]                                                        90%|████████▉ | 11454/12776 [2:01:27<11:27,  1.92it/s] 90%|████████▉ | 11455/12776 [2:01:27<10:48,  2.04it/s]                                                        90%|████████▉ | 11455/12776 [2:01:27<10:48,  2.04it/s] 90%|████████▉ | 11456/12776 [2:01:28<10:12,  2.16it/s]                                                        90%|████████▉ | 11456/12776 [2:01:28<10:12,  2.16it/s] 90%|████████▉ | 11457/12776 [2:01:28<10:21,  2.12it/s]                                                        90%|████████▉ | 11457/12776 [2:01:28<10:21,  2.12it/s] 90%|████████▉ | 11458/12776 [2:01:28<09:40,  2.27it/s]                                                        90%|████████▉ | 11458/12776 [2:01:28<09:40,  2.27it/s] 90%|████████▉ | 11459/12776 [2:01:29<09:06,  2.41it/s]                                                        90%|████████▉ | 11459/12776 [2:01:29<09:06,  2.41it/s] 90%|████████▉ | 11460/12776 [2:01:29<09:07,  2.40it/s]                                                        90%|████████▉ | 11460/12776 [2:01:29<09:07,  2.40it/s] 90%|████████▉ | 11461/12776 [2:01:30<08:34,  2.56it/s]                                                        90%|████████▉ | 11461/12776 [2:01:30<08:34,  2.56it/s] 90%|████████▉ | 11462/12776 [2:01:30<08:07,  2.69it/s]                                                        90%|████████▉ | 11462/12776 [2:01:30<08:07,  2.69it/s] 90%|████████▉ | 11463/12776 [2:01:30<07:53,  2.78it/s]                                                        90%|████████▉ | 11463/12776 [2:01:30<07:53,  2.78it/s] 90%|████████▉ | 11464/12776 [2:01:31<07:27,  2.93it/s]                                                        90%|████████▉ | 11464/12776 [2:01:31<07:27,  2.93it/s] 90%|████████▉ | 11465/12776 [2:01:31<07:06,  3.07it/s]                                                        90%|████████▉ | 11465/12776 [2:01:31<07:06,  3.07it/s] 90%|████████▉ | 11466/12776 [2:01:31<06:48,  3.21it/s]                                                        90%|████████▉ | 11466/12776 [2:01:31<06:48,  3.21it/s] 90%|████████▉ | 11467/12776 [2:01:31<07:18,  2.99it/s]                                                       {'loss': 0.234, 'grad_norm': 3.573213815689087, 'learning_rate': 3.4261974584555225e-05, 'epoch': 1.78}
+{'loss': 0.2099, 'grad_norm': 0.828106164932251, 'learning_rate': 3.423753665689149e-05, 'epoch': 1.78}
+{'loss': 0.2266, 'grad_norm': 0.6829121112823486, 'learning_rate': 3.421309872922776e-05, 'epoch': 1.78}
+{'loss': 0.2118, 'grad_norm': 0.8972326517105103, 'learning_rate': 3.4188660801564026e-05, 'epoch': 1.78}
+{'loss': 0.2796, 'grad_norm': 2.1261706352233887, 'learning_rate': 3.416422287390029e-05, 'epoch': 1.78}
+{'loss': 0.257, 'grad_norm': 0.843867838382721, 'learning_rate': 3.4139784946236556e-05, 'epoch': 1.78}
+{'loss': 0.3085, 'grad_norm': 0.617863655090332, 'learning_rate': 3.411534701857282e-05, 'epoch': 1.78}
+{'loss': 0.3067, 'grad_norm': 1.1008343696594238, 'learning_rate': 3.4090909090909085e-05, 'epoch': 1.78}
+{'loss': 0.2741, 'grad_norm': 2.097799777984619, 'learning_rate': 3.406647116324536e-05, 'epoch': 1.78}
+{'loss': 0.4028, 'grad_norm': 5.063704967498779, 'learning_rate': 3.404203323558162e-05, 'epoch': 1.78}
+{'loss': 0.2896, 'grad_norm': 0.8035471439361572, 'learning_rate': 3.401759530791789e-05, 'epoch': 1.78}
+{'loss': 0.4598, 'grad_norm': 2.3350865840911865, 'learning_rate': 3.399315738025415e-05, 'epoch': 1.78}
+{'loss': 0.2244, 'grad_norm': 1.9042445421218872, 'learning_rate': 3.3968719452590416e-05, 'epoch': 1.78}
+{'loss': 0.4026, 'grad_norm': 3.0151047706604004, 'learning_rate': 3.394428152492668e-05, 'epoch': 1.78}
+{'loss': 0.4166, 'grad_norm': 1.309158205986023, 'learning_rate': 3.391984359726295e-05, 'epoch': 1.79}
+{'loss': 0.4382, 'grad_norm': 0.8744258880615234, 'learning_rate': 3.389540566959922e-05, 'epoch': 1.79}
+{'loss': 0.3675, 'grad_norm': 1.478337049484253, 'learning_rate': 3.387096774193548e-05, 'epoch': 1.79}
+{'loss': 0.663, 'grad_norm': 1.9362159967422485, 'learning_rate': 3.384652981427175e-05, 'epoch': 1.79}
+{'loss': 0.7617, 'grad_norm': 1.9572466611862183, 'learning_rate': 3.382209188660801e-05, 'epoch': 1.79}
+{'loss': 0.3307, 'grad_norm': 0.7849676012992859, 'learning_rate': 3.379765395894428e-05, 'epoch': 1.79}
+{'loss': 0.5584, 'grad_norm': 2.1287667751312256, 'learning_rate': 3.377321603128055e-05, 'epoch': 1.79}
+{'loss': 0.3862, 'grad_norm': 1.399261713027954, 'learning_rate': 3.374877810361681e-05, 'epoch': 1.79}
+{'loss': 0.5256, 'grad_norm': 2.173130750656128, 'learning_rate': 3.372434017595308e-05, 'epoch': 1.79}
+{'loss': 0.7287, 'grad_norm': 1.911555290222168, 'learning_rate': 3.369990224828934e-05, 'epoch': 1.79}
+{'loss': 0.5673, 'grad_norm': 3.9470415115356445, 'learning_rate': 3.367546432062561e-05, 'epoch': 1.79}
+{'loss': 0.3453, 'grad_norm': 2.7635679244995117, 'learning_rate': 3.365102639296187e-05, 'epoch': 1.79}
+{'loss': 0.6953, 'grad_norm': 2.662376880645752, 'learning_rate': 3.3626588465298144e-05, 'epoch': 1.79}
+{'loss': 0.7028, 'grad_norm': 4.1886444091796875, 'learning_rate': 3.360215053763441e-05, 'epoch': 1.79}
+{'loss': 0.7296, 'grad_norm': 3.9045183658599854, 'learning_rate': 3.357771260997067e-05, 'epoch': 1.79}
+{'loss': 0.6113, 'grad_norm': 2.272017240524292, 'learning_rate': 3.355327468230694e-05, 'epoch': 1.79}
+{'loss': 0.3347, 'grad_norm': 2.5397136211395264, 'learning_rate': 3.35288367546432e-05, 'epoch': 1.79}
+{'loss': 0.5801, 'grad_norm': 4.905436992645264, 'learning_rate': 3.350439882697947e-05, 'epoch': 1.79}
+{'loss': 0.8952, 'grad_norm': 1.6117602586746216, 'learning_rate': 3.347996089931573e-05, 'epoch': 1.79}
+{'loss': 0.6686, 'grad_norm': 2.91456937789917, 'learning_rate': 3.3455522971652004e-05, 'epoch': 1.79}
+{'loss': 0.6183, 'grad_norm': 3.0732598304748535, 'learning_rate': 3.343108504398827e-05, 'epoch': 1.79}
+{'loss': 0.7562, 'grad_norm': 1.9497085809707642, 'learning_rate': 3.3406647116324534e-05, 'epoch': 1.79}
+{'loss': 0.7507, 'grad_norm': 5.999476909637451, 'learning_rate': 3.33822091886608e-05, 'epoch': 1.79}
+{'loss': 0.8865, 'grad_norm': 2.643611431121826, 'learning_rate': 3.335777126099706e-05, 'epoch': 1.79}
+{'loss': 1.0517, 'grad_norm': 6.0978803634643555, 'learning_rate': 3.333333333333333e-05, 'epoch': 1.79}
+{'loss': 0.9786, 'grad_norm': 1.9144535064697266, 'learning_rate': 3.33088954056696e-05, 'epoch': 1.79}
+{'loss': 1.137, 'grad_norm': 4.194984436035156, 'learning_rate': 3.3284457478005865e-05, 'epoch': 1.79}
+{'loss': 1.0596, 'grad_norm': 2.2141382694244385, 'learning_rate': 3.326001955034213e-05, 'epoch': 1.79}
+{'loss': 0.6956, 'grad_norm': 1.324143886566162, 'learning_rate': 3.3235581622678394e-05, 'epoch': 1.79}
+{'loss': 0.6576, 'grad_norm': 2.4526519775390625, 'learning_rate': 3.321114369501466e-05, 'epoch': 1.79}
+{'loss': 0.6591, 'grad_norm': 1.4210792779922485, 'learning_rate': 3.3186705767350924e-05, 'epoch': 1.79}
+{'loss': 0.3686, 'grad_norm': 1.1268730163574219, 'learning_rate': 3.3162267839687195e-05, 'epoch': 1.79}
+{'loss': 0.7514, 'grad_norm': 2.319267988204956, 'learning_rate': 3.313782991202346e-05, 'epoch': 1.79}
+{'loss': 0.7639, 'grad_norm': 2.7974939346313477, 'learning_rate': 3.3113391984359725e-05, 'epoch': 1.79}
+{'loss': 1.1673, 'grad_norm': 4.421706676483154, 'learning_rate': 3.308895405669599e-05, 'epoch': 1.79}
+{'loss': 0.8485, 'grad_norm': 2.369821310043335, 'learning_rate': 3.3064516129032255e-05, 'epoch': 1.79}
+{'loss': 0.3957, 'grad_norm': 2.404873847961426, 'learning_rate': 3.304007820136852e-05, 'epoch': 1.79}
+{'loss': 0.2733, 'grad_norm': 0.9003664255142212, 'learning_rate': 3.301564027370479e-05, 'epoch': 1.79}
+{'loss': 0.2546, 'grad_norm': 0.6666517853736877, 'learning_rate': 3.2991202346041056e-05, 'epoch': 1.79}
+{'loss': 0.2187, 'grad_norm': 0.587860107421875, 'learning_rate': 3.296676441837732e-05, 'epoch': 1.79}
+{'loss': 0.2001, 'grad_norm': 1.2249459028244019, 'learning_rate': 3.2942326490713585e-05, 'epoch': 1.79}
+{'loss': 0.3076, 'grad_norm': 1.0504732131958008, 'learning_rate': 3.291788856304985e-05, 'epoch': 1.79}
+{'loss': 0.2633, 'grad_norm': 0.6395697593688965, 'learning_rate': 3.2893450635386115e-05, 'epoch': 1.79}
+{'loss': 0.4211, 'grad_norm': 1.8298180103302002, 'learning_rate': 3.2869012707722386e-05, 'epoch': 1.79}
+{'loss': 0.3673, 'grad_norm': 1.2210631370544434, 'learning_rate': 3.284457478005865e-05, 'epoch': 1.79}
+{'loss': 0.2212, 'grad_norm': 0.7577955722808838, 'learning_rate': 3.282013685239491e-05, 'epoch': 1.79}
+{'loss': 0.294, 'grad_norm': 1.5126709938049316, 'learning_rate': 3.279569892473118e-05, 'epoch': 1.79}
+{'loss': 0.236, 'grad_norm': 1.66423499584198, 'learning_rate': 3.2771260997067446e-05, 'epoch': 1.79}
+{'loss': 0.3446, 'grad_norm': 1.5655282735824585, 'learning_rate': 3.274682306940371e-05, 'epoch': 1.79}
+{'loss': 0.3498, 'grad_norm': 0.9023974537849426, 'learning_rate': 3.272238514173998e-05, 'epoch': 1.79}
+{'loss': 0.3794, 'grad_norm': 1.6287553310394287, 'learning_rate': 3.269794721407625e-05, 'epoch': 1.79}
+{'loss': 0.6226, 'grad_norm': 2.9960827827453613, 'learning_rate': 3.2673509286412505e-05, 'epoch': 1.79}
+{'loss': 0.3432, 'grad_norm': 8.331686019897461, 'learning_rate': 3.2649071358748776e-05, 'epoch': 1.79}
+{'loss': 0.3093, 'grad_norm': 1.214149832725525, 'learning_rate': 3.262463343108504e-05, 'epoch': 1.79}
+{'loss': 0.424, 'grad_norm': 1.9629710912704468, 'learning_rate': 3.2600195503421306e-05, 'epoch': 1.79}
+{'loss': 0.8316, 'grad_norm': 4.693020820617676, 'learning_rate': 3.257575757575758e-05, 'epoch': 1.79}
+{'loss': 0.5424, 'grad_norm': 3.069445848464966, 'learning_rate': 3.2551319648093836e-05, 'epoch': 1.79}
+{'loss': 0.3951, 'grad_norm': 1.917677879333496, 'learning_rate': 3.25268817204301e-05, 'epoch': 1.79}
+{'loss': 0.4418, 'grad_norm': 1.1094872951507568, 'learning_rate': 3.250244379276637e-05, 'epoch': 1.79}
+{'loss': 0.6939, 'grad_norm': 1.7542659044265747, 'learning_rate': 3.247800586510264e-05, 'epoch': 1.79}
+{'loss': 0.5249, 'grad_norm': 4.421488285064697, 'learning_rate': 3.24535679374389e-05, 'epoch': 1.79}
+{'loss': 0.6136, 'grad_norm': 3.417511224746704, 'learning_rate': 3.242913000977517e-05, 'epoch': 1.79}
+{'loss': 1.0622, 'grad_norm': 4.0269622802734375, 'learning_rate': 3.240469208211143e-05, 'epoch': 1.79}
+{'loss': 0.371, 'grad_norm': 1.2010209560394287, 'learning_rate': 3.2380254154447696e-05, 'epoch': 1.79}
+ 90%|████████▉ | 11467/12776 [2:01:31<07:18,  2.99it/s] 90%|████████▉ | 11468/12776 [2:01:32<06:49,  3.19it/s]                                                        90%|████████▉ | 11468/12776 [2:01:32<06:49,  3.19it/s] 90%|████████▉ | 11469/12776 [2:01:32<06:25,  3.39it/s]                                                        90%|████████▉ | 11469/12776 [2:01:32<06:25,  3.39it/s] 90%|████████▉ | 11470/12776 [2:01:32<06:07,  3.55it/s]                                                        90%|████████▉ | 11470/12776 [2:01:32<06:07,  3.55it/s] 90%|████████▉ | 11471/12776 [2:01:33<06:36,  3.29it/s]                                                        90%|████████▉ | 11471/12776 [2:01:33<06:36,  3.29it/s] 90%|████████▉ | 11472/12776 [2:01:33<06:11,  3.51it/s]                                                        90%|████████▉ | 11472/12776 [2:01:33<06:11,  3.51it/s] 90%|████████▉ | 11473/12776 [2:01:33<05:52,  3.70it/s]                                                        90%|████████▉ | 11473/12776 [2:01:33<05:52,  3.70it/s] 90%|████████▉ | 11474/12776 [2:01:33<05:36,  3.87it/s]                                                        90%|████████▉ | 11474/12776 [2:01:33<05:36,  3.87it/s] 90%|████████▉ | 11475/12776 [2:01:34<05:49,  3.72it/s]                                                        90%|████████▉ | 11475/12776 [2:01:34<05:49,  3.72it/s] 90%|████████▉ | 11476/12776 [2:01:34<05:28,  3.96it/s]                                                        90%|████████▉ | 11476/12776 [2:01:34<05:28,  3.96it/s] 90%|████████▉ | 11477/12776 [2:01:34<05:11,  4.17it/s]                                                        90%|████████▉ | 11477/12776 [2:01:34<05:11,  4.17it/s] 90%|████████▉ | 11478/12776 [2:01:34<05:00,  4.32it/s]                                                        90%|████████▉ | 11478/12776 [2:01:34<05:00,  4.32it/s] 90%|████████▉ | 11479/12776 [2:01:34<04:51,  4.45it/s]                                                        90%|████████▉ | 11479/12776 [2:01:34<04:51,  4.45it/s] 90%|████████▉ | 11480/12776 [2:01:35<05:17,  4.08it/s]                                                        90%|████████▉ | 11480/12776 [2:01:35<05:17,  4.08it/s] 90%|████████▉ | 11481/12776 [2:01:35<05:00,  4.31it/s]                                                        90%|████████▉ | 11481/12776 [2:01:35<05:00,  4.31it/s] 90%|████████▉ | 11482/12776 [2:01:35<04:47,  4.50it/s]                                                        90%|████████▉ | 11482/12776 [2:01:35<04:47,  4.50it/s] 90%|████████▉ | 11483/12776 [2:01:35<04:37,  4.66it/s]                                                        90%|████████▉ | 11483/12776 [2:01:35<04:37,  4.66it/s] 90%|████████▉ | 11484/12776 [2:01:36<04:29,  4.79it/s]                                                        90%|████████▉ | 11484/12776 [2:01:36<04:29,  4.79it/s] 90%|████████▉ | 11485/12776 [2:01:36<04:22,  4.91it/s]                                                        90%|████████▉ | 11485/12776 [2:01:36<04:22,  4.91it/s] 90%|████████▉ | 11486/12776 [2:01:36<04:52,  4.42it/s]                                                        90%|████████▉ | 11486/12776 [2:01:36<04:52,  4.42it/s] 90%|████████▉ | 11487/12776 [2:01:36<04:35,  4.68it/s]                                                        90%|████████▉ | 11487/12776 [2:01:36<04:35,  4.68it/s] 90%|████████▉ | 11488/12776 [2:01:37<07:39,  2.80it/s]                                                        90%|████████▉ | 11488/12776 [2:01:37<07:39,  2.80it/s] 90%|████████▉ | 11489/12776 [2:01:38<13:49,  1.55it/s]                                                        90%|████████▉ | 11489/12776 [2:01:38<13:49,  1.55it/s] 90%|████████▉ | 11490/12776 [2:01:39<15:37,  1.37it/s]                                                        90%|████████▉ | 11490/12776 [2:01:39<15:37,  1.37it/s] 90%|████████▉ | 11491/12776 [2:01:40<16:17,  1.31it/s]                                                        90%|████████▉ | 11491/12776 [2:01:40<16:17,  1.31it/s] 90%|████████▉ | 11492/12776 [2:01:41<16:53,  1.27it/s]                                                        90%|████████▉ | 11492/12776 [2:01:41<16:53,  1.27it/s] 90%|████████▉ | 11493/12776 [2:01:42<17:30,  1.22it/s]                                                        90%|████████▉ | 11493/12776 [2:01:42<17:30,  1.22it/s] 90%|████████▉ | 11494/12776 [2:01:42<16:43,  1.28it/s]                                                        90%|████████▉ | 11494/12776 [2:01:42<16:43,  1.28it/s] 90%|████████▉ | 11495/12776 [2:01:43<16:18,  1.31it/s]                                                        90%|████████▉ | 11495/12776 [2:01:43<16:18,  1.31it/s] 90%|████████▉ | 11496/12776 [2:01:44<15:23,  1.39it/s]                                                        90%|████████▉ | 11496/12776 [2:01:44<15:23,  1.39it/s] 90%|████████▉ | 11497/12776 [2:01:44<14:35,  1.46it/s]                                                        90%|████████▉ | 11497/12776 [2:01:44<14:35,  1.46it/s] 90%|████████▉ | 11498/12776 [2:01:45<13:45,  1.55it/s]                                                        90%|████████▉ | 11498/12776 [2:01:45<13:45,  1.55it/s] 90%|█████████ | 11499/12776 [2:01:45<13:21,  1.59it/s]                                                        90%|█████████ | 11499/12776 [2:01:45<13:21,  1.59it/s] 90%|█████████ | 11500/12776 [2:01:46<12:37,  1.68it/s]                                                        90%|█████████ | 11500/12776 [2:01:46<12:37,  1.68it/s] 90%|█████████ | 11501/12776 [2:01:47<12:41,  1.67it/s]                                                        90%|█████████ | 11501/12776 [2:01:47<12:41,  1.67it/s] 90%|█████████ | 11502/12776 [2:01:47<11:47,  1.80it/s]                                                        90%|█████████ | 11502/12776 [2:01:47<11:47,  1.80it/s] 90%|█████████ | 11503/12776 [2:01:48<11:45,  1.80it/s]                                                        90%|█████████ | 11503/12776 [2:01:48<11:45,  1.80it/s] 90%|█████████ | 11504/12776 [2:01:48<11:00,  1.93it/s]                                                        90%|█████████ | 11504/12776 [2:01:48<11:00,  1.93it/s] 90%|█████████ | 11505/12776 [2:01:49<10:57,  1.93it/s]                                                        90%|█████████ | 11505/12776 [2:01:49<10:57,  1.93it/s] 90%|█████████ | 11506/12776 [2:01:49<10:16,  2.06it/s]                                                        90%|█████████ | 11506/12776 [2:01:49<10:16,  2.06it/s] 90%|█████████ | 11507/12776 [2:01:49<09:39,  2.19it/s]                                                        90%|█████████ | 11507/12776 [2:01:49<09:39,  2.19it/s] 90%|█████████ | 11508/12776 [2:01:50<10:03,  2.10it/s]                                                        90%|█████████ | 11508/12776 [2:01:50<10:03,  2.10it/s] 90%|█████████ | 11509/12776 [2:01:50<09:15,  2.28it/s]                                                        90%|█████████ | 11509/12776 [2:01:50<09:15,  2.28it/s] 90%|█████████ | 11510/12776 [2:01:51<08:36,  2.45it/s]                                                        90%|█████████ | 11510/12776 [2:01:51<08:36,  2.45it/s] 90%|█████████ | 11511/12776 [2:01:51<08:40,  2.43it/s]                                                        90%|█████████ | 11511/12776 [2:01:51<08:40,  2.43it/s] 90%|█████████ | 11512/12776 [2:01:51<08:05,  2.60it/s]                                                        90%|█████████ | 11512/12776 [2:01:51<08:05,  2.60it/s] 90%|█████████ | 11513/12776 [2:01:52<07:35,  2.77it/s]                                                        90%|█████████ | 11513/12776 [2:01:52<07:35,  2.77it/s] 90%|█████████ | 11514/12776 [2:01:52<07:34,  2.78it/s]                                                        90%|█████████ | 11514/12776 [2:01:52<07:34,  2.78it/s] 90%|█████████ | 11515/12776 [2:01:52<07:04,  2.97it/s]                                                        90%|█████████ | 11515/12776 [2:01:52<07:04,  2.97it/s] 90%|█████████ | 11516/12776 [2:01:53<06:41,  3.14it/s]                                                        90%|█████████ | 11516/12776 [2:01:53<06:41,  3.14it/s] 90%|█████████ | 11517/12776 [2:01:53<06:20,  3.31it/s]                                                        90%|█████████ | 11517/12776 [2:01:53<06:20,  3.31it/s] 90%|█████████ | 11518/12776 [2:01:53<06:25,  3.27it/s]                                                        90%|█████████ | 11518/12776 [2:01:53<06:25,  3.27it/s] 90%|█████████ | 11519/12776 [2:01:53<06:03,  3.46it/s]                                                        90%|█████████ | 11519/12776 [2:01:53<06:03,  3.46it/s] 90%|█████████ | 11520/12776 [2:01:54<05:45,  3.63it/s]                                                        90%|█████████ | 11520/12776 [2:01:54<05:45,  3.63it/s] 90%|█████████ | 11521/12776 [2:01:54<05:32,  3.78it/s]                                                        90%|█████████ | 11521/12776 [2:01:54<05:32,  3.78it/s] 90%|█████████ | 11522/12776 [2:01:54<05:04,  4.12it/s]                                                        90%|█████████ | 11522/12776 [2:01:54<05:04,  4.12it/s] 90%|█████████ | 11523/12776 [2:01:54<05:38,  3.70it/s]                                                        90%|█████████ | 11523/12776 [2:01:54<05:38,  3.70it/s] 90%|█████████ | 11524/12776 [2:01:55<05:19,  3.92it/s]                                                        90%|█████████ | 11524/12776 [2:01:55<05:19,  3.92it/s] 90%|█████████ | 11525/12776 [2:01:55<05:03,  4.13it/s]                                                        90%|█████████ | 11525/12776 [2:01:55<05:03,  4.13it/s] 90%|█████████ | 11526/12776 [2:01:55<04:50,  4.30it/s]                                                        90%|█████████ | 11526/12776 [2:01:55<04:50,  4.30it/s] 90%|█████████ | 11527/12776 [2:01:55<04:41,  4.43it/s]                                                        90%|█████████ | 11527/12776 [2:01:55<04:41,  4.43it/s] 90%|█████████ | 11528/12776 [2:01:56<05:06,  4.07it/s]                                                        90%|█████████ | 11528/12776 [2:01:56<05:06,  4.07it/s] 90%|█████████ | 11529/12776 [2:01:56<04:50,  4.29it/s]                                                        90%|█████████ | 11529/12776 [2:01:56<04:50,  4.29it/s] 90%|█████████ | 11530/12776 [2:01:56<04:37,  4.49it/s]                                                        90%|█████████ | 11530/12776 [2:01:56<04:37,  4.49it/s] 90%|█████████ | 11531/12776 [2:01:56<04:28,  4.64it/s]                                                        90%|█████████ | 11531/12776 [2:01:56<04:28,  4.64it/s] 90%|█████████ | 11532/12776 [2:01:56<04:21,  4.77it/s]                                                        90%|█████████ | 11532/12776 [2:01:56<04:21,  4.77it/s] 90%|█████████ | 11533/12776 [2:01:57<05:06,  4.05it/s]                                                        90%|█████████ | 11533/12776 [2:01:57<05:06,  4.05it/s] 90%|█████████ | 11534/12776 [2:01:57<04:45,  4.35it/s]                                                        90%|█████████ | 11534/12776 [2:01:57<04:45,  4.35it/s] 90%|█████████ | 11535/12776 [2:01:57<04:30,  4.58it/s]                                                        90%|█████████ | 11535/12776 [2:01:57<04:30,  4.58it/s] 90%|█████████ | 11536/12776 [2:01:57<04:19,  4.78it/s]                                                        90%|█████████ | 11536/12776 [2:01:57<04:19,  4.78it/s] 90%|█████████ | 11537/12776 [2:01:57<04:10,  4.94it/s]                                                        90%|█████████ | 11537/12776 [2:01:57<04:10,  4.94it/s] 90%|█████████ | 11538/12776 [2:01:58<07:10,  2.88it/s]                                                        90%|█████████ | 11538/12776 [2:01:58<07:10,  2.88it/s] 90%|█████████ | 11539/12776 [2:01:59<13:08,  1.57it/s]                                                        90%|█████████ | 11539/12776 [2:01:59<13:08,  1.57it/s] 90%|█████████ | 11540/12776 [2:02:00<15:10,  1.36it/s]                                                        90%|█████████ | 11540/12776 [2:02:00<15:10,  1.36it/s] 90%|█████████ | 11541/12776 [2:02:01<15:49,  1.30it/s]                                                        90%|█████████ | 11541/12776 [2:02:01<15:49,  1.30it/s] 90%|█████████ | 11542/12776 [2:02:02<15:41,  1.31it/s]                                                        90%|█████████ | 11542/12776 [2:02:02<15:41,  1.31it/s] 90%|█████████ | 11543/12776 [2:02:03<15:42,  1.31it/s]                                                        90%|█████████ | 11543/12776 [2:02:03<15:42,  1.31it/s] 90%|█████████ | 11544/12776 [2:02:03<15:28,  1.33it/s]                                                        90%|█████████ | 11544/12776 [2:02:03<15:28,  1.33it/s] 90%|█████████ | 11545/12776 [2:02:04<14:48,  1.39it/s]                                                       {'loss': 0.6811, 'grad_norm': 3.1197423934936523, 'learning_rate': 3.235581622678397e-05, 'epoch': 1.8}
+{'loss': 0.4798, 'grad_norm': 2.041327714920044, 'learning_rate': 3.233137829912023e-05, 'epoch': 1.8}
+{'loss': 0.4145, 'grad_norm': 2.050049066543579, 'learning_rate': 3.23069403714565e-05, 'epoch': 1.8}
+{'loss': 0.7301, 'grad_norm': 5.6066575050354, 'learning_rate': 3.228250244379277e-05, 'epoch': 1.8}
+{'loss': 0.9764, 'grad_norm': 2.6833720207214355, 'learning_rate': 3.225806451612903e-05, 'epoch': 1.8}
+{'loss': 0.3526, 'grad_norm': 1.4853190183639526, 'learning_rate': 3.223362658846529e-05, 'epoch': 1.8}
+{'loss': 0.4752, 'grad_norm': 1.8210937976837158, 'learning_rate': 3.220918866080156e-05, 'epoch': 1.8}
+{'loss': 0.9654, 'grad_norm': 2.770622968673706, 'learning_rate': 3.218475073313783e-05, 'epoch': 1.8}
+{'loss': 0.4661, 'grad_norm': 2.7523603439331055, 'learning_rate': 3.216031280547409e-05, 'epoch': 1.8}
+{'loss': 0.7784, 'grad_norm': 4.842382907867432, 'learning_rate': 3.213587487781036e-05, 'epoch': 1.8}
+{'loss': 1.0925, 'grad_norm': 2.4741690158843994, 'learning_rate': 3.211143695014662e-05, 'epoch': 1.8}
+{'loss': 0.5622, 'grad_norm': 1.1925292015075684, 'learning_rate': 3.208699902248289e-05, 'epoch': 1.8}
+{'loss': 1.3511, 'grad_norm': 3.3205652236938477, 'learning_rate': 3.206256109481916e-05, 'epoch': 1.8}
+{'loss': 1.5569, 'grad_norm': 3.456491231918335, 'learning_rate': 3.2038123167155424e-05, 'epoch': 1.8}
+{'loss': 0.8711, 'grad_norm': 3.728654146194458, 'learning_rate': 3.201368523949169e-05, 'epoch': 1.8}
+{'loss': 1.4595, 'grad_norm': 2.7066574096679688, 'learning_rate': 3.198924731182795e-05, 'epoch': 1.8}
+{'loss': 1.2391, 'grad_norm': 2.8487234115600586, 'learning_rate': 3.196480938416422e-05, 'epoch': 1.8}
+{'loss': 0.6185, 'grad_norm': 1.6365246772766113, 'learning_rate': 3.194037145650048e-05, 'epoch': 1.8}
+{'loss': 0.3946, 'grad_norm': 0.9667677879333496, 'learning_rate': 3.191593352883675e-05, 'epoch': 1.8}
+{'loss': 0.4003, 'grad_norm': 1.4002686738967896, 'learning_rate': 3.189149560117302e-05, 'epoch': 1.8}
+{'loss': 0.3993, 'grad_norm': 1.4334136247634888, 'learning_rate': 3.1867057673509284e-05, 'epoch': 1.8}
+{'loss': 0.837, 'grad_norm': 3.9255292415618896, 'learning_rate': 3.184261974584555e-05, 'epoch': 1.8}
+{'loss': 0.2766, 'grad_norm': 2.0887675285339355, 'learning_rate': 3.1818181818181814e-05, 'epoch': 1.8}
+{'loss': 0.4492, 'grad_norm': 1.7812479734420776, 'learning_rate': 3.179374389051808e-05, 'epoch': 1.8}
+{'loss': 0.2828, 'grad_norm': 1.9391423463821411, 'learning_rate': 3.176930596285434e-05, 'epoch': 1.8}
+{'loss': 0.3079, 'grad_norm': 1.2495836019515991, 'learning_rate': 3.1744868035190615e-05, 'epoch': 1.8}
+{'loss': 0.2513, 'grad_norm': 1.1032434701919556, 'learning_rate': 3.172043010752688e-05, 'epoch': 1.8}
+{'loss': 0.2143, 'grad_norm': 1.8104841709136963, 'learning_rate': 3.1695992179863144e-05, 'epoch': 1.8}
+{'loss': 0.3067, 'grad_norm': 1.4198352098464966, 'learning_rate': 3.167155425219941e-05, 'epoch': 1.8}
+{'loss': 0.2628, 'grad_norm': 0.8170778155326843, 'learning_rate': 3.1647116324535674e-05, 'epoch': 1.8}
+{'loss': 0.455, 'grad_norm': 2.481227159500122, 'learning_rate': 3.162267839687194e-05, 'epoch': 1.8}
+{'loss': 0.3932, 'grad_norm': 1.1111358404159546, 'learning_rate': 3.159824046920821e-05, 'epoch': 1.8}
+{'loss': 0.4981, 'grad_norm': 2.602691888809204, 'learning_rate': 3.1573802541544475e-05, 'epoch': 1.8}
+{'loss': 0.5675, 'grad_norm': 0.9818456172943115, 'learning_rate': 3.154936461388074e-05, 'epoch': 1.8}
+{'loss': 0.3184, 'grad_norm': 1.2159143686294556, 'learning_rate': 3.1524926686217005e-05, 'epoch': 1.8}
+{'loss': 0.2769, 'grad_norm': 2.042997360229492, 'learning_rate': 3.150048875855327e-05, 'epoch': 1.8}
+{'loss': 0.4187, 'grad_norm': 1.841683268547058, 'learning_rate': 3.1476050830889534e-05, 'epoch': 1.8}
+{'loss': 0.3185, 'grad_norm': 1.4786103963851929, 'learning_rate': 3.1451612903225806e-05, 'epoch': 1.8}
+{'loss': 0.4065, 'grad_norm': 1.3413180112838745, 'learning_rate': 3.142717497556207e-05, 'epoch': 1.8}
+{'loss': 0.2588, 'grad_norm': 1.192056655883789, 'learning_rate': 3.1402737047898335e-05, 'epoch': 1.8}
+{'loss': 0.3511, 'grad_norm': 3.1147072315216064, 'learning_rate': 3.13782991202346e-05, 'epoch': 1.8}
+{'loss': 0.3664, 'grad_norm': 0.9415939450263977, 'learning_rate': 3.1353861192570865e-05, 'epoch': 1.8}
+{'loss': 0.5367, 'grad_norm': 2.014275074005127, 'learning_rate': 3.132942326490713e-05, 'epoch': 1.8}
+{'loss': 0.4566, 'grad_norm': 2.412937879562378, 'learning_rate': 3.13049853372434e-05, 'epoch': 1.8}
+{'loss': 0.4879, 'grad_norm': 1.8484764099121094, 'learning_rate': 3.1280547409579666e-05, 'epoch': 1.8}
+{'loss': 0.4734, 'grad_norm': 2.9329864978790283, 'learning_rate': 3.125610948191593e-05, 'epoch': 1.8}
+{'loss': 0.6371, 'grad_norm': 3.0174472332000732, 'learning_rate': 3.1231671554252196e-05, 'epoch': 1.8}
+{'loss': 0.5017, 'grad_norm': 2.343379020690918, 'learning_rate': 3.120723362658846e-05, 'epoch': 1.8}
+{'loss': 0.5473, 'grad_norm': 2.4087560176849365, 'learning_rate': 3.1182795698924725e-05, 'epoch': 1.8}
+{'loss': 0.5428, 'grad_norm': 5.394991874694824, 'learning_rate': 3.1158357771261e-05, 'epoch': 1.8}
+{'loss': 0.5551, 'grad_norm': 1.5382750034332275, 'learning_rate': 3.113391984359726e-05, 'epoch': 1.8}
+{'loss': 0.4123, 'grad_norm': 1.568321704864502, 'learning_rate': 3.110948191593353e-05, 'epoch': 1.8}
+{'loss': 0.6364, 'grad_norm': 2.0196034908294678, 'learning_rate': 3.108504398826979e-05, 'epoch': 1.8}
+{'loss': 0.8347, 'grad_norm': 2.843515634536743, 'learning_rate': 3.1060606060606056e-05, 'epoch': 1.8}
+{'loss': 0.7907, 'grad_norm': 2.5024216175079346, 'learning_rate': 3.103616813294232e-05, 'epoch': 1.8}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 3.103616813294232e-05, 'epoch': 1.8}
+{'loss': 0.9792, 'grad_norm': 2.9168970584869385, 'learning_rate': 3.101173020527859e-05, 'epoch': 1.8}
+{'loss': 0.7339, 'grad_norm': 3.7329742908477783, 'learning_rate': 3.098729227761486e-05, 'epoch': 1.8}
+{'loss': 1.2615, 'grad_norm': 4.0727972984313965, 'learning_rate': 3.096285434995112e-05, 'epoch': 1.8}
+{'loss': 0.8195, 'grad_norm': 2.7813782691955566, 'learning_rate': 3.093841642228739e-05, 'epoch': 1.8}
+{'loss': 1.0503, 'grad_norm': 3.9311771392822266, 'learning_rate': 3.091397849462365e-05, 'epoch': 1.8}
+{'loss': 0.7532, 'grad_norm': 2.577922821044922, 'learning_rate': 3.088954056695992e-05, 'epoch': 1.8}
+{'loss': 1.3083, 'grad_norm': 5.859790325164795, 'learning_rate': 3.086510263929619e-05, 'epoch': 1.8}
+{'loss': 0.8507, 'grad_norm': 2.1421098709106445, 'learning_rate': 3.084066471163245e-05, 'epoch': 1.8}
+{'loss': 1.1791, 'grad_norm': 3.9190313816070557, 'learning_rate': 3.081622678396872e-05, 'epoch': 1.81}
+{'loss': 0.9817, 'grad_norm': 1.5476529598236084, 'learning_rate': 3.079178885630498e-05, 'epoch': 1.81}
+{'loss': 0.6504, 'grad_norm': 1.8120455741882324, 'learning_rate': 3.076735092864125e-05, 'epoch': 1.81}
+{'loss': 0.5684, 'grad_norm': 1.9655290842056274, 'learning_rate': 3.074291300097751e-05, 'epoch': 1.81}
+{'loss': 0.7158, 'grad_norm': 2.028470993041992, 'learning_rate': 3.0718475073313784e-05, 'epoch': 1.81}
+{'loss': 0.5727, 'grad_norm': 1.6118348836898804, 'learning_rate': 3.069403714565005e-05, 'epoch': 1.81}
+{'loss': 0.5635, 'grad_norm': 1.6711117029190063, 'learning_rate': 3.066959921798631e-05, 'epoch': 1.81}
+{'loss': 0.7681, 'grad_norm': 1.72394859790802, 'learning_rate': 3.064516129032258e-05, 'epoch': 1.81}
+{'loss': 0.3143, 'grad_norm': 1.3141530752182007, 'learning_rate': 3.062072336265884e-05, 'epoch': 1.81}
+{'loss': 0.2761, 'grad_norm': 2.4097068309783936, 'learning_rate': 3.059628543499511e-05, 'epoch': 1.81}
+{'loss': 0.2951, 'grad_norm': 1.3687645196914673, 'learning_rate': 3.057184750733138e-05, 'epoch': 1.81}
+{'loss': 0.3424, 'grad_norm': 1.9082344770431519, 'learning_rate': 3.0547409579667644e-05, 'epoch': 1.81}
+{'loss': 0.3282, 'grad_norm': 1.1238840818405151, 'learning_rate': 3.052297165200391e-05, 'epoch': 1.81}
+{'loss': 0.3696, 'grad_norm': 14.494555473327637, 'learning_rate': 3.0498533724340174e-05, 'epoch': 1.81}
+ 90%|█████████ | 11545/12776 [2:02:04<14:48,  1.39it/s] 90%|█████████ | 11546/12776 [2:02:05<15:07,  1.36it/s]                                                        90%|█████████ | 11546/12776 [2:02:05<15:07,  1.36it/s] 90%|█████████ | 11547/12776 [2:02:05<14:10,  1.44it/s]                                                        90%|█████████ | 11547/12776 [2:02:05<14:10,  1.44it/s] 90%|█████████ | 11548/12776 [2:02:06<13:41,  1.50it/s]                                                        90%|█████████ | 11548/12776 [2:02:06<13:41,  1.50it/s] 90%|█████████ | 11549/12776 [2:02:07<12:48,  1.60it/s]                                                        90%|█████████ | 11549/12776 [2:02:07<12:48,  1.60it/s] 90%|█████████ | 11550/12776 [2:02:07<12:22,  1.65it/s]                                                        90%|█████████ | 11550/12776 [2:02:07<12:22,  1.65it/s] 90%|█████████ | 11551/12776 [2:02:08<11:39,  1.75it/s]                                                        90%|█████████ | 11551/12776 [2:02:08<11:39,  1.75it/s] 90%|█████████ | 11552/12776 [2:02:08<10:57,  1.86it/s]                                                        90%|█████████ | 11552/12776 [2:02:08<10:57,  1.86it/s] 90%|█████████ | 11553/12776 [2:02:09<10:19,  1.97it/s]                                                        90%|█████████ | 11553/12776 [2:02:09<10:19,  1.97it/s] 90%|█████████ | 11554/12776 [2:02:09<09:45,  2.09it/s]                                                        90%|█████████ | 11554/12776 [2:02:09<09:45,  2.09it/s] 90%|█████████ | 11555/12776 [2:02:09<09:44,  2.09it/s]                                                        90%|█████████ | 11555/12776 [2:02:09<09:44,  2.09it/s] 90%|█████████ | 11556/12776 [2:02:10<09:13,  2.20it/s]                                                        90%|█████████ | 11556/12776 [2:02:10<09:13,  2.20it/s] 90%|█████████ | 11557/12776 [2:02:10<08:48,  2.30it/s]                                                        90%|█████████ | 11557/12776 [2:02:10<08:48,  2.30it/s] 90%|█████████ | 11558/12776 [2:02:11<08:41,  2.34it/s]                                                        90%|█████████ | 11558/12776 [2:02:11<08:41,  2.34it/s] 90%|█████████ | 11559/12776 [2:02:11<08:15,  2.46it/s]                                                        90%|█████████ | 11559/12776 [2:02:11<08:15,  2.46it/s] 90%|█████████ | 11560/12776 [2:02:11<07:55,  2.56it/s]                                                        90%|█████████ | 11560/12776 [2:02:11<07:55,  2.56it/s] 90%|█████████ | 11561/12776 [2:02:12<08:01,  2.52it/s]                                                        90%|█████████ | 11561/12776 [2:02:12<08:01,  2.52it/s] 90%|█████████ | 11562/12776 [2:02:12<07:39,  2.64it/s]                                                        90%|█████████ | 11562/12776 [2:02:12<07:39,  2.64it/s] 91%|█████████ | 11563/12776 [2:02:12<07:18,  2.76it/s]                                                        91%|█████████ | 11563/12776 [2:02:12<07:18,  2.76it/s] 91%|█████████ | 11564/12776 [2:02:13<07:32,  2.68it/s]                                                        91%|█████████ | 11564/12776 [2:02:13<07:32,  2.68it/s] 91%|█████████ | 11565/12776 [2:02:13<07:03,  2.86it/s]                                                        91%|█████████ | 11565/12776 [2:02:13<07:03,  2.86it/s] 91%|█████████ | 11566/12776 [2:02:13<06:39,  3.03it/s]                                                        91%|█████████ | 11566/12776 [2:02:13<06:39,  3.03it/s] 91%|█████████ | 11567/12776 [2:02:14<07:02,  2.86it/s]                                                        91%|█████████ | 11567/12776 [2:02:14<07:02,  2.86it/s] 91%|█████████ | 11568/12776 [2:02:14<06:32,  3.08it/s]                                                        91%|█████████ | 11568/12776 [2:02:14<06:32,  3.08it/s] 91%|█████████ | 11569/12776 [2:02:14<06:09,  3.27it/s]                                                        91%|█████████ | 11569/12776 [2:02:14<06:09,  3.27it/s] 91%|█████████ | 11570/12776 [2:02:15<05:49,  3.45it/s]                                                        91%|█████████ | 11570/12776 [2:02:15<05:49,  3.45it/s] 91%|█████████ | 11571/12776 [2:02:15<06:07,  3.28it/s]                                                        91%|█████████ | 11571/12776 [2:02:15<06:07,  3.28it/s] 91%|█████████ | 11572/12776 [2:02:15<05:45,  3.48it/s]                                                        91%|█████████ | 11572/12776 [2:02:15<05:45,  3.48it/s] 91%|█████████ | 11573/12776 [2:02:15<05:27,  3.67it/s]                                                        91%|█████████ | 11573/12776 [2:02:15<05:27,  3.67it/s] 91%|█████████ | 11574/12776 [2:02:16<05:12,  3.84it/s]                                                        91%|█████████ | 11574/12776 [2:02:16<05:12,  3.84it/s] 91%|█████████ | 11575/12776 [2:02:16<05:01,  3.99it/s]                                                        91%|█████████ | 11575/12776 [2:02:16<05:01,  3.99it/s] 91%|█████████ | 11576/12776 [2:02:16<05:15,  3.81it/s]                                                        91%|█████████ | 11576/12776 [2:02:16<05:15,  3.81it/s] 91%|█████████ | 11577/12776 [2:02:16<04:57,  4.03it/s]                                                        91%|█████████ | 11577/12776 [2:02:16<04:57,  4.03it/s] 91%|█████████ | 11578/12776 [2:02:17<04:43,  4.23it/s]                                                        91%|█████████ | 11578/12776 [2:02:17<04:43,  4.23it/s] 91%|█████████ | 11579/12776 [2:02:17<04:33,  4.37it/s]                                                        91%|█████████ | 11579/12776 [2:02:17<04:33,  4.37it/s] 91%|█████████ | 11580/12776 [2:02:17<04:25,  4.51it/s]                                                        91%|█████████ | 11580/12776 [2:02:17<04:25,  4.51it/s] 91%|█████████ | 11581/12776 [2:02:17<04:54,  4.06it/s]                                                        91%|█████████ | 11581/12776 [2:02:17<04:54,  4.06it/s] 91%|█████████ | 11582/12776 [2:02:17<04:37,  4.30it/s]                                                        91%|█████████ | 11582/12776 [2:02:17<04:37,  4.30it/s] 91%|█████████ | 11583/12776 [2:02:18<04:25,  4.50it/s]                                                        91%|█████████ | 11583/12776 [2:02:18<04:25,  4.50it/s] 91%|█████████ | 11584/12776 [2:02:18<04:16,  4.65it/s]                                                        91%|█████████ | 11584/12776 [2:02:18<04:16,  4.65it/s] 91%|█████████ | 11585/12776 [2:02:18<04:08,  4.80it/s]                                                        91%|█████████ | 11585/12776 [2:02:18<04:08,  4.80it/s] 91%|█████████ | 11586/12776 [2:02:18<04:49,  4.11it/s]                                                        91%|█████████ | 11586/12776 [2:02:18<04:49,  4.11it/s] 91%|█████████ | 11587/12776 [2:02:19<04:28,  4.43it/s]                                                        91%|█████████ | 11587/12776 [2:02:19<04:28,  4.43it/s] 91%|█████████ | 11588/12776 [2:02:19<07:20,  2.70it/s]                                                        91%|█████████ | 11588/12776 [2:02:19<07:20,  2.70it/s] 91%|█████████ | 11589/12776 [2:02:21<13:47,  1.43it/s]                                                        91%|█████████ | 11589/12776 [2:02:21<13:47,  1.43it/s] 91%|█████████ | 11590/12776 [2:02:22<15:24,  1.28it/s]                                                        91%|█████████ | 11590/12776 [2:02:22<15:24,  1.28it/s] 91%|█████████ | 11591/12776 [2:02:23<15:45,  1.25it/s]                                                        91%|█████████ | 11591/12776 [2:02:23<15:45,  1.25it/s] 91%|█████████ | 11592/12776 [2:02:23<15:38,  1.26it/s]                                                        91%|█████████ | 11592/12776 [2:02:23<15:38,  1.26it/s] 91%|█████████ | 11593/12776 [2:02:24<15:15,  1.29it/s]                                                        91%|█████████ | 11593/12776 [2:02:24<15:15,  1.29it/s] 91%|█████████ | 11594/12776 [2:02:25<14:54,  1.32it/s]                                                        91%|█████████ | 11594/12776 [2:02:25<14:54,  1.32it/s] 91%|█████████ | 11595/12776 [2:02:26<14:59,  1.31it/s]                                                        91%|█████████ | 11595/12776 [2:02:26<14:59,  1.31it/s] 91%|█████████ | 11596/12776 [2:02:26<15:17,  1.29it/s]                                                        91%|█████████ | 11596/12776 [2:02:26<15:17,  1.29it/s] 91%|█████████ | 11597/12776 [2:02:27<14:18,  1.37it/s]                                                        91%|█████████ | 11597/12776 [2:02:27<14:18,  1.37it/s] 91%|█████████ | 11598/12776 [2:02:28<13:21,  1.47it/s]                                                        91%|█████████ | 11598/12776 [2:02:28<13:21,  1.47it/s] 91%|█████████ | 11599/12776 [2:02:28<12:39,  1.55it/s]                                                        91%|█████████ | 11599/12776 [2:02:28<12:39,  1.55it/s] 91%|█████████ | 11600/12776 [2:02:29<12:18,  1.59it/s]                                                        91%|█████████ | 11600/12776 [2:02:29<12:18,  1.59it/s]Saving model checkpoint to ./checkpoint-11600
+Configuration saved in ./checkpoint-11600/config.json
+Model weights saved in ./checkpoint-11600/model.safetensors
+Feature extractor saved in ./checkpoint-11600/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-11600/tokenizer_config.json
+Special tokens file saved in ./checkpoint-11600/special_tokens_map.json
+added tokens file saved in ./checkpoint-11600/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-10400] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 91%|█████████ | 11601/12776 [2:02:35<43:24,  2.22s/it]                                                        91%|█████████ | 11601/12776 [2:02:35<43:24,  2.22s/it] 91%|█████████ | 11602/12776 [2:02:35<33:07,  1.69s/it]                                                        91%|█████████ | 11602/12776 [2:02:35<33:07,  1.69s/it] 91%|█████████ | 11603/12776 [2:02:36<25:57,  1.33s/it]                                                        91%|█████████ | 11603/12776 [2:02:36<25:57,  1.33s/it] 91%|█████████ | 11604/12776 [2:02:36<20:34,  1.05s/it]                                                        91%|█████████ | 11604/12776 [2:02:36<20:34,  1.05s/it] 91%|█████████ | 11605/12776 [2:02:36<16:43,  1.17it/s]                                                        91%|█████████ | 11605/12776 [2:02:36<16:43,  1.17it/s] 91%|█████████ | 11606/12776 [2:02:37<14:28,  1.35it/s]                                                        91%|█████████ | 11606/12776 [2:02:37<14:28,  1.35it/s] 91%|█████████ | 11607/12776 [2:02:37<12:14,  1.59it/s]                                                        91%|█████████ | 11607/12776 [2:02:37<12:14,  1.59it/s] 91%|█████████ | 11608/12776 [2:02:38<10:37,  1.83it/s]                                                        91%|█████████ | 11608/12776 [2:02:38<10:37,  1.83it/s] 91%|█████���███ | 11609/12776 [2:02:38<09:54,  1.96it/s]                                                        91%|█████████ | 11609/12776 [2:02:38<09:54,  1.96it/s] 91%|█████████ | 11610/12776 [2:02:38<08:51,  2.19it/s]                                                        91%|█████████ | 11610/12776 [2:02:38<08:51,  2.19it/s] 91%|█████████ | 11611/12776 [2:02:39<08:03,  2.41it/s]                                                        91%|█████████ | 11611/12776 [2:02:39<08:03,  2.41it/s] 91%|█████████ | 11612/12776 [2:02:39<07:34,  2.56it/s]                                                        91%|█████████ | 11612/12776 [2:02:39<07:34,  2.56it/s] 91%|█████████ | 11613/12776 [2:02:39<07:05,  2.74it/s]                                                        91%|█████████ | 11613/12776 [2:02:39<07:05,  2.74it/s] 91%|█████████ | 11614/12776 [2:02:40<06:41,  2.90it/s]                                                        91%|█████████ | 11614/12776 [2:02:40<06:41,  2.90it/s] 91%|█████████ | 11615/12776 [2:02:40<06:17,  3.07it/s]                                                        91%|█████████ | 11615/12776 [2:02:40<06:17,  3.07it/s] 91%|█████████ | 11616/12776 [2:02:40<06:26,  3.00it/s]                                                        91%|█████████ | 11616/12776 [2:02:40<06:26,  3.00it/s] 91%|█████████ | 11617/12776 [2:02:40<06:01,  3.21it/s]                                                        91%|█████████ | 11617/12776 [2:02:40<06:01,  3.21it/s] 91%|█████████ | 11618/12776 [2:02:41<05:42,  3.38it/s]                                                        91%|█████████ | 11618/12776 [2:02:41<05:42,  3.38it/s] 91%|█████████ | 11619/12776 [2:02:41<05:26,  3.55it/s]                                                        91%|█████████ | 11619/12776 [2:02:41<05:26,  3.55it/s] 91%|█████████ | 11620/12776 [2:02:41<05:47,  3.33it/s]                                                        91%|█████████ | 11620/12776 [2:02:41<05:47,  3.33it/s] 91%|█████████ | 11621/12776 [2:02:42<05:23,  3.57it/s]                                                        91%|█████████ | 11621/12776 [2:02:42<05:23,  3.57it/s] 91%|█████████ | 11622/12776 [2:02:42<05:09,  3.73it/s]                                                        91%|█████████ | 11622/12776 [2:02:42<05:09,  3.73it/s] 91%|█████████ | 11623/12776 [2:02:42<04:52,  3.94it/s]                                                       {'loss': 0.3021, 'grad_norm': 0.5469344854354858, 'learning_rate': 3.047409579667644e-05, 'epoch': 1.81}
+{'loss': 0.2567, 'grad_norm': 0.5608265995979309, 'learning_rate': 3.0449657869012703e-05, 'epoch': 1.81}
+{'loss': 0.3498, 'grad_norm': 0.9688588380813599, 'learning_rate': 3.0425219941348968e-05, 'epoch': 1.81}
+{'loss': 0.2691, 'grad_norm': 1.1317929029464722, 'learning_rate': 3.0400782013685236e-05, 'epoch': 1.81}
+{'loss': 0.2949, 'grad_norm': 0.9738819003105164, 'learning_rate': 3.03763440860215e-05, 'epoch': 1.81}
+{'loss': 0.4013, 'grad_norm': 1.1929585933685303, 'learning_rate': 3.0351906158357766e-05, 'epoch': 1.81}
+{'loss': 0.4509, 'grad_norm': 1.0364540815353394, 'learning_rate': 3.0327468230694034e-05, 'epoch': 1.81}
+{'loss': 0.4985, 'grad_norm': 2.415992259979248, 'learning_rate': 3.03030303030303e-05, 'epoch': 1.81}
+{'loss': 0.443, 'grad_norm': 1.438122272491455, 'learning_rate': 3.0278592375366564e-05, 'epoch': 1.81}
+{'loss': 0.4412, 'grad_norm': 1.7964818477630615, 'learning_rate': 3.0254154447702832e-05, 'epoch': 1.81}
+{'loss': 0.4248, 'grad_norm': 4.328434467315674, 'learning_rate': 3.0229716520039097e-05, 'epoch': 1.81}
+{'loss': 0.4388, 'grad_norm': 2.232316017150879, 'learning_rate': 3.020527859237536e-05, 'epoch': 1.81}
+{'loss': 0.3977, 'grad_norm': 1.9502800703048706, 'learning_rate': 3.018084066471163e-05, 'epoch': 1.81}
+{'loss': 0.7417, 'grad_norm': 5.183807373046875, 'learning_rate': 3.0156402737047895e-05, 'epoch': 1.81}
+{'loss': 0.6555, 'grad_norm': 4.440307140350342, 'learning_rate': 3.013196480938416e-05, 'epoch': 1.81}
+{'loss': 0.4009, 'grad_norm': 1.0355379581451416, 'learning_rate': 3.0107526881720428e-05, 'epoch': 1.81}
+{'loss': 0.4825, 'grad_norm': 2.554629325866699, 'learning_rate': 3.0083088954056692e-05, 'epoch': 1.81}
+{'loss': 0.5349, 'grad_norm': 1.1384446620941162, 'learning_rate': 3.0058651026392957e-05, 'epoch': 1.81}
+{'loss': 0.3389, 'grad_norm': 1.610360026359558, 'learning_rate': 3.0034213098729225e-05, 'epoch': 1.81}
+{'loss': 0.5675, 'grad_norm': 1.8687621355056763, 'learning_rate': 3.000977517106549e-05, 'epoch': 1.81}
+{'loss': 0.3691, 'grad_norm': 4.0817108154296875, 'learning_rate': 2.9985337243401755e-05, 'epoch': 1.81}
+{'loss': 0.6155, 'grad_norm': 5.202278137207031, 'learning_rate': 2.9960899315738023e-05, 'epoch': 1.81}
+{'loss': 0.7993, 'grad_norm': 2.133866548538208, 'learning_rate': 2.9936461388074288e-05, 'epoch': 1.81}
+{'loss': 1.0991, 'grad_norm': 3.848477363586426, 'learning_rate': 2.9912023460410553e-05, 'epoch': 1.81}
+{'loss': 0.4259, 'grad_norm': 1.6100995540618896, 'learning_rate': 2.988758553274682e-05, 'epoch': 1.81}
+{'loss': 0.5782, 'grad_norm': 3.559994697570801, 'learning_rate': 2.9863147605083086e-05, 'epoch': 1.81}
+{'loss': 0.8529, 'grad_norm': 3.698235034942627, 'learning_rate': 2.983870967741935e-05, 'epoch': 1.81}
+{'loss': 0.5965, 'grad_norm': 3.099733591079712, 'learning_rate': 2.981427174975562e-05, 'epoch': 1.81}
+{'loss': 0.6342, 'grad_norm': 1.290650725364685, 'learning_rate': 2.9789833822091883e-05, 'epoch': 1.81}
+{'loss': 1.0052, 'grad_norm': 2.6807796955108643, 'learning_rate': 2.9765395894428148e-05, 'epoch': 1.81}
+{'loss': 0.6427, 'grad_norm': 1.7438832521438599, 'learning_rate': 2.9740957966764416e-05, 'epoch': 1.81}
+{'loss': 1.0218, 'grad_norm': 2.453213691711426, 'learning_rate': 2.971652003910068e-05, 'epoch': 1.81}
+{'loss': 0.3113, 'grad_norm': 2.5815582275390625, 'learning_rate': 2.9692082111436946e-05, 'epoch': 1.81}
+{'loss': 0.7022, 'grad_norm': 1.8507500886917114, 'learning_rate': 2.9667644183773214e-05, 'epoch': 1.81}
+{'loss': 1.1861, 'grad_norm': 5.00587797164917, 'learning_rate': 2.964320625610948e-05, 'epoch': 1.81}
+{'loss': 0.8452, 'grad_norm': 4.860931873321533, 'learning_rate': 2.9618768328445744e-05, 'epoch': 1.81}
+{'loss': 0.8327, 'grad_norm': 1.9036601781845093, 'learning_rate': 2.9594330400782012e-05, 'epoch': 1.81}
+{'loss': 1.3153, 'grad_norm': 2.3255369663238525, 'learning_rate': 2.9569892473118277e-05, 'epoch': 1.81}
+{'loss': 0.8975, 'grad_norm': 3.8739614486694336, 'learning_rate': 2.954545454545454e-05, 'epoch': 1.81}
+{'loss': 1.295, 'grad_norm': 2.4773049354553223, 'learning_rate': 2.952101661779081e-05, 'epoch': 1.81}
+{'loss': 0.1969, 'grad_norm': 1.24504816532135, 'learning_rate': 2.9496578690127075e-05, 'epoch': 1.81}
+{'loss': 0.9211, 'grad_norm': 3.3709805011749268, 'learning_rate': 2.947214076246334e-05, 'epoch': 1.81}
+{'loss': 0.5325, 'grad_norm': 4.954073905944824, 'learning_rate': 2.9447702834799608e-05, 'epoch': 1.81}
+{'loss': 0.796, 'grad_norm': 2.0279388427734375, 'learning_rate': 2.9423264907135872e-05, 'epoch': 1.81}
+{'loss': 0.3563, 'grad_norm': 0.8926810622215271, 'learning_rate': 2.9398826979472137e-05, 'epoch': 1.81}
+{'loss': 0.347, 'grad_norm': 0.5909395217895508, 'learning_rate': 2.9374389051808405e-05, 'epoch': 1.81}
+{'loss': 0.3006, 'grad_norm': 1.289638876914978, 'learning_rate': 2.934995112414467e-05, 'epoch': 1.81}
+{'loss': 0.2591, 'grad_norm': 0.5214605927467346, 'learning_rate': 2.9325513196480935e-05, 'epoch': 1.81}
+{'loss': 0.2681, 'grad_norm': 3.6117069721221924, 'learning_rate': 2.9301075268817203e-05, 'epoch': 1.81}
+{'loss': 0.371, 'grad_norm': 1.9626871347427368, 'learning_rate': 2.9276637341153468e-05, 'epoch': 1.81}
+{'loss': 0.3304, 'grad_norm': 0.8882358074188232, 'learning_rate': 2.9252199413489733e-05, 'epoch': 1.82}
+{'loss': 0.3598, 'grad_norm': 2.33246111869812, 'learning_rate': 2.9227761485826e-05, 'epoch': 1.82}
+{'loss': 0.3543, 'grad_norm': 1.491754174232483, 'learning_rate': 2.9203323558162266e-05, 'epoch': 1.82}
+{'loss': 0.3254, 'grad_norm': 1.187076449394226, 'learning_rate': 2.917888563049853e-05, 'epoch': 1.82}
+{'loss': 0.3543, 'grad_norm': 0.9200171232223511, 'learning_rate': 2.91544477028348e-05, 'epoch': 1.82}
+{'loss': 0.3431, 'grad_norm': 1.1525431871414185, 'learning_rate': 2.9130009775171064e-05, 'epoch': 1.82}
+{'loss': 0.2869, 'grad_norm': 0.8428003787994385, 'learning_rate': 2.910557184750733e-05, 'epoch': 1.82}
+{'loss': 0.585, 'grad_norm': 2.701383590698242, 'learning_rate': 2.9081133919843597e-05, 'epoch': 1.82}
+{'loss': 0.3806, 'grad_norm': 1.2327849864959717, 'learning_rate': 2.905669599217986e-05, 'epoch': 1.82}
+{'loss': 0.5627, 'grad_norm': 1.1938436031341553, 'learning_rate': 2.9032258064516126e-05, 'epoch': 1.82}
+{'loss': 0.4786, 'grad_norm': 1.1973328590393066, 'learning_rate': 2.9007820136852394e-05, 'epoch': 1.82}
+{'loss': 0.4375, 'grad_norm': 7.545814514160156, 'learning_rate': 2.898338220918866e-05, 'epoch': 1.82}
+{'loss': 0.607, 'grad_norm': 3.719249963760376, 'learning_rate': 2.8958944281524924e-05, 'epoch': 1.82}
+{'loss': 0.5607, 'grad_norm': 1.4751845598220825, 'learning_rate': 2.8934506353861192e-05, 'epoch': 1.82}
+{'loss': 0.5926, 'grad_norm': 9.847101211547852, 'learning_rate': 2.8910068426197457e-05, 'epoch': 1.82}
+{'loss': 0.4732, 'grad_norm': 1.2304680347442627, 'learning_rate': 2.8885630498533722e-05, 'epoch': 1.82}
+{'loss': 0.338, 'grad_norm': 4.059071063995361, 'learning_rate': 2.8861192570869987e-05, 'epoch': 1.82}
+{'loss': 0.4844, 'grad_norm': 1.5331727266311646, 'learning_rate': 2.8836754643206255e-05, 'epoch': 1.82}
+{'loss': 0.5759, 'grad_norm': 2.8758628368377686, 'learning_rate': 2.881231671554252e-05, 'epoch': 1.82}
+{'loss': 0.4847, 'grad_norm': 4.602398872375488, 'learning_rate': 2.8787878787878784e-05, 'epoch': 1.82}
+{'loss': 1.2581, 'grad_norm': 5.723301410675049, 'learning_rate': 2.8763440860215053e-05, 'epoch': 1.82}
+{'loss': 0.5796, 'grad_norm': 2.905620813369751, 'learning_rate': 2.8739002932551317e-05, 'epoch': 1.82}
+{'loss': 0.6131, 'grad_norm': 2.289865732192993, 'learning_rate': 2.8714565004887582e-05, 'epoch': 1.82}
+{'loss': 0.523, 'grad_norm': 1.782109022140503, 'learning_rate': 2.869012707722385e-05, 'epoch': 1.82}
+{'loss': 0.9899, 'grad_norm': 8.425079345703125, 'learning_rate': 2.8665689149560115e-05, 'epoch': 1.82}
+{'loss': 0.7031, 'grad_norm': 2.678179979324341, 'learning_rate': 2.864125122189638e-05, 'epoch': 1.82}
+{'loss': 0.5054, 'grad_norm': 9.00810432434082, 'learning_rate': 2.8616813294232648e-05, 'epoch': 1.82}
+{'loss': 0.8168, 'grad_norm': 2.7023744583129883, 'learning_rate': 2.8592375366568913e-05, 'epoch': 1.82}
+ 91%|█████████ | 11623/12776 [2:02:42<04:52,  3.94it/s] 91%|█████████ | 11624/12776 [2:02:42<05:08,  3.74it/s]                                                        91%|█████████ | 11624/12776 [2:02:42<05:08,  3.74it/s] 91%|█████████ | 11625/12776 [2:02:43<04:47,  4.00it/s]                                                        91%|█████████ | 11625/12776 [2:02:43<04:47,  4.00it/s] 91%|█████████ | 11626/12776 [2:02:43<04:31,  4.23it/s]                                                        91%|█████████ | 11626/12776 [2:02:43<04:31,  4.23it/s] 91%|█████████ | 11627/12776 [2:02:43<04:17,  4.47it/s]                                                        91%|█████████ | 11627/12776 [2:02:43<04:17,  4.47it/s] 91%|█████████ | 11628/12776 [2:02:43<04:05,  4.68it/s]                                                        91%|█████████ | 11628/12776 [2:02:43<04:05,  4.68it/s] 91%|█████████ | 11629/12776 [2:02:43<03:59,  4.79it/s]                                                        91%|█████████ | 11629/12776 [2:02:43<03:59,  4.79it/s] 91%|█████████ | 11630/12776 [2:02:44<04:12,  4.55it/s]                                                        91%|█████████ | 11630/12776 [2:02:44<04:12,  4.55it/s] 91%|█████████ | 11631/12776 [2:02:44<03:58,  4.80it/s]                                                        91%|█████████ | 11631/12776 [2:02:44<03:58,  4.80it/s] 91%|█████████ | 11632/12776 [2:02:44<03:48,  5.01it/s]                                                        91%|█████████ | 11632/12776 [2:02:44<03:48,  5.01it/s] 91%|█████████ | 11633/12776 [2:02:44<03:40,  5.17it/s]                                                        91%|█████████ | 11633/12776 [2:02:44<03:40,  5.17it/s] 91%|█████████ | 11634/12776 [2:02:44<03:34,  5.32it/s]                                                        91%|█████████ | 11634/12776 [2:02:44<03:34,  5.32it/s] 91%|█████████ | 11635/12776 [2:02:44<03:29,  5.45it/s]                                                        91%|█████████ | 11635/12776 [2:02:44<03:29,  5.45it/s] 91%|█████████ | 11636/12776 [2:02:45<03:51,  4.93it/s]                                                        91%|█████████ | 11636/12776 [2:02:45<03:51,  4.93it/s] 91%|█████████ | 11637/12776 [2:02:45<03:38,  5.21it/s]                                                        91%|█████████ | 11637/12776 [2:02:45<03:38,  5.21it/s] 91%|█████████ | 11638/12776 [2:02:46<06:26,  2.95it/s]                                                        91%|█████████ | 11638/12776 [2:02:46<06:26,  2.95it/s] 91%|█████████ | 11639/12776 [2:02:47<12:14,  1.55it/s]                                                        91%|█████████ | 11639/12776 [2:02:47<12:14,  1.55it/s] 91%|█████████ | 11640/12776 [2:02:48<13:49,  1.37it/s]                                                        91%|█████████ | 11640/12776 [2:02:48<13:49,  1.37it/s] 91%|█████████ | 11641/12776 [2:02:49<14:24,  1.31it/s]                                                        91%|█████████ | 11641/12776 [2:02:49<14:24,  1.31it/s] 91%|█████████ | 11642/12776 [2:02:49<14:41,  1.29it/s]                                                        91%|█████████ | 11642/12776 [2:02:49<14:41,  1.29it/s] 91%|█████████ | 11643/12776 [2:02:50<15:11,  1.24it/s]                                                        91%|█████████ | 11643/12776 [2:02:50<15:11,  1.24it/s] 91%|█████████ | 11644/12776 [2:02:51<14:26,  1.31it/s]                                                        91%|█████████ | 11644/12776 [2:02:51<14:26,  1.31it/s] 91%|█████████ | 11645/12776 [2:02:52<14:10,  1.33it/s]                                                        91%|█████████ | 11645/12776 [2:02:52<14:10,  1.33it/s] 91%|█████████ | 11646/12776 [2:02:52<13:19,  1.41it/s]                                                        91%|█████████ | 11646/12776 [2:02:52<13:19,  1.41it/s] 91%|█████████ | 11647/12776 [2:02:53<12:37,  1.49it/s]                                                        91%|█████████ | 11647/12776 [2:02:53<12:37,  1.49it/s] 91%|█████████ | 11648/12776 [2:02:53<11:51,  1.58it/s]                                                        91%|█████████ | 11648/12776 [2:02:53<11:51,  1.58it/s] 91%|█████████ | 11649/12776 [2:02:54<11:21,  1.65it/s]                                                        91%|█████████ | 11649/12776 [2:02:54<11:21,  1.65it/s] 91%|█████████ | 11650/12776 [2:02:55<10:47,  1.74it/s]                                                        91%|█████████ | 11650/12776 [2:02:55<10:47,  1.74it/s] 91%|█████████ | 11651/12776 [2:02:55<10:33,  1.78it/s]                                                        91%|█████████ | 11651/12776 [2:02:55<10:33,  1.78it/s] 91%|█████████ | 11652/12776 [2:02:56<09:48,  1.91it/s]                                                        91%|█████████ | 11652/12776 [2:02:56<09:48,  1.91it/s] 91%|█████████ | 11653/12776 [2:02:56<09:45,  1.92it/s]                                                        91%|█████████ | 11653/12776 [2:02:56<09:45,  1.92it/s] 91%|█████████ | 11654/12776 [2:02:56<09:03,  2.07it/s]                                                        91%|█████████ | 11654/12776 [2:02:56<09:03,  2.07it/s] 91%|█████████ | 11655/12776 [2:02:57<08:27,  2.21it/s]                                                        91%|█████████ | 11655/12776 [2:02:57<08:27,  2.21it/s] 91%|█████████ | 11656/12776 [2:02:57<08:09,  2.29it/s]                                                        91%|█████████ | 11656/12776 [2:02:57<08:09,  2.29it/s] 91%|█████████ | 11657/12776 [2:02:58<07:43,  2.41it/s]                                                        91%|█████████ | 11657/12776 [2:02:58<07:43,  2.41it/s] 91%|█████████ | 11658/12776 [2:02:58<07:18,  2.55it/s]                                                        91%|█████████ | 11658/12776 [2:02:58<07:18,  2.55it/s] 91%|█████████▏| 11659/12776 [2:02:58<07:39,  2.43it/s]                                                        91%|█████████▏| 11659/12776 [2:02:58<07:39,  2.43it/s] 91%|█████████▏| 11660/12776 [2:02:59<07:09,  2.60it/s]                                                        91%|█████████▏| 11660/12776 [2:02:59<07:09,  2.60it/s] 91%|█████████▏| 11661/12776 [2:02:59<06:45,  2.75it/s]                                                        91%|█████████▏| 11661/12776 [2:02:59<06:45,  2.75it/s] 91%|█████████▏| 11662/12776 [2:02:59<06:25,  2.89it/s]                                                        91%|█████████▏| 11662/12776 [2:02:59<06:25,  2.89it/s] 91%|█████████▏| 11663/12776 [2:03:00<06:33,  2.83it/s]                                                        91%|█████████▏| 11663/12776 [2:03:00<06:33,  2.83it/s] 91%|█████████▏| 11664/12776 [2:03:00<06:06,  3.03it/s]                                                        91%|█████████▏| 11664/12776 [2:03:00<06:06,  3.03it/s] 91%|█████████▏| 11665/12776 [2:03:00<05:46,  3.21it/s]                                                        91%|█████████▏| 11665/12776 [2:03:00<05:46,  3.21it/s] 91%|█████████▏| 11666/12776 [2:03:00<05:29,  3.37it/s]                                                        91%|█████████▏| 11666/12776 [2:03:00<05:29,  3.37it/s] 91%|█████████▏| 11667/12776 [2:03:01<05:16,  3.50it/s]                                                        91%|█████████▏| 11667/12776 [2:03:01<05:16,  3.50it/s] 91%|█████████▏| 11668/12776 [2:03:01<05:05,  3.63it/s]                                                        91%|█████████▏| 11668/12776 [2:03:01<05:05,  3.63it/s] 91%|█████████▏| 11669/12776 [2:03:01<04:54,  3.76it/s]                                                        91%|█████████▏| 11669/12776 [2:03:01<04:54,  3.76it/s] 91%|█████████▏| 11670/12776 [2:03:01<04:42,  3.92it/s]                                                        91%|█████████▏| 11670/12776 [2:03:01<04:42,  3.92it/s] 91%|█████████▏| 11671/12776 [2:03:02<04:55,  3.75it/s]                                                        91%|█████████▏| 11671/12776 [2:03:02<04:55,  3.75it/s] 91%|█████████▏| 11672/12776 [2:03:02<04:38,  3.96it/s]                                                        91%|█████████▏| 11672/12776 [2:03:02<04:38,  3.96it/s] 91%|█████████▏| 11673/12776 [2:03:02<04:25,  4.15it/s]                                                        91%|█████████▏| 11673/12776 [2:03:02<04:25,  4.15it/s] 91%|█████████▏| 11674/12776 [2:03:02<04:14,  4.33it/s]                                                        91%|█████████▏| 11674/12776 [2:03:02<04:14,  4.33it/s] 91%|█████████▏| 11675/12776 [2:03:03<04:05,  4.49it/s]                                                        91%|█████████▏| 11675/12776 [2:03:03<04:05,  4.49it/s] 91%|█████████▏| 11676/12776 [2:03:03<04:21,  4.20it/s]                                                        91%|█████████▏| 11676/12776 [2:03:03<04:21,  4.20it/s] 91%|█████████▏| 11677/12776 [2:03:03<04:06,  4.45it/s]                                                        91%|█████████▏| 11677/12776 [2:03:03<04:06,  4.45it/s] 91%|█████████▏| 11678/12776 [2:03:03<03:54,  4.68it/s]                                                        91%|█████████▏| 11678/12776 [2:03:03<03:54,  4.68it/s] 91%|█████████▏| 11679/12776 [2:03:03<03:45,  4.85it/s]                                                        91%|█████████▏| 11679/12776 [2:03:03<03:45,  4.85it/s] 91%|█████████▏| 11680/12776 [2:03:04<03:38,  5.02it/s]                                                        91%|█████████▏| 11680/12776 [2:03:04<03:38,  5.02it/s] 91%|█████████▏| 11681/12776 [2:03:04<03:31,  5.18it/s]                                                        91%|█████████▏| 11681/12776 [2:03:04<03:31,  5.18it/s] 91%|█████████▏| 11682/12776 [2:03:04<03:59,  4.58it/s]                                                        91%|█████████▏| 11682/12776 [2:03:04<03:59,  4.58it/s] 91%|█████████▏| 11683/12776 [2:03:04<03:45,  4.85it/s]                                                        91%|█████████▏| 11683/12776 [2:03:04<03:45,  4.85it/s] 91%|█████████▏| 11684/12776 [2:03:04<03:34,  5.10it/s]                                                        91%|█████████▏| 11684/12776 [2:03:04<03:34,  5.10it/s] 91%|█████████▏| 11685/12776 [2:03:05<03:25,  5.30it/s]                                                        91%|█████████▏| 11685/12776 [2:03:05<03:25,  5.30it/s] 91%|█████████▏| 11686/12776 [2:03:05<03:17,  5.51it/s]                                                        91%|█████████▏| 11686/12776 [2:03:05<03:17,  5.51it/s] 91%|█████████▏| 11687/12776 [2:03:05<03:12,  5.66it/s]                                                        91%|█████████▏| 11687/12776 [2:03:05<03:12,  5.66it/s] 91%|█████████▏| 11688/12776 [2:03:06<05:59,  3.03it/s]                                                        91%|█████████▏| 11688/12776 [2:03:06<05:59,  3.03it/s] 91%|█████████▏| 11689/12776 [2:03:07<11:44,  1.54it/s]                                                        91%|█████████▏| 11689/12776 [2:03:07<11:44,  1.54it/s] 91%|█████████▏| 11690/12776 [2:03:08<13:34,  1.33it/s]                                                        91%|█████████▏| 11690/12776 [2:03:08<13:34,  1.33it/s] 92%|█████████▏| 11691/12776 [2:03:09<14:40,  1.23it/s]                                                        92%|█████████▏| 11691/12776 [2:03:09<14:40,  1.23it/s] 92%|█████████▏| 11692/12776 [2:03:10<14:42,  1.23it/s]                                                        92%|█████████▏| 11692/12776 [2:03:10<14:42,  1.23it/s] 92%|█████████▏| 11693/12776 [2:03:11<14:18,  1.26it/s]                                                        92%|█████████▏| 11693/12776 [2:03:11<14:18,  1.26it/s] 92%|█████████▏| 11694/12776 [2:03:11<14:18,  1.26it/s]                                                        92%|█████████▏| 11694/12776 [2:03:11<14:18,  1.26it/s] 92%|█████████▏| 11695/12776 [2:03:12<14:04,  1.28it/s]                                                        92%|█████████▏| 11695/12776 [2:03:12<14:04,  1.28it/s] 92%|█████████▏| 11696/12776 [2:03:13<13:14,  1.36it/s]                                                        92%|█████████▏| 11696/12776 [2:03:13<13:14,  1.36it/s] 92%|█████████▏| 11697/12776 [2:03:13<12:33,  1.43it/s]                                                        92%|█████████▏| 11697/12776 [2:03:13<12:33,  1.43it/s] 92%|█████████▏| 11698/12776 [2:03:14<11:49,  1.52it/s]                                                        92%|█████████▏| 11698/12776 [2:03:14<11:49,  1.52it/s] 92%|█████████▏| 11699/12776 [2:03:14<11:28,  1.56it/s]                                                        92%|█████████▏| 11699/12776 [2:03:14<11:28,  1.56it/s] 92%|█████████▏| 11700/12776 [2:03:15<10:48,  1.66it/s]                                                        92%|█████████▏| 11700/12776 [2:03:15<10:48,  1.66it/s] 92%|█████████▏| 11701/12776 [2:03:16<11:09,  1.61it/s]                                                       {'loss': 0.7713, 'grad_norm': 2.590367078781128, 'learning_rate': 2.8567937438905178e-05, 'epoch': 1.82}
+{'loss': 0.5236, 'grad_norm': 5.677480220794678, 'learning_rate': 2.8543499511241446e-05, 'epoch': 1.82}
+{'loss': 0.6324, 'grad_norm': 5.0781426429748535, 'learning_rate': 2.851906158357771e-05, 'epoch': 1.82}
+{'loss': 0.4184, 'grad_norm': 3.239406108856201, 'learning_rate': 2.8494623655913975e-05, 'epoch': 1.82}
+{'loss': 1.0267, 'grad_norm': 2.7294325828552246, 'learning_rate': 2.8470185728250244e-05, 'epoch': 1.82}
+{'loss': 1.3683, 'grad_norm': 3.9341413974761963, 'learning_rate': 2.844574780058651e-05, 'epoch': 1.82}
+{'loss': 1.1582, 'grad_norm': 4.064838409423828, 'learning_rate': 2.8421309872922773e-05, 'epoch': 1.82}
+{'loss': 0.4504, 'grad_norm': 2.5269718170166016, 'learning_rate': 2.839687194525904e-05, 'epoch': 1.82}
+{'loss': 1.0721, 'grad_norm': 2.9901621341705322, 'learning_rate': 2.8372434017595306e-05, 'epoch': 1.82}
+{'loss': 0.8848, 'grad_norm': 2.5948750972747803, 'learning_rate': 2.834799608993157e-05, 'epoch': 1.82}
+{'loss': 0.9791, 'grad_norm': 7.479366779327393, 'learning_rate': 2.832355816226784e-05, 'epoch': 1.82}
+{'loss': 1.3042, 'grad_norm': 6.228923797607422, 'learning_rate': 2.8299120234604104e-05, 'epoch': 1.82}
+{'loss': 0.7378, 'grad_norm': 2.04010272026062, 'learning_rate': 2.827468230694037e-05, 'epoch': 1.82}
+{'loss': 0.6142, 'grad_norm': 3.85182523727417, 'learning_rate': 2.8250244379276637e-05, 'epoch': 1.82}
+{'loss': 0.6967, 'grad_norm': 2.9816079139709473, 'learning_rate': 2.8225806451612902e-05, 'epoch': 1.82}
+{'loss': 0.9379, 'grad_norm': 2.144773483276367, 'learning_rate': 2.8201368523949167e-05, 'epoch': 1.82}
+{'loss': 0.3605, 'grad_norm': 4.1807661056518555, 'learning_rate': 2.8176930596285435e-05, 'epoch': 1.82}
+{'loss': 0.345, 'grad_norm': 1.5326061248779297, 'learning_rate': 2.81524926686217e-05, 'epoch': 1.82}
+{'loss': 0.3745, 'grad_norm': 0.7716324925422668, 'learning_rate': 2.8128054740957964e-05, 'epoch': 1.82}
+{'loss': 0.4254, 'grad_norm': 0.9222685098648071, 'learning_rate': 2.8103616813294233e-05, 'epoch': 1.82}
+{'loss': 0.3535, 'grad_norm': 0.9720975160598755, 'learning_rate': 2.8079178885630497e-05, 'epoch': 1.82}
+{'loss': 0.4628, 'grad_norm': 1.215984582901001, 'learning_rate': 2.8054740957966762e-05, 'epoch': 1.82}
+{'loss': 0.4993, 'grad_norm': 1.166796326637268, 'learning_rate': 2.803030303030303e-05, 'epoch': 1.82}
+{'loss': 0.4912, 'grad_norm': 1.0708175897598267, 'learning_rate': 2.8005865102639295e-05, 'epoch': 1.82}
+{'loss': 0.5915, 'grad_norm': 1.6838656663894653, 'learning_rate': 2.798142717497556e-05, 'epoch': 1.82}
+{'loss': 0.4936, 'grad_norm': 1.128220558166504, 'learning_rate': 2.7956989247311828e-05, 'epoch': 1.82}
+{'loss': 0.7338, 'grad_norm': 2.355700969696045, 'learning_rate': 2.7932551319648093e-05, 'epoch': 1.82}
+{'loss': 0.8051, 'grad_norm': 2.227602481842041, 'learning_rate': 2.7908113391984354e-05, 'epoch': 1.82}
+{'loss': 0.9211, 'grad_norm': 1.5010913610458374, 'learning_rate': 2.7883675464320626e-05, 'epoch': 1.82}
+{'loss': 0.7823, 'grad_norm': 2.1629629135131836, 'learning_rate': 2.785923753665689e-05, 'epoch': 1.82}
+{'loss': 0.7279, 'grad_norm': 1.6575279235839844, 'learning_rate': 2.7834799608993152e-05, 'epoch': 1.82}
+{'loss': 0.768, 'grad_norm': 1.6518474817276, 'learning_rate': 2.7810361681329424e-05, 'epoch': 1.82}
+{'loss': 0.551, 'grad_norm': 2.1042938232421875, 'learning_rate': 2.778592375366569e-05, 'epoch': 1.82}
+{'loss': 0.6896, 'grad_norm': 5.784224987030029, 'learning_rate': 2.776148582600195e-05, 'epoch': 1.82}
+{'loss': 0.5671, 'grad_norm': 1.9954174757003784, 'learning_rate': 2.773704789833822e-05, 'epoch': 1.82}
+{'loss': 0.5581, 'grad_norm': 1.8126837015151978, 'learning_rate': 2.7712609970674486e-05, 'epoch': 1.82}
+{'loss': 0.4665, 'grad_norm': 7.1128106117248535, 'learning_rate': 2.7688172043010748e-05, 'epoch': 1.83}
+{'loss': 0.6816, 'grad_norm': 3.6487762928009033, 'learning_rate': 2.766373411534702e-05, 'epoch': 1.83}
+{'loss': 0.7622, 'grad_norm': 5.3055338859558105, 'learning_rate': 2.763929618768328e-05, 'epoch': 1.83}
+{'loss': 0.4507, 'grad_norm': 2.0376229286193848, 'learning_rate': 2.7614858260019546e-05, 'epoch': 1.83}
+{'loss': 0.4463, 'grad_norm': 2.7106873989105225, 'learning_rate': 2.7590420332355817e-05, 'epoch': 1.83}
+{'loss': 0.3819, 'grad_norm': 2.749912977218628, 'learning_rate': 2.756598240469208e-05, 'epoch': 1.83}
+{'loss': 0.4986, 'grad_norm': 4.844040393829346, 'learning_rate': 2.7541544477028343e-05, 'epoch': 1.83}
+{'loss': 0.2946, 'grad_norm': 2.121793746948242, 'learning_rate': 2.7517106549364615e-05, 'epoch': 1.83}
+{'loss': 0.6871, 'grad_norm': 2.5618605613708496, 'learning_rate': 2.7492668621700876e-05, 'epoch': 1.83}
+{'loss': 0.4507, 'grad_norm': 6.57215690612793, 'learning_rate': 2.746823069403714e-05, 'epoch': 1.83}
+{'loss': 0.4263, 'grad_norm': 1.615665316581726, 'learning_rate': 2.7443792766373413e-05, 'epoch': 1.83}
+{'loss': 0.9011, 'grad_norm': 2.805304765701294, 'learning_rate': 2.7419354838709674e-05, 'epoch': 1.83}
+{'loss': 0.4498, 'grad_norm': 2.946972370147705, 'learning_rate': 2.739491691104594e-05, 'epoch': 1.83}
+{'loss': 0.9303, 'grad_norm': 2.347245693206787, 'learning_rate': 2.737047898338221e-05, 'epoch': 1.83}
+{'loss': 1.0643, 'grad_norm': 2.6300110816955566, 'learning_rate': 2.7346041055718472e-05, 'epoch': 1.83}
+{'loss': 1.1481, 'grad_norm': 5.476263046264648, 'learning_rate': 2.7321603128054737e-05, 'epoch': 1.83}
+{'loss': 0.8726, 'grad_norm': 4.003941059112549, 'learning_rate': 2.7297165200391e-05, 'epoch': 1.83}
+{'loss': 0.7593, 'grad_norm': 3.651909351348877, 'learning_rate': 2.727272727272727e-05, 'epoch': 1.83}
+{'loss': 1.0815, 'grad_norm': 5.829402923583984, 'learning_rate': 2.7248289345063535e-05, 'epoch': 1.83}
+{'loss': 0.8428, 'grad_norm': 4.480926036834717, 'learning_rate': 2.72238514173998e-05, 'epoch': 1.83}
+{'loss': 1.0317, 'grad_norm': 3.8357913494110107, 'learning_rate': 2.7199413489736068e-05, 'epoch': 1.83}
+{'loss': 1.6432, 'grad_norm': 6.306792259216309, 'learning_rate': 2.7174975562072332e-05, 'epoch': 1.83}
+{'loss': 1.1734, 'grad_norm': 3.932875156402588, 'learning_rate': 2.7150537634408597e-05, 'epoch': 1.83}
+{'loss': 1.1751, 'grad_norm': 3.3402152061462402, 'learning_rate': 2.7126099706744865e-05, 'epoch': 1.83}
+{'loss': 1.3794, 'grad_norm': 4.163675785064697, 'learning_rate': 2.710166177908113e-05, 'epoch': 1.83}
+{'loss': 0.7359, 'grad_norm': 2.3751349449157715, 'learning_rate': 2.7077223851417395e-05, 'epoch': 1.83}
+{'loss': 0.4729, 'grad_norm': 1.7543164491653442, 'learning_rate': 2.7052785923753663e-05, 'epoch': 1.83}
+{'loss': 0.4018, 'grad_norm': 3.514760732650757, 'learning_rate': 2.7028347996089928e-05, 'epoch': 1.83}
+{'loss': 0.6138, 'grad_norm': 2.0676984786987305, 'learning_rate': 2.7003910068426193e-05, 'epoch': 1.83}
+{'loss': 0.8691, 'grad_norm': 2.1401822566986084, 'learning_rate': 2.697947214076246e-05, 'epoch': 1.83}
+{'loss': 0.3156, 'grad_norm': 1.3781170845031738, 'learning_rate': 2.6955034213098726e-05, 'epoch': 1.83}
+{'loss': 0.3452, 'grad_norm': 3.473524808883667, 'learning_rate': 2.693059628543499e-05, 'epoch': 1.83}
+{'loss': 0.2908, 'grad_norm': 2.099724531173706, 'learning_rate': 2.690615835777126e-05, 'epoch': 1.83}
+{'loss': 0.3801, 'grad_norm': 1.1955111026763916, 'learning_rate': 2.6881720430107523e-05, 'epoch': 1.83}
+{'loss': 0.3253, 'grad_norm': 1.2508939504623413, 'learning_rate': 2.6857282502443788e-05, 'epoch': 1.83}
+{'loss': 0.3607, 'grad_norm': 1.153827428817749, 'learning_rate': 2.6832844574780056e-05, 'epoch': 1.83}
+{'loss': 0.411, 'grad_norm': 0.8758578300476074, 'learning_rate': 2.680840664711632e-05, 'epoch': 1.83}
+{'loss': 0.3108, 'grad_norm': 1.912964105606079, 'learning_rate': 2.6783968719452586e-05, 'epoch': 1.83}
+{'loss': 0.4525, 'grad_norm': 1.8823193311691284, 'learning_rate': 2.6759530791788854e-05, 'epoch': 1.83}
+{'loss': 0.5261, 'grad_norm': 1.5550634860992432, 'learning_rate': 2.673509286412512e-05, 'epoch': 1.83}
+{'loss': 0.3801, 'grad_norm': 1.8203532695770264, 'learning_rate': 2.6710654936461384e-05, 'epoch': 1.83}
+{'loss': 0.4729, 'grad_norm': 3.2406249046325684, 'learning_rate': 2.6686217008797652e-05, 'epoch': 1.83}
+ 92%|█████████▏| 11701/12776 [2:03:16<11:09,  1.61it/s] 92%|█████████▏| 11702/12776 [2:03:16<10:16,  1.74it/s]                                                        92%|█████████▏| 11702/12776 [2:03:16<10:16,  1.74it/s] 92%|█████████▏| 11703/12776 [2:03:17<09:35,  1.86it/s]                                                        92%|█████████▏| 11703/12776 [2:03:17<09:35,  1.86it/s] 92%|█████████▏| 11704/12776 [2:03:17<09:35,  1.86it/s]                                                        92%|█████████▏| 11704/12776 [2:03:17<09:35,  1.86it/s] 92%|█████████▏| 11705/12776 [2:03:18<09:01,  1.98it/s]                                                        92%|█████████▏| 11705/12776 [2:03:18<09:01,  1.98it/s] 92%|█████████▏| 11706/12776 [2:03:18<08:52,  2.01it/s]                                                        92%|█████████▏| 11706/12776 [2:03:18<08:52,  2.01it/s] 92%|█████████▏| 11707/12776 [2:03:18<08:18,  2.14it/s]                                                        92%|█████████▏| 11707/12776 [2:03:18<08:18,  2.14it/s] 92%|█████████▏| 11708/12776 [2:03:19<07:50,  2.27it/s]                                                        92%|█████████▏| 11708/12776 [2:03:19<07:50,  2.27it/s] 92%|█████████▏| 11709/12776 [2:03:19<07:35,  2.34it/s]                                                        92%|█████████▏| 11709/12776 [2:03:19<07:35,  2.34it/s] 92%|█████████▏| 11710/12776 [2:03:20<07:10,  2.47it/s]                                                        92%|█████████▏| 11710/12776 [2:03:20<07:10,  2.47it/s] 92%|█████████▏| 11711/12776 [2:03:20<06:51,  2.59it/s]                                                        92%|█████████▏| 11711/12776 [2:03:20<06:51,  2.59it/s] 92%|█████████▏| 11712/12776 [2:03:20<07:05,  2.50it/s]                                                        92%|█████████▏| 11712/12776 [2:03:20<07:05,  2.50it/s] 92%|█████████▏| 11713/12776 [2:03:21<06:40,  2.65it/s]                                                        92%|█████████▏| 11713/12776 [2:03:21<06:40,  2.65it/s] 92%|█████████▏| 11714/12776 [2:03:21<06:17,  2.81it/s]                                                        92%|█████████▏| 11714/12776 [2:03:21<06:17,  2.81it/s] 92%|█████████▏| 11715/12776 [2:03:21<05:57,  2.97it/s]                                                        92%|█████████▏| 11715/12776 [2:03:21<05:57,  2.97it/s] 92%|█████████▏| 11716/12776 [2:03:22<05:58,  2.95it/s]                                                        92%|█████████▏| 11716/12776 [2:03:22<05:58,  2.95it/s] 92%|█████████▏| 11717/12776 [2:03:22<05:38,  3.12it/s]                                                        92%|█████████▏| 11717/12776 [2:03:22<05:38,  3.12it/s] 92%|█████████▏| 11718/12776 [2:03:22<05:21,  3.29it/s]                                                        92%|█████████▏| 11718/12776 [2:03:22<05:21,  3.29it/s] 92%|█████████▏| 11719/12776 [2:03:22<05:07,  3.43it/s]                                                        92%|█████████▏| 11719/12776 [2:03:22<05:07,  3.43it/s] 92%|█████████▏| 11720/12776 [2:03:23<05:14,  3.36it/s]                                                        92%|█████████▏| 11720/12776 [2:03:23<05:14,  3.36it/s] 92%|█████████▏| 11721/12776 [2:03:23<04:57,  3.55it/s]                                                        92%|█████████▏| 11721/12776 [2:03:23<04:57,  3.55it/s] 92%|█████████▏| 11722/12776 [2:03:23<04:42,  3.73it/s]                                                        92%|█████████▏| 11722/12776 [2:03:23<04:42,  3.73it/s] 92%|█████████▏| 11723/12776 [2:03:23<04:30,  3.90it/s]                                                        92%|█████████▏| 11723/12776 [2:03:23<04:30,  3.90it/s] 92%|█████████▏| 11724/12776 [2:03:24<04:43,  3.71it/s]                                                        92%|█████████▏| 11724/12776 [2:03:24<04:43,  3.71it/s] 92%|█████████▏| 11725/12776 [2:03:24<04:27,  3.93it/s]                                                        92%|█████████▏| 11725/12776 [2:03:24<04:27,  3.93it/s] 92%|█████████▏| 11726/12776 [2:03:24<04:13,  4.13it/s]                                                        92%|█████████▏| 11726/12776 [2:03:24<04:13,  4.13it/s] 92%|█████████▏| 11727/12776 [2:03:24<04:03,  4.31it/s]                                                        92%|█████████▏| 11727/12776 [2:03:24<04:03,  4.31it/s] 92%|█████████▏| 11728/12776 [2:03:25<03:56,  4.44it/s]                                                        92%|█████████▏| 11728/12776 [2:03:25<03:56,  4.44it/s] 92%|█████████▏| 11729/12776 [2:03:25<04:15,  4.10it/s]                                                        92%|█████████▏| 11729/12776 [2:03:25<04:15,  4.10it/s] 92%|█████████▏| 11730/12776 [2:03:25<04:02,  4.32it/s]                                                        92%|█████████▏| 11730/12776 [2:03:25<04:02,  4.32it/s] 92%|█████████▏| 11731/12776 [2:03:25<03:51,  4.51it/s]                                                        92%|█████████▏| 11731/12776 [2:03:25<03:51,  4.51it/s] 92%|█████████▏| 11732/12776 [2:03:25<03:43,  4.67it/s]                                                        92%|█████████▏| 11732/12776 [2:03:25<03:43,  4.67it/s] 92%|█████████▏| 11733/12776 [2:03:26<03:38,  4.78it/s]                                                        92%|█████████▏| 11733/12776 [2:03:26<03:38,  4.78it/s] 92%|█████████▏| 11734/12776 [2:03:26<03:33,  4.88it/s]                                                        92%|█████████▏| 11734/12776 [2:03:26<03:33,  4.88it/s] 92%|█████████▏| 11735/12776 [2:03:26<03:54,  4.45it/s]                                                        92%|█████████▏| 11735/12776 [2:03:26<03:54,  4.45it/s] 92%|█████████▏| 11736/12776 [2:03:26<03:42,  4.68it/s]                                                        92%|█████████▏| 11736/12776 [2:03:26<03:42,  4.68it/s] 92%|█████████▏| 11737/12776 [2:03:26<03:32,  4.90it/s]                                                        92%|█████████▏| 11737/12776 [2:03:26<03:32,  4.90it/s] 92%|█████████▏| 11738/12776 [2:03:27<06:23,  2.70it/s]                                                        92%|█████████▏| 11738/12776 [2:03:27<06:23,  2.70it/s] 92%|█████████▏| 11739/12776 [2:03:29<11:23,  1.52it/s]                                                        92%|█████████▏| 11739/12776 [2:03:29<11:23,  1.52it/s] 92%|█████████▏| 11740/12776 [2:03:29<12:42,  1.36it/s]                                                        92%|█████████▏| 11740/12776 [2:03:29<12:42,  1.36it/s] 92%|█████████▏| 11741/12776 [2:03:30<13:19,  1.30it/s]                                                        92%|█████████▏| 11741/12776 [2:03:30<13:19,  1.30it/s] 92%|█████████▏| 11742/12776 [2:03:31<13:17,  1.30it/s]                                                        92%|█████████▏| 11742/12776 [2:03:31<13:17,  1.30it/s] 92%|█████████▏| 11743/12776 [2:03:32<13:05,  1.31it/s]                                                        92%|█████████▏| 11743/12776 [2:03:32<13:05,  1.31it/s] 92%|█████████▏| 11744/12776 [2:03:33<12:44,  1.35it/s]                                                        92%|█████████▏| 11744/12776 [2:03:33<12:44,  1.35it/s] 92%|█████████▏| 11745/12776 [2:03:33<12:37,  1.36it/s]                                                        92%|█████████▏| 11745/12776 [2:03:33<12:37,  1.36it/s] 92%|█████████▏| 11746/12776 [2:03:34<12:38,  1.36it/s]                                                        92%|█████████▏| 11746/12776 [2:03:34<12:38,  1.36it/s] 92%|█████████▏| 11747/12776 [2:03:35<11:46,  1.46it/s]                                                        92%|█████████▏| 11747/12776 [2:03:35<11:46,  1.46it/s] 92%|█████████▏| 11748/12776 [2:03:35<11:21,  1.51it/s]                                                        92%|█████████▏| 11748/12776 [2:03:35<11:21,  1.51it/s] 92%|█████████▏| 11749/12776 [2:03:36<10:39,  1.61it/s]                                                        92%|█████████▏| 11749/12776 [2:03:36<10:39,  1.61it/s] 92%|█████████▏| 11750/12776 [2:03:36<10:33,  1.62it/s]                                                        92%|█████████▏| 11750/12776 [2:03:36<10:33,  1.62it/s] 92%|█████████▏| 11751/12776 [2:03:37<09:42,  1.76it/s]                                                        92%|█████████▏| 11751/12776 [2:03:37<09:42,  1.76it/s] 92%|█████████▏| 11752/12776 [2:03:37<09:02,  1.89it/s]                                                        92%|█████████▏| 11752/12776 [2:03:37<09:02,  1.89it/s] 92%|█████████▏| 11753/12776 [2:03:38<08:57,  1.90it/s]                                                        92%|█████████▏| 11753/12776 [2:03:38<08:57,  1.90it/s] 92%|█████████▏| 11754/12776 [2:03:38<08:22,  2.03it/s]                                                        92%|█████████▏| 11754/12776 [2:03:38<08:22,  2.03it/s] 92%|█████████▏| 11755/12776 [2:03:39<08:11,  2.08it/s]                                                        92%|█████████▏| 11755/12776 [2:03:39<08:11,  2.08it/s] 92%|█████████▏| 11756/12776 [2:03:39<07:44,  2.20it/s]                                                        92%|█████████▏| 11756/12776 [2:03:39<07:44,  2.20it/s] 92%|█████████▏| 11757/12776 [2:03:39<07:21,  2.31it/s]                                                        92%|█████████▏| 11757/12776 [2:03:39<07:21,  2.31it/s] 92%|█████████▏| 11758/12776 [2:03:40<07:00,  2.42it/s]                                                        92%|█████████▏| 11758/12776 [2:03:40<07:00,  2.42it/s] 92%|█████████▏| 11759/12776 [2:03:40<06:47,  2.50it/s]                                                        92%|█████████▏| 11759/12776 [2:03:40<06:47,  2.50it/s] 92%|█████████▏| 11760/12776 [2:03:40<06:29,  2.61it/s]                                                        92%|█████████▏| 11760/12776 [2:03:40<06:29,  2.61it/s] 92%|█████████▏| 11761/12776 [2:03:41<06:34,  2.57it/s]                                                        92%|█████████▏| 11761/12776 [2:03:41<06:34,  2.57it/s] 92%|█████████▏| 11762/12776 [2:03:41<06:14,  2.70it/s]                                                        92%|█████████▏| 11762/12776 [2:03:41<06:14,  2.70it/s] 92%|█████████▏| 11763/12776 [2:03:41<05:55,  2.85it/s]                                                        92%|█████████▏| 11763/12776 [2:03:41<05:55,  2.85it/s] 92%|█████████▏| 11764/12776 [2:03:42<05:38,  2.99it/s]                                                        92%|█████████▏| 11764/12776 [2:03:42<05:38,  2.99it/s] 92%|█████████▏| 11765/12776 [2:03:42<05:43,  2.94it/s]                                                        92%|█████████▏| 11765/12776 [2:03:42<05:43,  2.94it/s] 92%|█████████▏| 11766/12776 [2:03:42<05:24,  3.11it/s]                                                        92%|█████████▏| 11766/12776 [2:03:42<05:24,  3.11it/s] 92%|█████████▏| 11767/12776 [2:03:43<05:08,  3.27it/s]                                                        92%|█████████▏| 11767/12776 [2:03:43<05:08,  3.27it/s] 92%|█████████▏| 11768/12776 [2:03:43<04:55,  3.41it/s]                                                        92%|█████████▏| 11768/12776 [2:03:43<04:55,  3.41it/s] 92%|█████████▏| 11769/12776 [2:03:43<05:07,  3.27it/s]                                                        92%|█████████▏| 11769/12776 [2:03:43<05:07,  3.27it/s] 92%|█████████▏| 11770/12776 [2:03:44<04:49,  3.48it/s]                                                        92%|█████████▏| 11770/12776 [2:03:44<04:49,  3.48it/s] 92%|█████████▏| 11771/12776 [2:03:44<04:36,  3.63it/s]                                                        92%|█████████▏| 11771/12776 [2:03:44<04:36,  3.63it/s] 92%|█████████▏| 11772/12776 [2:03:44<04:25,  3.78it/s]                                                        92%|█████████▏| 11772/12776 [2:03:44<04:25,  3.78it/s] 92%|█████████▏| 11773/12776 [2:03:44<05:00,  3.34it/s]                                                        92%|█████████▏| 11773/12776 [2:03:44<05:00,  3.34it/s] 92%|█████████▏| 11774/12776 [2:03:45<04:41,  3.57it/s]                                                        92%|█████████▏| 11774/12776 [2:03:45<04:41,  3.57it/s] 92%|█████████▏| 11775/12776 [2:03:45<04:23,  3.80it/s]                                                        92%|█████████▏| 11775/12776 [2:03:45<04:23,  3.80it/s] 92%|█████████▏| 11776/12776 [2:03:45<04:12,  3.96it/s]                                                        92%|█████████▏| 11776/12776 [2:03:45<04:12,  3.96it/s] 92%|█████████▏| 11777/12776 [2:03:45<04:39,  3.58it/s]                                                        92%|█████████▏| 11777/12776 [2:03:45<04:39,  3.58it/s] 92%|█████████▏| 11778/12776 [2:03:46<04:18,  3.86it/s]                                                       {'loss': 0.4572, 'grad_norm': 0.9475135207176208, 'learning_rate': 2.6661779081133917e-05, 'epoch': 1.83}
+{'loss': 0.5409, 'grad_norm': 1.6032726764678955, 'learning_rate': 2.663734115347018e-05, 'epoch': 1.83}
+{'loss': 0.4271, 'grad_norm': 1.069624423980713, 'learning_rate': 2.661290322580645e-05, 'epoch': 1.83}
+{'loss': 0.4135, 'grad_norm': 1.8359605073928833, 'learning_rate': 2.6588465298142715e-05, 'epoch': 1.83}
+{'loss': 0.6246, 'grad_norm': 4.056179046630859, 'learning_rate': 2.656402737047898e-05, 'epoch': 1.83}
+{'loss': 0.4708, 'grad_norm': 1.7407777309417725, 'learning_rate': 2.6539589442815248e-05, 'epoch': 1.83}
+{'loss': 0.5088, 'grad_norm': 1.6690481901168823, 'learning_rate': 2.6515151515151512e-05, 'epoch': 1.83}
+{'loss': 0.6305, 'grad_norm': 1.289447546005249, 'learning_rate': 2.6490713587487777e-05, 'epoch': 1.83}
+{'loss': 0.7835, 'grad_norm': 2.4667999744415283, 'learning_rate': 2.6466275659824045e-05, 'epoch': 1.83}
+{'loss': 0.6884, 'grad_norm': 1.8310896158218384, 'learning_rate': 2.644183773216031e-05, 'epoch': 1.83}
+{'loss': 0.7651, 'grad_norm': 1.9276304244995117, 'learning_rate': 2.6417399804496575e-05, 'epoch': 1.83}
+{'loss': 0.5969, 'grad_norm': 2.878755807876587, 'learning_rate': 2.6392961876832843e-05, 'epoch': 1.83}
+{'loss': 0.6253, 'grad_norm': 1.5785597562789917, 'learning_rate': 2.6368523949169108e-05, 'epoch': 1.83}
+{'loss': 0.4936, 'grad_norm': 1.4301279783248901, 'learning_rate': 2.6344086021505373e-05, 'epoch': 1.83}
+{'loss': 0.7848, 'grad_norm': 3.3966879844665527, 'learning_rate': 2.631964809384164e-05, 'epoch': 1.83}
+{'loss': 0.7803, 'grad_norm': 5.173041820526123, 'learning_rate': 2.6295210166177906e-05, 'epoch': 1.83}
+{'loss': 0.8126, 'grad_norm': 3.9286694526672363, 'learning_rate': 2.627077223851417e-05, 'epoch': 1.83}
+{'loss': 0.6296, 'grad_norm': 6.795763969421387, 'learning_rate': 2.624633431085044e-05, 'epoch': 1.83}
+{'loss': 0.9323, 'grad_norm': 6.505904197692871, 'learning_rate': 2.6221896383186704e-05, 'epoch': 1.83}
+{'loss': 0.8992, 'grad_norm': 2.0406224727630615, 'learning_rate': 2.619745845552297e-05, 'epoch': 1.83}
+{'loss': 0.9607, 'grad_norm': 11.124521255493164, 'learning_rate': 2.6173020527859237e-05, 'epoch': 1.83}
+{'loss': 1.0505, 'grad_norm': 10.044174194335938, 'learning_rate': 2.61485826001955e-05, 'epoch': 1.84}
+{'loss': 0.7101, 'grad_norm': 5.03192138671875, 'learning_rate': 2.6124144672531766e-05, 'epoch': 1.84}
+{'loss': 0.6641, 'grad_norm': 3.8350234031677246, 'learning_rate': 2.6099706744868034e-05, 'epoch': 1.84}
+{'loss': 0.5772, 'grad_norm': 1.9279121160507202, 'learning_rate': 2.60752688172043e-05, 'epoch': 1.84}
+{'loss': 0.5835, 'grad_norm': 2.837632656097412, 'learning_rate': 2.6050830889540564e-05, 'epoch': 1.84}
+{'loss': 0.9303, 'grad_norm': 4.1351237297058105, 'learning_rate': 2.6026392961876832e-05, 'epoch': 1.84}
+{'loss': 0.7467, 'grad_norm': 2.9830517768859863, 'learning_rate': 2.6001955034213097e-05, 'epoch': 1.84}
+{'loss': 0.8099, 'grad_norm': 2.7831618785858154, 'learning_rate': 2.5977517106549362e-05, 'epoch': 1.84}
+{'loss': 1.2066, 'grad_norm': 2.7024545669555664, 'learning_rate': 2.595307917888563e-05, 'epoch': 1.84}
+{'loss': 1.3392, 'grad_norm': 7.1745524406433105, 'learning_rate': 2.5928641251221895e-05, 'epoch': 1.84}
+{'loss': 1.2869, 'grad_norm': 3.5159523487091064, 'learning_rate': 2.590420332355816e-05, 'epoch': 1.84}
+{'loss': 0.8537, 'grad_norm': 1.527916669845581, 'learning_rate': 2.5879765395894428e-05, 'epoch': 1.84}
+{'loss': 0.4337, 'grad_norm': 1.3133277893066406, 'learning_rate': 2.5855327468230693e-05, 'epoch': 1.84}
+{'loss': 0.5792, 'grad_norm': 1.1044042110443115, 'learning_rate': 2.5830889540566957e-05, 'epoch': 1.84}
+{'loss': 0.3835, 'grad_norm': 1.173004388809204, 'learning_rate': 2.5806451612903226e-05, 'epoch': 1.84}
+{'loss': 0.7743, 'grad_norm': 3.5974175930023193, 'learning_rate': 2.578201368523949e-05, 'epoch': 1.84}
+{'loss': 0.6945, 'grad_norm': 1.2620445489883423, 'learning_rate': 2.5757575757575755e-05, 'epoch': 1.84}
+{'loss': 0.4413, 'grad_norm': 0.7731130719184875, 'learning_rate': 2.573313782991202e-05, 'epoch': 1.84}
+{'loss': 0.416, 'grad_norm': 1.389115810394287, 'learning_rate': 2.5708699902248288e-05, 'epoch': 1.84}
+{'loss': 0.5073, 'grad_norm': 0.8706973195075989, 'learning_rate': 2.5684261974584553e-05, 'epoch': 1.84}
+{'loss': 0.3761, 'grad_norm': 0.5937680602073669, 'learning_rate': 2.5659824046920818e-05, 'epoch': 1.84}
+{'loss': 0.4664, 'grad_norm': 1.8303773403167725, 'learning_rate': 2.5635386119257086e-05, 'epoch': 1.84}
+{'loss': 0.5187, 'grad_norm': 1.462870717048645, 'learning_rate': 2.561094819159335e-05, 'epoch': 1.84}
+{'loss': 0.5145, 'grad_norm': 1.8268141746520996, 'learning_rate': 2.5586510263929615e-05, 'epoch': 1.84}
+{'loss': 0.5543, 'grad_norm': 1.6612815856933594, 'learning_rate': 2.5562072336265884e-05, 'epoch': 1.84}
+{'loss': 0.4739, 'grad_norm': 3.4995779991149902, 'learning_rate': 2.553763440860215e-05, 'epoch': 1.84}
+{'loss': 0.4802, 'grad_norm': 1.3062471151351929, 'learning_rate': 2.5513196480938413e-05, 'epoch': 1.84}
+{'loss': 1.0193, 'grad_norm': 6.06448221206665, 'learning_rate': 2.548875855327468e-05, 'epoch': 1.84}
+{'loss': 0.5685, 'grad_norm': 2.5236899852752686, 'learning_rate': 2.5464320625610946e-05, 'epoch': 1.84}
+{'loss': 0.5858, 'grad_norm': 1.4289917945861816, 'learning_rate': 2.543988269794721e-05, 'epoch': 1.84}
+{'loss': 0.6848, 'grad_norm': 1.3175865411758423, 'learning_rate': 2.541544477028348e-05, 'epoch': 1.84}
+{'loss': 0.62, 'grad_norm': 3.198878288269043, 'learning_rate': 2.5391006842619744e-05, 'epoch': 1.84}
+{'loss': 0.5892, 'grad_norm': 2.0949974060058594, 'learning_rate': 2.536656891495601e-05, 'epoch': 1.84}
+{'loss': 0.8502, 'grad_norm': 2.16085147857666, 'learning_rate': 2.5342130987292277e-05, 'epoch': 1.84}
+{'loss': 0.7301, 'grad_norm': 3.7577946186065674, 'learning_rate': 2.5317693059628542e-05, 'epoch': 1.84}
+{'loss': 0.7686, 'grad_norm': 1.981907606124878, 'learning_rate': 2.5293255131964807e-05, 'epoch': 1.84}
+{'loss': 0.663, 'grad_norm': 4.2499308586120605, 'learning_rate': 2.5268817204301075e-05, 'epoch': 1.84}
+{'loss': 0.8082, 'grad_norm': 5.370046138763428, 'learning_rate': 2.524437927663734e-05, 'epoch': 1.84}
+{'loss': 0.7207, 'grad_norm': 1.3637969493865967, 'learning_rate': 2.5219941348973604e-05, 'epoch': 1.84}
+{'loss': 0.6059, 'grad_norm': 1.314660668373108, 'learning_rate': 2.5195503421309873e-05, 'epoch': 1.84}
+{'loss': 0.684, 'grad_norm': 6.1758809089660645, 'learning_rate': 2.5171065493646137e-05, 'epoch': 1.84}
+{'loss': 0.7404, 'grad_norm': 2.818993330001831, 'learning_rate': 2.5146627565982402e-05, 'epoch': 1.84}
+{'loss': 0.8012, 'grad_norm': 1.828629732131958, 'learning_rate': 2.512218963831867e-05, 'epoch': 1.84}
+{'loss': 0.7304, 'grad_norm': 2.7739205360412598, 'learning_rate': 2.5097751710654935e-05, 'epoch': 1.84}
+{'loss': 0.6273, 'grad_norm': 8.794857025146484, 'learning_rate': 2.50733137829912e-05, 'epoch': 1.84}
+{'loss': 0.6637, 'grad_norm': 1.3181114196777344, 'learning_rate': 2.5048875855327468e-05, 'epoch': 1.84}
+{'loss': 0.5413, 'grad_norm': 1.4442148208618164, 'learning_rate': 2.5024437927663733e-05, 'epoch': 1.84}
+{'loss': 0.7349, 'grad_norm': 4.040245532989502, 'learning_rate': 2.4999999999999998e-05, 'epoch': 1.84}
+{'loss': 0.8002, 'grad_norm': 3.4803292751312256, 'learning_rate': 2.4975562072336266e-05, 'epoch': 1.84}
+{'loss': 0.4308, 'grad_norm': 27.806154251098633, 'learning_rate': 2.495112414467253e-05, 'epoch': 1.84}
+{'loss': 1.6088, 'grad_norm': 6.975183963775635, 'learning_rate': 2.4926686217008796e-05, 'epoch': 1.84}
+{'loss': 0.9311, 'grad_norm': 3.424659013748169, 'learning_rate': 2.4902248289345064e-05, 'epoch': 1.84}
+{'loss': 1.0048, 'grad_norm': 2.6333138942718506, 'learning_rate': 2.487781036168133e-05, 'epoch': 1.84}
+{'loss': 0.8559, 'grad_norm': 3.147855520248413, 'learning_rate': 2.4853372434017593e-05, 'epoch': 1.84}
+{'loss': 1.0206, 'grad_norm': 2.6393229961395264, 'learning_rate': 2.482893450635386e-05, 'epoch': 1.84}
+{'loss': 0.4979, 'grad_norm': 2.7040412425994873, 'learning_rate': 2.4804496578690126e-05, 'epoch': 1.84}
+ 92%|█████████▏| 11778/12776 [2:03:46<04:18,  3.86it/s] 92%|█████████▏| 11779/12776 [2:03:46<04:08,  4.01it/s]                                                        92%|█████████▏| 11779/12776 [2:03:46<04:08,  4.01it/s] 92%|█████████▏| 11780/12776 [2:03:46<03:55,  4.24it/s]                                                        92%|█████████▏| 11780/12776 [2:03:46<03:55,  4.24it/s] 92%|█████████▏| 11781/12776 [2:03:46<03:43,  4.45it/s]                                                        92%|█████████▏| 11781/12776 [2:03:46<03:43,  4.45it/s] 92%|█████████▏| 11782/12776 [2:03:47<03:59,  4.15it/s]                                                        92%|█████████▏| 11782/12776 [2:03:47<03:59,  4.15it/s] 92%|█████████▏| 11783/12776 [2:03:47<03:45,  4.40it/s]                                                        92%|█████████▏| 11783/12776 [2:03:47<03:45,  4.40it/s] 92%|█████████▏| 11784/12776 [2:03:47<03:35,  4.60it/s]                                                        92%|█████████▏| 11784/12776 [2:03:47<03:35,  4.60it/s] 92%|█████████▏| 11785/12776 [2:03:47<03:27,  4.78it/s]                                                        92%|█████████��| 11785/12776 [2:03:47<03:27,  4.78it/s] 92%|█████████▏| 11786/12776 [2:03:47<03:19,  4.95it/s]                                                        92%|█████████▏| 11786/12776 [2:03:47<03:19,  4.95it/s] 92%|█████████▏| 11787/12776 [2:03:47<03:14,  5.08it/s]                                                        92%|█████████▏| 11787/12776 [2:03:47<03:14,  5.08it/s] 92%|█████████▏| 11788/12776 [2:03:48<05:50,  2.82it/s]                                                        92%|█████████▏| 11788/12776 [2:03:48<05:50,  2.82it/s] 92%|█████████▏| 11789/12776 [2:03:50<11:42,  1.40it/s]                                                        92%|█████████▏| 11789/12776 [2:03:50<11:42,  1.40it/s] 92%|█████████▏| 11790/12776 [2:03:51<12:45,  1.29it/s]                                                        92%|█████████▏| 11790/12776 [2:03:51<12:45,  1.29it/s] 92%|█████████▏| 11791/12776 [2:03:52<13:25,  1.22it/s]                                                        92%|█████████▏| 11791/12776 [2:03:52<13:25,  1.22it/s] 92%|█████████▏| 11792/12776 [2:03:53<14:11,  1.16it/s]                                                        92%|█████████▏| 11792/12776 [2:03:53<14:11,  1.16it/s] 92%|█████████▏| 11793/12776 [2:03:53<14:04,  1.16it/s]                                                        92%|█████████▏| 11793/12776 [2:03:53<14:04,  1.16it/s] 92%|█████████▏| 11794/12776 [2:03:54<13:19,  1.23it/s]                                                        92%|█████████▏| 11794/12776 [2:03:54<13:19,  1.23it/s] 92%|█████████▏| 11795/12776 [2:03:55<12:45,  1.28it/s]                                                        92%|█████████▏| 11795/12776 [2:03:55<12:45,  1.28it/s] 92%|█████████▏| 11796/12776 [2:03:55<12:01,  1.36it/s]                                                        92%|█████████▏| 11796/12776 [2:03:55<12:01,  1.36it/s] 92%|█████████▏| 11797/12776 [2:03:56<11:31,  1.42it/s]                                                        92%|█████████▏| 11797/12776 [2:03:56<11:31,  1.42it/s] 92%|█████████▏| 11798/12776 [2:03:57<10:50,  1.50it/s]                                                        92%|█████████▏| 11798/12776 [2:03:57<10:50,  1.50it/s] 92%|█████████▏| 11799/12776 [2:03:57<10:23,  1.57it/s]                                                        92%|█████████▏| 11799/12776 [2:03:57<10:23,  1.57it/s] 92%|█████████▏| 11800/12776 [2:03:58<09:41,  1.68it/s]                                                        92%|█████████▏| 11800/12776 [2:03:58<09:41,  1.68it/s] 92%|█████████▏| 11801/12776 [2:03:58<09:23,  1.73it/s]                                                        92%|█████████▏| 11801/12776 [2:03:58<09:23,  1.73it/s] 92%|█████████▏| 11802/12776 [2:03:59<08:42,  1.86it/s]                                                        92%|█████████▏| 11802/12776 [2:03:59<08:42,  1.86it/s] 92%|█████████▏| 11803/12776 [2:03:59<08:39,  1.87it/s]                                                        92%|█████████▏| 11803/12776 [2:03:59<08:39,  1.87it/s] 92%|█████████▏| 11804/12776 [2:04:00<08:02,  2.01it/s]                                                        92%|█████████▏| 11804/12776 [2:04:00<08:02,  2.01it/s] 92%|█████████▏| 11805/12776 [2:04:00<07:32,  2.15it/s]                                                        92%|█████████▏| 11805/12776 [2:04:00<07:32,  2.15it/s] 92%|█████████▏| 11806/12776 [2:04:01<07:45,  2.09it/s]                                                        92%|█████████▏| 11806/12776 [2:04:01<07:45,  2.09it/s] 92%|█████████▏| 11807/12776 [2:04:01<07:09,  2.26it/s]                                                        92%|█████████▏| 11807/12776 [2:04:01<07:09,  2.26it/s] 92%|█████████▏| 11808/12776 [2:04:01<06:39,  2.43it/s]                                                        92%|█████████▏| 11808/12776 [2:04:01<06:39,  2.43it/s] 92%|█████████▏| 11809/12776 [2:04:02<06:28,  2.49it/s]                                                        92%|█████████▏| 11809/12776 [2:04:02<06:28,  2.49it/s] 92%|█████████▏| 11810/12776 [2:04:02<06:05,  2.64it/s]                                                        92%|█████████▏| 11810/12776 [2:04:02<06:05,  2.64it/s] 92%|█████████▏| 11811/12776 [2:04:02<05:47,  2.78it/s]                                                        92%|█████████▏| 11811/12776 [2:04:02<05:47,  2.78it/s] 92%|█████████▏| 11812/12776 [2:04:03<05:41,  2.82it/s]                                                        92%|█████████▏| 11812/12776 [2:04:03<05:41,  2.82it/s] 92%|█████████▏| 11813/12776 [2:04:03<05:22,  2.99it/s]                                                        92%|█████████▏| 11813/12776 [2:04:03<05:22,  2.99it/s] 92%|█████████▏| 11814/12776 [2:04:03<05:06,  3.14it/s]                                                        92%|█████████▏| 11814/12776 [2:04:03<05:06,  3.14it/s] 92%|█████████▏| 11815/12776 [2:04:03<04:52,  3.29it/s]                                                        92%|█████████▏| 11815/12776 [2:04:03<04:52,  3.29it/s] 92%|█████████▏| 11816/12776 [2:04:04<04:55,  3.25it/s]                                                        92%|█████████▏| 11816/12776 [2:04:04<04:55,  3.25it/s] 92%|█████████▏| 11817/12776 [2:04:04<04:40,  3.42it/s]                                                        92%|█████████▏| 11817/12776 [2:04:04<04:40,  3.42it/s] 93%|█████████▎| 11818/12776 [2:04:04<04:26,  3.59it/s]                                                        93%|█████████▎| 11818/12776 [2:04:04<04:26,  3.59it/s] 93%|█████████▎| 11819/12776 [2:04:05<04:16,  3.73it/s]                                                        93%|█████████▎| 11819/12776 [2:04:05<04:16,  3.73it/s] 93%|█████████▎| 11820/12776 [2:04:05<04:07,  3.86it/s]                                                        93%|█████████▎| 11820/12776 [2:04:05<04:07,  3.86it/s] 93%|█████████▎| 11821/12776 [2:04:05<04:21,  3.65it/s]                                                        93%|█████████▎| 11821/12776 [2:04:05<04:21,  3.65it/s] 93%|█████████▎| 11822/12776 [2:04:05<04:08,  3.84it/s]                                                        93%|█████████▎| 11822/12776 [2:04:05<04:08,  3.84it/s] 93%|█████████▎| 11823/12776 [2:04:06<03:57,  4.01it/s]                                                        93%|█████████▎| 11823/12776 [2:04:06<03:57,  4.01it/s] 93%|█████████▎| 11824/12776 [2:04:06<03:47,  4.18it/s]                                                        93%|█████████▎| 11824/12776 [2:04:06<03:47,  4.18it/s] 93%|█████████▎| 11825/12776 [2:04:06<04:00,  3.95it/s]                                                        93%|█████████▎| 11825/12776 [2:04:06<04:00,  3.95it/s] 93%|█████████▎| 11826/12776 [2:04:06<03:49,  4.13it/s]                                                        93%|█████████▎| 11826/12776 [2:04:06<03:49,  4.13it/s] 93%|█████████▎| 11827/12776 [2:04:06<03:41,  4.28it/s]                                                        93%|█████████▎| 11827/12776 [2:04:06<03:41,  4.28it/s] 93%|█████████▎| 11828/12776 [2:04:07<03:35,  4.39it/s]                                                        93%|█████████▎| 11828/12776 [2:04:07<03:35,  4.39it/s] 93%|█████████▎| 11829/12776 [2:04:07<03:29,  4.53it/s]                                                        93%|█████████▎| 11829/12776 [2:04:07<03:29,  4.53it/s] 93%|█████████▎| 11830/12776 [2:04:07<03:46,  4.18it/s]                                                        93%|█████████▎| 11830/12776 [2:04:07<03:46,  4.18it/s] 93%|█████████▎| 11831/12776 [2:04:07<03:35,  4.38it/s]                                                        93%|█████████▎| 11831/12776 [2:04:07<03:35,  4.38it/s] 93%|█████████▎| 11832/12776 [2:04:08<03:27,  4.55it/s]                                                        93%|█████████▎| 11832/12776 [2:04:08<03:27,  4.55it/s] 93%|█████████▎| 11833/12776 [2:04:08<03:20,  4.70it/s]                                                        93%|█████████▎| 11833/12776 [2:04:08<03:20,  4.70it/s] 93%|█████████▎| 11834/12776 [2:04:08<03:18,  4.75it/s]                                                        93%|█████████���| 11834/12776 [2:04:08<03:18,  4.75it/s] 93%|█████████▎| 11835/12776 [2:04:08<03:44,  4.19it/s]                                                        93%|█████████▎| 11835/12776 [2:04:08<03:44,  4.19it/s] 93%|█████████▎| 11836/12776 [2:04:08<03:30,  4.48it/s]                                                        93%|█████████▎| 11836/12776 [2:04:08<03:30,  4.48it/s] 93%|█████████▎| 11837/12776 [2:04:09<03:18,  4.73it/s]                                                        93%|█████████▎| 11837/12776 [2:04:09<03:18,  4.73it/s] 93%|█████████▎| 11838/12776 [2:04:09<06:20,  2.47it/s]                                                        93%|█████████▎| 11838/12776 [2:04:09<06:20,  2.47it/s] 93%|█████████▎| 11839/12776 [2:04:11<11:47,  1.32it/s]                                                        93%|█████████▎| 11839/12776 [2:04:11<11:47,  1.32it/s] 93%|█████████▎| 11840/12776 [2:04:12<13:11,  1.18it/s]                                                        93%|█████████▎| 11840/12776 [2:04:12<13:11,  1.18it/s] 93%|█████████▎| 11841/12776 [2:04:13<13:10,  1.18it/s]                                                        93%|█████████▎| 11841/12776 [2:04:13<13:10,  1.18it/s] 93%|█████████▎| 11842/12776 [2:04:14<12:53,  1.21it/s]                                                        93%|█████████▎| 11842/12776 [2:04:14<12:53,  1.21it/s] 93%|█████████▎| 11843/12776 [2:04:14<12:22,  1.26it/s]                                                        93%|█████████▎| 11843/12776 [2:04:14<12:22,  1.26it/s] 93%|█████████▎| 11844/12776 [2:04:15<11:52,  1.31it/s]                                                        93%|█████████▎| 11844/12776 [2:04:15<11:52,  1.31it/s] 93%|█████████▎| 11845/12776 [2:04:16<11:17,  1.37it/s]                                                        93%|█████████▎| 11845/12776 [2:04:16<11:17,  1.37it/s] 93%|█████████▎| 11846/12776 [2:04:17<11:29,  1.35it/s]                                                        93%|█████████▎| 11846/12776 [2:04:17<11:29,  1.35it/s] 93%|█████████▎| 11847/12776 [2:04:17<10:42,  1.44it/s]                                                        93%|█████████▎| 11847/12776 [2:04:17<10:42,  1.44it/s] 93%|█████████▎| 11848/12776 [2:04:18<10:24,  1.49it/s]                                                        93%|█████████▎| 11848/12776 [2:04:18<10:24,  1.49it/s] 93%|█████████▎| 11849/12776 [2:04:18<09:45,  1.58it/s]                                                        93%|█████████▎| 11849/12776 [2:04:18<09:45,  1.58it/s] 93%|█████████▎| 11850/12776 [2:04:19<09:32,  1.62it/s]                                                        93%|█████████▎| 11850/12776 [2:04:19<09:32,  1.62it/s] 93%|█████████▎| 11851/12776 [2:04:19<08:53,  1.73it/s]                                                        93%|█████████▎| 11851/12776 [2:04:19<08:53,  1.73it/s] 93%|█████████▎| 11852/12776 [2:04:20<08:38,  1.78it/s]                                                        93%|█████████▎| 11852/12776 [2:04:20<08:38,  1.78it/s] 93%|█████████▎| 11853/12776 [2:04:20<08:04,  1.90it/s]                                                        93%|█████████▎| 11853/12776 [2:04:20<08:04,  1.90it/s] 93%|█████████▎| 11854/12776 [2:04:21<08:00,  1.92it/s]                                                        93%|█████████▎| 11854/12776 [2:04:21<08:00,  1.92it/s] 93%|█████████▎| 11855/12776 [2:04:21<07:26,  2.06it/s]                                                       {'loss': 0.7279, 'grad_norm': 3.9352643489837646, 'learning_rate': 2.478005865102639e-05, 'epoch': 1.84}
+{'loss': 0.9773, 'grad_norm': 2.641021966934204, 'learning_rate': 2.475562072336266e-05, 'epoch': 1.84}
+{'loss': 0.9536, 'grad_norm': 2.157909631729126, 'learning_rate': 2.4731182795698924e-05, 'epoch': 1.84}
+{'loss': 1.4514, 'grad_norm': 3.1116890907287598, 'learning_rate': 2.470674486803519e-05, 'epoch': 1.84}
+{'loss': 0.6041, 'grad_norm': 3.8484981060028076, 'learning_rate': 2.4682306940371457e-05, 'epoch': 1.84}
+{'loss': 0.9883, 'grad_norm': 2.849015951156616, 'learning_rate': 2.4657869012707722e-05, 'epoch': 1.84}
+{'loss': 0.4752, 'grad_norm': 1.907538652420044, 'learning_rate': 2.4633431085043983e-05, 'epoch': 1.84}
+{'loss': 1.0383, 'grad_norm': 4.029622554779053, 'learning_rate': 2.4608993157380255e-05, 'epoch': 1.84}
+{'loss': 0.9394, 'grad_norm': 4.055169105529785, 'learning_rate': 2.458455522971652e-05, 'epoch': 1.85}
+{'loss': 0.3284, 'grad_norm': 1.425485372543335, 'learning_rate': 2.456011730205278e-05, 'epoch': 1.85}
+{'loss': 0.9571, 'grad_norm': 3.0947279930114746, 'learning_rate': 2.4535679374389053e-05, 'epoch': 1.85}
+{'loss': 0.8344, 'grad_norm': 2.302706718444824, 'learning_rate': 2.4511241446725318e-05, 'epoch': 1.85}
+{'loss': 0.5112, 'grad_norm': 5.67050838470459, 'learning_rate': 2.448680351906158e-05, 'epoch': 1.85}
+{'loss': 0.4184, 'grad_norm': 1.0470259189605713, 'learning_rate': 2.446236559139785e-05, 'epoch': 1.85}
+{'loss': 0.5126, 'grad_norm': 1.2459172010421753, 'learning_rate': 2.4437927663734115e-05, 'epoch': 1.85}
+{'loss': 0.4512, 'grad_norm': 0.8190130591392517, 'learning_rate': 2.4413489736070377e-05, 'epoch': 1.85}
+{'loss': 0.916, 'grad_norm': 3.6219711303710938, 'learning_rate': 2.438905180840665e-05, 'epoch': 1.85}
+{'loss': 0.6591, 'grad_norm': 2.1488611698150635, 'learning_rate': 2.4364613880742913e-05, 'epoch': 1.85}
+{'loss': 0.3781, 'grad_norm': 0.7660203576087952, 'learning_rate': 2.4340175953079175e-05, 'epoch': 1.85}
+{'loss': 0.4816, 'grad_norm': 1.8825334310531616, 'learning_rate': 2.4315738025415446e-05, 'epoch': 1.85}
+{'loss': 0.5589, 'grad_norm': 1.4968349933624268, 'learning_rate': 2.4291300097751708e-05, 'epoch': 1.85}
+{'loss': 0.4607, 'grad_norm': 3.8094868659973145, 'learning_rate': 2.4266862170087972e-05, 'epoch': 1.85}
+{'loss': 0.5093, 'grad_norm': 2.436556816101074, 'learning_rate': 2.4242424242424244e-05, 'epoch': 1.85}
+{'loss': 0.6517, 'grad_norm': 3.266652822494507, 'learning_rate': 2.4217986314760505e-05, 'epoch': 1.85}
+{'loss': 0.6738, 'grad_norm': 2.8735289573669434, 'learning_rate': 2.419354838709677e-05, 'epoch': 1.85}
+{'loss': 0.9188, 'grad_norm': 2.9087893962860107, 'learning_rate': 2.4169110459433035e-05, 'epoch': 1.85}
+{'loss': 0.5283, 'grad_norm': 1.770983338356018, 'learning_rate': 2.4144672531769303e-05, 'epoch': 1.85}
+{'loss': 0.6439, 'grad_norm': 4.079898357391357, 'learning_rate': 2.4120234604105568e-05, 'epoch': 1.85}
+{'loss': 0.9994, 'grad_norm': 2.5677294731140137, 'learning_rate': 2.4095796676441833e-05, 'epoch': 1.85}
+{'loss': 0.7825, 'grad_norm': 1.662412166595459, 'learning_rate': 2.40713587487781e-05, 'epoch': 1.85}
+{'loss': 0.6303, 'grad_norm': 1.9668818712234497, 'learning_rate': 2.4046920821114366e-05, 'epoch': 1.85}
+{'loss': 0.709, 'grad_norm': 2.1464879512786865, 'learning_rate': 2.402248289345063e-05, 'epoch': 1.85}
+{'loss': 0.6679, 'grad_norm': 5.349493980407715, 'learning_rate': 2.39980449657869e-05, 'epoch': 1.85}
+{'loss': 0.8859, 'grad_norm': 1.8723517656326294, 'learning_rate': 2.3973607038123163e-05, 'epoch': 1.85}
+{'loss': 0.5719, 'grad_norm': 2.6535778045654297, 'learning_rate': 2.3949169110459428e-05, 'epoch': 1.85}
+{'loss': 0.6213, 'grad_norm': 1.8219172954559326, 'learning_rate': 2.3924731182795696e-05, 'epoch': 1.85}
+{'loss': 0.6391, 'grad_norm': 2.3253509998321533, 'learning_rate': 2.390029325513196e-05, 'epoch': 1.85}
+{'loss': 0.4743, 'grad_norm': 5.5953545570373535, 'learning_rate': 2.3875855327468226e-05, 'epoch': 1.85}
+{'loss': 0.798, 'grad_norm': 4.139413356781006, 'learning_rate': 2.3851417399804494e-05, 'epoch': 1.85}
+{'loss': 0.4565, 'grad_norm': 10.96532154083252, 'learning_rate': 2.382697947214076e-05, 'epoch': 1.85}
+{'loss': 0.9685, 'grad_norm': 2.8876092433929443, 'learning_rate': 2.3802541544477024e-05, 'epoch': 1.85}
+{'loss': 1.0197, 'grad_norm': 3.1957509517669678, 'learning_rate': 2.3778103616813292e-05, 'epoch': 1.85}
+{'loss': 0.8237, 'grad_norm': 3.1257941722869873, 'learning_rate': 2.3753665689149557e-05, 'epoch': 1.85}
+{'loss': 0.8114, 'grad_norm': 3.2138302326202393, 'learning_rate': 2.372922776148582e-05, 'epoch': 1.85}
+{'loss': 0.7801, 'grad_norm': 7.504042625427246, 'learning_rate': 2.370478983382209e-05, 'epoch': 1.85}
+{'loss': 1.1196, 'grad_norm': 5.465351104736328, 'learning_rate': 2.3680351906158355e-05, 'epoch': 1.85}
+{'loss': 0.9248, 'grad_norm': 5.04585599899292, 'learning_rate': 2.365591397849462e-05, 'epoch': 1.85}
+{'loss': 0.6686, 'grad_norm': 1.4739549160003662, 'learning_rate': 2.3631476050830888e-05, 'epoch': 1.85}
+{'loss': 0.923, 'grad_norm': 3.8544936180114746, 'learning_rate': 2.3607038123167152e-05, 'epoch': 1.85}
+{'loss': 0.8553, 'grad_norm': 5.232646942138672, 'learning_rate': 2.3582600195503417e-05, 'epoch': 1.85}
+{'loss': 1.4067, 'grad_norm': 3.4390921592712402, 'learning_rate': 2.3558162267839685e-05, 'epoch': 1.85}
+{'loss': 0.9317, 'grad_norm': 4.184832572937012, 'learning_rate': 2.353372434017595e-05, 'epoch': 1.85}
+{'loss': 1.2859, 'grad_norm': 3.6399333477020264, 'learning_rate': 2.3509286412512215e-05, 'epoch': 1.85}
+{'loss': 0.8343, 'grad_norm': 2.263460636138916, 'learning_rate': 2.3484848484848483e-05, 'epoch': 1.85}
+{'loss': 0.7115, 'grad_norm': 1.4903042316436768, 'learning_rate': 2.3460410557184748e-05, 'epoch': 1.85}
+{'loss': 1.8051, 'grad_norm': 4.350191593170166, 'learning_rate': 2.3435972629521013e-05, 'epoch': 1.85}
+{'loss': 0.6415, 'grad_norm': 3.3762741088867188, 'learning_rate': 2.341153470185728e-05, 'epoch': 1.85}
+{'loss': 0.9455, 'grad_norm': 2.8959567546844482, 'learning_rate': 2.3387096774193546e-05, 'epoch': 1.85}
+{'loss': 1.3687, 'grad_norm': 4.2399444580078125, 'learning_rate': 2.336265884652981e-05, 'epoch': 1.85}
+{'loss': 0.6854, 'grad_norm': 3.185884952545166, 'learning_rate': 2.333822091886608e-05, 'epoch': 1.85}
+{'loss': 0.4202, 'grad_norm': 1.1546235084533691, 'learning_rate': 2.3313782991202344e-05, 'epoch': 1.85}
+{'loss': 0.594, 'grad_norm': 0.8933812975883484, 'learning_rate': 2.328934506353861e-05, 'epoch': 1.85}
+{'loss': 0.5386, 'grad_norm': 1.431520938873291, 'learning_rate': 2.3264907135874877e-05, 'epoch': 1.85}
+{'loss': 0.596, 'grad_norm': 1.0850852727890015, 'learning_rate': 2.324046920821114e-05, 'epoch': 1.85}
+{'loss': 0.6123, 'grad_norm': 1.6052504777908325, 'learning_rate': 2.3216031280547406e-05, 'epoch': 1.85}
+{'loss': 0.5618, 'grad_norm': 1.0671995878219604, 'learning_rate': 2.3191593352883674e-05, 'epoch': 1.85}
+{'loss': 0.5923, 'grad_norm': 1.0191351175308228, 'learning_rate': 2.316715542521994e-05, 'epoch': 1.85}
+{'loss': 0.5884, 'grad_norm': 1.2922497987747192, 'learning_rate': 2.3142717497556204e-05, 'epoch': 1.85}
+{'loss': 0.549, 'grad_norm': 0.6985080242156982, 'learning_rate': 2.3118279569892472e-05, 'epoch': 1.85}
+{'loss': 0.7027, 'grad_norm': 1.6202582120895386, 'learning_rate': 2.3093841642228737e-05, 'epoch': 1.85}
+{'loss': 1.267, 'grad_norm': 6.858791828155518, 'learning_rate': 2.3069403714565002e-05, 'epoch': 1.85}
+{'loss': 0.7434, 'grad_norm': 2.2614457607269287, 'learning_rate': 2.304496578690127e-05, 'epoch': 1.85}
+{'loss': 0.8123, 'grad_norm': 6.331788539886475, 'learning_rate': 2.3020527859237535e-05, 'epoch': 1.86}
+{'loss': 0.754, 'grad_norm': 1.8438414335250854, 'learning_rate': 2.29960899315738e-05, 'epoch': 1.86}
+{'loss': 0.5913, 'grad_norm': 1.1426644325256348, 'learning_rate': 2.2971652003910068e-05, 'epoch': 1.86}
+{'loss': 0.5843, 'grad_norm': 1.538586974143982, 'learning_rate': 2.2947214076246333e-05, 'epoch': 1.86}
+{'loss': 0.8857, 'grad_norm': 2.8902087211608887, 'learning_rate': 2.2922776148582597e-05, 'epoch': 1.86}
+ 93%|█████████▎| 11855/12776 [2:04:21<07:26,  2.06it/s] 93%|█████████▎| 11856/12776 [2:04:22<06:59,  2.19it/s]                                                        93%|█████████▎| 11856/12776 [2:04:22<06:59,  2.19it/s] 93%|█████████▎| 11857/12776 [2:04:22<06:35,  2.32it/s]                                                        93%|█████████▎| 11857/12776 [2:04:22<06:35,  2.32it/s] 93%|█████████▎| 11858/12776 [2:04:22<06:15,  2.45it/s]                                                        93%|█████████▎| 11858/12776 [2:04:22<06:15,  2.45it/s] 93%|█████████▎| 11859/12776 [2:04:23<05:58,  2.56it/s]                                                        93%|█████████▎| 11859/12776 [2:04:23<05:58,  2.56it/s] 93%|█████████▎| 11860/12776 [2:04:23<06:01,  2.54it/s]                                                        93%|█████████▎| 11860/12776 [2:04:23<06:01,  2.54it/s] 93%|█████████▎| 11861/12776 [2:04:23<05:42,  2.67it/s]                                                        93%|█████████▎| 11861/12776 [2:04:23<05:42,  2.67it/s] 93%|█████████▎| 11862/12776 [2:04:24<05:23,  2.83it/s]                                                        93%|█████████▎| 11862/12776 [2:04:24<05:23,  2.83it/s] 93%|█████████▎| 11863/12776 [2:04:24<05:06,  2.98it/s]                                                        93%|█████████▎| 11863/12776 [2:04:24<05:06,  2.98it/s] 93%|█████████▎| 11864/12776 [2:04:24<05:11,  2.93it/s]                                                        93%|█████████▎| 11864/12776 [2:04:24<05:11,  2.93it/s] 93%|█████████▎| 11865/12776 [2:04:25<04:53,  3.10it/s]                                                        93%|█████████▎| 11865/12776 [2:04:25<04:53,  3.10it/s] 93%|█████████▎| 11866/12776 [2:04:25<04:39,  3.26it/s]                                                        93%|█████████▎| 11866/12776 [2:04:25<04:39,  3.26it/s] 93%|█████████▎| 11867/12776 [2:04:25<04:27,  3.40it/s]                                                        93%|█████████▎| 11867/12776 [2:04:25<04:27,  3.40it/s] 93%|█████████▎| 11868/12776 [2:04:26<04:40,  3.23it/s]                                                        93%|█████████▎| 11868/12776 [2:04:26<04:40,  3.23it/s] 93%|█████████▎| 11869/12776 [2:04:26<04:23,  3.44it/s]                                                        93%|█████████▎| 11869/12776 [2:04:26<04:23,  3.44it/s] 93%|█████████▎| 11870/12776 [2:04:26<04:11,  3.61it/s]                                                        93%|█████████▎| 11870/12776 [2:04:26<04:11,  3.61it/s] 93%|█████████▎| 11871/12776 [2:04:26<04:00,  3.76it/s]                                                        93%|█████████▎| 11871/12776 [2:04:26<04:00,  3.76it/s] 93%|█████████▎| 11872/12776 [2:04:27<04:27,  3.37it/s]                                                        93%|█████████▎| 11872/12776 [2:04:27<04:27,  3.37it/s] 93%|█████████▎| 11873/12776 [2:04:27<04:09,  3.62it/s]                                                        93%|█████████▎| 11873/12776 [2:04:27<04:09,  3.62it/s] 93%|█████████▎| 11874/12776 [2:04:27<03:54,  3.85it/s]                                                        93%|█████████▎| 11874/12776 [2:04:27<03:54,  3.85it/s] 93%|█████████▎| 11875/12776 [2:04:27<03:41,  4.07it/s]                                                        93%|█████████▎| 11875/12776 [2:04:27<03:41,  4.07it/s] 93%|█████████▎| 11876/12776 [2:04:28<04:04,  3.68it/s]                                                        93%|█████████▎| 11876/12776 [2:04:28<04:04,  3.68it/s] 93%|█████████▎| 11877/12776 [2:04:28<03:48,  3.94it/s]                                                        93%|█████████▎| 11877/12776 [2:04:28<03:48,  3.94it/s] 93%|█████████▎| 11878/12776 [2:04:28<03:34,  4.18it/s]                                                        93%|█████████▎| 11878/12776 [2:04:28<03:34,  4.18it/s] 93%|█████████▎| 11879/12776 [2:04:28<03:25,  4.37it/s]                                                        93%|█████████▎| 11879/12776 [2:04:28<03:25,  4.37it/s] 93%|█████████▎| 11880/12776 [2:04:28<03:18,  4.52it/s]                                                        93%|█████████▎| 11880/12776 [2:04:28<03:18,  4.52it/s] 93%|█████████▎| 11881/12776 [2:04:29<03:40,  4.07it/s]                                                        93%|█████████▎| 11881/12776 [2:04:29<03:40,  4.07it/s] 93%|█████████▎| 11882/12776 [2:04:29<03:27,  4.32it/s]                                                        93%|█████████▎| 11882/12776 [2:04:29<03:27,  4.32it/s] 93%|█████████▎| 11883/12776 [2:04:29<03:17,  4.52it/s]                                                        93%|█████████▎| 11883/12776 [2:04:29<03:17,  4.52it/s] 93%|█████████▎| 11884/12776 [2:04:29<03:10,  4.69it/s]                                                        93%|█████████▎| 11884/12776 [2:04:29<03:10,  4.69it/s] 93%|█████████▎| 11885/12776 [2:04:30<03:04,  4.82it/s]                                                        93%|█████████▎| 11885/12776 [2:04:30<03:04,  4.82it/s] 93%|█████████▎| 11886/12776 [2:04:30<02:58,  4.97it/s]                                                        93%|█████████▎| 11886/12776 [2:04:30<02:58,  4.97it/s] 93%|█████████▎| 11887/12776 [2:04:30<03:12,  4.62it/s]                                                        93%|█████████▎| 11887/12776 [2:04:30<03:12,  4.62it/s] 93%|█████████▎| 11888/12776 [2:04:31<05:32,  2.67it/s]                                                        93%|█████████▎| 11888/12776 [2:04:31<05:32,  2.67it/s] 93%|█████████▎| 11889/12776 [2:04:32<10:39,  1.39it/s]                                                        93%|█████████▎| 11889/12776 [2:04:32<10:39,  1.39it/s] 93%|█████████▎| 11890/12776 [2:04:33<12:13,  1.21it/s]                                                        93%|█████████▎| 11890/12776 [2:04:33<12:13,  1.21it/s] 93%|█████████▎| 11891/12776 [2:04:34<12:22,  1.19it/s]                                                        93%|█████████▎| 11891/12776 [2:04:34<12:22,  1.19it/s] 93%|█████████▎| 11892/12776 [2:04:35<12:16,  1.20it/s]                                                        93%|█████████▎| 11892/12776 [2:04:35<12:16,  1.20it/s] 93%|█████████▎| 11893/12776 [2:04:36<12:37,  1.17it/s]                                                        93%|██���██████▎| 11893/12776 [2:04:36<12:37,  1.17it/s] 93%|█████████▎| 11894/12776 [2:04:37<12:31,  1.17it/s]                                                        93%|█████████▎| 11894/12776 [2:04:37<12:31,  1.17it/s] 93%|█████████▎| 11895/12776 [2:04:37<11:43,  1.25it/s]                                                        93%|█████████▎| 11895/12776 [2:04:37<11:43,  1.25it/s] 93%|█████████▎| 11896/12776 [2:04:38<11:30,  1.27it/s]                                                        93%|█████████▎| 11896/12776 [2:04:38<11:30,  1.27it/s] 93%|█████████▎| 11897/12776 [2:04:39<10:50,  1.35it/s]                                                        93%|█████████▎| 11897/12776 [2:04:39<10:50,  1.35it/s] 93%|█████████▎| 11898/12776 [2:04:39<10:16,  1.42it/s]                                                        93%|█████████▎| 11898/12776 [2:04:39<10:16,  1.42it/s] 93%|█████████▎| 11899/12776 [2:04:40<09:40,  1.51it/s]                                                        93%|█████████▎| 11899/12776 [2:04:40<09:40,  1.51it/s] 93%|█████████▎| 11900/12776 [2:04:41<09:14,  1.58it/s]                                                        93%|█████████▎| 11900/12776 [2:04:41<09:14,  1.58it/s] 93%|█████████▎| 11901/12776 [2:04:41<08:44,  1.67it/s]                                                        93%|█████████▎| 11901/12776 [2:04:41<08:44,  1.67it/s] 93%|█████████▎| 11902/12776 [2:04:42<08:42,  1.67it/s]                                                        93%|█████████▎| 11902/12776 [2:04:42<08:42,  1.67it/s] 93%|█████████▎| 11903/12776 [2:04:42<08:05,  1.80it/s]                                                        93%|█████████▎| 11903/12776 [2:04:42<08:05,  1.80it/s] 93%|█████████▎| 11904/12776 [2:04:43<07:59,  1.82it/s]                                                        93%|█████████▎| 11904/12776 [2:04:43<07:59,  1.82it/s] 93%|█████████▎| 11905/12776 [2:04:43<07:26,  1.95it/s]                                                        93%|█████████▎| 11905/12776 [2:04:43<07:26,  1.95it/s] 93%|█████████▎| 11906/12776 [2:04:44<07:29,  1.93it/s]                                                        93%|█████████▎| 11906/12776 [2:04:44<07:29,  1.93it/s] 93%|█████████▎| 11907/12776 [2:04:44<06:56,  2.09it/s]                                                        93%|█████████▎| 11907/12776 [2:04:44<06:56,  2.09it/s] 93%|█████████▎| 11908/12776 [2:04:44<06:27,  2.24it/s]                                                        93%|█████████▎| 11908/12776 [2:04:44<06:27,  2.24it/s] 93%|█████████▎| 11909/12776 [2:04:45<06:12,  2.33it/s]                                                        93%|█████████▎| 11909/12776 [2:04:45<06:12,  2.33it/s] 93%|█████████▎| 11910/12776 [2:04:45<05:49,  2.48it/s]                                                        93%|█████████▎| 11910/12776 [2:04:45<05:49,  2.48it/s] 93%|█████████▎| 11911/12776 [2:04:45<05:31,  2.61it/s]                                                        93%|█████████▎| 11911/12776 [2:04:45<05:31,  2.61it/s] 93%|█████████▎| 11912/12776 [2:04:46<05:45,  2.50it/s]                                                        93%|█████████▎| 11912/12776 [2:04:46<05:45,  2.50it/s] 93%|█████████▎| 11913/12776 [2:04:46<05:23,  2.67it/s]                                                        93%|█████████▎| 11913/12776 [2:04:46<05:23,  2.67it/s] 93%|█████████▎| 11914/12776 [2:04:47<05:04,  2.83it/s]                                                        93%|█████████▎| 11914/12776 [2:04:47<05:04,  2.83it/s] 93%|█████████▎| 11915/12776 [2:04:47<04:50,  2.97it/s]                                                        93%|█████████▎| 11915/12776 [2:04:47<04:50,  2.97it/s] 93%|█████████▎| 11916/12776 [2:04:47<04:57,  2.89it/s]                                                        93%|█████████▎| 11916/12776 [2:04:47<04:57,  2.89it/s] 93%|█████████▎| 11917/12776 [2:04:47<04:40,  3.06it/s]                                                        93%|█████████▎| 11917/12776 [2:04:48<04:40,  3.06it/s] 93%|█████████▎| 11918/12776 [2:04:48<04:26,  3.21it/s]                                                        93%|█████████▎| 11918/12776 [2:04:48<04:26,  3.21it/s] 93%|█████████▎| 11919/12776 [2:04:48<04:08,  3.44it/s]                                                        93%|█████████▎| 11919/12776 [2:04:48<04:08,  3.44it/s] 93%|█████████▎| 11920/12776 [2:04:48<04:18,  3.31it/s]                                                        93%|█████████▎| 11920/12776 [2:04:48<04:18,  3.31it/s] 93%|█████████▎| 11921/12776 [2:04:49<04:01,  3.54it/s]                                                        93%|█████████▎| 11921/12776 [2:04:49<04:01,  3.54it/s] 93%|█████████▎| 11922/12776 [2:04:49<03:48,  3.74it/s]                                                        93%|█████████▎| 11922/12776 [2:04:49<03:48,  3.74it/s] 93%|█████████▎| 11923/12776 [2:04:49<03:37,  3.92it/s]                                                        93%|█████████▎| 11923/12776 [2:04:49<03:37,  3.92it/s] 93%|█████████▎| 11924/12776 [2:04:49<03:43,  3.81it/s]                                                        93%|█████████▎| 11924/12776 [2:04:49<03:43,  3.81it/s] 93%|█████████▎| 11925/12776 [2:04:50<03:31,  4.03it/s]                                                        93%|█████████▎| 11925/12776 [2:04:50<03:31,  4.03it/s] 93%|█████████▎| 11926/12776 [2:04:50<03:20,  4.24it/s]                                                        93%|█████████▎| 11926/12776 [2:04:50<03:20,  4.24it/s] 93%|█████████▎| 11927/12776 [2:04:50<03:12,  4.41it/s]                                                        93%|█████████▎| 11927/12776 [2:04:50<03:12,  4.41it/s] 93%|█████████▎| 11928/12776 [2:04:50<03:06,  4.54it/s]                                                        93%|█████████▎| 11928/12776 [2:04:50<03:06,  4.54it/s] 93%|█████████▎| 11929/12776 [2:04:50<03:30,  4.02it/s]                                                        93%|█████████▎| 11929/12776 [2:04:50<03:30,  4.02it/s] 93%|█████████▎| 11930/12776 [2:04:51<03:17,  4.27it/s]                                                        93%|█████████▎| 11930/12776 [2:04:51<03:17,  4.27it/s] 93%|█████████▎| 11931/12776 [2:04:51<03:08,  4.49it/s]                                                        93%|█████████▎| 11931/12776 [2:04:51<03:08,  4.49it/s] 93%|█████████▎| 11932/12776 [2:04:51<03:00,  4.66it/s]                                                        93%|█████████▎| 11932/12776 [2:04:51<03:00,  4.66it/s] 93%|█████████▎| 11933/12776 [2:04:51<02:44,  5.12it/s]                                                       {'loss': 0.8052, 'grad_norm': 2.553431749343872, 'learning_rate': 2.2898338220918866e-05, 'epoch': 1.86}
+{'loss': 0.7178, 'grad_norm': 3.586069107055664, 'learning_rate': 2.287390029325513e-05, 'epoch': 1.86}
+{'loss': 0.7, 'grad_norm': 1.4194121360778809, 'learning_rate': 2.2849462365591395e-05, 'epoch': 1.86}
+{'loss': 0.613, 'grad_norm': 1.709073781967163, 'learning_rate': 2.2825024437927663e-05, 'epoch': 1.86}
+{'loss': 0.6448, 'grad_norm': 1.8721200227737427, 'learning_rate': 2.2800586510263928e-05, 'epoch': 1.86}
+{'loss': 0.7323, 'grad_norm': 3.26497745513916, 'learning_rate': 2.2776148582600193e-05, 'epoch': 1.86}
+{'loss': 1.2313, 'grad_norm': 3.8040733337402344, 'learning_rate': 2.275171065493646e-05, 'epoch': 1.86}
+{'loss': 0.5404, 'grad_norm': 3.2131476402282715, 'learning_rate': 2.2727272727272726e-05, 'epoch': 1.86}
+{'loss': 0.5706, 'grad_norm': 2.0257785320281982, 'learning_rate': 2.270283479960899e-05, 'epoch': 1.86}
+{'loss': 0.7594, 'grad_norm': 1.4355109930038452, 'learning_rate': 2.267839687194526e-05, 'epoch': 1.86}
+{'loss': 0.892, 'grad_norm': 2.189570426940918, 'learning_rate': 2.2653958944281524e-05, 'epoch': 1.86}
+{'loss': 0.7958, 'grad_norm': 1.5247050523757935, 'learning_rate': 2.262952101661779e-05, 'epoch': 1.86}
+{'loss': 0.8679, 'grad_norm': 2.8987576961517334, 'learning_rate': 2.2605083088954053e-05, 'epoch': 1.86}
+{'loss': 0.8147, 'grad_norm': 2.054922580718994, 'learning_rate': 2.258064516129032e-05, 'epoch': 1.86}
+{'loss': 0.4729, 'grad_norm': 2.206418752670288, 'learning_rate': 2.2556207233626586e-05, 'epoch': 1.86}
+{'loss': 1.0916, 'grad_norm': 3.0962157249450684, 'learning_rate': 2.253176930596285e-05, 'epoch': 1.86}
+{'loss': 1.1446, 'grad_norm': 3.1131982803344727, 'learning_rate': 2.250733137829912e-05, 'epoch': 1.86}
+{'loss': 1.1253, 'grad_norm': 3.34653639793396, 'learning_rate': 2.2482893450635384e-05, 'epoch': 1.86}
+{'loss': 1.4304, 'grad_norm': 7.987216472625732, 'learning_rate': 2.245845552297165e-05, 'epoch': 1.86}
+{'loss': 0.5307, 'grad_norm': 3.8446297645568848, 'learning_rate': 2.2434017595307917e-05, 'epoch': 1.86}
+{'loss': 0.765, 'grad_norm': 2.1328821182250977, 'learning_rate': 2.2409579667644182e-05, 'epoch': 1.86}
+{'loss': 1.1964, 'grad_norm': 4.147828578948975, 'learning_rate': 2.2385141739980447e-05, 'epoch': 1.86}
+{'loss': 0.7498, 'grad_norm': 1.5424137115478516, 'learning_rate': 2.2360703812316715e-05, 'epoch': 1.86}
+{'loss': 0.8177, 'grad_norm': 2.9707698822021484, 'learning_rate': 2.233626588465298e-05, 'epoch': 1.86}
+{'loss': 0.8357, 'grad_norm': 8.598952293395996, 'learning_rate': 2.2311827956989244e-05, 'epoch': 1.86}
+{'loss': 0.6847, 'grad_norm': 1.6900843381881714, 'learning_rate': 2.2287390029325513e-05, 'epoch': 1.86}
+{'loss': 1.5568, 'grad_norm': 5.230086803436279, 'learning_rate': 2.2262952101661777e-05, 'epoch': 1.86}
+{'loss': 1.0774, 'grad_norm': 3.5463037490844727, 'learning_rate': 2.2238514173998042e-05, 'epoch': 1.86}
+{'loss': 0.8156, 'grad_norm': 3.144085168838501, 'learning_rate': 2.221407624633431e-05, 'epoch': 1.86}
+{'loss': 0.6252, 'grad_norm': 2.6396665573120117, 'learning_rate': 2.2189638318670575e-05, 'epoch': 1.86}
+{'loss': 0.1875, 'grad_norm': 0.6238581538200378, 'learning_rate': 2.216520039100684e-05, 'epoch': 1.86}
+{'loss': 0.52, 'grad_norm': 2.998420238494873, 'learning_rate': 2.2140762463343108e-05, 'epoch': 1.86}
+{'loss': 1.4961, 'grad_norm': 3.8253471851348877, 'learning_rate': 2.2116324535679373e-05, 'epoch': 1.86}
+{'loss': 0.85, 'grad_norm': 4.558570861816406, 'learning_rate': 2.2091886608015638e-05, 'epoch': 1.86}
+{'loss': 0.634, 'grad_norm': 1.2218017578125, 'learning_rate': 2.2067448680351906e-05, 'epoch': 1.86}
+{'loss': 0.5668, 'grad_norm': 2.944087266921997, 'learning_rate': 2.204301075268817e-05, 'epoch': 1.86}
+{'loss': 0.6317, 'grad_norm': 1.4352192878723145, 'learning_rate': 2.2018572825024436e-05, 'epoch': 1.86}
+{'loss': 0.5786, 'grad_norm': 1.2094552516937256, 'learning_rate': 2.1994134897360704e-05, 'epoch': 1.86}
+{'loss': 0.6373, 'grad_norm': 1.98757803440094, 'learning_rate': 2.196969696969697e-05, 'epoch': 1.86}
+{'loss': 0.6269, 'grad_norm': 1.0095068216323853, 'learning_rate': 2.1945259042033233e-05, 'epoch': 1.86}
+{'loss': 0.6092, 'grad_norm': 1.0049716234207153, 'learning_rate': 2.19208211143695e-05, 'epoch': 1.86}
+{'loss': 0.5908, 'grad_norm': 1.7188302278518677, 'learning_rate': 2.1896383186705766e-05, 'epoch': 1.86}
+{'loss': 0.6305, 'grad_norm': 2.2089591026306152, 'learning_rate': 2.187194525904203e-05, 'epoch': 1.86}
+{'loss': 0.7027, 'grad_norm': 2.382549285888672, 'learning_rate': 2.18475073313783e-05, 'epoch': 1.86}
+{'loss': 0.6653, 'grad_norm': 1.0006448030471802, 'learning_rate': 2.1823069403714564e-05, 'epoch': 1.86}
+{'loss': 0.7094, 'grad_norm': 1.5975639820098877, 'learning_rate': 2.179863147605083e-05, 'epoch': 1.86}
+{'loss': 0.7759, 'grad_norm': 1.290367841720581, 'learning_rate': 2.1774193548387097e-05, 'epoch': 1.86}
+{'loss': 0.7417, 'grad_norm': 2.142758846282959, 'learning_rate': 2.1749755620723362e-05, 'epoch': 1.86}
+{'loss': 0.8493, 'grad_norm': 1.4143774509429932, 'learning_rate': 2.1725317693059627e-05, 'epoch': 1.86}
+{'loss': 0.7674, 'grad_norm': 1.213179588317871, 'learning_rate': 2.1700879765395895e-05, 'epoch': 1.86}
+{'loss': 0.8457, 'grad_norm': 1.534143328666687, 'learning_rate': 2.167644183773216e-05, 'epoch': 1.86}
+{'loss': 0.7507, 'grad_norm': 14.284627914428711, 'learning_rate': 2.1652003910068425e-05, 'epoch': 1.86}
+{'loss': 0.7684, 'grad_norm': 1.4157757759094238, 'learning_rate': 2.1627565982404693e-05, 'epoch': 1.86}
+{'loss': 0.7432, 'grad_norm': 1.829573392868042, 'learning_rate': 2.1603128054740958e-05, 'epoch': 1.86}
+{'loss': 0.9471, 'grad_norm': 1.5792940855026245, 'learning_rate': 2.1578690127077222e-05, 'epoch': 1.86}
+{'loss': 1.042, 'grad_norm': 3.3256168365478516, 'learning_rate': 2.155425219941349e-05, 'epoch': 1.86}
+{'loss': 0.8677, 'grad_norm': 2.1688032150268555, 'learning_rate': 2.1529814271749755e-05, 'epoch': 1.86}
+{'loss': 1.0286, 'grad_norm': 2.214085578918457, 'learning_rate': 2.150537634408602e-05, 'epoch': 1.86}
+{'loss': 0.8419, 'grad_norm': 2.812800168991089, 'learning_rate': 2.148093841642229e-05, 'epoch': 1.86}
+{'loss': 0.7129, 'grad_norm': 2.8499462604522705, 'learning_rate': 2.1456500488758553e-05, 'epoch': 1.87}
+{'loss': 0.9942, 'grad_norm': 2.5196754932403564, 'learning_rate': 2.1432062561094818e-05, 'epoch': 1.87}
+{'loss': 0.7659, 'grad_norm': 1.3992620706558228, 'learning_rate': 2.1407624633431086e-05, 'epoch': 1.87}
+{'loss': 0.7056, 'grad_norm': 3.1790847778320312, 'learning_rate': 2.138318670576735e-05, 'epoch': 1.87}
+{'loss': 1.2447, 'grad_norm': 3.775634527206421, 'learning_rate': 2.1358748778103612e-05, 'epoch': 1.87}
+{'loss': 0.5885, 'grad_norm': 2.107165813446045, 'learning_rate': 2.1334310850439884e-05, 'epoch': 1.87}
+{'loss': 0.6303, 'grad_norm': 2.489856243133545, 'learning_rate': 2.130987292277615e-05, 'epoch': 1.87}
+{'loss': 0.6447, 'grad_norm': 6.781438827514648, 'learning_rate': 2.128543499511241e-05, 'epoch': 1.87}
+{'loss': 0.6563, 'grad_norm': 3.0962233543395996, 'learning_rate': 2.126099706744868e-05, 'epoch': 1.87}
+{'loss': 1.2628, 'grad_norm': 3.9209532737731934, 'learning_rate': 2.1236559139784946e-05, 'epoch': 1.87}
+{'loss': 0.7253, 'grad_norm': 2.2564404010772705, 'learning_rate': 2.1212121212121208e-05, 'epoch': 1.87}
+{'loss': 1.0094, 'grad_norm': 5.064412593841553, 'learning_rate': 2.118768328445748e-05, 'epoch': 1.87}
+{'loss': 1.0255, 'grad_norm': 4.73468542098999, 'learning_rate': 2.1163245356793744e-05, 'epoch': 1.87}
+{'loss': 1.1333, 'grad_norm': 4.486331462860107, 'learning_rate': 2.1138807429130006e-05, 'epoch': 1.87}
+{'loss': 0.555, 'grad_norm': 4.3150811195373535, 'learning_rate': 2.1114369501466277e-05, 'epoch': 1.87}
+{'loss': 1.3198, 'grad_norm': 4.391129970550537, 'learning_rate': 2.1089931573802542e-05, 'epoch': 1.87}
+{'loss': 1.0015, 'grad_norm': 3.979997158050537, 'learning_rate': 2.1065493646138803e-05, 'epoch': 1.87}
+{'loss': 1.9898, 'grad_norm': 2.4785280227661133, 'learning_rate': 2.1041055718475068e-05, 'epoch': 1.87}
+{'loss': 1.0243, 'grad_norm': 2.6585922241210938, 'learning_rate': 2.1016617790811336e-05, 'epoch': 1.87}
+ 93%|█████████▎| 11933/12776 [2:04:51<02:44,  5.12it/s] 93%|█████████▎| 11934/12776 [2:04:51<02:38,  5.32it/s]                                                        93%|█████████▎| 11934/12776 [2:04:51<02:38,  5.32it/s] 93%|█████████▎| 11935/12776 [2:04:52<03:08,  4.46it/s]                                                        93%|█████████▎| 11935/12776 [2:04:52<03:08,  4.46it/s] 93%|█████████▎| 11936/12776 [2:04:52<02:57,  4.73it/s]                                                        93%|█████████▎| 11936/12776 [2:04:52<02:57,  4.73it/s] 93%|█████████▎| 11937/12776 [2:04:52<02:49,  4.96it/s]                                                        93%|█████████▎| 11937/12776 [2:04:52<02:49,  4.96it/s] 93%|█████████▎| 11938/12776 [2:04:53<05:11,  2.69it/s]                                                        93%|█████████▎| 11938/12776 [2:04:53<05:11,  2.69it/s] 93%|█████████▎| 11939/12776 [2:04:54<09:58,  1.40it/s]                                                        93%|█████████▎| 11939/12776 [2:04:54<09:58,  1.40it/s] 93%|█████████▎| 11940/12776 [2:04:55<10:48,  1.29it/s]                                                        93%|█████████▎| 11940/12776 [2:04:55<10:48,  1.29it/s] 93%|█████████▎| 11941/12776 [2:04:56<11:30,  1.21it/s]                                                        93%|█████████▎| 11941/12776 [2:04:56<11:30,  1.21it/s] 93%|█████████▎| 11942/12776 [2:04:57<11:10,  1.24it/s]                                                        93%|█████████▎| 11942/12776 [2:04:57<11:10,  1.24it/s] 93%|█████████▎| 11943/12776 [2:04:58<11:05,  1.25it/s]                                                        93%|█████████▎| 11943/12776 [2:04:58<11:05,  1.25it/s] 93%|█████████▎| 11944/12776 [2:04:58<10:45,  1.29it/s]                                                        93%|█████████▎| 11944/12776 [2:04:58<10:45,  1.29it/s] 93%|█████████▎| 11945/12776 [2:04:59<10:10,  1.36it/s]                                                        93%|█████████▎| 11945/12776 [2:04:59<10:10,  1.36it/s] 94%|█████████▎| 11946/12776 [2:05:00<10:14,  1.35it/s]                                                        94%|█████████▎| 11946/12776 [2:05:00<10:14,  1.35it/s] 94%|█████████▎| 11947/12776 [2:05:00<09:32,  1.45it/s]                                                        94%|█████████▎| 11947/12776 [2:05:00<09:32,  1.45it/s] 94%|█████████▎| 11948/12776 [2:05:01<09:16,  1.49it/s]                                                        94%|█████████▎| 11948/12776 [2:05:01<09:16,  1.49it/s] 94%|█████████▎| 11949/12776 [2:05:02<08:39,  1.59it/s]                                                        94%|█████████▎| 11949/12776 [2:05:02<08:39,  1.59it/s] 94%|█████████▎| 11950/12776 [2:05:02<08:25,  1.63it/s]                                                        94%|█████████▎| 11950/12776 [2:05:02<08:25,  1.63it/s] 94%|█████████▎| 11951/12776 [2:05:03<07:54,  1.74it/s]                                                        94%|█████████▎| 11951/12776 [2:05:03<07:54,  1.74it/s] 94%|████████��▎| 11952/12776 [2:05:03<07:50,  1.75it/s]                                                        94%|█████████▎| 11952/12776 [2:05:03<07:50,  1.75it/s] 94%|█████████▎| 11953/12776 [2:05:04<07:16,  1.89it/s]                                                        94%|█████████▎| 11953/12776 [2:05:04<07:16,  1.89it/s] 94%|█████████▎| 11954/12776 [2:05:04<07:08,  1.92it/s]                                                        94%|█████████▎| 11954/12776 [2:05:04<07:08,  1.92it/s] 94%|█████████▎| 11955/12776 [2:05:05<06:40,  2.05it/s]                                                        94%|█████████▎| 11955/12776 [2:05:05<06:40,  2.05it/s] 94%|█████████▎| 11956/12776 [2:05:05<06:15,  2.18it/s]                                                        94%|█████████▎| 11956/12776 [2:05:05<06:15,  2.18it/s] 94%|█████████▎| 11957/12776 [2:05:05<06:23,  2.13it/s]                                                        94%|█████████▎| 11957/12776 [2:05:05<06:23,  2.13it/s] 94%|█████████▎| 11958/12776 [2:05:06<05:56,  2.29it/s]                                                        94%|█████████▎| 11958/12776 [2:05:06<05:56,  2.29it/s] 94%|█████████▎| 11959/12776 [2:05:06<05:34,  2.44it/s]                                                        94%|█████████▎| 11959/12776 [2:05:06<05:34,  2.44it/s] 94%|█████████▎| 11960/12776 [2:05:07<05:33,  2.45it/s]                                                        94%|█████████▎| 11960/12776 [2:05:07<05:33,  2.45it/s] 94%|█████████▎| 11961/12776 [2:05:07<05:13,  2.60it/s]                                                        94%|█████████▎| 11961/12776 [2:05:07<05:13,  2.60it/s] 94%|█████████▎| 11962/12776 [2:05:07<04:57,  2.74it/s]                                                        94%|█████████▎| 11962/12776 [2:05:07<04:57,  2.74it/s] 94%|█████████▎| 11963/12776 [2:05:08<04:52,  2.78it/s]                                                        94%|█████████▎| 11963/12776 [2:05:08<04:52,  2.78it/s] 94%|█████████▎| 11964/12776 [2:05:08<04:35,  2.95it/s]                                                        94%|█████████▎| 11964/12776 [2:05:08<04:35,  2.95it/s] 94%|█████████▎| 11965/12776 [2:05:08<04:21,  3.10it/s]                                                        94%|█████████▎| 11965/12776 [2:05:08<04:21,  3.10it/s] 94%|█████████▎| 11966/12776 [2:05:08<04:10,  3.24it/s]                                                        94%|█████████▎| 11966/12776 [2:05:08<04:10,  3.24it/s] 94%|█████████▎| 11967/12776 [2:05:09<04:05,  3.29it/s]                                                        94%|█████████▎| 11967/12776 [2:05:09<04:05,  3.29it/s] 94%|█████████▎| 11968/12776 [2:05:09<03:54,  3.44it/s]                                                        94%|█████████▎| 11968/12776 [2:05:09<03:54,  3.44it/s] 94%|█████████▎| 11969/12776 [2:05:09<03:43,  3.61it/s]                                                        94%|█████████▎| 11969/12776 [2:05:09<03:43,  3.61it/s] 94%|█████████▎| 11970/12776 [2:05:09<03:34,  3.75it/s]                                                        94%|█████████▎| 11970/12776 [2:05:09<03:34,  3.75it/s] 94%|█████████▎| 11971/12776 [2:05:10<03:27,  3.89it/s]                                                        94%|█████████▎| 11971/12776 [2:05:10<03:27,  3.89it/s] 94%|█████████▎| 11972/12776 [2:05:10<03:32,  3.78it/s]                                                        94%|█████████▎| 11972/12776 [2:05:10<03:32,  3.78it/s] 94%|█████████▎| 11973/12776 [2:05:10<03:22,  3.96it/s]                                                        94%|█████████▎| 11973/12776 [2:05:10<03:22,  3.96it/s] 94%|█████████▎| 11974/12776 [2:05:10<03:14,  4.12it/s]                                                        94%|█████████▎| 11974/12776 [2:05:10<03:14,  4.12it/s] 94%|█████████▎| 11975/12776 [2:05:11<03:07,  4.28it/s]                                                        94%|█████████▎| 11975/12776 [2:05:11<03:07,  4.28it/s] 94%|█████████▎| 11976/12776 [2:05:11<03:00,  4.43it/s]                                                        94%|█████████▎| 11976/12776 [2:05:11<03:00,  4.43it/s] 94%|█████████▎| 11977/12776 [2:05:11<03:06,  4.29it/s]                                                        94%|█████████▎| 11977/12776 [2:05:11<03:06,  4.29it/s] 94%|█████████▍| 11978/12776 [2:05:11<02:59,  4.45it/s]                                                        94%|█████████▍| 11978/12776 [2:05:11<02:59,  4.45it/s] 94%|█████████▍| 11979/12776 [2:05:11<02:53,  4.61it/s]                                                        94%|█████████▍| 11979/12776 [2:05:11<02:53,  4.61it/s] 94%|█████████▍| 11980/12776 [2:05:12<02:47,  4.75it/s]                                                        94%|█████████▍| 11980/12776 [2:05:12<02:47,  4.75it/s] 94%|█████████▍| 11981/12776 [2:05:12<02:43,  4.87it/s]                                                        94%|█████████▍| 11981/12776 [2:05:12<02:43,  4.87it/s] 94%|█████████▍| 11982/12776 [2:05:12<03:06,  4.25it/s]                                                        94%|█████████▍| 11982/12776 [2:05:12<03:06,  4.25it/s] 94%|█████████▍| 11983/12776 [2:05:12<02:55,  4.51it/s]                                                        94%|█████████▍| 11983/12776 [2:05:12<02:55,  4.51it/s] 94%|█████████▍| 11984/12776 [2:05:13<02:47,  4.73it/s]                                                        94%|█████████▍| 11984/12776 [2:05:13<02:47,  4.73it/s] 94%|█████████▍| 11985/12776 [2:05:13<02:41,  4.91it/s]                                                        94%|█████████▍| 11985/12776 [2:05:13<02:41,  4.91it/s] 94%|█████████▍| 11986/12776 [2:05:13<02:35,  5.08it/s]                                                        94%|█████████▍| 11986/12776 [2:05:13<02:35,  5.08it/s] 94%|█████████▍| 11987/12776 [2:05:13<02:55,  4.49it/s]                                                        94%|█████████▍| 11987/12776 [2:05:13<02:55,  4.49it/s] 94%|█████████▍| 11988/12776 [2:05:14<04:43,  2.78it/s]                                                        94%|█████████▍| 11988/12776 [2:05:14<04:43,  2.78it/s] 94%|█████████▍| 11989/12776 [2:05:15<09:35,  1.37it/s]                                                        94%|█████████▍| 11989/12776 [2:05:15<09:35,  1.37it/s] 94%|█████████▍| 11990/12776 [2:05:16<10:14,  1.28it/s]                                                        94%|█████████▍| 11990/12776 [2:05:16<10:14,  1.28it/s] 94%|█████████▍| 11991/12776 [2:05:17<10:24,  1.26it/s]                                                        94%|█████████▍| 11991/12776 [2:05:17<10:24,  1.26it/s] 94%|█████████▍| 11992/12776 [2:05:18<10:38,  1.23it/s]                                                        94%|█████████▍| 11992/12776 [2:05:18<10:38,  1.23it/s] 94%|█████████▍| 11993/12776 [2:05:19<10:40,  1.22it/s]                                                        94%|█████████▍| 11993/12776 [2:05:19<10:40,  1.22it/s] 94%|█████████▍| 11994/12776 [2:05:20<10:04,  1.29it/s]                                                        94%|█████████▍| 11994/12776 [2:05:20<10:04,  1.29it/s] 94%|█████████▍| 11995/12776 [2:05:20<09:54,  1.31it/s]                                                        94%|█████████▍| 11995/12776 [2:05:20<09:54,  1.31it/s] 94%|█████████▍| 11996/12776 [2:05:21<09:20,  1.39it/s]                                                        94%|█████████▍| 11996/12776 [2:05:21<09:20,  1.39it/s] 94%|█████████▍| 11997/12776 [2:05:21<08:51,  1.47it/s]                                                        94%|█████████▍| 11997/12776 [2:05:21<08:51,  1.47it/s] 94%|█████████▍| 11998/12776 [2:05:22<08:22,  1.55it/s]                                                        94%|█████████▍| 11998/12776 [2:05:22<08:22,  1.55it/s] 94%|█████████▍| 11999/12776 [2:05:23<07:58,  1.62it/s]                                                        94%|█████████▍| 11999/12776 [2:05:23<07:58,  1.62it/s] 94%|█████████▍| 12000/12776 [2:05:23<07:36,  1.70it/s]                                                        94%|█████████▍| 12000/12776 [2:05:23<07:36,  1.70it/s]The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
+***** Running Evaluation *****
+  Num examples = 12383
+  Batch size = 16
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 2.1016617790811336e-05, 'epoch': 1.87}
+{'loss': 0.9492, 'grad_norm': 3.6180975437164307, 'learning_rate': 2.09921798631476e-05, 'epoch': 1.87}
+{'loss': 1.0283, 'grad_norm': 4.195194721221924, 'learning_rate': 2.0967741935483866e-05, 'epoch': 1.87}
+{'loss': 0.5503, 'grad_norm': 1.3655951023101807, 'learning_rate': 2.0943304007820134e-05, 'epoch': 1.87}
+{'loss': 0.5951, 'grad_norm': 2.214198112487793, 'learning_rate': 2.09188660801564e-05, 'epoch': 1.87}
+{'loss': 0.8592, 'grad_norm': 2.850289821624756, 'learning_rate': 2.0894428152492664e-05, 'epoch': 1.87}
+{'loss': 0.8295, 'grad_norm': 0.8146681189537048, 'learning_rate': 2.0869990224828932e-05, 'epoch': 1.87}
+{'loss': 0.8397, 'grad_norm': 1.004290223121643, 'learning_rate': 2.0845552297165197e-05, 'epoch': 1.87}
+{'loss': 0.9296, 'grad_norm': 1.3244619369506836, 'learning_rate': 2.082111436950146e-05, 'epoch': 1.87}
+{'loss': 0.8361, 'grad_norm': 1.1102343797683716, 'learning_rate': 2.079667644183773e-05, 'epoch': 1.87}
+{'loss': 1.0738, 'grad_norm': 1.2211482524871826, 'learning_rate': 2.0772238514173995e-05, 'epoch': 1.87}
+{'loss': 0.876, 'grad_norm': 0.993922233581543, 'learning_rate': 2.074780058651026e-05, 'epoch': 1.87}
+{'loss': 0.8152, 'grad_norm': 1.7707405090332031, 'learning_rate': 2.0723362658846528e-05, 'epoch': 1.87}
+{'loss': 0.8948, 'grad_norm': 0.997743546962738, 'learning_rate': 2.0698924731182792e-05, 'epoch': 1.87}
+{'loss': 0.8765, 'grad_norm': 0.9178709983825684, 'learning_rate': 2.0674486803519057e-05, 'epoch': 1.87}
+{'loss': 0.9537, 'grad_norm': 2.155320167541504, 'learning_rate': 2.0650048875855325e-05, 'epoch': 1.87}
+{'loss': 0.9491, 'grad_norm': 1.6522964239120483, 'learning_rate': 2.062561094819159e-05, 'epoch': 1.87}
+{'loss': 0.9958, 'grad_norm': 1.9385637044906616, 'learning_rate': 2.0601173020527855e-05, 'epoch': 1.87}
+{'loss': 0.9823, 'grad_norm': 2.451505422592163, 'learning_rate': 2.0576735092864123e-05, 'epoch': 1.87}
+{'loss': 0.9815, 'grad_norm': 9.86997127532959, 'learning_rate': 2.0552297165200388e-05, 'epoch': 1.87}
+{'loss': 0.9844, 'grad_norm': 1.331578016281128, 'learning_rate': 2.0527859237536653e-05, 'epoch': 1.87}
+{'loss': 0.8801, 'grad_norm': 2.020040273666382, 'learning_rate': 2.050342130987292e-05, 'epoch': 1.87}
+{'loss': 1.0083, 'grad_norm': 2.5946261882781982, 'learning_rate': 2.0478983382209186e-05, 'epoch': 1.87}
+{'loss': 0.9543, 'grad_norm': 1.8665767908096313, 'learning_rate': 2.045454545454545e-05, 'epoch': 1.87}
+{'loss': 0.8922, 'grad_norm': 2.101576089859009, 'learning_rate': 2.043010752688172e-05, 'epoch': 1.87}
+{'loss': 0.9835, 'grad_norm': 1.9192637205123901, 'learning_rate': 2.0405669599217984e-05, 'epoch': 1.87}
+{'loss': 0.9142, 'grad_norm': 6.741157054901123, 'learning_rate': 2.038123167155425e-05, 'epoch': 1.87}
+{'loss': 0.9528, 'grad_norm': 2.8808178901672363, 'learning_rate': 2.0356793743890517e-05, 'epoch': 1.87}
+{'loss': 1.0117, 'grad_norm': 7.245316505432129, 'learning_rate': 2.033235581622678e-05, 'epoch': 1.87}
+{'loss': 0.736, 'grad_norm': 1.8030654191970825, 'learning_rate': 2.0307917888563046e-05, 'epoch': 1.87}
+{'loss': 0.9961, 'grad_norm': 19.674142837524414, 'learning_rate': 2.0283479960899314e-05, 'epoch': 1.87}
+{'loss': 0.7848, 'grad_norm': 4.334254741668701, 'learning_rate': 2.025904203323558e-05, 'epoch': 1.87}
+{'loss': 0.7798, 'grad_norm': 1.3288064002990723, 'learning_rate': 2.0234604105571844e-05, 'epoch': 1.87}
+{'loss': 1.0681, 'grad_norm': 3.3582398891448975, 'learning_rate': 2.0210166177908112e-05, 'epoch': 1.87}
+{'loss': 0.8853, 'grad_norm': 4.449483394622803, 'learning_rate': 2.0185728250244377e-05, 'epoch': 1.87}
+{'loss': 0.6398, 'grad_norm': 2.989022731781006, 'learning_rate': 2.0161290322580642e-05, 'epoch': 1.87}
+{'loss': 0.7581, 'grad_norm': 4.922720432281494, 'learning_rate': 2.013685239491691e-05, 'epoch': 1.87}
+{'loss': 0.9456, 'grad_norm': 4.551708221435547, 'learning_rate': 2.0112414467253175e-05, 'epoch': 1.87}
+{'loss': 0.8504, 'grad_norm': 2.336358070373535, 'learning_rate': 2.008797653958944e-05, 'epoch': 1.87}
+{'loss': 0.7855, 'grad_norm': 2.310737371444702, 'learning_rate': 2.0063538611925708e-05, 'epoch': 1.87}
+{'loss': 0.775, 'grad_norm': 10.708081245422363, 'learning_rate': 2.0039100684261973e-05, 'epoch': 1.87}
+{'loss': 0.7938, 'grad_norm': 3.0271284580230713, 'learning_rate': 2.0014662756598237e-05, 'epoch': 1.87}
+{'loss': 1.235, 'grad_norm': 7.497602462768555, 'learning_rate': 1.9990224828934506e-05, 'epoch': 1.87}
+{'loss': 1.2037, 'grad_norm': 7.694507598876953, 'learning_rate': 1.996578690127077e-05, 'epoch': 1.87}
+{'loss': 0.6108, 'grad_norm': 5.209584712982178, 'learning_rate': 1.9941348973607035e-05, 'epoch': 1.87}
+{'loss': 0.9119, 'grad_norm': 5.772160053253174, 'learning_rate': 1.9916911045943303e-05, 'epoch': 1.88}
+{'loss': 1.2656, 'grad_norm': 2.1883044242858887, 'learning_rate': 1.9892473118279568e-05, 'epoch': 1.88}
+{'loss': 0.9636, 'grad_norm': 3.705591917037964, 'learning_rate': 1.9868035190615833e-05, 'epoch': 1.88}
+{'loss': 1.0948, 'grad_norm': 2.439570188522339, 'learning_rate': 1.98435972629521e-05, 'epoch': 1.88}
+{'loss': 0.6857, 'grad_norm': 2.082089424133301, 'learning_rate': 1.9819159335288366e-05, 'epoch': 1.88}
+{'loss': 0.3912, 'grad_norm': 1.2534931898117065, 'learning_rate': 1.979472140762463e-05, 'epoch': 1.88}
+{'loss': 0.6163, 'grad_norm': 3.958746910095215, 'learning_rate': 1.97702834799609e-05, 'epoch': 1.88}
+{'loss': 0.4008, 'grad_norm': 1.1258634328842163, 'learning_rate': 1.9745845552297164e-05, 'epoch': 1.88}
+{'loss': 0.7214, 'grad_norm': 3.0614774227142334, 'learning_rate': 1.972140762463343e-05, 'epoch': 1.88}
+{'loss': 1.0756, 'grad_norm': 3.998211145401001, 'learning_rate': 1.9696969696969697e-05, 'epoch': 1.88}
+{'loss': 1.0684, 'grad_norm': 2.4418106079101562, 'learning_rate': 1.967253176930596e-05, 'epoch': 1.88}
+{'loss': 1.1258, 'grad_norm': 0.8472214937210083, 'learning_rate': 1.9648093841642226e-05, 'epoch': 1.88}
+{'loss': 1.1072, 'grad_norm': 1.6514756679534912, 'learning_rate': 1.9623655913978494e-05, 'epoch': 1.88}
+{'loss': 1.2211, 'grad_norm': 1.2808918952941895, 'learning_rate': 1.959921798631476e-05, 'epoch': 1.88}
+{'loss': 1.1116, 'grad_norm': 0.9649450182914734, 'learning_rate': 1.9574780058651024e-05, 'epoch': 1.88}
+{'loss': 1.1229, 'grad_norm': 1.2473208904266357, 'learning_rate': 1.9550342130987292e-05, 'epoch': 1.88}
+{'loss': 1.1491, 'grad_norm': 1.2597932815551758, 'learning_rate': 1.9525904203323557e-05, 'epoch': 1.88}
+{'loss': 1.136, 'grad_norm': 1.4384278059005737, 'learning_rate': 1.9501466275659822e-05, 'epoch': 1.88}
+{'loss': 1.0778, 'grad_norm': 1.308000087738037, 'learning_rate': 1.9477028347996087e-05, 'epoch': 1.88}
+{'loss': 1.0682, 'grad_norm': 1.234831690788269, 'learning_rate': 1.9452590420332355e-05, 'epoch': 1.88}
+{'loss': 1.0062, 'grad_norm': 0.9568355083465576, 'learning_rate': 1.942815249266862e-05, 'epoch': 1.88}
+{'loss': 1.0344, 'grad_norm': 3.727036952972412, 'learning_rate': 1.9403714565004884e-05, 'epoch': 1.88}
+{'loss': 1.0171, 'grad_norm': 1.185569167137146, 'learning_rate': 1.9379276637341153e-05, 'epoch': 1.88}
+
+  0%|          | 0/774 [00:00<?, ?it/s][A
+  0%|          | 2/774 [00:00<02:02,  6.31it/s][A
+  0%|          | 3/774 [00:00<02:45,  4.67it/s][A
+  1%|          | 4/774 [00:00<03:11,  4.02it/s][A
+  1%|          | 5/774 [00:01<03:11,  4.01it/s][A
+  1%|          | 6/774 [00:01<03:25,  3.74it/s][A
+  1%|          | 7/774 [00:01<03:23,  3.77it/s][A
+  1%|          | 8/774 [00:02<03:26,  3.72it/s][A
+  1%|          | 9/774 [00:02<03:12,  3.98it/s][A
+  1%|▏         | 10/774 [00:02<03:12,  3.98it/s][A
+  1%|▏         | 11/774 [00:02<03:27,  3.68it/s][A
+  2%|▏         | 12/774 [00:03<03:14,  3.92it/s][A
+  2%|▏         | 13/774 [00:03<03:06,  4.08it/s][A
+  2%|▏         | 14/774 [00:03<03:18,  3.82it/s][A
+  2%|▏         | 15/774 [00:03<03:37,  3.49it/s][A
+  2%|▏         | 16/774 [00:04<03:35,  3.52it/s][A
+  2%|▏         | 17/774 [00:04<03:12,  3.94it/s][A
+  2%|▏         | 18/774 [00:04<03:04,  4.11it/s][A
+  2%|▏         | 19/774 [00:04<03:13,  3.90it/s][A
+  3%|▎         | 20/774 [00:05<03:10,  3.96it/s][A
+  3%|▎         | 21/774 [00:05<03:14,  3.88it/s][A
+  3%|▎         | 22/774 [00:05<03:17,  3.81it/s][A
+  3%|▎         | 23/774 [00:05<03:29,  3.59it/s][A
+  3%|▎         | 24/774 [00:06<03:27,  3.61it/s][A
+  3%|▎         | 25/774 [00:06<03:33,  3.50it/s][A
+  3%|▎         | 26/774 [00:06<03:30,  3.55it/s][A
+  3%|▎         | 27/774 [00:07<03:29,  3.57it/s][A
+  4%|▎         | 28/774 [00:07<03:35,  3.46it/s][A
+  4%|▎         | 29/774 [00:07<03:40,  3.37it/s][A
+  4%|▍         | 30/774 [00:07<03:28,  3.57it/s][A
+  4%|▍         | 31/774 [00:08<03:28,  3.56it/s][A
+  4%|▍         | 32/774 [00:08<04:04,  3.04it/s][A
+  4%|▍         | 33/774 [00:08<03:52,  3.19it/s][A
+  4%|▍         | 34/774 [00:09<03:36,  3.41it/s][A
+  5%|▍         | 35/774 [00:09<03:44,  3.30it/s][A
+  5%|▍         | 36/774 [00:09<03:44,  3.29it/s][A
+  5%|▍         | 37/774 [00:10<03:44,  3.29it/s][A
+  5%|▍         | 38/774 [00:10<03:33,  3.44it/s][A
+  5%|▌         | 39/774 [00:10<03:19,  3.69it/s][A
+  5%|▌         | 40/774 [00:10<03:24,  3.59it/s][A
+  5%|▌         | 41/774 [00:11<03:21,  3.63it/s][A
+  5%|▌         | 42/774 [00:11<03:10,  3.85it/s][A
+  6%|▌         | 43/774 [00:11<03:22,  3.61it/s][A
+  6%|▌         | 44/774 [00:12<03:26,  3.54it/s][A
+  6%|▌         | 45/774 [00:12<03:14,  3.74it/s][A
+  6%|▌         | 46/774 [00:12<02:59,  4.05it/s][A
+  6%|▌         | 47/774 [00:12<02:48,  4.32it/s][A
+  6%|▌         | 48/774 [00:12<02:50,  4.26it/s][A
+  6%|▋         | 49/774 [00:13<02:51,  4.22it/s][A
+  6%|▋         | 50/774 [00:13<02:54,  4.15it/s][A
+  7%|▋         | 51/774 [00:13<02:54,  4.14it/s][A
+  7%|▋         | 52/774 [00:13<02:51,  4.20it/s][A
+  7%|▋         | 53/774 [00:14<03:00,  3.98it/s][A
+  7%|▋         | 54/774 [00:14<03:05,  3.89it/s][A
+  7%|▋         | 55/774 [00:14<03:14,  3.70it/s][A
+  7%|▋         | 56/774 [00:14<03:15,  3.68it/s][A
+  7%|▋         | 57/774 [00:15<03:27,  3.46it/s][A
+  7%|▋         | 58/774 [00:15<03:25,  3.48it/s][A
+  8%|▊         | 59/774 [00:15<03:09,  3.78it/s][A
+  8%|▊         | 60/774 [00:15<02:55,  4.08it/s][A
+  8%|▊         | 61/774 [00:16<02:32,  4.68it/s][A
+  8%|▊         | 62/774 [00:16<02:29,  4.77it/s][A
+  8%|▊         | 63/774 [00:16<02:54,  4.07it/s][A
+  8%|▊         | 64/774 [00:16<02:45,  4.28it/s][A
+  8%|▊         | 65/774 [00:17<02:49,  4.19it/s][A
+  9%|▊         | 66/774 [00:17<02:47,  4.24it/s][A
+  9%|▊         | 67/774 [00:17<02:40,  4.40it/s][A
+  9%|▉         | 68/774 [00:17<02:37,  4.49it/s][A
+  9%|▉         | 69/774 [00:17<02:29,  4.72it/s][A
+  9%|▉         | 70/774 [00:18<02:36,  4.49it/s][A
+  9%|▉         | 71/774 [00:18<02:31,  4.63it/s][A
+  9%|▉         | 72/774 [00:18<02:40,  4.37it/s][A
+  9%|▉         | 73/774 [00:18<02:50,  4.10it/s][A
+ 10%|▉         | 74/774 [00:19<02:57,  3.94it/s][A
+ 10%|▉         | 75/774 [00:19<03:04,  3.80it/s][A
+ 10%|▉         | 76/774 [00:19<03:00,  3.87it/s][A
+ 10%|▉         | 77/774 [00:20<03:13,  3.60it/s][A
+ 10%|█         | 78/774 [00:20<02:54,  3.99it/s][A
+ 10%|█         | 79/774 [00:20<02:42,  4.29it/s][A
+ 10%|█         | 80/774 [00:20<02:38,  4.37it/s][A
+ 10%|█         | 81/774 [00:20<02:17,  5.06it/s][A
+ 11%|█         | 82/774 [00:20<02:15,  5.11it/s][A
+ 11%|█         | 83/774 [00:21<02:19,  4.94it/s][A
+ 11%|█         | 84/774 [00:21<02:25,  4.73it/s][A
+ 11%|█         | 85/774 [00:21<02:34,  4.45it/s][A
+ 11%|█         | 86/774 [00:21<02:41,  4.26it/s][A
+ 11%|█         | 87/774 [00:22<02:42,  4.23it/s][A
+ 11%|█▏        | 88/774 [00:22<02:31,  4.54it/s][A
+ 11%|█▏        | 89/774 [00:22<02:25,  4.72it/s][A
+ 12%|█▏        | 90/774 [00:22<02:33,  4.46it/s][A
+ 12%|█▏        | 91/774 [00:23<02:47,  4.08it/s][A
+ 12%|█▏        | 92/774 [00:23<03:00,  3.78it/s][A
+ 12%|█▏        | 93/774 [00:23<02:56,  3.85it/s][A
+ 12%|█▏        | 94/774 [00:23<03:00,  3.76it/s][A
+ 12%|█▏        | 95/774 [00:24<02:59,  3.79it/s][A
+ 12%|█▏        | 96/774 [00:24<02:54,  3.89it/s][A
+ 13%|█▎        | 97/774 [00:24<02:39,  4.24it/s][A
+ 13%|█▎        | 98/774 [00:24<02:33,  4.41it/s][A
+ 13%|█▎        | 99/774 [00:25<02:45,  4.07it/s][A
+ 13%|█▎        | 100/774 [00:25<02:57,  3.80it/s][A
+ 13%|█▎        | 101/774 [00:25<03:00,  3.72it/s][A
+ 13%|█▎        | 102/774 [00:26<03:13,  3.47it/s][A
+ 13%|█▎        | 103/774 [00:26<03:16,  3.42it/s][A
+ 13%|█▎        | 104/774 [00:26<03:14,  3.44it/s][A
+ 14%|█▎        | 105/774 [00:26<03:13,  3.45it/s][A
+ 14%|█▎        | 106/774 [00:27<03:34,  3.12it/s][A
+ 14%|█▍        | 107/774 [00:27<03:46,  2.94it/s][A
+ 14%|█▍        | 108/774 [00:28<03:37,  3.06it/s][A
+ 14%|█▍        | 109/774 [00:28<03:35,  3.09it/s][A
+ 14%|█▍        | 110/774 [00:28<03:24,  3.24it/s][A
+ 14%|█▍        | 111/774 [00:28<03:23,  3.25it/s][A
+ 14%|█▍        | 112/774 [00:29<03:12,  3.43it/s][A
+ 15%|█▍        | 113/774 [00:29<03:18,  3.34it/s][A
+ 15%|█▍        | 114/774 [00:29<03:22,  3.26it/s][A
+ 15%|█▍        | 115/774 [00:30<03:16,  3.36it/s][A
+ 15%|█▍        | 116/774 [00:30<03:00,  3.64it/s][A
+ 15%|█▌        | 117/774 [00:30<03:07,  3.51it/s][A
+ 15%|█▌        | 118/774 [00:30<03:05,  3.54it/s][A
+ 15%|█▌        | 119/774 [00:31<02:58,  3.68it/s][A
+ 16%|█▌        | 120/774 [00:31<03:08,  3.48it/s][A
+ 16%|█▌        | 121/774 [00:31<03:03,  3.56it/s][A
+ 16%|█▌        | 122/774 [00:32<03:06,  3.50it/s][A
+ 16%|█▌        | 123/774 [00:32<02:57,  3.67it/s][A
+ 16%|█▌        | 124/774 [00:32<02:59,  3.62it/s][A
+ 16%|█▌        | 125/774 [00:32<02:59,  3.62it/s][A
+ 16%|█▋        | 126/774 [00:33<03:07,  3.45it/s][A
+ 16%|█▋        | 127/774 [00:33<03:16,  3.29it/s][A
+ 17%|█▋        | 128/774 [00:33<03:07,  3.44it/s][A
+ 17%|█▋        | 129/774 [00:34<03:08,  3.41it/s][A
+ 17%|█▋        | 130/774 [00:34<03:16,  3.28it/s][A
+ 17%|█▋        | 131/774 [00:34<03:06,  3.44it/s][A
+ 17%|█▋        | 132/774 [00:34<03:07,  3.43it/s][A
+ 17%|█▋        | 133/774 [00:35<03:02,  3.51it/s][A
+ 17%|█▋        | 134/774 [00:35<03:02,  3.52it/s][A
+ 17%|█▋        | 135/774 [00:35<03:20,  3.19it/s][A
+ 18%|█▊        | 136/774 [00:36<03:27,  3.07it/s][A
+ 18%|█▊        | 137/774 [00:36<03:26,  3.09it/s][A
+ 18%|█▊        | 138/774 [00:36<03:22,  3.14it/s][A
+ 18%|█▊        | 139/774 [00:37<03:23,  3.13it/s][A
+ 18%|█▊        | 140/774 [00:37<03:19,  3.18it/s][A
+ 18%|█▊        | 141/774 [00:37<03:12,  3.29it/s][A
+ 18%|█▊        | 142/774 [00:38<03:22,  3.13it/s][A
+ 18%|█▊        | 143/774 [00:38<03:18,  3.18it/s][A
+ 19%|█▊        | 144/774 [00:38<03:07,  3.36it/s][A
+ 19%|█▊        | 145/774 [00:38<03:00,  3.48it/s][A
+ 19%|█▉        | 146/774 [00:39<02:50,  3.68it/s][A
+ 19%|█▉        | 147/774 [00:39<02:42,  3.87it/s][A
+ 19%|█▉        | 148/774 [00:39<02:51,  3.65it/s][A
+ 19%|█▉        | 149/774 [00:40<03:03,  3.40it/s][A
+ 19%|█▉        | 150/774 [00:40<03:06,  3.35it/s][A
+ 20%|█▉        | 151/774 [00:40<02:55,  3.55it/s][A
+ 20%|█▉        | 152/774 [00:40<02:47,  3.72it/s][A
+ 20%|█▉        | 153/774 [00:41<02:53,  3.58it/s][A
+ 20%|█▉        | 154/774 [00:41<02:48,  3.68it/s][A
+ 20%|██        | 155/774 [00:41<02:45,  3.74it/s][A
+ 20%|██        | 156/774 [00:41<02:40,  3.85it/s][A
+ 20%|██        | 157/774 [00:42<02:34,  4.00it/s][A
+ 20%|██        | 158/774 [00:42<02:38,  3.90it/s][A
+ 21%|██        | 159/774 [00:42<02:40,  3.84it/s][A
+ 21%|██        | 160/774 [00:42<02:31,  4.05it/s][A
+ 21%|██        | 161/774 [00:43<02:41,  3.80it/s][A
+ 21%|██        | 162/774 [00:43<02:46,  3.69it/s][A
+ 21%|██        | 163/774 [00:43<02:45,  3.70it/s][A
+ 21%|██        | 164/774 [00:43<02:39,  3.83it/s][A
+ 21%|██▏       | 165/774 [00:44<02:37,  3.86it/s][A
+ 21%|██▏       | 166/774 [00:44<02:41,  3.76it/s][A
+ 22%|██▏       | 167/774 [00:44<02:44,  3.70it/s][A
+ 22%|██▏       | 168/774 [00:45<02:35,  3.90it/s][A
+ 22%|██▏       | 169/774 [00:45<02:27,  4.09it/s][A
+ 22%|██▏       | 170/774 [00:45<02:37,  3.84it/s][A
+ 22%|██▏       | 171/774 [00:45<02:46,  3.62it/s][A
+ 22%|██▏       | 172/774 [00:46<02:54,  3.44it/s][A
+ 22%|██▏       | 173/774 [00:46<02:51,  3.51it/s][A
+ 22%|██▏       | 174/774 [00:46<02:44,  3.66it/s][A
+ 23%|██▎       | 175/774 [00:46<02:43,  3.65it/s][A
+ 23%|██▎       | 176/774 [00:47<02:37,  3.79it/s][A
+ 23%|██▎       | 177/774 [00:47<02:51,  3.48it/s][A
+ 23%|██▎       | 178/774 [00:47<02:36,  3.82it/s][A
+ 23%|██▎       | 179/774 [00:47<02:22,  4.16it/s][A
+ 23%|██▎       | 180/774 [00:48<02:16,  4.36it/s][A
+ 23%|██▎       | 181/774 [00:48<02:20,  4.23it/s][A
+ 24%|██▎       | 182/774 [00:48<02:24,  4.10it/s][A
+ 24%|██▎       | 183/774 [00:48<02:25,  4.07it/s][A
+ 24%|██▍       | 184/774 [00:49<02:35,  3.79it/s][A
+ 24%|██▍       | 185/774 [00:49<02:44,  3.58it/s][A
+ 24%|██▍       | 186/774 [00:49<02:43,  3.60it/s][A
+ 24%|██▍       | 187/774 [00:50<02:36,  3.76it/s][A
+ 24%|██▍       | 188/774 [00:50<02:34,  3.79it/s][A
+ 24%|██▍       | 189/774 [00:50<02:30,  3.87it/s][A
+ 25%|██▍       | 190/774 [00:50<02:26,  3.99it/s][A
+ 25%|██▍       | 191/774 [00:51<02:31,  3.85it/s][A
+ 25%|██▍       | 192/774 [00:51<02:35,  3.73it/s][A
+ 25%|██▍       | 193/774 [00:51<02:38,  3.66it/s][A
+ 25%|██▌       | 194/774 [00:51<02:47,  3.46it/s][A
+ 25%|██▌       | 195/774 [00:52<02:55,  3.29it/s][A
+ 25%|██▌       | 196/774 [00:52<02:56,  3.28it/s][A
+ 25%|██▌       | 197/774 [00:52<02:52,  3.35it/s][A
+ 26%|██▌       | 198/774 [00:53<02:43,  3.53it/s][A
+ 26%|██▌       | 199/774 [00:53<02:44,  3.50it/s][A
+ 26%|██▌       | 200/774 [00:53<02:38,  3.61it/s][A
+ 26%|██▌       | 201/774 [00:53<02:36,  3.67it/s][A
+ 26%|██▌       | 202/774 [00:54<02:33,  3.73it/s][A
+ 26%|██▌       | 203/774 [00:54<02:26,  3.91it/s][A
+ 26%|██▋       | 204/774 [00:54<02:29,  3.80it/s][A
+ 26%|██▋       | 205/774 [00:55<02:39,  3.56it/s][A
+ 27%|██▋       | 206/774 [00:55<02:35,  3.65it/s][A
+ 27%|██▋       | 207/774 [00:55<02:33,  3.70it/s][A
+ 27%|██▋       | 208/774 [00:55<02:33,  3.68it/s][A
+ 27%|██▋       | 209/774 [00:56<02:32,  3.71it/s][A
+ 27%|██▋       | 210/774 [00:56<02:30,  3.75it/s][A
+ 27%|██▋       | 211/774 [00:56<02:27,  3.81it/s][A
+ 27%|██▋       | 212/774 [00:56<02:16,  4.11it/s][A
+ 28%|██▊       | 213/774 [00:56<02:01,  4.61it/s][A
+ 28%|██▊       | 214/774 [00:57<02:03,  4.52it/s][A
+ 28%|██▊       | 215/774 [00:57<02:03,  4.53it/s][A
+ 28%|██▊       | 216/774 [00:57<02:00,  4.63it/s][A
+ 28%|██▊       | 217/774 [00:57<02:05,  4.45it/s][A
+ 28%|██▊       | 218/774 [00:58<02:11,  4.23it/s][A
+ 28%|██▊       | 219/774 [00:58<02:20,  3.95it/s][A
+ 28%|██▊       | 220/774 [00:58<02:19,  3.98it/s][A
+ 29%|██▊       | 221/774 [00:58<02:25,  3.81it/s][A
+ 29%|██▊       | 222/774 [00:59<02:34,  3.58it/s][A
+ 29%|██▉       | 223/774 [00:59<02:51,  3.21it/s][A
+ 29%|██▉       | 224/774 [01:00<03:00,  3.04it/s][A
+ 29%|██▉       | 225/774 [01:00<03:12,  2.86it/s][A
+ 29%|██▉       | 226/774 [01:00<03:15,  2.80it/s][A
+ 29%|██▉       | 227/774 [01:01<03:11,  2.86it/s][A
+ 29%|██▉       | 228/774 [01:01<03:03,  2.97it/s][A
+ 30%|██▉       | 229/774 [01:01<03:19,  2.74it/s][A
+ 30%|██▉       | 230/774 [01:02<03:05,  2.93it/s][A
+ 30%|██▉       | 231/774 [01:02<03:02,  2.98it/s][A
+ 30%|██▉       | 232/774 [01:02<02:53,  3.12it/s][A
+ 30%|███       | 233/774 [01:03<03:08,  2.87it/s][A
+ 30%|███       | 234/774 [01:03<03:12,  2.81it/s][A
+ 30%|███       | 235/774 [01:03<03:11,  2.82it/s][A
+ 30%|███       | 236/774 [01:04<03:14,  2.76it/s][A
+ 31%|███       | 237/774 [01:04<03:10,  2.82it/s][A
+ 31%|███       | 238/774 [01:04<03:01,  2.96it/s][A
+ 31%|███       | 239/774 [01:05<02:59,  2.98it/s][A
+ 31%|███       | 240/774 [01:05<03:00,  2.96it/s][A
+ 31%|███       | 241/774 [01:05<03:03,  2.91it/s][A
+ 31%|███▏      | 242/774 [01:06<03:13,  2.75it/s][A
+ 31%|███▏      | 243/774 [01:06<03:22,  2.62it/s][A
+ 32%|███▏      | 244/774 [01:07<03:17,  2.68it/s][A
+ 32%|███▏      | 245/774 [01:07<03:09,  2.80it/s][A
+ 32%|███▏      | 246/774 [01:07<03:07,  2.81it/s][A
+ 32%|███▏      | 247/774 [01:08<03:45,  2.34it/s][A
+ 32%|███▏      | 248/774 [01:08<03:50,  2.28it/s][A
+ 32%|███▏      | 249/774 [01:09<03:26,  2.54it/s][A
+ 32%|███▏      | 250/774 [01:09<03:20,  2.61it/s][A
+ 32%|███▏      | 251/774 [01:09<03:18,  2.64it/s][A
+ 33%|███▎      | 252/774 [01:10<03:14,  2.69it/s][A
+ 33%|███▎      | 253/774 [01:10<03:12,  2.70it/s][A
+ 33%|███▎      | 254/774 [01:10<03:08,  2.76it/s][A
+ 33%|███▎      | 255/774 [01:11<03:03,  2.83it/s][A
+ 33%|███▎      | 256/774 [01:11<02:58,  2.90it/s][A
+ 33%|███▎      | 257/774 [01:11<02:56,  2.92it/s][A
+ 33%|███▎      | 258/774 [01:12<02:41,  3.19it/s][A
+ 33%|███▎      | 259/774 [01:12<02:24,  3.56it/s][A
+ 34%|███▎      | 260/774 [01:12<02:22,  3.61it/s][A
+ 34%|███▎      | 261/774 [01:12<02:27,  3.48it/s][A
+ 34%|███▍      | 262/774 [01:13<02:12,  3.86it/s][A
+ 34%|███▍      | 263/774 [01:13<02:05,  4.07it/s][A
+ 34%|███▍      | 264/774 [01:13<02:15,  3.77it/s][A
+ 34%|███▍      | 265/774 [01:13<02:08,  3.95it/s][A
+ 34%|███▍      | 266/774 [01:14<02:02,  4.14it/s][A
+ 34%|███▍      | 267/774 [01:14<02:01,  4.17it/s][A
+ 35%|███▍      | 268/774 [01:14<02:08,  3.93it/s][A
+ 35%|███▍      | 269/774 [01:14<02:14,  3.75it/s][A
+ 35%|███▍      | 270/774 [01:15<02:20,  3.60it/s][A
+ 35%|███▌      | 271/774 [01:15<02:15,  3.70it/s][A
+ 35%|███▌      | 272/774 [01:15<02:05,  4.01it/s][A
+ 35%|███▌      | 273/774 [01:15<02:00,  4.14it/s][A
+ 35%|███▌      | 274/774 [01:16<02:04,  4.02it/s][A
+ 36%|███▌      | 275/774 [01:16<01:58,  4.21it/s][A
+ 36%|███▌      | 276/774 [01:16<01:52,  4.43it/s][A
+ 36%|███▌      | 277/774 [01:16<01:56,  4.27it/s][A
+ 36%|███▌      | 278/774 [01:17<01:59,  4.16it/s][A
+ 36%|███▌      | 279/774 [01:17<01:53,  4.36it/s][A
+ 36%|███▌      | 280/774 [01:17<01:55,  4.29it/s][A
+ 36%|███▋      | 281/774 [01:17<02:05,  3.91it/s][A
+ 36%|███▋      | 282/774 [01:18<02:17,  3.59it/s][A
+ 37%|███▋      | 283/774 [01:18<02:12,  3.71it/s][A
+ 37%|███▋      | 284/774 [01:18<02:13,  3.68it/s][A
+ 37%|███▋      | 285/774 [01:18<02:05,  3.89it/s][A
+ 37%|███▋      | 286/774 [01:19<02:01,  4.02it/s][A
+ 37%|███▋      | 287/774 [01:19<02:12,  3.67it/s][A
+ 37%|███▋      | 288/774 [01:19<02:15,  3.60it/s][A
+ 37%|███▋      | 289/774 [01:20<02:13,  3.64it/s][A
+ 37%|███▋      | 290/774 [01:20<02:09,  3.73it/s][A
+ 38%|███▊      | 291/774 [01:20<02:08,  3.75it/s][A
+ 38%|███▊      | 292/774 [01:20<02:05,  3.84it/s][A
+ 38%|███▊      | 293/774 [01:20<01:53,  4.22it/s][A
+ 38%|███▊      | 294/774 [01:21<01:50,  4.34it/s][A
+ 38%|███▊      | 295/774 [01:21<01:49,  4.39it/s][A
+ 38%|███▊      | 296/774 [01:21<01:43,  4.60it/s][A
+ 38%|███▊      | 297/774 [01:21<01:38,  4.84it/s][A
+ 39%|███▊      | 298/774 [01:22<01:43,  4.61it/s][A
+ 39%|███▊      | 299/774 [01:22<01:47,  4.43it/s][A
+ 39%|███▉      | 300/774 [01:22<01:53,  4.16it/s][A
+ 39%|███▉      | 301/774 [01:22<01:46,  4.44it/s][A
+ 39%|███▉      | 302/774 [01:22<01:41,  4.65it/s][A
+ 39%|███▉      | 303/774 [01:23<01:38,  4.78it/s][A
+ 39%|███▉      | 304/774 [01:23<01:25,  5.49it/s][A
+ 39%|███▉      | 305/774 [01:23<01:24,  5.53it/s][A
+ 40%|███▉      | 306/774 [01:23<01:37,  4.82it/s][A
+ 40%|███▉      | 307/774 [01:23<01:42,  4.56it/s][A
+ 40%|███▉      | 308/774 [01:24<01:37,  4.76it/s][A
+ 40%|███▉      | 309/774 [01:24<01:38,  4.73it/s][A
+ 40%|████      | 310/774 [01:24<01:43,  4.48it/s][A
+ 40%|████      | 311/774 [01:24<01:41,  4.54it/s][A
+ 40%|████      | 312/774 [01:25<01:39,  4.66it/s][A
+ 40%|████      | 313/774 [01:25<01:39,  4.66it/s][A
+ 41%|████      | 314/774 [01:25<01:40,  4.57it/s][A
+ 41%|████      | 315/774 [01:25<01:49,  4.21it/s][A
+ 41%|████      | 316/774 [01:25<01:40,  4.56it/s][A
+ 41%|████      | 317/774 [01:26<01:33,  4.90it/s][A
+ 41%|████      | 318/774 [01:26<01:36,  4.70it/s][A
+ 41%|████      | 319/774 [01:26<01:39,  4.58it/s][A
+ 41%|████▏     | 320/774 [01:26<01:38,  4.60it/s][A
+ 41%|████▏     | 321/774 [01:26<01:30,  4.98it/s][A
+ 42%|████▏     | 322/774 [01:27<01:25,  5.28it/s][A
+ 42%|████▏     | 323/774 [01:27<01:17,  5.85it/s][A
+ 42%|████▏     | 324/774 [01:27<01:24,  5.35it/s][A
+ 42%|████▏     | 325/774 [01:27<01:28,  5.07it/s][A
+ 42%|████▏     | 326/774 [01:27<01:25,  5.26it/s][A
+ 42%|████▏     | 327/774 [01:28<01:28,  5.04it/s][A
+ 42%|████▏     | 328/774 [01:28<01:26,  5.15it/s][A
+ 43%|████▎     | 329/774 [01:28<01:34,  4.68it/s][A
+ 43%|████▎     | 330/774 [01:28<01:30,  4.89it/s][A
+ 43%|████▎     | 331/774 [01:28<01:22,  5.37it/s][A
+ 43%|████▎     | 332/774 [01:29<01:20,  5.50it/s][A
+ 43%|████▎     | 333/774 [01:29<01:22,  5.32it/s][A
+ 43%|████▎     | 334/774 [01:29<01:27,  5.05it/s][A
+ 43%|████▎     | 335/774 [01:29<01:27,  5.01it/s][A
+ 43%|████▎     | 336/774 [01:29<01:26,  5.06it/s][A
+ 44%|████▎     | 337/774 [01:29<01:20,  5.45it/s][A
+ 44%|████▎     | 338/774 [01:30<01:15,  5.81it/s][A
+ 44%|████▍     | 339/774 [01:30<01:09,  6.29it/s][A
+ 44%|████▍     | 340/774 [01:30<01:09,  6.25it/s][A
+ 44%|████▍     | 341/774 [01:30<01:27,  4.96it/s][A
+ 44%|████▍     | 342/774 [01:30<01:36,  4.46it/s][A
+ 44%|████▍     | 343/774 [01:31<01:38,  4.39it/s][A
+ 44%|████▍     | 344/774 [01:31<01:40,  4.26it/s][A
+ 45%|████▍     | 345/774 [01:31<01:44,  4.10it/s][A
+ 45%|████▍     | 346/774 [01:31<01:47,  4.00it/s][A
+ 45%|████▍     | 347/774 [01:32<01:43,  4.11it/s][A
+ 45%|████▍     | 348/774 [01:32<01:39,  4.30it/s][A
+ 45%|████▌     | 349/774 [01:32<01:33,  4.53it/s][A
+ 45%|████▌     | 350/774 [01:32<01:37,  4.36it/s][A
+ 45%|████▌     | 351/774 [01:33<01:37,  4.35it/s][A
+ 45%|████▌     | 352/774 [01:33<01:33,  4.52it/s][A
+ 46%|████▌     | 353/774 [01:33<01:33,  4.50it/s][A
+ 46%|████▌     | 354/774 [01:33<01:32,  4.54it/s][A
+ 46%|████▌     | 355/774 [01:34<01:37,  4.28it/s][A
+ 46%|████▌     | 356/774 [01:34<01:47,  3.89it/s][A
+ 46%|████▌     | 357/774 [01:34<02:04,  3.36it/s][A
+ 46%|████▋     | 358/774 [01:35<02:07,  3.26it/s][A
+ 46%|████▋     | 359/774 [01:35<02:06,  3.28it/s][A
+ 47%|████▋     | 360/774 [01:35<02:06,  3.27it/s][A
+ 47%|████▋     | 361/774 [01:35<02:01,  3.41it/s][A
+ 47%|████▋     | 362/774 [01:36<02:07,  3.24it/s][A
+ 47%|████▋     | 363/774 [01:36<02:07,  3.21it/s][A
+ 47%|████▋     | 364/774 [01:36<02:11,  3.12it/s][A
+ 47%|████▋     | 365/774 [01:37<02:07,  3.21it/s][A
+ 47%|████▋     | 366/774 [01:37<01:57,  3.48it/s][A
+ 47%|████▋     | 367/774 [01:37<01:51,  3.66it/s][A
+ 48%|████▊     | 368/774 [01:37<01:48,  3.76it/s][A
+ 48%|████▊     | 369/774 [01:38<01:55,  3.51it/s][A
+ 48%|████▊     | 370/774 [01:38<02:08,  3.13it/s][A
+ 48%|████▊     | 371/774 [01:38<02:00,  3.36it/s][A
+ 48%|████▊     | 372/774 [01:39<02:00,  3.33it/s][A
+ 48%|████▊     | 373/774 [01:39<01:59,  3.36it/s][A
+ 48%|████▊     | 374/774 [01:39<01:55,  3.47it/s][A
+ 48%|████▊     | 375/774 [01:40<01:55,  3.46it/s][A
+ 49%|████▊     | 376/774 [01:40<01:59,  3.32it/s][A
+ 49%|████▊     | 377/774 [01:40<02:12,  3.00it/s][A
+ 49%|████▉     | 378/774 [01:41<02:12,  2.98it/s][A
+ 49%|████▉     | 379/774 [01:41<02:03,  3.20it/s][A
+ 49%|████▉     | 380/774 [01:41<01:52,  3.49it/s][A
+ 49%|████▉     | 381/774 [01:41<01:44,  3.75it/s][A
+ 49%|████▉     | 382/774 [01:42<01:41,  3.88it/s][A
+ 49%|████▉     | 383/774 [01:42<01:39,  3.94it/s][A
+ 50%|████▉     | 384/774 [01:42<01:46,  3.65it/s][A
+ 50%|████▉     | 385/774 [01:43<01:55,  3.37it/s][A
+ 50%|████▉     | 386/774 [01:43<01:47,  3.60it/s][A
+ 50%|█████     | 387/774 [01:43<01:41,  3.82it/s][A
+ 50%|█████     | 388/774 [01:43<01:46,  3.62it/s][A
+ 50%|█████     | 389/774 [01:44<01:42,  3.74it/s][A
+ 50%|█████     | 390/774 [01:44<01:56,  3.31it/s][A
+ 51%|█████     | 391/774 [01:44<01:57,  3.25it/s][A
+ 51%|█████     | 392/774 [01:44<01:48,  3.53it/s][A
+ 51%|█████     | 393/774 [01:45<01:39,  3.82it/s][A
+ 51%|█████     | 394/774 [01:45<01:40,  3.79it/s][A
+ 51%|█████     | 395/774 [01:45<01:46,  3.54it/s][A
+ 51%|█████     | 396/774 [01:46<01:44,  3.61it/s][A
+ 51%|█████▏    | 397/774 [01:46<01:48,  3.48it/s][A
+ 51%|█████▏    | 398/774 [01:46<01:43,  3.62it/s][A
+ 52%|█████▏    | 399/774 [01:46<01:41,  3.69it/s][A
+ 52%|█████▏    | 400/774 [01:47<01:34,  3.97it/s][A
+ 52%|█████▏    | 401/774 [01:47<01:31,  4.09it/s][A
+ 52%|█████▏    | 402/774 [01:47<01:30,  4.09it/s][A
+ 52%|█████▏    | 403/774 [01:47<01:34,  3.91it/s][A
+ 52%|█████▏    | 404/774 [01:48<01:40,  3.70it/s][A
+ 52%|█████▏    | 405/774 [01:48<01:36,  3.82it/s][A
+ 52%|█████▏    | 406/774 [01:48<01:40,  3.68it/s][A
+ 53%|█████▎    | 407/774 [01:48<01:46,  3.43it/s][A
+ 53%|█████▎    | 408/774 [01:49<01:42,  3.58it/s][A
+ 53%|█████▎    | 409/774 [01:49<01:39,  3.68it/s][A
+ 53%|█████▎    | 410/774 [01:49<01:40,  3.62it/s][A
+ 53%|█████▎    | 411/774 [01:50<01:40,  3.62it/s][A
+ 53%|█████▎    | 412/774 [01:50<01:41,  3.57it/s][A
+ 53%|█████▎    | 413/774 [01:50<01:39,  3.64it/s][A
+ 53%|█████▎    | 414/774 [01:50<01:36,  3.73it/s][A
+ 54%|█████▎    | 415/774 [01:51<01:25,  4.21it/s][A
+ 54%|█████▎    | 416/774 [01:51<01:25,  4.17it/s][A
+ 54%|█████▍    | 417/774 [01:51<01:25,  4.19it/s][A
+ 54%|█████▍    | 418/774 [01:51<01:19,  4.48it/s][A
+ 54%|█████▍    | 419/774 [01:52<01:33,  3.80it/s][A
+ 54%|█████▍    | 420/774 [01:52<01:37,  3.62it/s][A
+ 54%|█████▍    | 421/774 [01:52<01:39,  3.56it/s][A
+ 55%|█████▍    | 422/774 [01:52<01:38,  3.56it/s][A
+ 55%|█████▍    | 423/774 [01:53<01:39,  3.54it/s][A
+ 55%|█████▍    | 424/774 [01:53<01:36,  3.63it/s][A
+ 55%|█████▍    | 425/774 [01:53<01:25,  4.09it/s][A
+ 55%|█████▌    | 426/774 [01:53<01:18,  4.44it/s][A
+ 55%|█████▌    | 427/774 [01:54<01:14,  4.64it/s][A
+ 55%|█████▌    | 428/774 [01:54<01:16,  4.51it/s][A
+ 55%|█████▌    | 429/774 [01:54<01:18,  4.37it/s][A
+ 56%|█████▌    | 430/774 [01:54<01:23,  4.14it/s][A
+ 56%|█████▌    | 431/774 [01:55<01:34,  3.61it/s][A
+ 56%|█████▌    | 432/774 [01:55<01:34,  3.63it/s][A
+ 56%|█████▌    | 433/774 [01:55<01:27,  3.89it/s][A
+ 56%|█████▌    | 434/774 [01:55<01:22,  4.11it/s][A
+ 56%|█████▌    | 435/774 [01:56<01:21,  4.14it/s][A
+ 56%|█████▋    | 436/774 [01:56<01:23,  4.05it/s][A
+ 56%|█████▋    | 437/774 [01:56<01:21,  4.14it/s][A
+ 57%|█████▋    | 438/774 [01:56<01:17,  4.36it/s][A
+ 57%|█████▋    | 439/774 [01:57<01:20,  4.16it/s][A
+ 57%|█████▋    | 440/774 [01:57<01:24,  3.96it/s][A
+ 57%|█████▋    | 441/774 [01:57<01:28,  3.76it/s][A
+ 57%|█████▋    | 442/774 [01:57<01:30,  3.68it/s][A
+ 57%|█████▋    | 443/774 [01:58<01:27,  3.78it/s][A
+ 57%|█████▋    | 444/774 [01:58<01:25,  3.85it/s][A
+ 57%|█████▋    | 445/774 [01:58<01:25,  3.85it/s][A
+ 58%|█████▊    | 446/774 [01:58<01:23,  3.94it/s][A
+ 58%|█████▊    | 447/774 [01:59<01:21,  4.00it/s][A
+ 58%|█████▊    | 448/774 [01:59<01:14,  4.38it/s][A
+ 58%|█████▊    | 449/774 [01:59<01:15,  4.31it/s][A
+ 58%|█████▊    | 450/774 [01:59<01:17,  4.17it/s][A
+ 58%|█████▊    | 451/774 [02:00<01:15,  4.27it/s][A
+ 58%|█████▊    | 452/774 [02:00<01:12,  4.46it/s][A
+ 59%|█████▊    | 453/774 [02:00<01:11,  4.48it/s][A
+ 59%|█████▊    | 454/774 [02:00<01:17,  4.12it/s][A
+ 59%|█████▉    | 455/774 [02:01<01:21,  3.93it/s][A
+ 59%|█████▉    | 456/774 [02:01<01:25,  3.73it/s][A
+ 59%|█████▉    | 457/774 [02:01<01:18,  4.01it/s][A
+ 59%|█████▉    | 458/774 [02:01<01:18,  4.03it/s][A
+ 59%|█████▉    | 459/774 [02:01<01:16,  4.10it/s][A
+ 59%|█████▉    | 460/774 [02:02<01:21,  3.83it/s][A
+ 60%|█████▉    | 461/774 [02:02<01:28,  3.52it/s][A
+ 60%|█████▉    | 462/774 [02:02<01:26,  3.61it/s][A
+ 60%|█████▉    | 463/774 [02:03<01:23,  3.72it/s][A
+ 60%|█████▉    | 464/774 [02:03<01:22,  3.75it/s][A
+ 60%|██████    | 465/774 [02:03<01:14,  4.14it/s][A
+ 60%|██████    | 466/774 [02:03<01:12,  4.28it/s][A
+ 60%|██████    | 467/774 [02:03<01:07,  4.52it/s][A
+ 60%|██████    | 468/774 [02:04<01:08,  4.47it/s][A
+ 61%|██████    | 469/774 [02:04<01:02,  4.87it/s][A
+ 61%|██████    | 470/774 [02:04<00:59,  5.14it/s][A
+ 61%|██████    | 471/774 [02:04<01:01,  4.89it/s][A
+ 61%|██████    | 472/774 [02:05<01:06,  4.53it/s][A
+ 61%|██████    | 473/774 [02:05<01:09,  4.32it/s][A
+ 61%|██████    | 474/774 [02:05<01:08,  4.40it/s][A
+ 61%|██████▏   | 475/774 [02:05<01:08,  4.34it/s][A
+ 61%|██████▏   | 476/774 [02:06<01:17,  3.85it/s][A
+ 62%|██████▏   | 477/774 [02:06<01:31,  3.23it/s][A
+ 62%|██████▏   | 478/774 [02:06<01:32,  3.19it/s][A
+ 62%|██████▏   | 479/774 [02:07<01:30,  3.26it/s][A
+ 62%|██████▏   | 480/774 [02:07<01:27,  3.37it/s][A
+ 62%|██████▏   | 481/774 [02:07<01:28,  3.32it/s][A
+ 62%|██████▏   | 482/774 [02:07<01:26,  3.37it/s][A
+ 62%|██████▏   | 483/774 [02:08<01:24,  3.46it/s][A
+ 63%|██████▎   | 484/774 [02:08<01:25,  3.39it/s][A
+ 63%|██████▎   | 485/774 [02:08<01:27,  3.30it/s][A
+ 63%|██████▎   | 486/774 [02:09<01:24,  3.42it/s][A
+ 63%|██████▎   | 487/774 [02:09<01:25,  3.37it/s][A
+ 63%|██████▎   | 488/774 [02:09<01:22,  3.45it/s][A
+ 63%|██████▎   | 489/774 [02:09<01:17,  3.67it/s][A
+ 63%|██████▎   | 490/774 [02:10<01:17,  3.65it/s][A
+ 63%|██████▎   | 491/774 [02:10<01:16,  3.68it/s][A
+ 64%|██████▎   | 492/774 [02:10<01:18,  3.59it/s][A
+ 64%|██████▎   | 493/774 [02:11<01:18,  3.56it/s][A
+ 64%|██████▍   | 494/774 [02:11<01:17,  3.61it/s][A
+ 64%|██████▍   | 495/774 [02:11<01:17,  3.61it/s][A
+ 64%|██████▍   | 496/774 [02:11<01:22,  3.37it/s][A
+ 64%|██████▍   | 497/774 [02:12<01:23,  3.33it/s][A
+ 64%|██████▍   | 498/774 [02:12<01:22,  3.36it/s][A
+ 64%|██████▍   | 499/774 [02:12<01:19,  3.45it/s][A
+ 65%|██████▍   | 500/774 [02:13<01:17,  3.54it/s][A
+ 65%|██████▍   | 501/774 [02:13<01:14,  3.67it/s][A
+ 65%|██████▍   | 502/774 [02:13<01:13,  3.68it/s][A
+ 65%|██████▍   | 503/774 [02:13<01:19,  3.43it/s][A
+ 65%|██████▌   | 504/774 [02:14<01:21,  3.31it/s][A
+ 65%|██████▌   | 505/774 [02:14<01:18,  3.42it/s][A
+ 65%|██████▌   | 506/774 [02:14<01:18,  3.40it/s][A
+ 66%|██████▌   | 507/774 [02:15<01:23,  3.21it/s][A
+ 66%|██████▌   | 508/774 [02:15<01:21,  3.27it/s][A
+ 66%|██████▌   | 509/774 [02:15<01:19,  3.31it/s][A
+ 66%|██████▌   | 510/774 [02:16<01:17,  3.40it/s][A
+ 66%|██████▌   | 511/774 [02:16<01:12,  3.62it/s][A
+ 66%|██████▌   | 512/774 [02:16<01:10,  3.70it/s][A
+ 66%|██████▋   | 513/774 [02:16<01:13,  3.53it/s][A
+ 66%|██████▋   | 514/774 [02:17<01:16,  3.41it/s][A
+ 67%|██████▋   | 515/774 [02:17<01:21,  3.16it/s][A
+ 67%|██████▋   | 516/774 [02:17<01:16,  3.37it/s][A
+ 67%|██████▋   | 517/774 [02:18<01:10,  3.67it/s][A
+ 67%|██████▋   | 518/774 [02:18<01:07,  3.77it/s][A
+ 67%|██████▋   | 519/774 [02:18<01:10,  3.62it/s][A
+ 67%|██████▋   | 520/774 [02:18<01:09,  3.64it/s][A
+ 67%|██████▋   | 521/774 [02:19<01:08,  3.70it/s][A
+ 67%|██████▋   | 522/774 [02:19<01:05,  3.86it/s][A
+ 68%|██████▊   | 523/774 [02:19<01:03,  3.96it/s][A
+ 68%|██████▊   | 524/774 [02:19<01:06,  3.73it/s][A
+ 68%|██████▊   | 525/774 [02:20<01:10,  3.55it/s][A
+ 68%|██████▊   | 526/774 [02:20<01:11,  3.45it/s][A
+ 68%|██████▊   | 527/774 [02:20<01:12,  3.39it/s][A
+ 68%|██████▊   | 528/774 [02:21<01:11,  3.46it/s][A
+ 68%|██████▊   | 529/774 [02:21<01:07,  3.64it/s][A
+ 68%|██████▊   | 530/774 [02:21<01:08,  3.56it/s][A
+ 69%|██████▊   | 531/774 [02:21<01:07,  3.62it/s][A
+ 69%|██████▊   | 532/774 [02:22<01:04,  3.75it/s][A
+ 69%|██████▉   | 533/774 [02:22<01:01,  3.90it/s][A
+ 69%|██████▉   | 534/774 [02:22<00:58,  4.11it/s][A
+ 69%|██████▉   | 535/774 [02:22<01:00,  3.97it/s][A
+ 69%|██████▉   | 536/774 [02:23<01:02,  3.80it/s][A
+ 69%|██████▉   | 537/774 [02:23<01:02,  3.80it/s][A
+ 70%|██████▉   | 538/774 [02:23<01:06,  3.54it/s][A
+ 70%|██████▉   | 539/774 [02:24<01:06,  3.56it/s][A
+ 70%|██████▉   | 540/774 [02:24<01:05,  3.56it/s][A
+ 70%|██████▉   | 541/774 [02:24<01:04,  3.60it/s][A
+ 70%|███████   | 542/774 [02:24<01:04,  3.62it/s][A
+ 70%|███████   | 543/774 [02:25<01:04,  3.57it/s][A
+ 70%|███████   | 544/774 [02:25<01:04,  3.57it/s][A
+ 70%|███████   | 545/774 [02:25<01:01,  3.72it/s][A
+ 71%|███████   | 546/774 [02:25<00:57,  3.93it/s][A
+ 71%|███████   | 547/774 [02:26<00:55,  4.09it/s][A
+ 71%|███████   | 548/774 [02:26<00:54,  4.15it/s][A
+ 71%|███████   | 549/774 [02:26<00:56,  3.96it/s][A
+ 71%|███████   | 550/774 [02:26<00:59,  3.76it/s][A
+ 71%|███████   | 551/774 [02:27<01:01,  3.60it/s][A
+ 71%|███████▏  | 552/774 [02:27<01:05,  3.41it/s][A
+ 71%|███████▏  | 553/774 [02:27<01:09,  3.20it/s][A
+ 72%|███████▏  | 554/774 [02:28<01:07,  3.25it/s][A
+ 72%|███████▏  | 555/774 [02:28<01:07,  3.26it/s][A
+ 72%|███████▏  | 556/774 [02:28<01:03,  3.43it/s][A
+ 72%|███████▏  | 557/774 [02:29<01:07,  3.21it/s][A
+ 72%|███████▏  | 558/774 [02:29<01:01,  3.54it/s][A
+ 72%|███████▏  | 559/774 [02:29<00:56,  3.81it/s][A
+ 72%|███████▏  | 560/774 [02:29<01:00,  3.52it/s][A
+ 72%|███████▏  | 561/774 [02:30<00:57,  3.72it/s][A
+ 73%|███████▎  | 562/774 [02:30<00:52,  4.03it/s][A
+ 73%|███████▎  | 563/774 [02:30<00:50,  4.19it/s][A
+ 73%|███████▎  | 564/774 [02:30<00:52,  4.00it/s][A
+ 73%|███████▎  | 565/774 [02:31<00:54,  3.84it/s][A
+ 73%|███████▎  | 566/774 [02:31<00:50,  4.13it/s][A
+ 73%|███████▎  | 567/774 [02:31<00:46,  4.47it/s][A
+ 73%|███████▎  | 568/774 [02:31<00:47,  4.35it/s][A
+ 74%|███████▎  | 569/774 [02:31<00:48,  4.26it/s][A
+ 74%|███████▎  | 570/774 [02:32<00:48,  4.24it/s][A
+ 74%|███████▍  | 571/774 [02:32<00:52,  3.89it/s][A
+ 74%|███████▍  | 572/774 [02:32<00:53,  3.75it/s][A
+ 74%|███████▍  | 573/774 [02:33<00:53,  3.75it/s][A
+ 74%|███████▍  | 574/774 [02:33<00:51,  3.85it/s][A
+ 74%|███████▍  | 575/774 [02:33<00:51,  3.87it/s][A
+ 74%|███████▍  | 576/774 [02:33<00:56,  3.50it/s][A
+ 75%|███████▍  | 577/774 [02:34<00:55,  3.53it/s][A
+ 75%|███████▍  | 578/774 [02:34<00:54,  3.60it/s][A
+ 75%|███████▍  | 579/774 [02:34<00:56,  3.44it/s][A
+ 75%|███████▍  | 580/774 [02:35<00:55,  3.47it/s][A
+ 75%|███████▌  | 581/774 [02:35<00:55,  3.50it/s][A
+ 75%|███████▌  | 582/774 [02:35<00:53,  3.61it/s][A
+ 75%|███████▌  | 583/774 [02:35<00:52,  3.67it/s][A
+ 75%|███████▌  | 584/774 [02:36<00:50,  3.73it/s][A
+ 76%|███████▌  | 585/774 [02:36<00:52,  3.57it/s][A
+ 76%|███████▌  | 586/774 [02:36<00:53,  3.54it/s][A
+ 76%|███████▌  | 587/774 [02:36<00:51,  3.62it/s][A
+ 76%|███████▌  | 588/774 [02:37<00:51,  3.63it/s][A
+ 76%|███████▌  | 589/774 [02:37<00:49,  3.75it/s][A
+ 76%|███████▌  | 590/774 [02:37<00:45,  4.02it/s][A
+ 76%|███████▋  | 591/774 [02:37<00:47,  3.89it/s][A
+ 76%|███████▋  | 592/774 [02:38<00:49,  3.65it/s][A
+ 77%|███████▋  | 593/774 [02:38<00:50,  3.59it/s][A
+ 77%|███████▋  | 594/774 [02:38<00:50,  3.58it/s][A
+ 77%|███████▋  | 595/774 [02:39<00:53,  3.32it/s][A
+ 77%|███████▋  | 596/774 [02:39<00:56,  3.15it/s][A
+ 77%|███████▋  | 597/774 [02:39<00:56,  3.16it/s][A
+ 77%|███████▋  | 598/774 [02:40<00:57,  3.06it/s][A
+ 77%|███████▋  | 599/774 [02:40<00:58,  3.01it/s][A
+ 78%|███████▊  | 600/774 [02:40<00:57,  3.00it/s][A
+ 78%|███████▊  | 601/774 [02:41<00:58,  2.97it/s][A
+ 78%|███████▊  | 602/774 [02:41<00:58,  2.95it/s][A
+ 78%|███████▊  | 603/774 [02:41<00:57,  2.97it/s][A
+ 78%|███████▊  | 604/774 [02:42<00:57,  2.94it/s][A
+ 78%|███████▊  | 605/774 [02:42<00:56,  2.97it/s][A
+ 78%|███████▊  | 606/774 [02:42<00:58,  2.89it/s][A
+ 78%|███████▊  | 607/774 [02:43<00:56,  2.93it/s][A
+ 79%|███████▊  | 608/774 [02:43<00:56,  2.93it/s][A
+ 79%|███████▊  | 609/774 [02:43<00:54,  3.05it/s][A
+ 79%|███████▉  | 610/774 [02:44<00:55,  2.97it/s][A
+ 79%|███████▉  | 611/774 [02:44<00:59,  2.74it/s][A
+ 79%|███████▉  | 612/774 [02:45<01:01,  2.63it/s][A
+ 79%|███████▉  | 613/774 [02:45<00:56,  2.83it/s][A
+ 79%|███████▉  | 614/774 [02:45<00:55,  2.90it/s][A
+ 79%|███████▉  | 615/774 [02:46<00:52,  3.04it/s][A
+ 80%|███████▉  | 616/774 [02:46<00:51,  3.09it/s][A
+ 80%|███████▉  | 617/774 [02:46<00:50,  3.11it/s][A
+ 80%|███████▉  | 618/774 [02:46<00:47,  3.27it/s][A
+ 80%|███████▉  | 619/774 [02:47<00:45,  3.43it/s][A
+ 80%|████████  | 620/774 [02:47<00:44,  3.46it/s][A
+ 80%|████████  | 621/774 [02:47<00:41,  3.73it/s][A
+ 80%|████████  | 622/774 [02:47<00:38,  3.98it/s][A
+ 80%|████████  | 623/774 [02:48<00:38,  3.93it/s][A
+ 81%|████████  | 624/774 [02:48<00:41,  3.61it/s][A
+ 81%|████████  | 625/774 [02:48<00:41,  3.56it/s][A
+ 81%|████████  | 626/774 [02:49<00:44,  3.31it/s][A
+ 81%|████████  | 627/774 [02:49<00:45,  3.23it/s][A
+ 81%|████████  | 628/774 [02:49<00:45,  3.22it/s][A
+ 81%|████████▏ | 629/774 [02:50<00:43,  3.32it/s][A
+ 81%|████████▏ | 630/774 [02:50<00:40,  3.55it/s][A
+ 82%|████████▏ | 631/774 [02:50<00:38,  3.73it/s][A
+ 82%|████████▏ | 632/774 [02:50<00:38,  3.73it/s][A
+ 82%|████████▏ | 633/774 [02:51<00:39,  3.54it/s][A
+ 82%|████████▏ | 634/774 [02:51<00:40,  3.44it/s][A
+ 82%|████████▏ | 635/774 [02:51<00:39,  3.53it/s][A
+ 82%|████████▏ | 636/774 [02:52<00:40,  3.45it/s][A
+ 82%|████████▏ | 637/774 [02:52<00:39,  3.50it/s][A
+ 82%|████████▏ | 638/774 [02:52<00:39,  3.47it/s][A
+ 83%|████████▎ | 639/774 [02:52<00:43,  3.09it/s][A
+ 83%|████████▎ | 640/774 [02:53<00:50,  2.68it/s][A
+ 83%|████████▎ | 641/774 [02:53<00:49,  2.70it/s][A
+ 83%|████████▎ | 642/774 [02:54<00:45,  2.88it/s][A
+ 83%|████████▎ | 643/774 [02:54<00:45,  2.89it/s][A
+ 83%|████████▎ | 644/774 [02:54<00:41,  3.10it/s][A
+ 83%|████████▎ | 645/774 [02:54<00:37,  3.40it/s][A
+ 83%|████████▎ | 646/774 [02:55<00:35,  3.64it/s][A
+ 84%|████████▎ | 647/774 [02:55<00:32,  3.90it/s][A
+ 84%|████████▎ | 648/774 [02:55<00:31,  4.05it/s][A
+ 84%|████████▍ | 649/774 [02:55<00:30,  4.07it/s][A
+ 84%|████████▍ | 650/774 [02:56<00:28,  4.28it/s][A
+ 84%|████████▍ | 651/774 [02:56<00:28,  4.25it/s][A
+ 84%|████████▍ | 652/774 [02:56<00:29,  4.13it/s][A
+ 84%|████████▍ | 653/774 [02:56<00:31,  3.86it/s][A
+ 84%|████████▍ | 654/774 [02:57<00:29,  4.10it/s][A
+ 85%|████████▍ | 655/774 [02:57<00:27,  4.40it/s][A
+ 85%|████████▍ | 656/774 [02:57<00:27,  4.25it/s][A
+ 85%|████████▍ | 657/774 [02:57<00:26,  4.44it/s][A
+ 85%|████████▌ | 658/774 [02:58<00:27,  4.22it/s][A
+ 85%|████████▌ | 659/774 [02:58<00:29,  3.87it/s][A
+ 85%|████████▌ | 660/774 [02:58<00:30,  3.77it/s][A
+ 85%|████████▌ | 661/774 [02:58<00:30,  3.71it/s][A
+ 86%|████████▌ | 662/774 [02:59<00:28,  3.88it/s][A
+ 86%|████████▌ | 663/774 [02:59<00:30,  3.66it/s][A
+ 86%|████████▌ | 664/774 [02:59<00:30,  3.63it/s][A
+ 86%|████████▌ | 665/774 [02:59<00:27,  3.91it/s][A
+ 86%|████████▌ | 666/774 [03:00<00:25,  4.31it/s][A
+ 86%|████████▌ | 667/774 [03:00<00:23,  4.57it/s][A
+ 86%|████████▋ | 668/774 [03:00<00:24,  4.41it/s][A
+ 86%|████████▋ | 669/774 [03:00<00:25,  4.14it/s][A
+ 87%|████████▋ | 670/774 [03:00<00:24,  4.33it/s][A
+ 87%|████████▋ | 671/774 [03:01<00:26,  3.93it/s][A
+ 87%|████████▋ | 672/774 [03:01<00:25,  4.00it/s][A
+ 87%|████████▋ | 673/774 [03:01<00:24,  4.09it/s][A
+ 87%|████████▋ | 674/774 [03:02<00:24,  4.03it/s][A
+ 87%|████████▋ | 675/774 [03:02<00:23,  4.28it/s][A
+ 87%|████████▋ | 676/774 [03:02<00:21,  4.46it/s][A
+ 87%|████████▋ | 677/774 [03:02<00:21,  4.42it/s][A
+ 88%|████████▊ | 678/774 [03:02<00:21,  4.45it/s][A
+ 88%|████████▊ | 679/774 [03:03<00:22,  4.21it/s][A
+ 88%|████████▊ | 680/774 [03:03<00:22,  4.22it/s][A
+ 88%|████████▊ | 681/774 [03:03<00:20,  4.50it/s][A
+ 88%|████████▊ | 682/774 [03:03<00:20,  4.52it/s][A
+ 88%|████████▊ | 683/774 [03:04<00:21,  4.15it/s][A
+ 88%|████████▊ | 684/774 [03:04<00:23,  3.89it/s][A
+ 89%|████████▊ | 685/774 [03:04<00:24,  3.70it/s][A
+ 89%|████████▊ | 686/774 [03:04<00:22,  3.83it/s][A
+ 89%|████████▉ | 687/774 [03:05<00:21,  4.04it/s][A
+ 89%|████████▉ | 688/774 [03:05<00:21,  4.05it/s][A
+ 89%|████████▉ | 689/774 [03:05<00:20,  4.20it/s][A
+ 89%|████████▉ | 690/774 [03:05<00:19,  4.31it/s][A
+ 89%|████████▉ | 691/774 [03:06<00:18,  4.40it/s][A
+ 89%|████████▉ | 692/774 [03:06<00:18,  4.46it/s][A
+ 90%|████████▉ | 693/774 [03:06<00:18,  4.47it/s][A
+ 90%|████████▉ | 694/774 [03:06<00:19,  4.19it/s][A
+ 90%|████████▉ | 695/774 [03:07<00:20,  3.85it/s][A
+ 90%|████████▉ | 696/774 [03:07<00:19,  3.95it/s][A
+ 90%|█████████ | 697/774 [03:07<00:19,  3.96it/s][A
+ 90%|█████████ | 698/774 [03:07<00:17,  4.35it/s][A
+ 90%|█████████ | 699/774 [03:07<00:15,  4.73it/s][A
+ 90%|█████████ | 700/774 [03:08<00:17,  4.34it/s][A
+ 91%|█████████ | 701/774 [03:08<00:16,  4.41it/s][A
+ 91%|█████████ | 702/774 [03:08<00:16,  4.39it/s][A
+ 91%|█████████ | 703/774 [03:08<00:17,  4.17it/s][A
+ 91%|█████████ | 704/774 [03:09<00:16,  4.15it/s][A
+ 91%|█████████ | 705/774 [03:09<00:15,  4.51it/s][A
+ 91%|█████████ | 706/774 [03:09<00:14,  4.70it/s][A
+ 91%|█████████▏| 707/774 [03:09<00:14,  4.63it/s][A
+ 91%|█████████▏| 708/774 [03:09<00:13,  4.90it/s][A
+ 92%|█████████▏| 709/774 [03:10<00:13,  4.75it/s][A
+ 92%|█████████▏| 710/774 [03:10<00:13,  4.73it/s][A
+ 92%|█████████▏| 711/774 [03:10<00:12,  4.90it/s][A
+ 92%|█████████▏| 712/774 [03:10<00:12,  5.13it/s][A
+ 92%|█████████▏| 713/774 [03:10<00:12,  4.96it/s][A
+ 92%|█████████▏| 714/774 [03:11<00:12,  4.65it/s][A
+ 92%|█████████▏| 715/774 [03:11<00:12,  4.75it/s][A
+ 93%|█████████▎| 716/774 [03:11<00:10,  5.32it/s][A
+ 93%|█████████▎| 717/774 [03:11<00:10,  5.37it/s][A
+ 93%|█████████▎| 718/774 [03:11<00:11,  4.78it/s][A
+ 93%|█████████▎| 719/774 [03:12<00:11,  4.64it/s][A
+ 93%|█████████▎| 720/774 [03:12<00:10,  4.97it/s][A
+ 93%|█████████▎| 721/774 [03:12<00:10,  5.24it/s][A
+ 93%|█████████▎| 722/774 [03:12<00:09,  5.72it/s][A
+ 93%|█████████▎| 723/774 [03:12<00:09,  5.47it/s][A
+ 94%|█████████▎| 724/774 [03:13<00:09,  5.40it/s][A
+ 94%|█████████▎| 725/774 [03:13<00:08,  5.53it/s][A
+ 94%|█████████▍| 726/774 [03:13<00:08,  5.58it/s][A
+ 94%|█████████▍| 727/774 [03:13<00:08,  5.36it/s][A
+ 94%|█████████▍| 728/774 [03:13<00:09,  4.86it/s][A
+ 94%|█████████▍| 729/774 [03:13<00:08,  5.15it/s][A
+ 94%|█████████▍| 730/774 [03:14<00:08,  5.43it/s][A
+ 94%|█████████▍| 731/774 [03:14<00:07,  5.42it/s][A
+ 95%|█████████▍| 732/774 [03:14<00:07,  5.57it/s][A
+ 95%|█████████▍| 733/774 [03:14<00:07,  5.57it/s][A
+ 95%|█████████▍| 734/774 [03:14<00:07,  5.62it/s][A
+ 95%|█████████▍| 735/774 [03:15<00:06,  5.84it/s][A
+ 95%|█████████▌| 736/774 [03:15<00:06,  5.87it/s][A
+ 95%|█████████▌| 737/774 [03:15<00:06,  5.77it/s][A
+ 95%|█████████▌| 738/774 [03:15<00:06,  5.56it/s][A
+ 95%|█████████▌| 739/774 [03:15<00:06,  5.49it/s][A
+ 96%|█████████▌| 740/774 [03:15<00:06,  5.38it/s][A
+ 96%|█████████▌| 741/774 [03:16<00:06,  5.09it/s][A
+ 96%|█████████▌| 742/774 [03:16<00:06,  5.27it/s][A
+ 96%|█████████▌| 743/774 [03:16<00:05,  5.59it/s][A
+ 96%|█████████▌| 744/774 [03:16<00:05,  5.37it/s][A
+ 96%|█████████▋| 745/774 [03:17<00:06,  4.42it/s][A
+ 96%|█████████▋| 746/774 [03:17<00:07,  3.86it/s][A
+ 97%|█████████▋| 747/774 [03:17<00:06,  4.06it/s][A
+ 97%|█████████▋| 748/774 [03:17<00:06,  4.28it/s][A
+ 97%|█████████▋| 749/774 [03:17<00:05,  4.57it/s][A
+ 97%|█████████▋| 750/774 [03:18<00:05,  4.27it/s][A
+ 97%|█████████▋| 751/774 [03:18<00:05,  4.47it/s][A
+ 97%|█████████▋| 752/774 [03:18<00:04,  4.43it/s][A
+ 97%|█████████▋| 753/774 [03:18<00:04,  4.71it/s][A
+ 97%|█████████▋| 754/774 [03:18<00:03,  5.34it/s][A
+ 98%|█████████▊| 755/774 [03:19<00:03,  5.64it/s][A
+ 98%|█████████▊| 756/774 [03:19<00:03,  5.49it/s][A
+ 98%|█████████▊| 757/774 [03:19<00:03,  5.30it/s][A
+ 98%|█████████▊| 758/774 [03:19<00:03,  5.26it/s][A
+ 98%|█████████▊| 759/774 [03:19<00:02,  5.48it/s][A
+ 98%|█████████▊| 760/774 [03:20<00:02,  5.45it/s][A
+ 98%|█████████▊| 761/774 [03:20<00:02,  5.90it/s][A
+ 98%|█████████▊| 762/774 [03:20<00:01,  6.01it/s][A
+ 99%|█████████▊| 763/774 [03:20<00:01,  6.20it/s][A
+ 99%|█████████▊| 764/774 [03:20<00:01,  6.32it/s][A
+ 99%|█████████▉| 765/774 [03:20<00:01,  6.26it/s][A
+ 99%|█████████▉| 766/774 [03:21<00:01,  5.35it/s][A
+ 99%|█████████▉| 767/774 [03:21<00:01,  5.51it/s][A
+ 99%|█████████▉| 768/774 [03:21<00:01,  5.48it/s][A
+ 99%|█████████▉| 769/774 [03:21<00:00,  5.19it/s][A
+ 99%|█████████▉| 770/774 [03:21<00:00,  5.06it/s][A
+100%|█████████▉| 771/774 [03:22<00:00,  5.37it/s][A
+100%|█████████▉| 772/774 [03:22<00:00,  5.08it/s][A
+100%|█████████▉| 773/774 [03:22<00:00,  4.91it/s][A                                                       
+                                                 [A 94%|█████████▍| 12000/12776 [2:08:48<07:36,  1.70it/s]
+100%|██████████| 774/774 [03:24<00:00,  4.91it/s][A
+                                                 [ASaving model checkpoint to ./checkpoint-12000
+Configuration saved in ./checkpoint-12000/config.json
+Model weights saved in ./checkpoint-12000/model.safetensors
+Feature extractor saved in ./checkpoint-12000/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-12000/tokenizer_config.json
+Special tokens file saved in ./checkpoint-12000/special_tokens_map.json
+added tokens file saved in ./checkpoint-12000/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-10800] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 94%|█████████▍| 12001/12776 [2:08:54<13:43:40, 63.77s/it]                                                           94%|█████████▍| 12001/12776 [2:08:54<13:43:40, 63.77s/it] 94%|█████████▍| 12002/12776 [2:08:55<9:37:56, 44.80s/it]                                                           94%|█████████▍| 12002/12776 [2:08:55<9:37:56, 44.80s/it] 94%|█████████▍| 12003/12776 [2:08:55<6:45:42, 31.49s/it]                                                          94%|█████████▍| 12003/12776 [2:08:55<6:45:42, 31.49s/it] 94%|█████████▍| 12004/12776 [2:08:56<4:45:34, 22.20s/it]                                                          94%|█████████▍| 12004/12776 [2:08:56<4:45:34, 22.20s/it] 94%|█████████▍| 12005/12776 [2:08:56<3:21:09, 15.65s/it]                                                          94%|█████████▍| 12005/12776 [2:08:56<3:21:09, 15.65s/it] 94%|█████████▍| 12006/12776 [2:08:57<2:22:06, 11.07s/it]                                                          94%|█████████▍| 12006/12776 [2:08:57<2:22:06, 11.07s/it] 94%|█████████▍| 12007/12776 [2:08:57<1:40:55,  7.87s/it]                                                          94%|█████████▍| 12007/12776 [2:08:57<1:40:55,  7.87s/it] 94%|█████████▍| 12008/12776 [2:08:57<1:11:53,  5.62s/it]                                                          94%|█████████▍| 12008/12776 [2:08:57<1:11:53,  5.62s/it] 94%|█���███████▍| 12009/12776 [2:08:58<51:31,  4.03s/it]                                                          94%|█████████▍| 12009/12776 [2:08:58<51:31,  4.03s/it] 94%|█████████▍| 12010/12776 [2:08:58<37:14,  2.92s/it]                                                        94%|█████████▍| 12010/12776 [2:08:58<37:14,  2.92s/it] 94%|█████████▍| 12011/12776 [2:08:58<27:26,  2.15s/it]                                                        94%|█████████▍| 12011/12776 [2:08:58<27:26,  2.15s/it] 94%|█████████▍| 12012/12776 [2:08:59<20:19,  1.60s/it]                                                        94%|█████████▍| 12012/12776 [2:08:59<20:19,  1.60s/it] 94%|█████████▍| 12013/12776 [2:08:59<15:17,  1.20s/it]                                                        94%|█████████▍| 12013/12776 [2:08:59<15:17,  1.20s/it] 94%|█████████▍| 12014/12776 [2:08:59<12:03,  1.05it/s]                                                        94%|█████████▍| 12014/12776 [2:08:59<12:03,  1.05it/s] 94%|█████████▍| 12015/12776 [2:09:00<09:28,  1.34it/s]                                                        94%|█████████▍| 12015/12776 [2:09:00<09:28,  1.34it/s] 94%|█████████▍| 12016/12776 [2:09:00<07:38,  1.66it/s]                                                        94%|█████████▍| 12016/12776 [2:09:00<07:38,  1.66it/s] 94%|█████████▍| 12017/12776 [2:09:00<06:18,  2.00it/s]                                                        94%|█████████▍| 12017/12776 [2:09:00<06:18,  2.00it/s] 94%|█████████▍| 12018/12776 [2:09:00<05:39,  2.23it/s]                                                        94%|█████████▍| 12018/12776 [2:09:00<05:39,  2.23it/s] 94%|█████████▍| 12019/12776 [2:09:01<04:50,  2.60it/s]                                                        94%|█████████▍| 12019/12776 [2:09:01<04:50,  2.60it/s] 94%|█████████▍| 12020/12776 [2:09:01<04:15,  2.96it/s]                                                        94%|█████████▍| 12020/12776 [2:09:01<04:15,  2.96it/s] 94%|█████████▍| 12021/12776 [2:09:01<03:49,  3.29it/s]                                                        94%|█████████▍| 12021/12776 [2:09:01<03:49,  3.29it/s] 94%|█████████▍| 12022/12776 [2:09:01<03:29,  3.60it/s]                                                        94%|█████████▍| 12022/12776 [2:09:01<03:29,  3.60it/s] 94%|█████████▍| 12023/12776 [2:09:02<03:35,  3.49it/s]                                                        94%|█████████▍| 12023/12776 [2:09:02<03:35,  3.49it/s] 94%|█████████▍| 12024/12776 [2:09:02<03:16,  3.83it/s]                                                        94%|█████████▍| 12024/12776 [2:09:02<03:16,  3.83it/s] 94%|█████████▍| 12025/12776 [2:09:02<03:01,  4.13it/s]                                                        94%|█████████▍| 12025/12776 [2:09:02<03:01,  4.13it/s] 94%|█████████▍| 12026/12776 [2:09:02<02:50,  4.39it/s]                                                        94%|█████████▍| 12026/12776 [2:09:02<02:50,  4.39it/s] 94%|█████████▍| 12027/12776 [2:09:02<02:42,  4.60it/s]                                                        94%|█████████▍| 12027/12776 [2:09:02<02:42,  4.60it/s] 94%|█████████▍| 12028/12776 [2:09:03<02:54,  4.29it/s]                                                        94%|█████████▍| 12028/12776 [2:09:03<02:54,  4.29it/s] 94%|█████████▍| 12029/12776 [2:09:03<02:43,  4.56it/s]                                                        94%|█████████▍| 12029/12776 [2:09:03<02:43,  4.56it/s] 94%|█████████▍| 12030/12776 [2:09:03<02:35,  4.79it/s]                                                        94%|█████████▍| 12030/12776 [2:09:03<02:35,  4.79it/s] 94%|█████████▍| 12031/12776 [2:09:03<02:29,  4.98it/s]                                                        94%|█████████▍| 12031/12776 [2:09:03<02:29,  4.98it/s] 94%|█████████▍| 12032/12776 [2:09:03<02:24,  5.15it/s]                                                        94%|█████████▍| 12032/12776 [2:09:03<02:24,  5.15it/s] 94%|█████████▍| 12033/12776 [2:09:04<02:19,  5.32it/s]                                                        94%|█████████▍| 12033/12776 [2:09:04<02:19,  5.32it/s] 94%|█████████▍| 12034/12776 [2:09:04<02:30,  4.92it/s]                                                        94%|█████████▍| 12034/12776 [2:09:04<02:30,  4.92it/s] 94%|█████████▍| 12035/12776 [2:09:04<02:23,  5.17it/s]                                                        94%|█████████▍| 12035/12776 [2:09:04<02:23,  5.17it/s] 94%|█████████▍| 12036/12776 [2:09:04<02:17,  5.38it/s]                                                        94%|█████████▍| 12036/12776 [2:09:04<02:17,  5.38it/s] 94%|█████████▍| 12037/12776 [2:09:04<02:12,  5.58it/s]                                                        94%|█████████▍| 12037/12776 [2:09:04<02:12,  5.58it/s] 94%|█████████▍| 12038/12776 [2:09:05<04:19,  2.85it/s]                                                        94%|█████████▍| 12038/12776 [2:09:05<04:19,  2.85it/s] 94%|█████████▍| 12039/12776 [2:09:07<08:22,  1.47it/s]                                                        94%|█████████▍| 12039/12776 [2:09:07<08:22,  1.47it/s] 94%|█████████▍| 12040/12776 [2:09:07<09:17,  1.32it/s]                                                        94%|█████████▍| 12040/12776 [2:09:07<09:17,  1.32it/s] 94%|█████████▍| 12041/12776 [2:09:08<09:30,  1.29it/s]                                                        94%|█████████▍| 12041/12776 [2:09:08<09:30,  1.29it/s] 94%|█████████▍| 12042/12776 [2:09:09<09:21,  1.31it/s]                                                        94%|█████████▍| 12042/12776 [2:09:09<09:21,  1.31it/s] 94%|█████████▍| 12043/12776 [2:09:10<09:14,  1.32it/s]                                                        94%|█████████▍| 12043/12776 [2:09:10<09:14,  1.32it/s] 94%|█████████▍| 12044/12776 [2:09:10<08:50,  1.38it/s]                                                        94%|█████████▍| 12044/12776 [2:09:10<08:50,  1.38it/s] 94%|█████████▍| 12045/12776 [2:09:11<08:50,  1.38it/s]                                                        94%|█████████▍| 12045/12776 [2:09:11<08:50,  1.38it/s] 94%|█████████▍| 12046/12776 [2:09:12<08:21,  1.46it/s]                                                        94%|█████████▍| 12046/12776 [2:09:12<08:21,  1.46it/s] 94%|█████████▍| 12047/12776 [2:09:12<07:54,  1.54it/s]                                                        94%|█████████▍| 12047/12776 [2:09:12<07:54,  1.54it/s] 94%|█████████▍| 12048/12776 [2:09:13<07:31,  1.61it/s]                                                        94%|█████████▍| 12048/12776 [2:09:13<07:31,  1.61it/s] 94%|█████████▍| 12049/12776 [2:09:13<07:17,  1.66it/s]                                                        94%|█████████▍| 12049/12776 [2:09:13<07:17,  1.66it/s] 94%|█████████▍| 12050/12776 [2:09:14<06:55,  1.75it/s]                                                        94%|█████████▍| 12050/12776 [2:09:14<06:55,  1.75it/s] 94%|█████████▍| 12051/12776 [2:09:14<06:51,  1.76it/s]                                                        94%|█████████▍| 12051/12776 [2:09:14<06:51,  1.76it/s] 94%|█████████▍| 12052/12776 [2:09:15<06:26,  1.87it/s]                                                        94%|█████████▍| 12052/12776 [2:09:15<06:26,  1.87it/s] 94%|█████████▍| 12053/12776 [2:09:15<06:16,  1.92it/s]                                                        94%|█████████▍| 12053/12776 [2:09:15<06:16,  1.92it/s] 94%|█████████▍| 12054/12776 [2:09:16<05:52,  2.05it/s]                                                        94%|█████████▍| 12054/12776 [2:09:16<05:52,  2.05it/s] 94%|█████████▍| 12055/12776 [2:09:16<05:32,  2.17it/s]                                                        94%|█████████▍| 12055/12776 [2:09:16<05:32,  2.17it/s] 94%|█████████▍| 12056/12776 [2:09:17<05:35,  2.15it/s]                                                        94%|█████████▍| 12056/12776 [2:09:17<05:35,  2.15it/s] 94%|█████████▍| 12057/12776 [2:09:17<05:12,  2.30it/s]                                                        94%|█████████▍| 12057/12776 [2:09:17<05:12,  2.30it/s] 94%|���████████▍| 12058/12776 [2:09:17<04:54,  2.44it/s]                                                        94%|█████████▍| 12058/12776 [2:09:17<04:54,  2.44it/s] 94%|█████████▍| 12059/12776 [2:09:18<04:58,  2.40it/s]                                                        94%|█████████▍| 12059/12776 [2:09:18<04:58,  2.40it/s] 94%|█████████▍| 12060/12776 [2:09:18<04:37,  2.58it/s]                                                        94%|█████████▍| 12060/12776 [2:09:18<04:37,  2.58it/s] 94%|█████████▍| 12061/12776 [2:09:18<04:19,  2.75it/s]                                                        94%|█████████▍| 12061/12776 [2:09:18<04:19,  2.75it/s] 94%|█████████▍| 12062/12776 [2:09:19<04:15,  2.80it/s]                                                        94%|█████████▍| 12062/12776 [2:09:19<04:15,  2.80it/s] 94%|█████████▍| 12063/12776 [2:09:19<03:58,  2.99it/s]                                                        94%|█████████▍| 12063/12776 [2:09:19<03:58,  2.99it/s] 94%|█████████▍| 12064/12776 [2:09:19<03:46,  3.15it/s]                                                        94%|█████████▍| 12064/12776 [2:09:19<03:46,  3.15it/s] 94%|█████████▍| 12065/12776 [2:09:20<03:36,  3.28it/s]                                                        94%|█████████▍| 12065/12776 [2:09:20<03:36,  3.28it/s] 94%|█████████▍| 12066/12776 [2:09:20<03:35,  3.29it/s]                                                        94%|█████████▍| 12066/12776 [2:09:20<03:35,  3.29it/s] 94%|█████████▍| 12067/12776 [2:09:20<03:26,  3.44it/s]                                                        94%|█████████▍| 12067/12776 [2:09:20<03:26,  3.44it/s] 94%|█████████▍| 12068/12776 [2:09:20<03:17,  3.59it/s]                                                        94%|█████████▍| 12068/12776 [2:09:20<03:17,  3.59it/s] 94%|█████████▍| 12069/12776 [2:09:21<03:11,  3.70it/s]                                                        94%|█████████▍| 12069/12776 [2:09:21<03:11,  3.70it/s] 94%|█████████▍| 12070/12776 [2:09:21<03:03,  3.84it/s]                                                        94%|█████████▍| 12070/12776 [2:09:21<03:03,  3.84it/s] 94%|█████████▍| 12071/12776 [2:09:21<03:01,  3.89it/s]                                                        94%|█████████▍| 12071/12776 [2:09:21<03:01,  3.89it/s] 94%|█████████▍| 12072/12776 [2:09:21<02:53,  4.05it/s]                                                        94%|█████████▍| 12072/12776 [2:09:21<02:53,  4.05it/s] 94%|█████████▍| 12073/12776 [2:09:22<02:48,  4.18it/s]                                                        94%|█████████▍| 12073/12776 [2:09:22<02:48,  4.18it/s] 95%|█████████▍| 12074/12776 [2:09:22<02:41,  4.34it/s]                                                        95%|█████████▍| 12074/12776 [2:09:22<02:41,  4.34it/s] 95%|█████████▍| 12075/12776 [2:09:22<02:36,  4.48it/s]                                                        95%|█████████▍| 12075/12776 [2:09:22<02:36,  4.48it/s] 95%|█████████▍| 12076/12776 [2:09:22<02:41,  4.34it/s]                                                        95%|█████████▍| 12076/12776 [2:09:22<02:41,  4.34it/s] 95%|█████████▍| 12077/12776 [2:09:23<02:33,  4.55it/s]                                                       {'eval_loss': 0.9464879631996155, 'eval_wer': 0.8378987185753448, 'eval_runtime': 205.315, 'eval_samples_per_second': 60.312, 'eval_steps_per_second': 3.77, 'epoch': 1.88}
+{'loss': 1.4622, 'grad_norm': 4.092573165893555, 'learning_rate': 1.9354838709677417e-05, 'epoch': 1.88}
+{'loss': 1.3294, 'grad_norm': 2.7687883377075195, 'learning_rate': 1.9330400782013682e-05, 'epoch': 1.88}
+{'loss': 1.0254, 'grad_norm': 1.519518494606018, 'learning_rate': 1.930596285434995e-05, 'epoch': 1.88}
+{'loss': 0.9684, 'grad_norm': 1.2389482259750366, 'learning_rate': 1.9281524926686215e-05, 'epoch': 1.88}
+{'loss': 1.044, 'grad_norm': 11.871675491333008, 'learning_rate': 1.925708699902248e-05, 'epoch': 1.88}
+{'loss': 0.875, 'grad_norm': 2.5590877532958984, 'learning_rate': 1.9232649071358748e-05, 'epoch': 1.88}
+{'loss': 0.9461, 'grad_norm': 2.1708011627197266, 'learning_rate': 1.9208211143695013e-05, 'epoch': 1.88}
+{'loss': 0.9795, 'grad_norm': 5.7094502449035645, 'learning_rate': 1.9183773216031278e-05, 'epoch': 1.88}
+{'loss': 1.1206, 'grad_norm': 2.582882881164551, 'learning_rate': 1.9159335288367546e-05, 'epoch': 1.88}
+{'loss': 1.0056, 'grad_norm': 2.906423330307007, 'learning_rate': 1.913489736070381e-05, 'epoch': 1.88}
+{'loss': 0.9007, 'grad_norm': 4.6472320556640625, 'learning_rate': 1.9110459433040076e-05, 'epoch': 1.88}
+{'loss': 0.851, 'grad_norm': 3.4553442001342773, 'learning_rate': 1.9086021505376344e-05, 'epoch': 1.88}
+{'loss': 1.0446, 'grad_norm': 3.4687905311584473, 'learning_rate': 1.906158357771261e-05, 'epoch': 1.88}
+{'loss': 0.9464, 'grad_norm': 1.8787938356399536, 'learning_rate': 1.9037145650048873e-05, 'epoch': 1.88}
+{'loss': 0.8582, 'grad_norm': 2.372514009475708, 'learning_rate': 1.901270772238514e-05, 'epoch': 1.88}
+{'loss': 0.7178, 'grad_norm': 3.6794726848602295, 'learning_rate': 1.8988269794721406e-05, 'epoch': 1.88}
+{'loss': 0.8946, 'grad_norm': 1.5233488082885742, 'learning_rate': 1.896383186705767e-05, 'epoch': 1.88}
+{'loss': 0.7768, 'grad_norm': 2.525583505630493, 'learning_rate': 1.893939393939394e-05, 'epoch': 1.88}
+{'loss': 1.2274, 'grad_norm': 6.541940689086914, 'learning_rate': 1.8914956011730204e-05, 'epoch': 1.88}
+{'loss': 0.8303, 'grad_norm': 8.669783592224121, 'learning_rate': 1.889051808406647e-05, 'epoch': 1.88}
+{'loss': 1.1194, 'grad_norm': 5.07046365737915, 'learning_rate': 1.8866080156402737e-05, 'epoch': 1.88}
+{'loss': 1.1634, 'grad_norm': 2.6451454162597656, 'learning_rate': 1.8841642228739002e-05, 'epoch': 1.88}
+{'loss': 1.2416, 'grad_norm': 11.504127502441406, 'learning_rate': 1.8817204301075267e-05, 'epoch': 1.88}
+{'loss': 0.732, 'grad_norm': 4.630967617034912, 'learning_rate': 1.8792766373411535e-05, 'epoch': 1.88}
+{'loss': 1.012, 'grad_norm': 5.2356648445129395, 'learning_rate': 1.87683284457478e-05, 'epoch': 1.88}
+{'loss': 0.7886, 'grad_norm': 6.520271301269531, 'learning_rate': 1.8743890518084065e-05, 'epoch': 1.88}
+{'loss': 1.1509, 'grad_norm': 8.99795913696289, 'learning_rate': 1.871945259042033e-05, 'epoch': 1.88}
+{'loss': 0.8647, 'grad_norm': 2.0889909267425537, 'learning_rate': 1.8695014662756598e-05, 'epoch': 1.88}
+{'loss': 1.0876, 'grad_norm': 9.168989181518555, 'learning_rate': 1.8670576735092862e-05, 'epoch': 1.88}
+{'loss': 1.0499, 'grad_norm': 3.345892906188965, 'learning_rate': 1.8646138807429127e-05, 'epoch': 1.88}
+{'loss': 1.5739, 'grad_norm': 3.142423391342163, 'learning_rate': 1.8621700879765395e-05, 'epoch': 1.88}
+{'loss': 1.2463, 'grad_norm': 6.219061851501465, 'learning_rate': 1.859726295210166e-05, 'epoch': 1.88}
+{'loss': 0.8387, 'grad_norm': 3.891333818435669, 'learning_rate': 1.8572825024437925e-05, 'epoch': 1.88}
+{'loss': 0.5832, 'grad_norm': 2.0812160968780518, 'learning_rate': 1.8548387096774193e-05, 'epoch': 1.88}
+{'loss': 0.3952, 'grad_norm': 2.1762547492980957, 'learning_rate': 1.8523949169110458e-05, 'epoch': 1.88}
+{'loss': 0.6069, 'grad_norm': 1.8617538213729858, 'learning_rate': 1.8499511241446723e-05, 'epoch': 1.88}
+{'loss': 0.8268, 'grad_norm': 2.6567537784576416, 'learning_rate': 1.847507331378299e-05, 'epoch': 1.88}
+{'loss': 0.7337, 'grad_norm': 3.485652208328247, 'learning_rate': 1.8450635386119256e-05, 'epoch': 1.88}
+{'loss': 1.2784, 'grad_norm': 1.9477510452270508, 'learning_rate': 1.842619745845552e-05, 'epoch': 1.88}
+{'loss': 1.221, 'grad_norm': 0.9730574488639832, 'learning_rate': 1.840175953079179e-05, 'epoch': 1.88}
+{'loss': 1.1966, 'grad_norm': 1.2898110151290894, 'learning_rate': 1.8377321603128054e-05, 'epoch': 1.88}
+{'loss': 1.1641, 'grad_norm': 0.9540336728096008, 'learning_rate': 1.8352883675464318e-05, 'epoch': 1.89}
+{'loss': 1.1545, 'grad_norm': 4.748780727386475, 'learning_rate': 1.8328445747800586e-05, 'epoch': 1.89}
+{'loss': 1.2024, 'grad_norm': 1.0382719039916992, 'learning_rate': 1.830400782013685e-05, 'epoch': 1.89}
+{'loss': 1.3272, 'grad_norm': 3.70912766456604, 'learning_rate': 1.8279569892473116e-05, 'epoch': 1.89}
+{'loss': 1.1662, 'grad_norm': 2.094374656677246, 'learning_rate': 1.8255131964809384e-05, 'epoch': 1.89}
+{'loss': 1.0841, 'grad_norm': 1.1259913444519043, 'learning_rate': 1.823069403714565e-05, 'epoch': 1.89}
+{'loss': 1.1885, 'grad_norm': 1.507389783859253, 'learning_rate': 1.8206256109481914e-05, 'epoch': 1.89}
+{'loss': 1.1419, 'grad_norm': 1.2091017961502075, 'learning_rate': 1.8181818181818182e-05, 'epoch': 1.89}
+{'loss': 1.1869, 'grad_norm': 2.392782688140869, 'learning_rate': 1.8157380254154447e-05, 'epoch': 1.89}
+{'loss': 1.1169, 'grad_norm': 3.752117156982422, 'learning_rate': 1.813294232649071e-05, 'epoch': 1.89}
+{'loss': 1.2533, 'grad_norm': 3.1273210048675537, 'learning_rate': 1.810850439882698e-05, 'epoch': 1.89}
+{'loss': 1.0157, 'grad_norm': 1.2055354118347168, 'learning_rate': 1.8084066471163245e-05, 'epoch': 1.89}
+{'loss': 1.0156, 'grad_norm': 4.132384300231934, 'learning_rate': 1.805962854349951e-05, 'epoch': 1.89}
+{'loss': 1.3168, 'grad_norm': 1.6139036417007446, 'learning_rate': 1.8035190615835778e-05, 'epoch': 1.89}
+{'loss': 1.38, 'grad_norm': 1.806853175163269, 'learning_rate': 1.801075268817204e-05, 'epoch': 1.89}
+{'loss': 1.1021, 'grad_norm': 1.3997470140457153, 'learning_rate': 1.7986314760508307e-05, 'epoch': 1.89}
+{'loss': 1.0186, 'grad_norm': 2.818284511566162, 'learning_rate': 1.7961876832844575e-05, 'epoch': 1.89}
+{'loss': 1.4395, 'grad_norm': 10.638510704040527, 'learning_rate': 1.7937438905180837e-05, 'epoch': 1.89}
+{'loss': 1.2109, 'grad_norm': 4.576744079589844, 'learning_rate': 1.7913000977517105e-05, 'epoch': 1.89}
+{'loss': 1.1902, 'grad_norm': 9.343724250793457, 'learning_rate': 1.7888563049853373e-05, 'epoch': 1.89}
+{'loss': 1.0696, 'grad_norm': 1.4243091344833374, 'learning_rate': 1.7864125122189635e-05, 'epoch': 1.89}
+{'loss': 1.3022, 'grad_norm': 4.524175643920898, 'learning_rate': 1.7839687194525903e-05, 'epoch': 1.89}
+{'loss': 1.1638, 'grad_norm': 2.529660224914551, 'learning_rate': 1.781524926686217e-05, 'epoch': 1.89}
+{'loss': 1.2616, 'grad_norm': 2.6119816303253174, 'learning_rate': 1.7790811339198432e-05, 'epoch': 1.89}
+{'loss': 0.9599, 'grad_norm': 4.102001667022705, 'learning_rate': 1.77663734115347e-05, 'epoch': 1.89}
+{'loss': 1.0821, 'grad_norm': 6.444525241851807, 'learning_rate': 1.7741935483870965e-05, 'epoch': 1.89}
+{'loss': 1.2128, 'grad_norm': 3.062340021133423, 'learning_rate': 1.771749755620723e-05, 'epoch': 1.89}
+{'loss': 1.2174, 'grad_norm': 2.605454444885254, 'learning_rate': 1.76930596285435e-05, 'epoch': 1.89}
+{'loss': 1.1536, 'grad_norm': 4.152836799621582, 'learning_rate': 1.7668621700879763e-05, 'epoch': 1.89}
+{'loss': 1.4407, 'grad_norm': 5.219440460205078, 'learning_rate': 1.7644183773216028e-05, 'epoch': 1.89}
+{'loss': 1.3388, 'grad_norm': 3.174717426300049, 'learning_rate': 1.7619745845552296e-05, 'epoch': 1.89}
+{'loss': 1.0997, 'grad_norm': 3.0966529846191406, 'learning_rate': 1.759530791788856e-05, 'epoch': 1.89}
+{'loss': 1.1042, 'grad_norm': 7.947587013244629, 'learning_rate': 1.7570869990224826e-05, 'epoch': 1.89}
+{'loss': 1.5781, 'grad_norm': 5.379951477050781, 'learning_rate': 1.7546432062561094e-05, 'epoch': 1.89}
+{'loss': 1.295, 'grad_norm': 3.88952898979187, 'learning_rate': 1.752199413489736e-05, 'epoch': 1.89}
+ 95%|█████████▍| 12077/12776 [2:09:23<02:33,  4.55it/s] 95%|█████████▍| 12078/12776 [2:09:23<02:27,  4.73it/s]                                                        95%|█████████▍| 12078/12776 [2:09:23<02:27,  4.73it/s] 95%|█████████▍| 12079/12776 [2:09:23<02:22,  4.90it/s]                                                        95%|█████████▍| 12079/12776 [2:09:23<02:22,  4.90it/s] 95%|█████████▍| 12080/12776 [2:09:23<02:18,  5.04it/s]                                                        95%|█████████▍| 12080/12776 [2:09:23<02:18,  5.04it/s] 95%|█████████▍| 12081/12776 [2:09:23<02:27,  4.70it/s]                                                        95%|█████████▍| 12081/12776 [2:09:23<02:27,  4.70it/s] 95%|█████████▍| 12082/12776 [2:09:24<02:20,  4.94it/s]                                                        95%|█████████▍| 12082/12776 [2:09:24<02:20,  4.94it/s] 95%|█████████▍| 12083/12776 [2:09:24<02:15,  5.12it/s]                                                        95%|█████████▍| 12083/12776 [2:09:24<02:15,  5.12it/s] 95%|█████████▍| 12084/12776 [2:09:24<02:10,  5.30it/s]                                                        95%|█████████▍| 12084/12776 [2:09:24<02:10,  5.30it/s] 95%|█████████▍| 12085/12776 [2:09:24<02:06,  5.45it/s]                                                        95%|█████████▍| 12085/12776 [2:09:24<02:06,  5.45it/s] 95%|█████████▍| 12086/12776 [2:09:24<02:02,  5.62it/s]                                                        95%|█████████▍| 12086/12776 [2:09:24<02:02,  5.62it/s] 95%|█████████▍| 12087/12776 [2:09:24<02:18,  4.99it/s]                                                        95%|█████████▍| 12087/12776 [2:09:24<02:18,  4.99it/s] 95%|█████████▍| 12088/12776 [2:09:25<04:23,  2.61it/s]                                                        95%|█████████▍| 12088/12776 [2:09:25<04:23,  2.61it/s] 95%|█████████▍| 12089/12776 [2:09:27<08:33,  1.34it/s]                                                        95%|█████████▍| 12089/12776 [2:09:27<08:33,  1.34it/s] 95%|█████████▍| 12090/12776 [2:09:28<09:28,  1.21it/s]                                                        95%|█████████▍| 12090/12776 [2:09:28<09:28,  1.21it/s] 95%|█████████▍| 12091/12776 [2:09:29<09:36,  1.19it/s]                                                        95%|█████████▍| 12091/12776 [2:09:29<09:36,  1.19it/s] 95%|█████████▍| 12092/12776 [2:09:30<09:27,  1.21it/s]                                                        95%|█████████▍| 12092/12776 [2:09:30<09:27,  1.21it/s] 95%|█████████▍| 12093/12776 [2:09:30<09:06,  1.25it/s]                                                        95%|█████████▍| 12093/12776 [2:09:30<09:06,  1.25it/s] 95%|█████████▍| 12094/12776 [2:09:31<08:44,  1.30it/s]                                                        95%|█████████▍| 12094/12776 [2:09:31<08:44,  1.30it/s] 95%|█████████▍| 12095/12776 [2:09:32<08:41,  1.30it/s]                                                        95%|█████████▍| 12095/12776 [2:09:32<08:41,  1.30it/s] 95%|█████████▍| 12096/12776 [2:09:33<08:40,  1.31it/s]                                                        95%|█████████▍| 12096/12776 [2:09:33<08:40,  1.31it/s] 95%|█████████▍| 12097/12776 [2:09:33<08:04,  1.40it/s]                                                        95%|█████████▍| 12097/12776 [2:09:33<08:04,  1.40it/s] 95%|█████████▍| 12098/12776 [2:09:34<07:38,  1.48it/s]                                                        95%|█████████▍| 12098/12776 [2:09:34<07:38,  1.48it/s] 95%|█████████▍| 12099/12776 [2:09:34<07:09,  1.58it/s]                                                        95%|█████████▍| 12099/12776 [2:09:34<07:09,  1.58it/s] 95%|█████████▍| 12100/12776 [2:09:35<07:05,  1.59it/s]                                                        95%|█████████▍| 12100/12776 [2:09:35<07:05,  1.59it/s] 95%|█████████▍| 12101/12776 [2:09:35<06:35,  1.71it/s]                                                        95%|█████████▍| 12101/12776 [2:09:35<06:35,  1.71it/s] 95%|█████████▍| 12102/12776 [2:09:36<06:09,  1.82it/s]                                                        95%|█████████▍| 12102/12776 [2:09:36<06:09,  1.82it/s] 95%|█████████▍| 12103/12776 [2:09:36<05:56,  1.89it/s]                                                        95%|█████████▍| 12103/12776 [2:09:36<05:56,  1.89it/s] 95%|█████████▍| 12104/12776 [2:09:37<05:37,  1.99it/s]                                                        95%|█████████▍| 12104/12776 [2:09:37<05:37,  1.99it/s] 95%|█████████▍| 12105/12776 [2:09:37<05:28,  2.04it/s]                                                        95%|█████████▍| 12105/12776 [2:09:37<05:28,  2.04it/s] 95%|█████████▍| 12106/12776 [2:09:38<05:11,  2.15it/s]                                                        95%|█████████▍| 12106/12776 [2:09:38<05:11,  2.15it/s] 95%|█████████▍| 12107/12776 [2:09:38<05:00,  2.23it/s]                                                        95%|█████████▍| 12107/12776 [2:09:38<05:00,  2.23it/s] 95%|█████████▍| 12108/12776 [2:09:38<05:06,  2.18it/s]                                                        95%|█████████▍| 12108/12776 [2:09:38<05:06,  2.18it/s] 95%|█████████▍| 12109/12776 [2:09:39<04:49,  2.30it/s]                                                        95%|█████████▍| 12109/12776 [2:09:39<04:49,  2.30it/s] 95%|█████████▍| 12110/12776 [2:09:39<04:34,  2.43it/s]                                                        95%|█████████▍| 12110/12776 [2:09:39<04:34,  2.43it/s] 95%|█████████▍| 12111/12776 [2:09:40<04:32,  2.44it/s]                                                        95%|█████████▍| 12111/12776 [2:09:40<04:32,  2.44it/s] 95%|█████████▍| 12112/12776 [2:09:40<04:17,  2.58it/s]                                                        95%|█████████▍| 12112/12776 [2:09:40<04:17,  2.58it/s] 95%|█████████▍| 12113/12776 [2:09:40<04:05,  2.70it/s]                                                        95%|█████████▍| 12113/12776 [2:09:40<04:05,  2.70it/s] 95%|█████████▍| 12114/12776 [2:09:41<03:58,  2.78it/s]                                                        95%|█████████▍| 12114/12776 [2:09:41<03:58,  2.78it/s] 95%|█████████▍| 12115/12776 [2:09:41<03:46,  2.92it/s]                                                        95%|█████████▍| 12115/12776 [2:09:41<03:46,  2.92it/s] 95%|█████████▍| 12116/12776 [2:09:41<03:36,  3.05it/s]                                                        95%|█���███████▍| 12116/12776 [2:09:41<03:36,  3.05it/s] 95%|█████████▍| 12117/12776 [2:09:41<03:27,  3.18it/s]                                                        95%|█████████▍| 12117/12776 [2:09:41<03:27,  3.18it/s] 95%|█████████▍| 12118/12776 [2:09:42<03:39,  3.00it/s]                                                        95%|█████████▍| 12118/12776 [2:09:42<03:39,  3.00it/s] 95%|█████████▍| 12119/12776 [2:09:42<03:25,  3.19it/s]                                                        95%|█████████▍| 12119/12776 [2:09:42<03:25,  3.19it/s] 95%|█████████▍| 12120/12776 [2:09:42<03:14,  3.38it/s]                                                        95%|█████████▍| 12120/12776 [2:09:42<03:14,  3.38it/s] 95%|█████████▍| 12121/12776 [2:09:43<03:05,  3.54it/s]                                                        95%|█████████▍| 12121/12776 [2:09:43<03:05,  3.54it/s] 95%|█████████▍| 12122/12776 [2:09:43<03:20,  3.27it/s]                                                        95%|█████████▍| 12122/12776 [2:09:43<03:20,  3.27it/s] 95%|█████████▍| 12123/12776 [2:09:43<03:06,  3.51it/s]                                                        95%|█████████▍| 12123/12776 [2:09:43<03:06,  3.51it/s] 95%|█████████▍| 12124/12776 [2:09:43<02:55,  3.71it/s]                                                        95%|█████████▍| 12124/12776 [2:09:43<02:55,  3.71it/s] 95%|█████████▍| 12125/12776 [2:09:44<02:46,  3.91it/s]                                                        95%|█████████▍| 12125/12776 [2:09:44<02:46,  3.91it/s] 95%|█████████▍| 12126/12776 [2:09:44<02:53,  3.75it/s]                                                        95%|█████████▍| 12126/12776 [2:09:44<02:53,  3.75it/s] 95%|█████████▍| 12127/12776 [2:09:44<02:41,  4.02it/s]                                                        95%|█████████▍| 12127/12776 [2:09:44<02:41,  4.02it/s] 95%|█████████▍| 12128/12776 [2:09:44<02:34,  4.21it/s]                                                        95%|█████████▍| 12128/12776 [2:09:44<02:34,  4.21it/s] 95%|█████████▍| 12129/12776 [2:09:45<02:27,  4.39it/s]                                                        95%|█████████▍| 12129/12776 [2:09:45<02:27,  4.39it/s] 95%|█████████▍| 12130/12776 [2:09:45<02:22,  4.54it/s]                                                        95%|█████████▍| 12130/12776 [2:09:45<02:22,  4.54it/s] 95%|█████████▍| 12131/12776 [2:09:45<02:36,  4.13it/s]                                                        95%|█████████▍| 12131/12776 [2:09:45<02:36,  4.13it/s] 95%|█████████▍| 12132/12776 [2:09:45<02:26,  4.38it/s]                                                        95%|█████████▍| 12132/12776 [2:09:45<02:26,  4.38it/s] 95%|█████████▍| 12133/12776 [2:09:45<02:20,  4.58it/s]                                                        95%|█████████▍| 12133/12776 [2:09:45<02:20,  4.58it/s] 95%|█████████▍| 12134/12776 [2:09:46<02:15,  4.75it/s]                                                        95%|█████████▍| 12134/12776 [2:09:46<02:15,  4.75it/s] 95%|█████████▍| 12135/12776 [2:09:46<02:11,  4.88it/s]                                                        95%|█████████▍| 12135/12776 [2:09:46<02:11,  4.88it/s] 95%|█████████▍| 12136/12776 [2:09:46<02:07,  5.01it/s]                                                        95%|█████████▍| 12136/12776 [2:09:46<02:07,  5.01it/s] 95%|█████████▍| 12137/12776 [2:09:46<02:24,  4.41it/s]                                                        95%|█████████▍| 12137/12776 [2:09:46<02:24,  4.41it/s] 95%|█████████▌| 12138/12776 [2:09:47<03:46,  2.82it/s]                                                        95%|█████████▌| 12138/12776 [2:09:47<03:46,  2.82it/s] 95%|█████████▌| 12139/12776 [2:09:49<07:39,  1.39it/s]                                                        95%|█████████▌| 12139/12776 [2:09:49<07:39,  1.39it/s] 95%|█████████▌| 12140/12776 [2:09:50<08:40,  1.22it/s]                                                        95%|█████████▌| 12140/12776 [2:09:50<08:40,  1.22it/s] 95%|█████████▌| 12141/12776 [2:09:50<08:44,  1.21it/s]                                                        95%|█████████▌| 12141/12776 [2:09:50<08:44,  1.21it/s] 95%|█████████▌| 12142/12776 [2:09:51<08:34,  1.23it/s]                                                        95%|█████████▌| 12142/12776 [2:09:51<08:34,  1.23it/s] 95%|█████████▌| 12143/12776 [2:09:52<08:17,  1.27it/s]                                                        95%|█████████▌| 12143/12776 [2:09:52<08:17,  1.27it/s] 95%|█████████▌| 12144/12776 [2:09:53<08:01,  1.31it/s]                                                        95%|█████████▌| 12144/12776 [2:09:53<08:01,  1.31it/s] 95%|█████████▌| 12145/12776 [2:09:53<08:02,  1.31it/s]                                                        95%|█████████▌| 12145/12776 [2:09:53<08:02,  1.31it/s] 95%|█████████▌| 12146/12776 [2:09:54<07:57,  1.32it/s]                                                        95%|█████████▌| 12146/12776 [2:09:54<07:57,  1.32it/s] 95%|█████████▌| 12147/12776 [2:09:55<07:25,  1.41it/s]                                                        95%|█████████▌| 12147/12776 [2:09:55<07:25,  1.41it/s] 95%|█████████▌| 12148/12776 [2:09:55<07:00,  1.49it/s]                                                        95%|█████████▌| 12148/12776 [2:09:55<07:00,  1.49it/s] 95%|█████████▌| 12149/12776 [2:09:56<06:37,  1.58it/s]                                                        95%|█████████▌| 12149/12776 [2:09:56<06:37,  1.58it/s] 95%|█████████▌| 12150/12776 [2:09:57<06:34,  1.59it/s]                                                        95%|█████████▌| 12150/12776 [2:09:57<06:34,  1.59it/s] 95%|█████████▌| 12151/12776 [2:09:57<06:08,  1.70it/s]                                                        95%|█████████▌| 12151/12776 [2:09:57<06:08,  1.70it/s] 95%|█████████▌| 12152/12776 [2:09:57<05:43,  1.82it/s]                                                        95%|█████████▌| 12152/12776 [2:09:57<05:43,  1.82it/s] 95%|█████████▌| 12153/12776 [2:09:58<05:24,  1.92it/s]                                                        95%|█████████▌| 12153/12776 [2:09:58<05:24,  1.92it/s] 95%|█████████▌| 12154/12776 [2:09:58<05:07,  2.02it/s]                                                        95%|█████████▌| 12154/12776 [2:09:58<05:07,  2.02it/s] 95%|█████████▌| 12155/12776 [2:09:59<04:56,  2.10it/s]                                                       {'loss': 1.4006, 'grad_norm': 3.303056001663208, 'learning_rate': 1.7497556207233624e-05, 'epoch': 1.89}
+{'loss': 1.1342, 'grad_norm': 3.121760845184326, 'learning_rate': 1.7473118279569892e-05, 'epoch': 1.89}
+{'loss': 1.2744, 'grad_norm': 6.825216293334961, 'learning_rate': 1.7448680351906157e-05, 'epoch': 1.89}
+{'loss': 1.141, 'grad_norm': 3.57566237449646, 'learning_rate': 1.742424242424242e-05, 'epoch': 1.89}
+{'loss': 1.4476, 'grad_norm': 6.941944122314453, 'learning_rate': 1.739980449657869e-05, 'epoch': 1.89}
+{'loss': 1.4218, 'grad_norm': 3.8195106983184814, 'learning_rate': 1.7375366568914954e-05, 'epoch': 1.89}
+{'loss': 0.7745, 'grad_norm': 2.770172119140625, 'learning_rate': 1.735092864125122e-05, 'epoch': 1.89}
+{'loss': 0.8573, 'grad_norm': 2.7398681640625, 'learning_rate': 1.7326490713587487e-05, 'epoch': 1.89}
+{'loss': 0.7147, 'grad_norm': 5.767883777618408, 'learning_rate': 1.7302052785923752e-05, 'epoch': 1.89}
+{'loss': 0.7094, 'grad_norm': 2.8740317821502686, 'learning_rate': 1.7277614858260017e-05, 'epoch': 1.89}
+{'loss': 1.3801, 'grad_norm': 3.8529751300811768, 'learning_rate': 1.7253176930596285e-05, 'epoch': 1.89}
+{'loss': 0.7619, 'grad_norm': 1.6639562845230103, 'learning_rate': 1.722873900293255e-05, 'epoch': 1.89}
+{'loss': 1.4944, 'grad_norm': 1.0102930068969727, 'learning_rate': 1.7204301075268815e-05, 'epoch': 1.89}
+{'loss': 1.4635, 'grad_norm': 1.1898506879806519, 'learning_rate': 1.7179863147605083e-05, 'epoch': 1.89}
+{'loss': 1.491, 'grad_norm': 1.3134690523147583, 'learning_rate': 1.7155425219941348e-05, 'epoch': 1.89}
+{'loss': 1.4765, 'grad_norm': 1.2267783880233765, 'learning_rate': 1.7130987292277613e-05, 'epoch': 1.89}
+{'loss': 1.3881, 'grad_norm': 1.480941653251648, 'learning_rate': 1.710654936461388e-05, 'epoch': 1.89}
+{'loss': 1.3982, 'grad_norm': 1.0729501247406006, 'learning_rate': 1.7082111436950146e-05, 'epoch': 1.89}
+{'loss': 1.5171, 'grad_norm': 1.248131275177002, 'learning_rate': 1.705767350928641e-05, 'epoch': 1.89}
+{'loss': 1.4509, 'grad_norm': 2.5417959690093994, 'learning_rate': 1.703323558162268e-05, 'epoch': 1.89}
+{'loss': 1.3934, 'grad_norm': 1.5075997114181519, 'learning_rate': 1.7008797653958943e-05, 'epoch': 1.89}
+{'loss': 1.4268, 'grad_norm': 5.500137805938721, 'learning_rate': 1.6984359726295208e-05, 'epoch': 1.89}
+{'loss': 1.368, 'grad_norm': 1.5557372570037842, 'learning_rate': 1.6959921798631476e-05, 'epoch': 1.89}
+{'loss': 1.3948, 'grad_norm': 2.184826374053955, 'learning_rate': 1.693548387096774e-05, 'epoch': 1.89}
+{'loss': 1.3585, 'grad_norm': 1.7609930038452148, 'learning_rate': 1.6911045943304006e-05, 'epoch': 1.89}
+{'loss': 1.3926, 'grad_norm': 2.9163129329681396, 'learning_rate': 1.6886608015640274e-05, 'epoch': 1.89}
+{'loss': 1.2834, 'grad_norm': 1.8992156982421875, 'learning_rate': 1.686217008797654e-05, 'epoch': 1.89}
+{'loss': 1.4879, 'grad_norm': 2.0363168716430664, 'learning_rate': 1.6837732160312804e-05, 'epoch': 1.89}
+{'loss': 1.119, 'grad_norm': 1.4370715618133545, 'learning_rate': 1.6813294232649072e-05, 'epoch': 1.89}
+{'loss': 1.2856, 'grad_norm': 1.8881416320800781, 'learning_rate': 1.6788856304985337e-05, 'epoch': 1.9}
+{'loss': 1.3787, 'grad_norm': 1.4255024194717407, 'learning_rate': 1.67644183773216e-05, 'epoch': 1.9}
+{'loss': 1.2557, 'grad_norm': 2.249227285385132, 'learning_rate': 1.6739980449657866e-05, 'epoch': 1.9}
+{'loss': 1.3487, 'grad_norm': 4.423599720001221, 'learning_rate': 1.6715542521994134e-05, 'epoch': 1.9}
+{'loss': 1.192, 'grad_norm': 4.681179523468018, 'learning_rate': 1.66911045943304e-05, 'epoch': 1.9}
+{'loss': 1.1183, 'grad_norm': 1.816692590713501, 'learning_rate': 1.6666666666666664e-05, 'epoch': 1.9}
+{'loss': 1.3788, 'grad_norm': 4.783669948577881, 'learning_rate': 1.6642228739002932e-05, 'epoch': 1.9}
+{'loss': 1.1074, 'grad_norm': 3.048429250717163, 'learning_rate': 1.6617790811339197e-05, 'epoch': 1.9}
+{'loss': 1.274, 'grad_norm': 3.0569400787353516, 'learning_rate': 1.6593352883675462e-05, 'epoch': 1.9}
+{'loss': 1.0185, 'grad_norm': 2.585090398788452, 'learning_rate': 1.656891495601173e-05, 'epoch': 1.9}
+{'loss': 1.3508, 'grad_norm': 2.0414528846740723, 'learning_rate': 1.6544477028347995e-05, 'epoch': 1.9}
+{'loss': 1.2198, 'grad_norm': 2.784245729446411, 'learning_rate': 1.652003910068426e-05, 'epoch': 1.9}
+{'loss': 1.103, 'grad_norm': 2.629258394241333, 'learning_rate': 1.6495601173020528e-05, 'epoch': 1.9}
+{'loss': 1.291, 'grad_norm': 6.373242378234863, 'learning_rate': 1.6471163245356793e-05, 'epoch': 1.9}
+{'loss': 1.0345, 'grad_norm': 3.1650185585021973, 'learning_rate': 1.6446725317693057e-05, 'epoch': 1.9}
+{'loss': 0.8684, 'grad_norm': 2.357555389404297, 'learning_rate': 1.6422287390029326e-05, 'epoch': 1.9}
+{'loss': 1.0331, 'grad_norm': 3.3148272037506104, 'learning_rate': 1.639784946236559e-05, 'epoch': 1.9}
+{'loss': 0.9614, 'grad_norm': 3.1584115028381348, 'learning_rate': 1.6373411534701855e-05, 'epoch': 1.9}
+{'loss': 1.0514, 'grad_norm': 5.584422588348389, 'learning_rate': 1.6348973607038123e-05, 'epoch': 1.9}
+{'loss': 1.5685, 'grad_norm': 3.8823444843292236, 'learning_rate': 1.6324535679374388e-05, 'epoch': 1.9}
+{'loss': 1.2219, 'grad_norm': 3.763765811920166, 'learning_rate': 1.6300097751710653e-05, 'epoch': 1.9}
+{'loss': 1.0447, 'grad_norm': 2.749150037765503, 'learning_rate': 1.6275659824046918e-05, 'epoch': 1.9}
+{'loss': 1.004, 'grad_norm': 5.038095951080322, 'learning_rate': 1.6251221896383186e-05, 'epoch': 1.9}
+{'loss': 1.027, 'grad_norm': 8.337298393249512, 'learning_rate': 1.622678396871945e-05, 'epoch': 1.9}
+{'loss': 1.4483, 'grad_norm': 3.5762979984283447, 'learning_rate': 1.6202346041055716e-05, 'epoch': 1.9}
+{'loss': 1.2446, 'grad_norm': 2.6334099769592285, 'learning_rate': 1.6177908113391984e-05, 'epoch': 1.9}
+{'loss': 1.2401, 'grad_norm': 2.6013121604919434, 'learning_rate': 1.615347018572825e-05, 'epoch': 1.9}
+{'loss': 1.249, 'grad_norm': 3.092857599258423, 'learning_rate': 1.6129032258064513e-05, 'epoch': 1.9}
+{'loss': 0.4669, 'grad_norm': 4.714329719543457, 'learning_rate': 1.610459433040078e-05, 'epoch': 1.9}
+{'loss': 0.4258, 'grad_norm': 1.6379783153533936, 'learning_rate': 1.6080156402737046e-05, 'epoch': 1.9}
+{'loss': 0.6978, 'grad_norm': 3.6598150730133057, 'learning_rate': 1.605571847507331e-05, 'epoch': 1.9}
+{'loss': 1.3925, 'grad_norm': 4.708147048950195, 'learning_rate': 1.603128054740958e-05, 'epoch': 1.9}
+{'loss': 0.5404, 'grad_norm': 1.7821239233016968, 'learning_rate': 1.6006842619745844e-05, 'epoch': 1.9}
+{'loss': 1.1651, 'grad_norm': 1.8768203258514404, 'learning_rate': 1.598240469208211e-05, 'epoch': 1.9}
+{'loss': 1.237, 'grad_norm': 0.9923833012580872, 'learning_rate': 1.5957966764418374e-05, 'epoch': 1.9}
+{'loss': 1.2096, 'grad_norm': 0.9963833689689636, 'learning_rate': 1.5933528836754642e-05, 'epoch': 1.9}
+{'loss': 1.2101, 'grad_norm': 1.4401146173477173, 'learning_rate': 1.5909090909090907e-05, 'epoch': 1.9}
+{'loss': 1.0725, 'grad_norm': 0.8284012079238892, 'learning_rate': 1.588465298142717e-05, 'epoch': 1.9}
+{'loss': 1.2081, 'grad_norm': 1.0937573909759521, 'learning_rate': 1.586021505376344e-05, 'epoch': 1.9}
+{'loss': 1.0808, 'grad_norm': 0.7755000591278076, 'learning_rate': 1.5835777126099705e-05, 'epoch': 1.9}
+{'loss': 1.183, 'grad_norm': 1.1034554243087769, 'learning_rate': 1.581133919843597e-05, 'epoch': 1.9}
+{'loss': 1.0971, 'grad_norm': 1.378918170928955, 'learning_rate': 1.5786901270772238e-05, 'epoch': 1.9}
+{'loss': 1.2029, 'grad_norm': 1.6147481203079224, 'learning_rate': 1.5762463343108502e-05, 'epoch': 1.9}
+{'loss': 1.002, 'grad_norm': 1.2892570495605469, 'learning_rate': 1.5738025415444767e-05, 'epoch': 1.9}
+{'loss': 1.105, 'grad_norm': 1.0253342390060425, 'learning_rate': 1.5713587487781035e-05, 'epoch': 1.9}
+{'loss': 1.1433, 'grad_norm': 1.6563441753387451, 'learning_rate': 1.56891495601173e-05, 'epoch': 1.9}
+{'loss': 0.9589, 'grad_norm': 1.512212872505188, 'learning_rate': 1.5664711632453565e-05, 'epoch': 1.9}
+{'loss': 1.0516, 'grad_norm': 0.9118117690086365, 'learning_rate': 1.5640273704789833e-05, 'epoch': 1.9}
+{'loss': 1.2032, 'grad_norm': 2.5056610107421875, 'learning_rate': 1.5615835777126098e-05, 'epoch': 1.9}
+ 95%|█████████▌| 12155/12776 [2:09:59<04:56,  2.10it/s] 95%|█████████▌| 12156/12776 [2:09:59<04:41,  2.20it/s]                                                        95%|█████████▌| 12156/12776 [2:09:59<04:41,  2.20it/s] 95%|█████████▌| 12157/12776 [2:10:00<04:30,  2.29it/s]                                                        95%|█████████▌| 12157/12776 [2:10:00<04:30,  2.29it/s] 95%|█████████▌| 12158/12776 [2:10:00<04:45,  2.16it/s]                                                        95%|█████████▌| 12158/12776 [2:10:00<04:45,  2.16it/s] 95%|█████████▌| 12159/12776 [2:10:00<04:25,  2.32it/s]                                                        95%|█████████▌| 12159/12776 [2:10:00<04:25,  2.32it/s] 95%|█████████▌| 12160/12776 [2:10:01<04:10,  2.46it/s]                                                        95%|█████████▌| 12160/12776 [2:10:01<04:10,  2.46it/s] 95%|█████████▌| 12161/12776 [2:10:01<04:13,  2.43it/s]                                                        95%|█████████▌| 12161/12776 [2:10:01<04:13,  2.43it/s] 95%|█████████▌| 12162/12776 [2:10:02<03:57,  2.58it/s]                                                        95%|█████████▌| 12162/12776 [2:10:02<03:57,  2.58it/s] 95%|█████████▌| 12163/12776 [2:10:02<03:44,  2.73it/s]                                                        95%|█████████▌| 12163/12776 [2:10:02<03:44,  2.73it/s] 95%|█████████▌| 12164/12776 [2:10:02<03:36,  2.82it/s]                                                        95%|█████████▌| 12164/12776 [2:10:02<03:36,  2.82it/s] 95%|█████████▌| 12165/12776 [2:10:03<03:25,  2.98it/s]                                                        95%|█████████▌| 12165/12776 [2:10:03<03:25,  2.98it/s] 95%|█████████▌| 12166/12776 [2:10:03<03:15,  3.12it/s]                                                        95%|█████████▌| 12166/12776 [2:10:03<03:15,  3.12it/s] 95%|█████████▌| 12167/12776 [2:10:03<03:05,  3.28it/s]                                                        95%|█████████▌| 12167/12776 [2:10:03<03:05,  3.28it/s] 95%|█████████▌| 12168/12776 [2:10:03<03:05,  3.27it/s]                                                        95%|█████████▌| 12168/12776 [2:10:03<03:05,  3.27it/s] 95%|█████████▌| 12169/12776 [2:10:04<02:56,  3.44it/s]                                                        95%|█████████▌| 12169/12776 [2:10:04<02:56,  3.44it/s] 95%|█████████▌| 12170/12776 [2:10:04<02:49,  3.58it/s]                                                        95%|█████████▌| 12170/12776 [2:10:04<02:49,  3.58it/s] 95%|█████████▌| 12171/12776 [2:10:04<02:43,  3.70it/s]                                                        95%|█████████▌| 12171/12776 [2:10:04<02:43,  3.70it/s] 95%|█████████▌| 12172/12776 [2:10:04<02:57,  3.41it/s]                                                        95%|█████████▌| 12172/12776 [2:10:05<02:57,  3.41it/s] 95%|█████████▌| 12173/12776 [2:10:05<02:46,  3.63it/s]                                                        95%|█████████▌| 12173/12776 [2:10:05<02:46,  3.63it/s] 95%|█████████▌| 12174/12776 [2:10:05<02:37,  3.83it/s]                                                        95%|█████████▌| 12174/12776 [2:10:05<02:37,  3.83it/s] 95%|█████████▌| 12175/12776 [2:10:05<02:29,  4.02it/s]                                                        95%|█████████▌| 12175/12776 [2:10:05<02:29,  4.02it/s] 95%|█████████▌| 12176/12776 [2:10:05<02:23,  4.19it/s]                                                        95%|█████████▌| 12176/12776 [2:10:05<02:23,  4.19it/s] 95%|█████████▌| 12177/12776 [2:10:06<02:26,  4.09it/s]                                                        95%|█████████▌| 12177/12776 [2:10:06<02:26,  4.09it/s] 95%|█████████▌| 12178/12776 [2:10:06<02:20,  4.26it/s]                                                        95%|█████████▌| 12178/12776 [2:10:06<02:20,  4.26it/s] 95%|█████████▌| 12179/12776 [2:10:06<02:15,  4.42it/s]                                                        95%|█████████▌| 12179/12776 [2:10:06<02:15,  4.42it/s] 95%|█████████▌| 12180/12776 [2:10:06<02:10,  4.57it/s]                                                        95%|█████████▌| 12180/12776 [2:10:06<02:10,  4.57it/s] 95%|█████████▌| 12181/12776 [2:10:06<02:06,  4.70it/s]                                                        95%|█████████▌| 12181/12776 [2:10:06<02:06,  4.70it/s] 95%|█████████▌| 12182/12776 [2:10:07<02:24,  4.11it/s]                                                        95%|█████████▌| 12182/12776 [2:10:07<02:24,  4.11it/s] 95%|█████████▌| 12183/12776 [2:10:07<02:15,  4.37it/s]                                                        95%|█████████▌| 12183/12776 [2:10:07<02:15,  4.37it/s] 95%|█████████▌| 12184/12776 [2:10:07<02:08,  4.59it/s]                                                        95%|█████████▌| 12184/12776 [2:10:07<02:08,  4.59it/s] 95%|█████████▌| 12185/12776 [2:10:07<02:04,  4.75it/s]                                                        95%|█████████▌| 12185/12776 [2:10:07<02:04,  4.75it/s] 95%|█████████▌| 12186/12776 [2:10:08<02:00,  4.90it/s]                                                        95%|█████████▌| 12186/12776 [2:10:08<02:00,  4.90it/s] 95%|█████████▌| 12187/12776 [2:10:08<01:56,  5.04it/s]                                                        95%|█████████▌| 12187/12776 [2:10:08<01:56,  5.04it/s] 95%|█████████▌| 12188/12776 [2:10:08<03:30,  2.79it/s]                                                        95%|█████████▌| 12188/12776 [2:10:08<03:30,  2.79it/s] 95%|█████████▌| 12189/12776 [2:10:10<07:04,  1.38it/s]                                                        95%|█████████▌| 12189/12776 [2:10:10<07:04,  1.38it/s] 95%|█████████▌| 12190/12776 [2:10:11<07:40,  1.27it/s]                                                        95%|█████████▌| 12190/12776 [2:10:11<07:40,  1.27it/s] 95%|█████████▌| 12191/12776 [2:10:12<07:59,  1.22it/s]                                                        95%|█████████▌| 12191/12776 [2:10:12<07:59,  1.22it/s] 95%|█████████▌| 12192/12776 [2:10:13<07:45,  1.26it/s]                                                        95%|█████████▌| 12192/12776 [2:10:13<07:45,  1.26it/s] 95%|█████████▌| 12193/12776 [2:10:13<07:30,  1.29it/s]                                                        95%|█████████▌| 12193/12776 [2:10:13<07:30,  1.29it/s] 95%|█████████▌| 12194/12776 [2:10:14<07:25,  1.31it/s]                                                        95%|█████████▌| 12194/12776 [2:10:14<07:25,  1.31it/s] 95%|█████████▌| 12195/12776 [2:10:15<07:18,  1.32it/s]                                                        95%|█████████▌| 12195/12776 [2:10:15<07:18,  1.32it/s] 95%|█████████▌| 12196/12776 [2:10:15<06:46,  1.43it/s]                                                        95%|█████████▌| 12196/12776 [2:10:15<06:46,  1.43it/s] 95%|█████████▌| 12197/12776 [2:10:16<06:24,  1.50it/s]                                                        95%|█████████▌| 12197/12776 [2:10:16<06:24,  1.50it/s] 95%|█████████▌| 12198/12776 [2:10:17<06:03,  1.59it/s]                                                        95%|█████████▌| 12198/12776 [2:10:17<06:03,  1.59it/s] 95%|█████████▌| 12199/12776 [2:10:17<05:56,  1.62it/s]                                                        95%|█████████▌| 12199/12776 [2:10:17<05:56,  1.62it/s] 95%|█████████▌| 12200/12776 [2:10:18<05:34,  1.72it/s]                                                        95%|█████████▌| 12200/12776 [2:10:18<05:34,  1.72it/s] 95%|█████████▌| 12201/12776 [2:10:18<05:13,  1.83it/s]                                                        95%|█████████▌| 12201/12776 [2:10:18<05:13,  1.83it/s] 96%|█████████▌| 12202/12776 [2:10:19<04:58,  1.92it/s]                                                        96%|█████████▌| 12202/12776 [2:10:19<04:58,  1.92it/s] 96%|█████████▌| 12203/12776 [2:10:19<04:43,  2.02it/s]                                                        96%|█████████▌| 12203/12776 [2:10:19<04:43,  2.02it/s] 96%|█████████▌| 12204/12776 [2:10:19<04:34,  2.09it/s]                                                        96%|█████████▌| 12204/12776 [2:10:19<04:34,  2.09it/s] 96%|█████████▌| 12205/12776 [2:10:20<04:20,  2.19it/s]                                                        96%|█████████▌| 12205/12776 [2:10:20<04:20,  2.19it/s] 96%|█████████▌| 12206/12776 [2:10:20<04:08,  2.29it/s]                                                        96%|█████████▌| 12206/12776 [2:10:20<04:08,  2.29it/s] 96%|█████████▌| 12207/12776 [2:10:21<04:21,  2.18it/s]                                                        96%|█████████▌| 12207/12776 [2:10:21<04:21,  2.18it/s] 96%|█████████▌| 12208/12776 [2:10:21<04:02,  2.34it/s]                                                        96%|█████████▌| 12208/12776 [2:10:21<04:02,  2.34it/s] 96%|█████████▌| 12209/12776 [2:10:21<03:47,  2.49it/s]                                                        96%|█████████▌| 12209/12776 [2:10:21<03:47,  2.49it/s] 96%|█████████▌| 12210/12776 [2:10:22<03:54,  2.41it/s]                                                        96%|█████████▌| 12210/12776 [2:10:22<03:54,  2.41it/s] 96%|█████████▌| 12211/12776 [2:10:22<03:40,  2.56it/s]                                                        96%|█████████▌| 12211/12776 [2:10:22<03:40,  2.56it/s] 96%|█████████▌| 12212/12776 [2:10:23<03:29,  2.70it/s]                                                        96%|█████████▌| 12212/12776 [2:10:23<03:29,  2.70it/s] 96%|█████████▌| 12213/12776 [2:10:23<03:20,  2.80it/s]                                                        96%|█████████▌| 12213/12776 [2:10:23<03:20,  2.80it/s] 96%|█████████▌| 12214/12776 [2:10:23<03:11,  2.94it/s]                                                        96%|█████████▌| 12214/12776 [2:10:23<03:11,  2.94it/s] 96%|█████████▌| 12215/12776 [2:10:23<03:03,  3.06it/s]                                                        96%|█████████▌| 12215/12776 [2:10:23<03:03,  3.06it/s] 96%|█████████▌| 12216/12776 [2:10:24<02:57,  3.16it/s]                                                        96%|█████████▌| 12216/12776 [2:10:24<02:57,  3.16it/s] 96%|█████████▌| 12217/12776 [2:10:24<03:07,  2.98it/s]                                                        96%|█████████▌| 12217/12776 [2:10:24<03:07,  2.98it/s] 96%|█████████▌| 12218/12776 [2:10:24<02:56,  3.16it/s]                                                        96%|█████████▌| 12218/12776 [2:10:24<02:56,  3.16it/s] 96%|█████████▌| 12219/12776 [2:10:25<02:47,  3.33it/s]                                                        96%|█████████▌| 12219/12776 [2:10:25<02:47,  3.33it/s] 96%|█████████▌| 12220/12776 [2:10:25<02:39,  3.49it/s]                                                        96%|█████████▌| 12220/12776 [2:10:25<02:39,  3.49it/s] 96%|█████████▌| 12221/12776 [2:10:25<02:47,  3.32it/s]                                                        96%|█████████▌| 12221/12776 [2:10:25<02:47,  3.32it/s] 96%|█████████▌| 12222/12776 [2:10:25<02:37,  3.52it/s]                                                        96%|█████████▌| 12222/12776 [2:10:25<02:37,  3.52it/s] 96%|█████████▌| 12223/12776 [2:10:26<02:28,  3.72it/s]                                                        96%|█████████▌| 12223/12776 [2:10:26<02:28,  3.72it/s] 96%|█████████▌| 12224/12776 [2:10:26<02:22,  3.88it/s]                                                        96%|█████████▌| 12224/12776 [2:10:26<02:22,  3.88it/s] 96%|█████████▌| 12225/12776 [2:10:26<02:28,  3.72it/s]                                                        96%|█████████▌| 12225/12776 [2:10:26<02:28,  3.72it/s] 96%|█████████▌| 12226/12776 [2:10:26<02:19,  3.94it/s]                                                        96%|█████████▌| 12226/12776 [2:10:26<02:19,  3.94it/s] 96%|█████████▌| 12227/12776 [2:10:27<02:12,  4.14it/s]                                                        96%|█████████▌| 12227/12776 [2:10:27<02:12,  4.14it/s] 96%|█████████▌| 12228/12776 [2:10:27<02:07,  4.31it/s]                                                        96%|█████████▌| 12228/12776 [2:10:27<02:07,  4.31it/s] 96%|█████████▌| 12229/12776 [2:10:27<02:03,  4.45it/s]                                                        96%|█████████▌| 12229/12776 [2:10:27<02:03,  4.45it/s] 96%|█████████▌| 12230/12776 [2:10:27<02:15,  4.03it/s]                                                        96%|█████████▌| 12230/12776 [2:10:27<02:15,  4.03it/s] 96%|█████████▌| 12231/12776 [2:10:28<02:07,  4.27it/s]                                                        96%|█████████▌| 12231/12776 [2:10:28<02:07,  4.27it/s] 96%|█████████▌| 12232/12776 [2:10:28<02:01,  4.49it/s]                                                        96%|█████████▌| 12232/12776 [2:10:28<02:01,  4.49it/s] 96%|█████████▌| 12233/12776 [2:10:28<01:56,  4.65it/s]                                                       {'loss': 1.0465, 'grad_norm': 3.2829456329345703, 'learning_rate': 1.5591397849462363e-05, 'epoch': 1.9}
+{'loss': 1.1008, 'grad_norm': 4.280078411102295, 'learning_rate': 1.556695992179863e-05, 'epoch': 1.9}
+{'loss': 1.0278, 'grad_norm': 1.5060573816299438, 'learning_rate': 1.5542521994134896e-05, 'epoch': 1.9}
+{'loss': 1.1257, 'grad_norm': 2.12257981300354, 'learning_rate': 1.551808406647116e-05, 'epoch': 1.9}
+{'loss': 1.2765, 'grad_norm': 2.310483694076538, 'learning_rate': 1.549364613880743e-05, 'epoch': 1.9}
+{'loss': 1.1797, 'grad_norm': 2.682175636291504, 'learning_rate': 1.5469208211143694e-05, 'epoch': 1.9}
+{'loss': 1.0389, 'grad_norm': 3.6471972465515137, 'learning_rate': 1.544477028347996e-05, 'epoch': 1.9}
+{'loss': 1.0323, 'grad_norm': 2.102436065673828, 'learning_rate': 1.5420332355816226e-05, 'epoch': 1.9}
+{'loss': 1.1483, 'grad_norm': 4.041779041290283, 'learning_rate': 1.539589442815249e-05, 'epoch': 1.9}
+{'loss': 1.1631, 'grad_norm': 3.722378730773926, 'learning_rate': 1.5371456500488756e-05, 'epoch': 1.9}
+{'loss': 1.3785, 'grad_norm': 4.074342727661133, 'learning_rate': 1.5347018572825024e-05, 'epoch': 1.9}
+{'loss': 0.945, 'grad_norm': 6.567094326019287, 'learning_rate': 1.532258064516129e-05, 'epoch': 1.9}
+{'loss': 1.2596, 'grad_norm': 7.025415897369385, 'learning_rate': 1.5298142717497554e-05, 'epoch': 1.9}
+{'loss': 1.2277, 'grad_norm': 3.313201904296875, 'learning_rate': 1.5273704789833822e-05, 'epoch': 1.9}
+{'loss': 1.2767, 'grad_norm': 4.0791120529174805, 'learning_rate': 1.5249266862170087e-05, 'epoch': 1.9}
+{'loss': 1.1568, 'grad_norm': 4.89728307723999, 'learning_rate': 1.5224828934506352e-05, 'epoch': 1.91}
+{'loss': 1.1248, 'grad_norm': 3.8333678245544434, 'learning_rate': 1.5200391006842618e-05, 'epoch': 1.91}
+{'loss': 1.0531, 'grad_norm': 4.54195499420166, 'learning_rate': 1.5175953079178883e-05, 'epoch': 1.91}
+{'loss': 1.124, 'grad_norm': 4.03914737701416, 'learning_rate': 1.515151515151515e-05, 'epoch': 1.91}
+{'loss': 1.2547, 'grad_norm': 7.725468635559082, 'learning_rate': 1.5127077223851416e-05, 'epoch': 1.91}
+{'loss': 1.0578, 'grad_norm': 4.000185966491699, 'learning_rate': 1.510263929618768e-05, 'epoch': 1.91}
+{'loss': 1.0488, 'grad_norm': 3.3531341552734375, 'learning_rate': 1.5078201368523947e-05, 'epoch': 1.91}
+{'loss': 1.0028, 'grad_norm': 1.9735767841339111, 'learning_rate': 1.5053763440860214e-05, 'epoch': 1.91}
+{'loss': 1.0639, 'grad_norm': 6.0913262367248535, 'learning_rate': 1.5029325513196479e-05, 'epoch': 1.91}
+{'loss': 0.9197, 'grad_norm': 1.993403434753418, 'learning_rate': 1.5004887585532745e-05, 'epoch': 1.91}
+{'loss': 1.3543, 'grad_norm': 3.3327815532684326, 'learning_rate': 1.4980449657869012e-05, 'epoch': 1.91}
+{'loss': 0.9099, 'grad_norm': 3.145503044128418, 'learning_rate': 1.4956011730205276e-05, 'epoch': 1.91}
+{'loss': 0.6648, 'grad_norm': 2.0428433418273926, 'learning_rate': 1.4931573802541543e-05, 'epoch': 1.91}
+{'loss': 0.533, 'grad_norm': 1.2480318546295166, 'learning_rate': 1.490713587487781e-05, 'epoch': 1.91}
+{'loss': 1.3884, 'grad_norm': 3.0121002197265625, 'learning_rate': 1.4882697947214074e-05, 'epoch': 1.91}
+{'loss': 0.562, 'grad_norm': 1.4688293933868408, 'learning_rate': 1.485826001955034e-05, 'epoch': 1.91}
+{'loss': 0.3936, 'grad_norm': 1.736817479133606, 'learning_rate': 1.4833822091886607e-05, 'epoch': 1.91}
+{'loss': 0.5264, 'grad_norm': 2.580030918121338, 'learning_rate': 1.4809384164222872e-05, 'epoch': 1.91}
+{'loss': 1.3178, 'grad_norm': 5.177095413208008, 'learning_rate': 1.4784946236559138e-05, 'epoch': 1.91}
+{'loss': 1.284, 'grad_norm': 0.8959721326828003, 'learning_rate': 1.4760508308895405e-05, 'epoch': 1.91}
+{'loss': 1.3241, 'grad_norm': 1.049804449081421, 'learning_rate': 1.473607038123167e-05, 'epoch': 1.91}
+{'loss': 1.1876, 'grad_norm': 4.327507495880127, 'learning_rate': 1.4711632453567936e-05, 'epoch': 1.91}
+{'loss': 1.2219, 'grad_norm': 2.699472188949585, 'learning_rate': 1.4687194525904203e-05, 'epoch': 1.91}
+{'loss': 1.2485, 'grad_norm': 1.9174275398254395, 'learning_rate': 1.4662756598240468e-05, 'epoch': 1.91}
+{'loss': 1.3244, 'grad_norm': 3.461306095123291, 'learning_rate': 1.4638318670576734e-05, 'epoch': 1.91}
+{'loss': 1.2386, 'grad_norm': 2.7849996089935303, 'learning_rate': 1.4613880742913e-05, 'epoch': 1.91}
+{'loss': 1.2349, 'grad_norm': 3.3640100955963135, 'learning_rate': 1.4589442815249265e-05, 'epoch': 1.91}
+{'loss': 1.253, 'grad_norm': 0.8825156688690186, 'learning_rate': 1.4565004887585532e-05, 'epoch': 1.91}
+{'loss': 1.1779, 'grad_norm': 2.1742002964019775, 'learning_rate': 1.4540566959921798e-05, 'epoch': 1.91}
+{'loss': 1.1674, 'grad_norm': 1.2652525901794434, 'learning_rate': 1.4516129032258063e-05, 'epoch': 1.91}
+{'loss': 1.1022, 'grad_norm': 1.1848137378692627, 'learning_rate': 1.449169110459433e-05, 'epoch': 1.91}
+{'loss': 0.9741, 'grad_norm': 4.88668155670166, 'learning_rate': 1.4467253176930596e-05, 'epoch': 1.91}
+{'loss': 1.3092, 'grad_norm': 1.7493770122528076, 'learning_rate': 1.4442815249266861e-05, 'epoch': 1.91}
+{'loss': 1.1745, 'grad_norm': 2.5054683685302734, 'learning_rate': 1.4418377321603127e-05, 'epoch': 1.91}
+{'loss': 1.1197, 'grad_norm': 1.4955793619155884, 'learning_rate': 1.4393939393939392e-05, 'epoch': 1.91}
+{'loss': 1.1417, 'grad_norm': 1.2048388719558716, 'learning_rate': 1.4369501466275659e-05, 'epoch': 1.91}
+{'loss': 1.2411, 'grad_norm': 1.501969814300537, 'learning_rate': 1.4345063538611925e-05, 'epoch': 1.91}
+{'loss': 1.1179, 'grad_norm': 1.6169402599334717, 'learning_rate': 1.432062561094819e-05, 'epoch': 1.91}
+{'loss': 1.1619, 'grad_norm': 2.1456000804901123, 'learning_rate': 1.4296187683284456e-05, 'epoch': 1.91}
+{'loss': 1.2492, 'grad_norm': 3.0159125328063965, 'learning_rate': 1.4271749755620723e-05, 'epoch': 1.91}
+{'loss': 1.0944, 'grad_norm': 1.6835179328918457, 'learning_rate': 1.4247311827956988e-05, 'epoch': 1.91}
+{'loss': 0.9977, 'grad_norm': 2.42319393157959, 'learning_rate': 1.4222873900293254e-05, 'epoch': 1.91}
+{'loss': 1.3491, 'grad_norm': 1.6393407583236694, 'learning_rate': 1.419843597262952e-05, 'epoch': 1.91}
+{'loss': 1.274, 'grad_norm': 4.471738815307617, 'learning_rate': 1.4173998044965786e-05, 'epoch': 1.91}
+{'loss': 1.1159, 'grad_norm': 2.172912836074829, 'learning_rate': 1.4149560117302052e-05, 'epoch': 1.91}
+{'loss': 1.0586, 'grad_norm': 2.280036449432373, 'learning_rate': 1.4125122189638319e-05, 'epoch': 1.91}
+{'loss': 0.9635, 'grad_norm': 5.947153568267822, 'learning_rate': 1.4100684261974583e-05, 'epoch': 1.91}
+{'loss': 1.4723, 'grad_norm': 5.538031101226807, 'learning_rate': 1.407624633431085e-05, 'epoch': 1.91}
+{'loss': 1.4025, 'grad_norm': 3.303689479827881, 'learning_rate': 1.4051808406647116e-05, 'epoch': 1.91}
+{'loss': 1.0212, 'grad_norm': 2.7196755409240723, 'learning_rate': 1.4027370478983381e-05, 'epoch': 1.91}
+{'loss': 1.114, 'grad_norm': 14.392826080322266, 'learning_rate': 1.4002932551319648e-05, 'epoch': 1.91}
+{'loss': 1.1304, 'grad_norm': 5.36333703994751, 'learning_rate': 1.3978494623655914e-05, 'epoch': 1.91}
+{'loss': 0.9993, 'grad_norm': 2.031944513320923, 'learning_rate': 1.3954056695992177e-05, 'epoch': 1.91}
+{'loss': 1.1506, 'grad_norm': 7.611963748931885, 'learning_rate': 1.3929618768328445e-05, 'epoch': 1.91}
+{'loss': 0.9725, 'grad_norm': 2.747000217437744, 'learning_rate': 1.3905180840664712e-05, 'epoch': 1.91}
+{'loss': 0.7786, 'grad_norm': 5.841665267944336, 'learning_rate': 1.3880742913000975e-05, 'epoch': 1.91}
+{'loss': 1.0598, 'grad_norm': 6.18753719329834, 'learning_rate': 1.3856304985337243e-05, 'epoch': 1.91}
+{'loss': 1.1976, 'grad_norm': 3.2494606971740723, 'learning_rate': 1.383186705767351e-05, 'epoch': 1.91}
+{'loss': 1.413, 'grad_norm': 7.703588008880615, 'learning_rate': 1.3807429130009773e-05, 'epoch': 1.91}
+{'loss': 1.4166, 'grad_norm': 3.169772148132324, 'learning_rate': 1.378299120234604e-05, 'epoch': 1.91}
+{'loss': 1.3009, 'grad_norm': 3.6064870357513428, 'learning_rate': 1.3758553274682307e-05, 'epoch': 1.91}
+{'loss': 0.8335, 'grad_norm': 2.5471582412719727, 'learning_rate': 1.373411534701857e-05, 'epoch': 1.91}
+{'loss': 1.1152, 'grad_norm': 3.2321269512176514, 'learning_rate': 1.3709677419354837e-05, 'epoch': 1.91}
+ 96%|█████████▌| 12233/12776 [2:10:28<01:56,  4.65it/s] 96%|█████████▌| 12234/12776 [2:10:28<01:52,  4.80it/s]                                                        96%|█████████▌| 12234/12776 [2:10:28<01:52,  4.80it/s] 96%|█████████▌| 12235/12776 [2:10:28<01:50,  4.90it/s]                                                        96%|█████████▌| 12235/12776 [2:10:28<01:50,  4.90it/s] 96%|█████████▌| 12236/12776 [2:10:29<02:00,  4.46it/s]                                                        96%|█████████▌| 12236/12776 [2:10:29<02:00,  4.46it/s] 96%|█████████▌| 12237/12776 [2:10:29<01:54,  4.72it/s]                                                        96%|█████████▌| 12237/12776 [2:10:29<01:54,  4.72it/s] 96%|█████████▌| 12238/12776 [2:10:29<03:08,  2.85it/s]                                                        96%|█████████▌| 12238/12776 [2:10:29<03:08,  2.85it/s] 96%|█████████▌| 12239/12776 [2:10:31<05:39,  1.58it/s]                                                        96%|█████████▌| 12239/12776 [2:10:31<05:39,  1.58it/s] 96%|█████████▌| 12240/12776 [2:10:32<06:26,  1.39it/s]                                                        96%|█████████▌| 12240/12776 [2:10:32<06:26,  1.39it/s] 96%|█████████▌| 12241/12776 [2:10:33<06:41,  1.33it/s]                                                        96%|█████████▌| 12241/12776 [2:10:33<06:41,  1.33it/s] 96%|█████████▌| 12242/12776 [2:10:33<06:55,  1.29it/s]                                                        96%|█████████▌| 12242/12776 [2:10:33<06:55,  1.29it/s] 96%|█████████▌| 12243/12776 [2:10:34<07:00,  1.27it/s]                                                        96%|█████████▌| 12243/12776 [2:10:34<07:00,  1.27it/s] 96%|█████████▌| 12244/12776 [2:10:35<06:39,  1.33it/s]                                                        96%|█████████▌| 12244/12776 [2:10:35<06:39,  1.33it/s] 96%|█████████▌| 12245/12776 [2:10:36<06:39,  1.33it/s]                                                        96%|█████████▌| 12245/12776 [2:10:36<06:39,  1.33it/s] 96%|█████████▌| 12246/12776 [2:10:36<06:13,  1.42it/s]                                                        96%|█████████▌| 12246/12776 [2:10:36<06:13,  1.42it/s] 96%|█████████▌| 12247/12776 [2:10:37<06:02,  1.46it/s]                                                        96%|█████████▌| 12247/12776 [2:10:37<06:02,  1.46it/s] 96%|█████████▌| 12248/12776 [2:10:37<05:39,  1.55it/s]                                                        96%|█████████▌| 12248/12776 [2:10:37<05:39,  1.55it/s] 96%|█████████▌| 12249/12776 [2:10:38<05:34,  1.58it/s]                                                        96%|█████████▌| 12249/12776 [2:10:38<05:34,  1.58it/s] 96%|█████████▌| 12250/12776 [2:10:39<05:19,  1.65it/s]                                                        96%|█████████▌| 12250/12776 [2:10:39<05:19,  1.65it/s] 96%|█████████▌| 12251/12776 [2:10:39<05:14,  1.67it/s]                                                        96%|█████████▌| 12251/12776 [2:10:39<05:14,  1.67it/s] 96%|█████████▌| 12252/12776 [2:10:40<04:50,  1.80it/s]                                                        96%|█████████▌| 12252/12776 [2:10:40<04:50,  1.80it/s] 96%|█████████▌| 12253/12776 [2:10:40<04:54,  1.78it/s]                                                        96%|█████████▌| 12253/12776 [2:10:40<04:54,  1.78it/s] 96%|█████████▌| 12254/12776 [2:10:41<04:31,  1.92it/s]                                                        96%|█████████▌| 12254/12776 [2:10:41<04:31,  1.92it/s] 96%|█████████▌| 12255/12776 [2:10:41<04:27,  1.95it/s]                                                        96%|█████████▌| 12255/12776 [2:10:41<04:27,  1.95it/s] 96%|█████████▌| 12256/12776 [2:10:41<04:06,  2.11it/s]                                                        96%|█████████▌| 12256/12776 [2:10:41<04:06,  2.11it/s] 96%|█████████▌| 12257/12776 [2:10:42<03:48,  2.27it/s]                                                        96%|█████████▌| 12257/12776 [2:10:42<03:48,  2.27it/s] 96%|█████████▌| 12258/12776 [2:10:42<03:45,  2.30it/s]                                                        96%|█████████▌| 12258/12776 [2:10:42<03:45,  2.30it/s] 96%|█████████▌| 12259/12776 [2:10:43<03:31,  2.45it/s]                                                        96%|█████████▌| 12259/12776 [2:10:43<03:31,  2.45it/s] 96%|█████████▌| 12260/12776 [2:10:43<03:19,  2.58it/s]                                                        96%|█████████▌| 12260/12776 [2:10:43<03:19,  2.58it/s] 96%|█████████▌| 12261/12776 [2:10:43<03:26,  2.49it/s]                                                        96%|█████████▌| 12261/12776 [2:10:43<03:26,  2.49it/s] 96%|█████████▌| 12262/12776 [2:10:44<03:12,  2.67it/s]                                                        96%|█████████▌| 12262/12776 [2:10:44<03:12,  2.67it/s] 96%|█████████▌| 12263/12776 [2:10:44<03:00,  2.84it/s]                                                        96%|█████████▌| 12263/12776 [2:10:44<03:00,  2.84it/s] 96%|█████████▌| 12264/12776 [2:10:44<02:51,  2.98it/s]                                                        96%|█████████▌| 12264/12776 [2:10:44<02:51,  2.98it/s] 96%|█████████▌| 12265/12776 [2:10:45<02:54,  2.92it/s]                                                        96%|█████████▌| 12265/12776 [2:10:45<02:54,  2.92it/s] 96%|█████████▌| 12266/12776 [2:10:45<02:45,  3.08it/s]                                                        96%|█████████▌| 12266/12776 [2:10:45<02:45,  3.08it/s] 96%|█████████▌| 12267/12776 [2:10:45<02:36,  3.24it/s]                                                        96%|█████████▌| 12267/12776 [2:10:45<02:36,  3.24it/s] 96%|█████████▌| 12268/12776 [2:10:45<02:29,  3.40it/s]                                                        96%|█████████▌| 12268/12776 [2:10:45<02:29,  3.40it/s] 96%|█████████▌| 12269/12776 [2:10:46<02:36,  3.23it/s]                                                        96%|█████████▌| 12269/12776 [2:10:46<02:36,  3.23it/s] 96%|█████████▌| 12270/12776 [2:10:46<02:26,  3.44it/s]                                                        96%|█████████▌| 12270/12776 [2:10:46<02:26,  3.44it/s] 96%|█████████▌| 12271/12776 [2:10:46<02:18,  3.64it/s]                                                        96%|█████████▌| 12271/12776 [2:10:46<02:18,  3.64it/s] 96%|█████████▌| 12272/12776 [2:10:47<02:12,  3.81it/s]                                                        96%|█████████▌| 12272/12776 [2:10:47<02:12,  3.81it/s] 96%|█████████▌| 12273/12776 [2:10:47<02:27,  3.41it/s]                                                        96%|█████████▌| 12273/12776 [2:10:47<02:27,  3.41it/s] 96%|█████████▌| 12274/12776 [2:10:47<02:16,  3.68it/s]                                                        96%|█████████▌| 12274/12776 [2:10:47<02:16,  3.68it/s] 96%|█████████▌| 12275/12776 [2:10:47<02:07,  3.92it/s]                                                        96%|█████████▌| 12275/12776 [2:10:47<02:07,  3.92it/s] 96%|█████████▌| 12276/12776 [2:10:48<02:01,  4.12it/s]                                                        96%|█████████▌| 12276/12776 [2:10:48<02:01,  4.12it/s] 96%|█████████▌| 12277/12776 [2:10:48<01:55,  4.30it/s]                                                        96%|█████████▌| 12277/12776 [2:10:48<01:55,  4.30it/s] 96%|█████████▌| 12278/12776 [2:10:48<02:10,  3.81it/s]                                                        96%|█████████▌| 12278/12776 [2:10:48<02:10,  3.81it/s] 96%|█████████▌| 12279/12776 [2:10:48<02:02,  4.07it/s]                                                        96%|█████████▌| 12279/12776 [2:10:48<02:02,  4.07it/s] 96%|█████████▌| 12280/12776 [2:10:48<01:55,  4.29it/s]                                                        96%|█████████▌| 12280/12776 [2:10:48<01:55,  4.29it/s] 96%|█████████▌| 12281/12776 [2:10:49<01:50,  4.46it/s]                                                        96%|█████████▌| 12281/12776 [2:10:49<01:50,  4.46it/s] 96%|█████████▌| 12282/12776 [2:10:49<01:47,  4.61it/s]                                                        96%|█████████▌| 12282/12776 [2:10:49<01:47,  4.61it/s] 96%|█████████▌| 12283/12776 [2:10:49<02:01,  4.05it/s]                                                        96%|█████████▌| 12283/12776 [2:10:49<02:01,  4.05it/s] 96%|█████████▌| 12284/12776 [2:10:49<01:53,  4.33it/s]                                                        96%|█████████▌| 12284/12776 [2:10:49<01:53,  4.33it/s] 96%|█████████▌| 12285/12776 [2:10:50<01:47,  4.57it/s]                                                        96%|█████████▌| 12285/12776 [2:10:50<01:47,  4.57it/s] 96%|█████████▌| 12286/12776 [2:10:50<01:42,  4.79it/s]                                                        96%|█████████▌| 12286/12776 [2:10:50<01:42,  4.79it/s] 96%|█████████▌| 12287/12776 [2:10:50<01:38,  4.96it/s]                                                        96%|█████████▌| 12287/12776 [2:10:50<01:38,  4.96it/s] 96%|█████████▌| 12288/12776 [2:10:51<02:44,  2.96it/s]                                                        96%|█████████▌| 12288/12776 [2:10:51<02:44,  2.96it/s] 96%|█████████▌| 12289/12776 [2:10:52<05:23,  1.51it/s]                                                        96%|█████████▌| 12289/12776 [2:10:52<05:23,  1.51it/s] 96%|█████████▌| 12290/12776 [2:10:53<06:04,  1.33it/s]                                                        96%|█████████▌| 12290/12776 [2:10:53<06:04,  1.33it/s] 96%|█████████▌| 12291/12776 [2:10:54<06:16,  1.29it/s]                                                        96%|█████████▌| 12291/12776 [2:10:54<06:16,  1.29it/s] 96%|█████████▌| 12292/12776 [2:10:55<06:11,  1.30it/s]                                                        96%|█████████▌| 12292/12776 [2:10:55<06:11,  1.30it/s] 96%|█████████▌| 12293/12776 [2:10:55<06:02,  1.33it/s]                                                        96%|█████████▌| 12293/12776 [2:10:55<06:02,  1.33it/s] 96%|█████████▌| 12294/12776 [2:10:56<05:48,  1.38it/s]                                                        96%|█████████▌| 12294/12776 [2:10:56<05:48,  1.38it/s] 96%|█████████▌| 12295/12776 [2:10:57<05:33,  1.44it/s]                                                        96%|█████████▌| 12295/12776 [2:10:57<05:33,  1.44it/s] 96%|█████████▌| 12296/12776 [2:10:57<05:16,  1.52it/s]                                                        96%|█████████▌| 12296/12776 [2:10:57<05:16,  1.52it/s] 96%|█████████▋| 12297/12776 [2:10:58<05:00,  1.60it/s]                                                        96%|█████████▋| 12297/12776 [2:10:58<05:00,  1.60it/s] 96%|█████████▋| 12298/12776 [2:10:58<04:53,  1.63it/s]                                                        96%|█████████▋| 12298/12776 [2:10:58<04:53,  1.63it/s] 96%|█████████▋| 12299/12776 [2:10:59<04:33,  1.74it/s]                                                        96%|█████████▋| 12299/12776 [2:10:59<04:33,  1.74it/s] 96%|█████████▋| 12300/12776 [2:10:59<04:15,  1.86it/s]                                                        96%|█████████▋| 12300/12776 [2:10:59<04:15,  1.86it/s] 96%|█████████▋| 12301/12776 [2:11:00<04:13,  1.87it/s]                                                        96%|█████████▋| 12301/12776 [2:11:00<04:13,  1.87it/s] 96%|█████████▋| 12302/12776 [2:11:00<03:57,  1.99it/s]                                                        96%|█████████▋| 12302/12776 [2:11:00<03:57,  1.99it/s] 96%|█████████▋| 12303/12776 [2:11:01<03:50,  2.05it/s]                                                        96%|█████████▋| 12303/12776 [2:11:01<03:50,  2.05it/s] 96%|█████████▋| 12304/12776 [2:11:01<03:36,  2.18it/s]                                                        96%|█████████▋| 12304/12776 [2:11:01<03:36,  2.18it/s] 96%|█████████▋| 12305/12776 [2:11:01<03:24,  2.30it/s]                                                        96%|█████████▋| 12305/12776 [2:11:01<03:24,  2.30it/s] 96%|█████████▋| 12306/12776 [2:11:02<03:15,  2.40it/s]                                                        96%|█████████▋| 12306/12776 [2:11:02<03:15,  2.40it/s] 96%|█████████▋| 12307/12776 [2:11:02<03:06,  2.52it/s]                                                        96%|█████████▋| 12307/12776 [2:11:02<03:06,  2.52it/s] 96%|█████████▋| 12308/12776 [2:11:02<02:59,  2.60it/s]                                                        96%|█████████▋| 12308/12776 [2:11:02<02:59,  2.60it/s] 96%|█████████▋| 12309/12776 [2:11:03<03:03,  2.55it/s]                                                        96%|█████████▋| 12309/12776 [2:11:03<03:03,  2.55it/s] 96%|█████████▋| 12310/12776 [2:11:03<02:53,  2.69it/s]                                                       {'loss': 0.8062, 'grad_norm': 3.2195959091186523, 'learning_rate': 1.3685239491691105e-05, 'epoch': 1.91}
+{'loss': 0.541, 'grad_norm': 2.1859567165374756, 'learning_rate': 1.3660801564027368e-05, 'epoch': 1.92}
+{'loss': 0.5953, 'grad_norm': 1.7205095291137695, 'learning_rate': 1.3636363636363635e-05, 'epoch': 1.92}
+{'loss': 0.5505, 'grad_norm': 4.676856517791748, 'learning_rate': 1.36119257086999e-05, 'epoch': 1.92}
+{'loss': 0.9582, 'grad_norm': 2.9190919399261475, 'learning_rate': 1.3587487781036166e-05, 'epoch': 1.92}
+{'loss': 1.1218, 'grad_norm': 2.5346970558166504, 'learning_rate': 1.3563049853372433e-05, 'epoch': 1.92}
+{'loss': 1.4646, 'grad_norm': 0.8043964505195618, 'learning_rate': 1.3538611925708697e-05, 'epoch': 1.92}
+{'loss': 1.4516, 'grad_norm': 1.0437607765197754, 'learning_rate': 1.3514173998044964e-05, 'epoch': 1.92}
+{'loss': 1.4577, 'grad_norm': 1.5638072490692139, 'learning_rate': 1.348973607038123e-05, 'epoch': 1.92}
+{'loss': 1.4553, 'grad_norm': 1.5612990856170654, 'learning_rate': 1.3465298142717495e-05, 'epoch': 1.92}
+{'loss': 1.5817, 'grad_norm': 1.4811925888061523, 'learning_rate': 1.3440860215053762e-05, 'epoch': 1.92}
+{'loss': 1.4004, 'grad_norm': 1.6641279458999634, 'learning_rate': 1.3416422287390028e-05, 'epoch': 1.92}
+{'loss': 1.3879, 'grad_norm': 1.62802255153656, 'learning_rate': 1.3391984359726293e-05, 'epoch': 1.92}
+{'loss': 1.3205, 'grad_norm': 1.1935174465179443, 'learning_rate': 1.336754643206256e-05, 'epoch': 1.92}
+{'loss': 1.2546, 'grad_norm': 2.306370496749878, 'learning_rate': 1.3343108504398826e-05, 'epoch': 1.92}
+{'loss': 1.4271, 'grad_norm': 3.6753339767456055, 'learning_rate': 1.331867057673509e-05, 'epoch': 1.92}
+{'loss': 1.318, 'grad_norm': 2.2451188564300537, 'learning_rate': 1.3294232649071357e-05, 'epoch': 1.92}
+{'loss': 1.3066, 'grad_norm': 3.015744209289551, 'learning_rate': 1.3269794721407624e-05, 'epoch': 1.92}
+{'loss': 1.2688, 'grad_norm': 0.8539578318595886, 'learning_rate': 1.3245356793743889e-05, 'epoch': 1.92}
+{'loss': 1.1816, 'grad_norm': 6.493170261383057, 'learning_rate': 1.3220918866080155e-05, 'epoch': 1.92}
+{'loss': 1.213, 'grad_norm': 1.088154673576355, 'learning_rate': 1.3196480938416422e-05, 'epoch': 1.92}
+{'loss': 1.1441, 'grad_norm': 2.4062836170196533, 'learning_rate': 1.3172043010752686e-05, 'epoch': 1.92}
+{'loss': 1.3002, 'grad_norm': 1.412260890007019, 'learning_rate': 1.3147605083088953e-05, 'epoch': 1.92}
+{'loss': 1.1326, 'grad_norm': 3.7139716148376465, 'learning_rate': 1.312316715542522e-05, 'epoch': 1.92}
+{'loss': 1.2002, 'grad_norm': 4.428962707519531, 'learning_rate': 1.3098729227761484e-05, 'epoch': 1.92}
+{'loss': 1.1597, 'grad_norm': 1.157462239265442, 'learning_rate': 1.307429130009775e-05, 'epoch': 1.92}
+{'loss': 1.1902, 'grad_norm': 3.1243197917938232, 'learning_rate': 1.3049853372434017e-05, 'epoch': 1.92}
+{'loss': 1.0987, 'grad_norm': 1.9715791940689087, 'learning_rate': 1.3025415444770282e-05, 'epoch': 1.92}
+{'loss': 1.2952, 'grad_norm': 8.637343406677246, 'learning_rate': 1.3000977517106548e-05, 'epoch': 1.92}
+{'loss': 1.6552, 'grad_norm': 6.371841907501221, 'learning_rate': 1.2976539589442815e-05, 'epoch': 1.92}
+{'loss': 1.0953, 'grad_norm': 2.1483147144317627, 'learning_rate': 1.295210166177908e-05, 'epoch': 1.92}
+{'loss': 1.0004, 'grad_norm': 1.9315729141235352, 'learning_rate': 1.2927663734115346e-05, 'epoch': 1.92}
+{'loss': 0.8629, 'grad_norm': 3.5723578929901123, 'learning_rate': 1.2903225806451613e-05, 'epoch': 1.92}
+{'loss': 1.2251, 'grad_norm': 2.428359270095825, 'learning_rate': 1.2878787878787878e-05, 'epoch': 1.92}
+{'loss': 1.1408, 'grad_norm': 3.252751350402832, 'learning_rate': 1.2854349951124144e-05, 'epoch': 1.92}
+{'loss': 1.0692, 'grad_norm': 2.705955743789673, 'learning_rate': 1.2829912023460409e-05, 'epoch': 1.92}
+{'loss': 1.0189, 'grad_norm': 4.607133388519287, 'learning_rate': 1.2805474095796675e-05, 'epoch': 1.92}
+{'loss': 1.3724, 'grad_norm': 2.5843207836151123, 'learning_rate': 1.2781036168132942e-05, 'epoch': 1.92}
+{'loss': 0.8495, 'grad_norm': 4.102626323699951, 'learning_rate': 1.2756598240469207e-05, 'epoch': 1.92}
+{'loss': 1.047, 'grad_norm': 3.620480537414551, 'learning_rate': 1.2732160312805473e-05, 'epoch': 1.92}
+{'loss': 0.7859, 'grad_norm': 7.478079795837402, 'learning_rate': 1.270772238514174e-05, 'epoch': 1.92}
+{'loss': 0.8097, 'grad_norm': 25.899625778198242, 'learning_rate': 1.2683284457478004e-05, 'epoch': 1.92}
+{'loss': 1.3919, 'grad_norm': 2.6998584270477295, 'learning_rate': 1.2658846529814271e-05, 'epoch': 1.92}
+{'loss': 1.2487, 'grad_norm': 5.945512771606445, 'learning_rate': 1.2634408602150537e-05, 'epoch': 1.92}
+{'loss': 0.9452, 'grad_norm': 6.3635029792785645, 'learning_rate': 1.2609970674486802e-05, 'epoch': 1.92}
+{'loss': 1.1909, 'grad_norm': 5.040246963500977, 'learning_rate': 1.2585532746823069e-05, 'epoch': 1.92}
+{'loss': 1.3885, 'grad_norm': 4.729580402374268, 'learning_rate': 1.2561094819159335e-05, 'epoch': 1.92}
+{'loss': 1.4546, 'grad_norm': 4.690464973449707, 'learning_rate': 1.25366568914956e-05, 'epoch': 1.92}
+{'loss': 1.3911, 'grad_norm': 4.445624828338623, 'learning_rate': 1.2512218963831867e-05, 'epoch': 1.92}
+{'loss': 0.9071, 'grad_norm': 2.9584579467773438, 'learning_rate': 1.2487781036168133e-05, 'epoch': 1.92}
+{'loss': 0.8461, 'grad_norm': 4.518764495849609, 'learning_rate': 1.2463343108504398e-05, 'epoch': 1.92}
+{'loss': 0.6349, 'grad_norm': 3.6593761444091797, 'learning_rate': 1.2438905180840664e-05, 'epoch': 1.92}
+{'loss': 0.6635, 'grad_norm': 3.229717254638672, 'learning_rate': 1.241446725317693e-05, 'epoch': 1.92}
+{'loss': 0.4047, 'grad_norm': 1.8013453483581543, 'learning_rate': 1.2390029325513196e-05, 'epoch': 1.92}
+{'loss': 0.5197, 'grad_norm': 3.6808831691741943, 'learning_rate': 1.2365591397849462e-05, 'epoch': 1.92}
+{'loss': 0.539, 'grad_norm': 2.7701351642608643, 'learning_rate': 1.2341153470185729e-05, 'epoch': 1.92}
+{'loss': 1.382, 'grad_norm': 1.3340402841567993, 'learning_rate': 1.2316715542521992e-05, 'epoch': 1.92}
+{'loss': 1.4128, 'grad_norm': 1.5330002307891846, 'learning_rate': 1.229227761485826e-05, 'epoch': 1.92}
+{'loss': 1.3799, 'grad_norm': 0.8415979743003845, 'learning_rate': 1.2267839687194526e-05, 'epoch': 1.92}
+{'loss': 1.4151, 'grad_norm': 0.8485933542251587, 'learning_rate': 1.224340175953079e-05, 'epoch': 1.92}
+{'loss': 1.4682, 'grad_norm': 1.0369019508361816, 'learning_rate': 1.2218963831867058e-05, 'epoch': 1.92}
+{'loss': 1.4581, 'grad_norm': 1.0517786741256714, 'learning_rate': 1.2194525904203324e-05, 'epoch': 1.92}
+{'loss': 1.3303, 'grad_norm': 1.096622347831726, 'learning_rate': 1.2170087976539587e-05, 'epoch': 1.92}
+{'loss': 1.2956, 'grad_norm': 3.3548057079315186, 'learning_rate': 1.2145650048875854e-05, 'epoch': 1.92}
+{'loss': 1.3452, 'grad_norm': 1.0958143472671509, 'learning_rate': 1.2121212121212122e-05, 'epoch': 1.93}
+{'loss': 1.322, 'grad_norm': 2.7665908336639404, 'learning_rate': 1.2096774193548385e-05, 'epoch': 1.93}
+{'loss': 1.2135, 'grad_norm': 1.590676188468933, 'learning_rate': 1.2072336265884652e-05, 'epoch': 1.93}
+{'loss': 1.506, 'grad_norm': 4.261856555938721, 'learning_rate': 1.2047898338220916e-05, 'epoch': 1.93}
+{'loss': 1.3583, 'grad_norm': 5.3091230392456055, 'learning_rate': 1.2023460410557183e-05, 'epoch': 1.93}
+{'loss': 1.3899, 'grad_norm': 2.1772117614746094, 'learning_rate': 1.199902248289345e-05, 'epoch': 1.93}
+{'loss': 1.2902, 'grad_norm': 1.1379402875900269, 'learning_rate': 1.1974584555229714e-05, 'epoch': 1.93}
+{'loss': 1.4014, 'grad_norm': 1.190233826637268, 'learning_rate': 1.195014662756598e-05, 'epoch': 1.93}
+{'loss': 1.2716, 'grad_norm': 1.8833681344985962, 'learning_rate': 1.1925708699902247e-05, 'epoch': 1.93}
+{'loss': 1.2386, 'grad_norm': 2.2918057441711426, 'learning_rate': 1.1901270772238512e-05, 'epoch': 1.93}
+{'loss': 1.4535, 'grad_norm': 2.4016504287719727, 'learning_rate': 1.1876832844574778e-05, 'epoch': 1.93}
+{'loss': 1.2666, 'grad_norm': 2.2692906856536865, 'learning_rate': 1.1852394916911045e-05, 'epoch': 1.93}
+{'loss': 1.1711, 'grad_norm': 5.164487838745117, 'learning_rate': 1.182795698924731e-05, 'epoch': 1.93}
+ 96%|█████████▋| 12310/12776 [2:11:03<02:53,  2.69it/s] 96%|█████████▋| 12311/12776 [2:11:04<02:44,  2.83it/s]                                                        96%|█████████▋| 12311/12776 [2:11:04<02:44,  2.83it/s] 96%|█████████▋| 12312/12776 [2:11:04<02:37,  2.95it/s]                                                        96%|█████████▋| 12312/12776 [2:11:04<02:37,  2.95it/s] 96%|█████████▋| 12313/12776 [2:11:04<02:38,  2.92it/s]                                                        96%|█████████▋| 12313/12776 [2:11:04<02:38,  2.92it/s] 96%|█████████▋| 12314/12776 [2:11:04<02:31,  3.05it/s]                                                        96%|█████████▋| 12314/12776 [2:11:04<02:31,  3.05it/s] 96%|█████████▋| 12315/12776 [2:11:05<02:24,  3.19it/s]                                                        96%|█████████▋| 12315/12776 [2:11:05<02:24,  3.19it/s] 96%|█████████▋| 12316/12776 [2:11:05<02:18,  3.33it/s]                                                        96%|█████████▋| 12316/12776 [2:11:05<02:18,  3.33it/s] 96%|█████████▋| 12317/12776 [2:11:05<02:14,  3.42it/s]                                                        96%|█████████▋| 12317/12776 [2:11:05<02:14,  3.42it/s] 96%|█████████▋| 12318/12776 [2:11:06<02:07,  3.58it/s]                                                        96%|█████████▋| 12318/12776 [2:11:06<02:07,  3.58it/s] 96%|█████████▋| 12319/12776 [2:11:06<02:03,  3.70it/s]                                                        96%|█████████▋| 12319/12776 [2:11:06<02:03,  3.70it/s] 96%|█████████▋| 12320/12776 [2:11:06<02:00,  3.79it/s]                                                        96%|█████████▋| 12320/12776 [2:11:06<02:00,  3.79it/s] 96%|█████████▋| 12321/12776 [2:11:06<02:11,  3.47it/s]                                                        96%|█████████▋| 12321/12776 [2:11:06<02:11,  3.47it/s] 96%|█████████▋| 12322/12776 [2:11:07<02:04,  3.65it/s]                                                        96%|█████████▋| 12322/12776 [2:11:07<02:04,  3.65it/s] 96%|█████████▋| 12323/12776 [2:11:07<01:58,  3.83it/s]                                                        96%|█████████▋| 12323/12776 [2:11:07<01:58,  3.83it/s] 96%|█████████▋| 12324/12776 [2:11:07<01:53,  3.97it/s]                                                        96%|█████████▋| 12324/12776 [2:11:07<01:53,  3.97it/s] 96%|█████████▋| 12325/12776 [2:11:07<02:01,  3.71it/s]                                                        96%|█████████▋| 12325/12776 [2:11:07<02:01,  3.71it/s] 96%|█████████▋| 12326/12776 [2:11:08<01:53,  3.95it/s]                                                        96%|█████████▋| 12326/12776 [2:11:08<01:53,  3.95it/s] 96%|█████████▋| 12327/12776 [2:11:08<01:47,  4.16it/s]                                                        96%|█████████▋| 12327/12776 [2:11:08<01:47,  4.16it/s] 96%|█████████▋| 12328/12776 [2:11:08<01:43,  4.33it/s]                                                        96%|█████████▋| 12328/12776 [2:11:08<01:43,  4.33it/s] 97%|█████████▋| 12329/12776 [2:11:08<01:39,  4.47it/s]                                                        97%|█████████▋| 12329/12776 [2:11:08<01:39,  4.47it/s] 97%|█████████▋| 12330/12776 [2:11:09<01:50,  4.04it/s]                                                        97%|█████████▋| 12330/12776 [2:11:09<01:50,  4.04it/s] 97%|█████████▋| 12331/12776 [2:11:09<01:43,  4.28it/s]                                                        97%|█████████▋| 12331/12776 [2:11:09<01:43,  4.28it/s] 97%|█████████▋| 12332/12776 [2:11:09<01:39,  4.48it/s]                                                        97%|█████████▋| 12332/12776 [2:11:09<01:39,  4.48it/s] 97%|█████████▋| 12333/12776 [2:11:09<01:35,  4.64it/s]                                                        97%|█████████▋| 12333/12776 [2:11:09<01:35,  4.64it/s] 97%|█████████▋| 12334/12776 [2:11:09<01:32,  4.79it/s]                                                        97%|█████████▋| 12334/12776 [2:11:09<01:32,  4.79it/s] 97%|█████████▋| 12335/12776 [2:11:10<01:29,  4.90it/s]                                                        97%|█████████▋| 12335/12776 [2:11:10<01:29,  4.90it/s] 97%|█████████▋| 12336/12776 [2:11:10<01:39,  4.41it/s]                                                        97%|█████████▋| 12336/12776 [2:11:10<01:39,  4.41it/s] 97%|█████████▋| 12337/12776 [2:11:10<01:34,  4.66it/s]                                                        97%|█████████▋| 12337/12776 [2:11:10<01:34,  4.66it/s] 97%|█████████▋| 12338/12776 [2:11:11<02:55,  2.50it/s]                                                        97%|█████████▋| 12338/12776 [2:11:11<02:55,  2.50it/s] 97%|█████████▋| 12339/12776 [2:11:12<05:15,  1.38it/s]                                                        97%|█████████▋| 12339/12776 [2:11:12<05:15,  1.38it/s] 97%|█████████▋| 12340/12776 [2:11:13<05:44,  1.26it/s]                                                        97%|█████████▋| 12340/12776 [2:11:13<05:44,  1.26it/s] 97%|█████████▋| 12341/12776 [2:11:14<05:53,  1.23it/s]                                                        97%|█████████▋| 12341/12776 [2:11:14<05:53,  1.23it/s] 97%|█████████▋| 12342/12776 [2:11:15<05:48,  1.25it/s]                                                        97%|█████████▋| 12342/12776 [2:11:15<05:48,  1.25it/s] 97%|█████████▋| 12343/12776 [2:11:16<05:53,  1.22it/s]                                                        97%|█████████▋| 12343/12776 [2:11:16<05:53,  1.22it/s] 97%|█████████▋| 12344/12776 [2:11:16<05:37,  1.28it/s]                                                        97%|█████████▋| 12344/12776 [2:11:16<05:37,  1.28it/s] 97%|█████████▋| 12345/12776 [2:11:17<05:20,  1.35it/s]                                                        97%|█████████▋| 12345/12776 [2:11:17<05:20,  1.35it/s] 97%|█████████▋| 12346/12776 [2:11:18<05:12,  1.38it/s]                                                        97%|█████████▋| 12346/12776 [2:11:18<05:12,  1.38it/s] 97%|█████████▋| 12347/12776 [2:11:18<04:53,  1.46it/s]                                                        97%|█████████▋| 12347/12776 [2:11:18<04:53,  1.46it/s] 97%|█████████▋| 12348/12776 [2:11:19<04:39,  1.53it/s]                                                        97%|█████████▋| 12348/12776 [2:11:19<04:39,  1.53it/s] 97%|█████████▋| 12349/12776 [2:11:19<04:25,  1.61it/s]                                                        97%|█████████▋| 12349/12776 [2:11:19<04:25,  1.61it/s] 97%|█████████▋| 12350/12776 [2:11:20<04:26,  1.60it/s]                                                        97%|█████████▋| 12350/12776 [2:11:20<04:26,  1.60it/s] 97%|█████████▋| 12351/12776 [2:11:21<04:09,  1.70it/s]                                                        97%|█████████▋| 12351/12776 [2:11:21<04:09,  1.70it/s] 97%|█████████▋| 12352/12776 [2:11:21<04:10,  1.69it/s]                                                        97%|█████████▋| 12352/12776 [2:11:21<04:10,  1.69it/s] 97%|█████████▋| 12353/12776 [2:11:22<03:50,  1.83it/s]                                                        97%|█████████▋| 12353/12776 [2:11:22<03:50,  1.83it/s] 97%|█████████▋| 12354/12776 [2:11:22<03:34,  1.96it/s]                                                        97%|█████████▋| 12354/12776 [2:11:22<03:34,  1.96it/s] 97%|█████████▋| 12355/12776 [2:11:23<03:27,  2.03it/s]                                                        97%|█████████▋| 12355/12776 [2:11:23<03:27,  2.03it/s] 97%|█████████▋| 12356/12776 [2:11:23<03:15,  2.15it/s]                                                        97%|█████████▋| 12356/12776 [2:11:23<03:15,  2.15it/s] 97%|█████████▋| 12357/12776 [2:11:23<03:04,  2.27it/s]                                                        97%|█████████▋| 12357/12776 [2:11:23<03:04,  2.27it/s] 97%|█████████▋| 12358/12776 [2:11:24<02:59,  2.33it/s]                                                        97%|█████████▋| 12358/12776 [2:11:24<02:59,  2.33it/s] 97%|█████████▋| 12359/12776 [2:11:24<02:50,  2.45it/s]                                                        97%|█████████▋| 12359/12776 [2:11:24<02:50,  2.45it/s] 97%|█████████▋| 12360/12776 [2:11:24<02:42,  2.56it/s]                                                        97%|█████████▋| 12360/12776 [2:11:24<02:42,  2.56it/s] 97%|█████████▋| 12361/12776 [2:11:25<02:47,  2.48it/s]                                                        97%|█████████▋| 12361/12776 [2:11:25<02:47,  2.48it/s] 97%|█████████▋| 12362/12776 [2:11:25<02:37,  2.62it/s]                                                        97%|█████████▋| 12362/12776 [2:11:25<02:37,  2.62it/s] 97%|█████████▋| 12363/12776 [2:11:25<02:29,  2.77it/s]                                                        97%|█████████▋| 12363/12776 [2:11:25<02:29,  2.77it/s] 97%|█████████▋| 12364/12776 [2:11:26<02:34,  2.67it/s]                                                        97%|█████████▋| 12364/12776 [2:11:26<02:34,  2.67it/s] 97%|█████████▋| 12365/12776 [2:11:26<02:24,  2.85it/s]                                                        97%|█████████▋| 12365/12776 [2:11:26<02:24,  2.85it/s] 97%|█████████▋| 12366/12776 [2:11:26<02:15,  3.02it/s]                                                        97%|█████████▋| 12366/12776 [2:11:26<02:15,  3.02it/s] 97%|█████████▋| 12367/12776 [2:11:27<02:25,  2.82it/s]                                                        97%|█████████▋| 12367/12776 [2:11:27<02:25,  2.82it/s] 97%|█████████▋| 12368/12776 [2:11:27<02:14,  3.03it/s]                                                        97%|█████████▋| 12368/12776 [2:11:27<02:14,  3.03it/s] 97%|█████████▋| 12369/12776 [2:11:27<02:06,  3.23it/s]                                                        97%|█████████▋| 12369/12776 [2:11:27<02:06,  3.23it/s] 97%|█████████▋| 12370/12776 [2:11:28<01:59,  3.41it/s]                                                        97%|█████████▋| 12370/12776 [2:11:28<01:59,  3.41it/s] 97%|█████████▋| 12371/12776 [2:11:28<02:05,  3.22it/s]                                                        97%|█████████▋| 12371/12776 [2:11:28<02:05,  3.22it/s] 97%|█████████▋| 12372/12776 [2:11:28<01:57,  3.44it/s]                                                        97%|█████████▋| 12372/12776 [2:11:28<01:57,  3.44it/s] 97%|█████████▋| 12373/12776 [2:11:29<01:50,  3.65it/s]                                                        97%|█████████▋| 12373/12776 [2:11:29<01:50,  3.65it/s] 97%|█████████▋| 12374/12776 [2:11:29<01:44,  3.83it/s]                                                        97%|█████████▋| 12374/12776 [2:11:29<01:44,  3.83it/s] 97%|█████████▋| 12375/12776 [2:11:29<01:40,  4.00it/s]                                                        97%|█████████▋| 12375/12776 [2:11:29<01:40,  4.00it/s] 97%|█████████▋| 12376/12776 [2:11:29<01:49,  3.67it/s]                                                        97%|█████████▋| 12376/12776 [2:11:29<01:49,  3.67it/s] 97%|█████████▋| 12377/12776 [2:11:30<01:41,  3.92it/s]                                                        97%|█████████▋| 12377/12776 [2:11:30<01:41,  3.92it/s] 97%|█████████▋| 12378/12776 [2:11:30<01:36,  4.14it/s]                                                        97%|█████████▋| 12378/12776 [2:11:30<01:36,  4.14it/s] 97%|█████████▋| 12379/12776 [2:11:30<01:32,  4.30it/s]                                                        97%|█████████▋| 12379/12776 [2:11:30<01:32,  4.30it/s] 97%|█████████▋| 12380/12776 [2:11:30<01:28,  4.47it/s]                                                        97%|█████████▋| 12380/12776 [2:11:30<01:28,  4.47it/s] 97%|█████████▋| 12381/12776 [2:11:30<01:37,  4.04it/s]                                                        97%|█████████▋| 12381/12776 [2:11:30<01:37,  4.04it/s] 97%|█████████▋| 12382/12776 [2:11:31<01:31,  4.30it/s]                                                        97%|█████████▋| 12382/12776 [2:11:31<01:31,  4.30it/s] 97%|█████████▋| 12383/12776 [2:11:31<01:27,  4.49it/s]                                                        97%|█████████▋| 12383/12776 [2:11:31<01:27,  4.49it/s] 97%|█████████▋| 12384/12776 [2:11:31<01:24,  4.66it/s]                                                        97%|█████████▋| 12384/12776 [2:11:31<01:24,  4.66it/s] 97%|█████████▋| 12385/12776 [2:11:31<01:21,  4.80it/s]                                                        97%|█████████▋| 12385/12776 [2:11:31<01:21,  4.80it/s] 97%|█████████▋| 12386/12776 [2:11:31<01:27,  4.44it/s]                                                        97%|█████████▋| 12386/12776 [2:11:31<01:27,  4.44it/s] 97%|█████████▋| 12387/12776 [2:11:32<01:23,  4.67it/s]                                                       {'loss': 1.2333, 'grad_norm': 3.0277092456817627, 'learning_rate': 1.1803519061583576e-05, 'epoch': 1.93}
+{'loss': 1.3658, 'grad_norm': 2.941403865814209, 'learning_rate': 1.1779081133919843e-05, 'epoch': 1.93}
+{'loss': 1.2713, 'grad_norm': 4.588296413421631, 'learning_rate': 1.1754643206256108e-05, 'epoch': 1.93}
+{'loss': 1.054, 'grad_norm': 1.65639328956604, 'learning_rate': 1.1730205278592374e-05, 'epoch': 1.93}
+{'loss': 1.3822, 'grad_norm': 2.7353925704956055, 'learning_rate': 1.170576735092864e-05, 'epoch': 1.93}
+{'loss': 1.4245, 'grad_norm': 4.178305149078369, 'learning_rate': 1.1681329423264905e-05, 'epoch': 1.93}
+{'loss': 1.1584, 'grad_norm': 4.869446754455566, 'learning_rate': 1.1656891495601172e-05, 'epoch': 1.93}
+{'loss': 1.1799, 'grad_norm': 2.118698835372925, 'learning_rate': 1.1632453567937438e-05, 'epoch': 1.93}
+{'loss': 1.1696, 'grad_norm': 7.572887897491455, 'learning_rate': 1.1608015640273703e-05, 'epoch': 1.93}
+{'loss': 1.0388, 'grad_norm': 2.0993082523345947, 'learning_rate': 1.158357771260997e-05, 'epoch': 1.93}
+{'loss': 1.1272, 'grad_norm': 6.564274787902832, 'learning_rate': 1.1559139784946236e-05, 'epoch': 1.93}
+{'loss': 1.0672, 'grad_norm': 4.388607978820801, 'learning_rate': 1.1534701857282501e-05, 'epoch': 1.93}
+{'loss': 1.1001, 'grad_norm': 3.7181320190429688, 'learning_rate': 1.1510263929618767e-05, 'epoch': 1.93}
+{'loss': 1.2546, 'grad_norm': 2.3277645111083984, 'learning_rate': 1.1485826001955034e-05, 'epoch': 1.93}
+{'loss': 1.56, 'grad_norm': 5.6356282234191895, 'learning_rate': 1.1461388074291299e-05, 'epoch': 1.93}
+{'loss': 1.4428, 'grad_norm': 2.452418804168701, 'learning_rate': 1.1436950146627565e-05, 'epoch': 1.93}
+{'loss': 1.2936, 'grad_norm': 8.939882278442383, 'learning_rate': 1.1412512218963832e-05, 'epoch': 1.93}
+{'loss': 1.0515, 'grad_norm': 2.89434552192688, 'learning_rate': 1.1388074291300096e-05, 'epoch': 1.93}
+{'loss': 1.2122, 'grad_norm': 5.483556270599365, 'learning_rate': 1.1363636363636363e-05, 'epoch': 1.93}
+{'loss': 0.9129, 'grad_norm': 3.8645455837249756, 'learning_rate': 1.133919843597263e-05, 'epoch': 1.93}
+{'loss': 1.2189, 'grad_norm': 3.9111804962158203, 'learning_rate': 1.1314760508308894e-05, 'epoch': 1.93}
+{'loss': 0.7437, 'grad_norm': 3.18015456199646, 'learning_rate': 1.129032258064516e-05, 'epoch': 1.93}
+{'loss': 1.3882, 'grad_norm': 8.127317428588867, 'learning_rate': 1.1265884652981426e-05, 'epoch': 1.93}
+{'loss': 1.6402, 'grad_norm': 2.8855273723602295, 'learning_rate': 1.1241446725317692e-05, 'epoch': 1.93}
+{'loss': 0.8951, 'grad_norm': 6.404877662658691, 'learning_rate': 1.1217008797653959e-05, 'epoch': 1.93}
+{'loss': 0.9152, 'grad_norm': 12.837389945983887, 'learning_rate': 1.1192570869990223e-05, 'epoch': 1.93}
+{'loss': 0.7218, 'grad_norm': 1.6619367599487305, 'learning_rate': 1.116813294232649e-05, 'epoch': 1.93}
+{'loss': 0.4666, 'grad_norm': 1.492072343826294, 'learning_rate': 1.1143695014662756e-05, 'epoch': 1.93}
+{'loss': 1.1153, 'grad_norm': 2.7796103954315186, 'learning_rate': 1.1119257086999021e-05, 'epoch': 1.93}
+{'loss': 1.9017, 'grad_norm': 6.7155938148498535, 'learning_rate': 1.1094819159335288e-05, 'epoch': 1.93}
+{'loss': 1.356, 'grad_norm': 0.7669269442558289, 'learning_rate': 1.1070381231671554e-05, 'epoch': 1.93}
+{'loss': 1.3973, 'grad_norm': 0.7935886383056641, 'learning_rate': 1.1045943304007819e-05, 'epoch': 1.93}
+{'loss': 1.2953, 'grad_norm': 1.2134813070297241, 'learning_rate': 1.1021505376344085e-05, 'epoch': 1.93}
+{'loss': 1.3599, 'grad_norm': 0.8597235083580017, 'learning_rate': 1.0997067448680352e-05, 'epoch': 1.93}
+{'loss': 1.2758, 'grad_norm': 0.8375195860862732, 'learning_rate': 1.0972629521016617e-05, 'epoch': 1.93}
+{'loss': 1.3688, 'grad_norm': 1.4954638481140137, 'learning_rate': 1.0948191593352883e-05, 'epoch': 1.93}
+{'loss': 1.3371, 'grad_norm': 1.4607149362564087, 'learning_rate': 1.092375366568915e-05, 'epoch': 1.93}
+{'loss': 1.34, 'grad_norm': 1.151136040687561, 'learning_rate': 1.0899315738025414e-05, 'epoch': 1.93}
+{'loss': 1.3688, 'grad_norm': 1.224950909614563, 'learning_rate': 1.0874877810361681e-05, 'epoch': 1.93}
+{'loss': 1.2989, 'grad_norm': 6.187594890594482, 'learning_rate': 1.0850439882697947e-05, 'epoch': 1.93}
+{'loss': 1.6841, 'grad_norm': 1.705400824546814, 'learning_rate': 1.0826001955034212e-05, 'epoch': 1.93}
+{'loss': 1.2696, 'grad_norm': 1.4183433055877686, 'learning_rate': 1.0801564027370479e-05, 'epoch': 1.93}
+{'loss': 1.5075, 'grad_norm': 2.378031015396118, 'learning_rate': 1.0777126099706745e-05, 'epoch': 1.93}
+{'loss': 1.2745, 'grad_norm': 1.5742651224136353, 'learning_rate': 1.075268817204301e-05, 'epoch': 1.93}
+{'loss': 1.3535, 'grad_norm': 1.3239703178405762, 'learning_rate': 1.0728250244379277e-05, 'epoch': 1.93}
+{'loss': 1.6419, 'grad_norm': 3.5635673999786377, 'learning_rate': 1.0703812316715543e-05, 'epoch': 1.93}
+{'loss': 1.3285, 'grad_norm': 2.4734835624694824, 'learning_rate': 1.0679374389051806e-05, 'epoch': 1.93}
+{'loss': 1.4164, 'grad_norm': 2.6944169998168945, 'learning_rate': 1.0654936461388074e-05, 'epoch': 1.93}
+{'loss': 1.3136, 'grad_norm': 1.283929467201233, 'learning_rate': 1.063049853372434e-05, 'epoch': 1.93}
+{'loss': 1.1611, 'grad_norm': 1.476118564605713, 'learning_rate': 1.0606060606060604e-05, 'epoch': 1.93}
+{'loss': 1.3293, 'grad_norm': 2.960695505142212, 'learning_rate': 1.0581622678396872e-05, 'epoch': 1.93}
+{'loss': 1.3852, 'grad_norm': 4.04657506942749, 'learning_rate': 1.0557184750733139e-05, 'epoch': 1.94}
+{'loss': 1.2408, 'grad_norm': 1.8370991945266724, 'learning_rate': 1.0532746823069402e-05, 'epoch': 1.94}
+{'loss': 1.395, 'grad_norm': 3.660257577896118, 'learning_rate': 1.0508308895405668e-05, 'epoch': 1.94}
+{'loss': 1.2782, 'grad_norm': 4.005573749542236, 'learning_rate': 1.0483870967741933e-05, 'epoch': 1.94}
+{'loss': 1.4618, 'grad_norm': 8.64341926574707, 'learning_rate': 1.04594330400782e-05, 'epoch': 1.94}
+{'loss': 1.9106, 'grad_norm': 8.12671947479248, 'learning_rate': 1.0434995112414466e-05, 'epoch': 1.94}
+{'loss': 1.3265, 'grad_norm': 2.215592384338379, 'learning_rate': 1.041055718475073e-05, 'epoch': 1.94}
+{'loss': 1.1993, 'grad_norm': 4.294419288635254, 'learning_rate': 1.0386119257086997e-05, 'epoch': 1.94}
+{'loss': 1.1445, 'grad_norm': 2.263684034347534, 'learning_rate': 1.0361681329423264e-05, 'epoch': 1.94}
+{'loss': 1.4243, 'grad_norm': 3.5883326530456543, 'learning_rate': 1.0337243401759529e-05, 'epoch': 1.94}
+{'loss': 0.9561, 'grad_norm': 1.9435137510299683, 'learning_rate': 1.0312805474095795e-05, 'epoch': 1.94}
+{'loss': 0.8797, 'grad_norm': 3.0882208347320557, 'learning_rate': 1.0288367546432062e-05, 'epoch': 1.94}
+{'loss': 1.2746, 'grad_norm': 7.617794036865234, 'learning_rate': 1.0263929618768326e-05, 'epoch': 1.94}
+{'loss': 1.4779, 'grad_norm': 4.225636959075928, 'learning_rate': 1.0239491691104593e-05, 'epoch': 1.94}
+{'loss': 1.4581, 'grad_norm': 15.488640785217285, 'learning_rate': 1.021505376344086e-05, 'epoch': 1.94}
+{'loss': 1.4015, 'grad_norm': 5.777149200439453, 'learning_rate': 1.0190615835777124e-05, 'epoch': 1.94}
+{'loss': 1.2572, 'grad_norm': 12.551581382751465, 'learning_rate': 1.016617790811339e-05, 'epoch': 1.94}
+{'loss': 1.0849, 'grad_norm': 4.459105491638184, 'learning_rate': 1.0141739980449657e-05, 'epoch': 1.94}
+{'loss': 1.3216, 'grad_norm': 3.434291362762451, 'learning_rate': 1.0117302052785922e-05, 'epoch': 1.94}
+{'loss': 0.7784, 'grad_norm': 2.0103023052215576, 'learning_rate': 1.0092864125122188e-05, 'epoch': 1.94}
+{'loss': 0.7125, 'grad_norm': 2.6893234252929688, 'learning_rate': 1.0068426197458455e-05, 'epoch': 1.94}
+{'loss': 0.998, 'grad_norm': 5.581327438354492, 'learning_rate': 1.004398826979472e-05, 'epoch': 1.94}
+{'loss': 1.2393, 'grad_norm': 3.5496435165405273, 'learning_rate': 1.0019550342130986e-05, 'epoch': 1.94}
+{'loss': 1.3163, 'grad_norm': 2.4860739707946777, 'learning_rate': 9.995112414467253e-06, 'epoch': 1.94}
+{'loss': 0.7082, 'grad_norm': 1.844963550567627, 'learning_rate': 9.970674486803518e-06, 'epoch': 1.94}
+{'loss': 0.6463, 'grad_norm': 2.7974820137023926, 'learning_rate': 9.946236559139784e-06, 'epoch': 1.94}
+ 97%|█████████▋| 12387/12776 [2:11:32<01:23,  4.67it/s] 97%|█████████▋| 12388/12776 [2:11:32<02:24,  2.68it/s]                                                        97%|█████████▋| 12388/12776 [2:11:32<02:24,  2.68it/s] 97%|█████████▋| 12389/12776 [2:11:34<04:49,  1.34it/s]                                                        97%|█████████▋| 12389/12776 [2:11:34<04:49,  1.34it/s] 97%|█████████▋| 12390/12776 [2:11:35<05:14,  1.23it/s]                                                        97%|█████████▋| 12390/12776 [2:11:35<05:14,  1.23it/s] 97%|█████████▋| 12391/12776 [2:11:36<05:19,  1.20it/s]                                                        97%|█████████▋| 12391/12776 [2:11:36<05:19,  1.20it/s] 97%|█████████▋| 12392/12776 [2:11:37<05:14,  1.22it/s]                                                        97%|█████████▋| 12392/12776 [2:11:37<05:14,  1.22it/s] 97%|█████████▋| 12393/12776 [2:11:38<05:21,  1.19it/s]                                                        97%|█████████▋| 12393/12776 [2:11:38<05:21,  1.19it/s] 97%|█████████▋| 12394/12776 [2:11:38<05:02,  1.26it/s]                                                        97%|█████████▋| 12394/12776 [2:11:38<05:02,  1.26it/s] 97%|█████████▋| 12395/12776 [2:11:39<04:47,  1.33it/s]                                                        97%|█████████▋| 12395/12776 [2:11:39<04:47,  1.33it/s] 97%|█████████▋| 12396/12776 [2:11:40<04:36,  1.38it/s]                                                        97%|█████████▋| 12396/12776 [2:11:40<04:36,  1.38it/s] 97%|█████████▋| 12397/12776 [2:11:40<04:22,  1.44it/s]                                                        97%|█████████▋| 12397/12776 [2:11:40<04:22,  1.44it/s] 97%|█████████▋| 12398/12776 [2:11:41<04:08,  1.52it/s]                                                        97%|█████████▋| 12398/12776 [2:11:41<04:08,  1.52it/s] 97%|█████████▋| 12399/12776 [2:11:41<03:56,  1.59it/s]                                                        97%|█████████▋| 12399/12776 [2:11:41<03:56,  1.59it/s] 97%|█████████▋| 12400/12776 [2:11:42<03:54,  1.60it/s]                                                        97%|█████████▋| 12400/12776 [2:11:42<03:54,  1.60it/s]Saving model checkpoint to ./checkpoint-12400
+Configuration saved in ./checkpoint-12400/config.json
+Model weights saved in ./checkpoint-12400/model.safetensors
+Feature extractor saved in ./checkpoint-12400/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-12400/tokenizer_config.json
+Special tokens file saved in ./checkpoint-12400/special_tokens_map.json
+added tokens file saved in ./checkpoint-12400/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-11200] due to args.save_total_limit
+/opt/conda/lib/python3.12/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+  warnings.warn(
+ 97%|█████████▋| 12401/12776 [2:11:48<13:48,  2.21s/it]                                                        97%|█████████▋| 12401/12776 [2:11:48<13:48,  2.21s/it] 97%|█████████▋| 12402/12776 [2:11:48<10:30,  1.69s/it]                                                        97%|█████████▋| 12402/12776 [2:11:48<10:30,  1.69s/it] 97%|█████████▋| 12403/12776 [2:11:49<08:28,  1.36s/it]                                                        97%|█████████▋| 12403/12776 [2:11:49<08:28,  1.36s/it] 97%|█████████▋| 12404/12776 [2:11:49<06:45,  1.09s/it]                                                        97%|█████████▋| 12404/12776 [2:11:49<06:45,  1.09s/it] 97%|█████████▋| 12405/12776 [2:11:50<05:40,  1.09it/s]                                                        97%|█████████▋| 12405/12776 [2:11:50<05:40,  1.09it/s] 97%|█████████▋| 12406/12776 [2:11:50<04:38,  1.33it/s]                                                        97%|█████████▋| 12406/12776 [2:11:50<04:38,  1.33it/s] 97%|█████████▋| 12407/12776 [2:11:51<03:54,  1.58it/s]                                                        97%|█████████▋| 12407/12776 [2:11:51<03:54,  1.58it/s] 97%|█████████▋| 12408/12776 [2:11:51<03:27,  1.77it/s]                                                        97%|█████████▋| 12408/12776 [2:11:51<03:27,  1.77it/s] 97%|█████████▋| 12409/12776 [2:11:51<03:00,  2.04it/s]                                                        97%|█████████▋| 12409/12776 [2:11:51<03:00,  2.04it/s] 97%|█████████▋| 12410/12776 [2:11:52<02:40,  2.28it/s]                                                        97%|█████████▋| 12410/12776 [2:11:52<02:40,  2.28it/s] 97%|█████████▋| 12411/12776 [2:11:52<02:26,  2.50it/s]                                                        97%|█████████▋| 12411/12776 [2:11:52<02:26,  2.50it/s] 97%|█████████▋| 12412/12776 [2:11:52<02:24,  2.51it/s]                                                        97%|█████████▋| 12412/12776 [2:11:52<02:24,  2.51it/s] 97%|█████████▋| 12413/12776 [2:11:53<02:12,  2.73it/s]                                                        97%|█████████▋| 12413/12776 [2:11:53<02:12,  2.73it/s] 97%|█████████▋| 12414/12776 [2:11:53<02:02,  2.95it/s]                                                        97%|█████████▋| 12414/12776 [2:11:53<02:02,  2.95it/s] 97%|█████████▋| 12415/12776 [2:11:53<02:03,  2.92it/s]                                                        97%|█████████▋| 12415/12776 [2:11:53<02:03,  2.92it/s] 97%|█████████▋| 12416/12776 [2:11:54<01:54,  3.15it/s]                                                        97%|█████████▋| 12416/12776 [2:11:54<01:54,  3.15it/s] 97%|█████████▋| 12417/12776 [2:11:54<01:47,  3.35it/s]                                                        97%|█████████▋| 12417/12776 [2:11:54<01:47,  3.35it/s] 97%|█████████▋| 12418/12776 [2:11:54<01:41,  3.52it/s]                                                        97%|█████████▋| 12418/12776 [2:11:54<01:41,  3.52it/s] 97%|█████████▋| 12419/12776 [2:11:54<01:47,  3.31it/s]                                                        97%|█████████▋| 12419/12776 [2:11:54<01:47,  3.31it/s] 97%|█████████▋| 12420/12776 [2:11:55<01:41,  3.52it/s]                                                        97%|█████████▋| 12420/12776 [2:11:55<01:41,  3.52it/s] 97%|█████████▋| 12421/12776 [2:11:55<01:35,  3.73it/s]                                                        97%|█████████▋| 12421/12776 [2:11:55<01:35,  3.73it/s] 97%|█████████▋| 12422/12776 [2:11:55<01:30,  3.92it/s]                                                        97%|█████████▋| 12422/12776 [2:11:55<01:30,  3.92it/s] 97%|█████████▋| 12423/12776 [2:11:55<01:26,  4.09it/s]                                                        97%|█████████▋| 12423/12776 [2:11:55<01:26,  4.09it/s] 97%|█████████▋| 12424/12776 [2:11:56<01:36,  3.64it/s]                                                        97%|█████████▋| 12424/12776 [2:11:56<01:36,  3.64it/s] 97%|█████████▋| 12425/12776 [2:11:56<01:29,  3.93it/s]                                                        97%|█████████▋| 12425/12776 [2:11:56<01:29,  3.93it/s] 97%|█████████▋| 12426/12776 [2:11:56<01:23,  4.19it/s]                                                        97%|█████████▋| 12426/12776 [2:11:56<01:23,  4.19it/s] 97%|█████████▋| 12427/12776 [2:11:56<01:18,  4.42it/s]                                                        97%|█████████▋| 12427/12776 [2:11:56<01:18,  4.42it/s] 97%|█████████▋| 12428/12776 [2:11:56<01:14,  4.64it/s]                                                        97%|█████████▋| 12428/12776 [2:11:56<01:14,  4.64it/s] 97%|█████████▋| 12429/12776 [2:11:57<01:27,  3.98it/s]                                                        97%|█████████▋| 12429/12776 [2:11:57<01:27,  3.98it/s] 97%|█████████▋| 12430/12776 [2:11:57<01:20,  4.32it/s]                                                        97%|█████████▋| 12430/12776 [2:11:57<01:20,  4.32it/s] 97%|█████████▋| 12431/12776 [2:11:57<01:14,  4.60it/s]                                                        97%|█████████▋| 12431/12776 [2:11:57<01:14,  4.60it/s] 97%|█████████▋| 12432/12776 [2:11:57<01:10,  4.85it/s]                                                        97%|█████████▋| 12432/12776 [2:11:57<01:10,  4.85it/s] 97%|█████████▋| 12433/12776 [2:11:57<01:07,  5.06it/s]                                                        97%|█████████▋| 12433/12776 [2:11:57<01:07,  5.06it/s] 97%|█████████▋| 12434/12776 [2:11:58<01:05,  5.24it/s]                                                        97%|█████████▋| 12434/12776 [2:11:58<01:05,  5.24it/s] 97%|█████████▋| 12435/12776 [2:11:58<01:12,  4.70it/s]                                                        97%|█████████▋| 12435/12776 [2:11:58<01:12,  4.70it/s] 97%|█████████▋| 12436/12776 [2:11:58<01:07,  5.00it/s]                                                        97%|█████████▋| 12436/12776 [2:11:58<01:07,  5.00it/s] 97%|█████████▋| 12437/12776 [2:11:58<01:04,  5.30it/s]                                                        97%|█████████▋| 12437/12776 [2:11:58<01:04,  5.30it/s] 97%|█████████▋| 12438/12776 [2:11:59<02:03,  2.74it/s]                                                        97%|█████████▋| 12438/12776 [2:11:59<02:03,  2.74it/s] 97%|█████████▋| 12439/12776 [2:12:01<03:54,  1.44it/s]                                                        97%|█████████▋| 12439/12776 [2:12:01<03:54,  1.44it/s] 97%|█████████▋| 12440/12776 [2:12:01<04:18,  1.30it/s]                                                        97%|█████████▋| 12440/12776 [2:12:01<04:18,  1.30it/s] 97%|█████████▋| 12441/12776 [2:12:02<04:24,  1.27it/s]                                                        97%|█████████▋| 12441/12776 [2:12:02<04:24,  1.27it/s] 97%|█████████▋| 12442/12776 [2:12:03<04:22,  1.27it/s]                                                        97%|█████████▋| 12442/12776 [2:12:03<04:22,  1.27it/s] 97%|█████████▋| 12443/12776 [2:12:04<04:26,  1.25it/s]                                                        97%|█████████▋| 12443/12776 [2:12:04<04:26,  1.25it/s] 97%|█████████▋| 12444/12776 [2:12:05<04:14,  1.30it/s]                                                        97%|█████████▋| 12444/12776 [2:12:05<04:14,  1.30it/s] 97%|█████████▋| 12445/12776 [2:12:05<04:00,  1.38it/s]                                                        97%|█████████▋| 12445/12776 [2:12:05<04:00,  1.38it/s] 97%|█████████▋| 12446/12776 [2:12:06<04:00,  1.37it/s]                                                        97%|█████████▋| 12446/12776 [2:12:06<04:00,  1.37it/s] 97%|█████████▋| 12447/12776 [2:12:07<03:44,  1.46it/s]                                                        97%|█████████▋| 12447/12776 [2:12:07<03:44,  1.46it/s] 97%|█████████▋| 12448/12776 [2:12:07<03:33,  1.54it/s]                                                        97%|█████████▋| 12448/12776 [2:12:07<03:33,  1.54it/s] 97%|█████████▋| 12449/12776 [2:12:08<03:18,  1.65it/s]                                                        97%|█████████▋| 12449/12776 [2:12:08<03:18,  1.65it/s] 97%|█████████▋| 12450/12776 [2:12:08<03:04,  1.77it/s]                                                        97%|█████████▋| 12450/12776 [2:12:08<03:04,  1.77it/s] 97%|█████████▋| 12451/12776 [2:12:09<02:56,  1.84it/s]                                                        97%|█████████▋| 12451/12776 [2:12:09<02:56,  1.84it/s] 97%|█████████▋| 12452/12776 [2:12:09<02:42,  2.00it/s]                                                        97%|█████████▋| 12452/12776 [2:12:09<02:42,  2.00it/s] 97%|█████████▋| 12453/12776 [2:12:09<02:38,  2.03it/s]                                                        97%|█████████▋| 12453/12776 [2:12:09<02:38,  2.03it/s] 97%|█████████▋| 12454/12776 [2:12:10<02:27,  2.18it/s]                                                        97%|█████████▋| 12454/12776 [2:12:10<02:27,  2.18it/s] 97%|█████████▋| 12455/12776 [2:12:10<02:18,  2.31it/s]                                                        97%|█████████▋| 12455/12776 [2:12:10<02:18,  2.31it/s] 97%|█████████▋| 12456/12776 [2:12:11<02:16,  2.35it/s]                                                        97%|█████████▋| 12456/12776 [2:12:11<02:16,  2.35it/s] 98%|█████████▊| 12457/12776 [2:12:11<02:07,  2.50it/s]                                                        98%|█████████▊| 12457/12776 [2:12:11<02:07,  2.50it/s] 98%|█████████▊| 12458/12776 [2:12:11<02:00,  2.63it/s]                                                        98%|█████████▊| 12458/12776 [2:12:11<02:00,  2.63it/s] 98%|█████████▊| 12459/12776 [2:12:12<01:54,  2.76it/s]                                                        98%|█████████▊| 12459/12776 [2:12:12<01:54,  2.76it/s] 98%|█████████▊| 12460/12776 [2:12:12<01:53,  2.78it/s]                                                        98%|█████████▊| 12460/12776 [2:12:12<01:53,  2.78it/s] 98%|█████████▊| 12461/12776 [2:12:12<01:47,  2.92it/s]                                                        98%|█████████▊| 12461/12776 [2:12:12<01:47,  2.92it/s] 98%|█████████▊| 12462/12776 [2:12:13<01:42,  3.05it/s]                                                        98%|█████████▊| 12462/12776 [2:12:13<01:42,  3.05it/s] 98%|█████████▊| 12463/12776 [2:12:13<01:44,  3.00it/s]                                                        98%|█████████▊| 12463/12776 [2:12:13<01:44,  3.00it/s] 98%|█████████▊| 12464/12776 [2:12:13<01:40,  3.10it/s]                                                        98%|█████████▊| 12464/12776 [2:12:13<01:40,  3.10it/s] 98%|█████████▊| 12465/12776 [2:12:13<01:38,  3.17it/s]                                                       {'loss': 0.3869, 'grad_norm': 1.4978660345077515, 'learning_rate': 9.92179863147605e-06, 'epoch': 1.94}
+{'loss': 1.1001, 'grad_norm': 2.3386178016662598, 'learning_rate': 9.897360703812315e-06, 'epoch': 1.94}
+{'loss': 1.5033, 'grad_norm': 0.8785395622253418, 'learning_rate': 9.872922776148582e-06, 'epoch': 1.94}
+{'loss': 1.4967, 'grad_norm': 0.8468477725982666, 'learning_rate': 9.848484848484848e-06, 'epoch': 1.94}
+{'loss': 1.4881, 'grad_norm': 0.7762795686721802, 'learning_rate': 9.824046920821113e-06, 'epoch': 1.94}
+{'loss': 1.3553, 'grad_norm': 2.385578155517578, 'learning_rate': 9.79960899315738e-06, 'epoch': 1.94}
+{'loss': 1.4372, 'grad_norm': 1.221888542175293, 'learning_rate': 9.775171065493646e-06, 'epoch': 1.94}
+{'loss': 1.4205, 'grad_norm': 3.5840225219726562, 'learning_rate': 9.750733137829911e-06, 'epoch': 1.94}
+{'loss': 1.3966, 'grad_norm': 0.9820275902748108, 'learning_rate': 9.726295210166177e-06, 'epoch': 1.94}
+{'loss': 1.4722, 'grad_norm': 1.1478222608566284, 'learning_rate': 9.701857282502442e-06, 'epoch': 1.94}
+{'loss': 1.4673, 'grad_norm': 2.0976521968841553, 'learning_rate': 9.677419354838709e-06, 'epoch': 1.94}
+{'loss': 1.5253, 'grad_norm': 1.5438917875289917, 'learning_rate': 9.652981427174975e-06, 'epoch': 1.94}
+{'loss': 1.4075, 'grad_norm': 1.2291433811187744, 'learning_rate': 9.62854349951124e-06, 'epoch': 1.94}
+{'loss': 1.3334, 'grad_norm': 1.6943989992141724, 'learning_rate': 9.604105571847507e-06, 'epoch': 1.94}
+{'loss': 1.4683, 'grad_norm': 1.9269627332687378, 'learning_rate': 9.579667644183773e-06, 'epoch': 1.94}
+{'loss': 1.5016, 'grad_norm': 1.9164787530899048, 'learning_rate': 9.555229716520038e-06, 'epoch': 1.94}
+{'loss': 1.2375, 'grad_norm': 1.3316177129745483, 'learning_rate': 9.530791788856304e-06, 'epoch': 1.94}
+{'loss': 1.2165, 'grad_norm': 1.476271390914917, 'learning_rate': 9.50635386119257e-06, 'epoch': 1.94}
+{'loss': 1.2883, 'grad_norm': 1.4863686561584473, 'learning_rate': 9.481915933528836e-06, 'epoch': 1.94}
+{'loss': 1.4016, 'grad_norm': 1.1693488359451294, 'learning_rate': 9.457478005865102e-06, 'epoch': 1.94}
+{'loss': 1.2879, 'grad_norm': 2.0392889976501465, 'learning_rate': 9.433040078201369e-06, 'epoch': 1.94}
+{'loss': 1.3189, 'grad_norm': 2.9882824420928955, 'learning_rate': 9.408602150537633e-06, 'epoch': 1.94}
+{'loss': 1.2713, 'grad_norm': 1.9439655542373657, 'learning_rate': 9.3841642228739e-06, 'epoch': 1.94}
+{'loss': 1.4599, 'grad_norm': 2.5149528980255127, 'learning_rate': 9.359726295210165e-06, 'epoch': 1.94}
+{'loss': 1.2472, 'grad_norm': 1.612697720527649, 'learning_rate': 9.335288367546431e-06, 'epoch': 1.94}
+{'loss': 1.208, 'grad_norm': 4.500641345977783, 'learning_rate': 9.310850439882698e-06, 'epoch': 1.94}
+{'loss': 1.5878, 'grad_norm': 1.9317268133163452, 'learning_rate': 9.286412512218962e-06, 'epoch': 1.94}
+{'loss': 1.4057, 'grad_norm': 6.950214862823486, 'learning_rate': 9.261974584555229e-06, 'epoch': 1.94}
+{'loss': 1.3945, 'grad_norm': 4.08366584777832, 'learning_rate': 9.237536656891495e-06, 'epoch': 1.94}
+{'loss': 1.0823, 'grad_norm': 1.9059251546859741, 'learning_rate': 9.21309872922776e-06, 'epoch': 1.94}
+{'loss': 1.4755, 'grad_norm': 4.59588098526001, 'learning_rate': 9.188660801564027e-06, 'epoch': 1.94}
+{'loss': 1.1854, 'grad_norm': 3.578123092651367, 'learning_rate': 9.164222873900293e-06, 'epoch': 1.94}
+{'loss': 1.2789, 'grad_norm': 1.8698164224624634, 'learning_rate': 9.139784946236558e-06, 'epoch': 1.94}
+{'loss': 1.3508, 'grad_norm': 2.4767820835113525, 'learning_rate': 9.115347018572825e-06, 'epoch': 1.94}
+{'loss': 0.9728, 'grad_norm': 1.976932406425476, 'learning_rate': 9.090909090909091e-06, 'epoch': 1.94}
+{'loss': 1.157, 'grad_norm': 2.43166184425354, 'learning_rate': 9.066471163245356e-06, 'epoch': 1.94}
+{'loss': 0.9677, 'grad_norm': 1.8392789363861084, 'learning_rate': 9.042033235581622e-06, 'epoch': 1.94}
+{'loss': 1.2045, 'grad_norm': 6.830517768859863, 'learning_rate': 9.017595307917889e-06, 'epoch': 1.94}
+{'loss': 1.522, 'grad_norm': 4.780092239379883, 'learning_rate': 8.993157380254154e-06, 'epoch': 1.95}
+{'loss': 1.4294, 'grad_norm': 6.710925102233887, 'learning_rate': 8.968719452590418e-06, 'epoch': 1.95}
+{'loss': 1.1853, 'grad_norm': 4.024580478668213, 'learning_rate': 8.944281524926687e-06, 'epoch': 1.95}
+{'loss': 1.195, 'grad_norm': 5.138179302215576, 'learning_rate': 8.919843597262951e-06, 'epoch': 1.95}
+{'loss': 1.5426, 'grad_norm': 4.0508270263671875, 'learning_rate': 8.895405669599216e-06, 'epoch': 1.95}
+{'loss': 1.3934, 'grad_norm': 5.0509538650512695, 'learning_rate': 8.870967741935483e-06, 'epoch': 1.95}
+{'loss': 1.361, 'grad_norm': 7.239458084106445, 'learning_rate': 8.84652981427175e-06, 'epoch': 1.95}
+{'loss': 1.1353, 'grad_norm': 4.996304988861084, 'learning_rate': 8.822091886608014e-06, 'epoch': 1.95}
+{'loss': 1.052, 'grad_norm': 1.4053064584732056, 'learning_rate': 8.79765395894428e-06, 'epoch': 1.95}
+{'loss': 1.125, 'grad_norm': 4.434309959411621, 'learning_rate': 8.773216031280547e-06, 'epoch': 1.95}
+{'loss': 0.8836, 'grad_norm': 2.092318058013916, 'learning_rate': 8.748778103616812e-06, 'epoch': 1.95}
+{'loss': 0.9855, 'grad_norm': 2.4105422496795654, 'learning_rate': 8.724340175953078e-06, 'epoch': 1.95}
+{'loss': 1.01, 'grad_norm': 3.4820504188537598, 'learning_rate': 8.699902248289345e-06, 'epoch': 1.95}
+{'loss': 0.9834, 'grad_norm': 2.5442054271698, 'learning_rate': 8.67546432062561e-06, 'epoch': 1.95}
+{'loss': 1.4554, 'grad_norm': 0.9357713460922241, 'learning_rate': 8.651026392961876e-06, 'epoch': 1.95}
+{'loss': 1.4628, 'grad_norm': 0.6884608268737793, 'learning_rate': 8.626588465298143e-06, 'epoch': 1.95}
+{'loss': 1.3942, 'grad_norm': 1.8610217571258545, 'learning_rate': 8.602150537634407e-06, 'epoch': 1.95}
+{'loss': 1.4434, 'grad_norm': 1.9256157875061035, 'learning_rate': 8.577712609970674e-06, 'epoch': 1.95}
+{'loss': 1.4785, 'grad_norm': 1.0926450490951538, 'learning_rate': 8.55327468230694e-06, 'epoch': 1.95}
+{'loss': 1.4919, 'grad_norm': 1.835080623626709, 'learning_rate': 8.528836754643205e-06, 'epoch': 1.95}
+{'loss': 1.485, 'grad_norm': 0.9636733531951904, 'learning_rate': 8.504398826979472e-06, 'epoch': 1.95}
+{'loss': 1.5098, 'grad_norm': 1.1740062236785889, 'learning_rate': 8.479960899315738e-06, 'epoch': 1.95}
+{'loss': 1.4772, 'grad_norm': 1.6411094665527344, 'learning_rate': 8.455522971652003e-06, 'epoch': 1.95}
+{'loss': 1.4486, 'grad_norm': 1.150183916091919, 'learning_rate': 8.43108504398827e-06, 'epoch': 1.95}
+{'loss': 1.4765, 'grad_norm': 2.3827710151672363, 'learning_rate': 8.406647116324536e-06, 'epoch': 1.95}
+{'loss': 1.2571, 'grad_norm': 1.509945034980774, 'learning_rate': 8.3822091886608e-06, 'epoch': 1.95}
+{'loss': 1.3709, 'grad_norm': 1.6995009183883667, 'learning_rate': 8.357771260997067e-06, 'epoch': 1.95}
+{'loss': 1.4892, 'grad_norm': 1.6770323514938354, 'learning_rate': 8.333333333333332e-06, 'epoch': 1.95}
+{'loss': 1.4031, 'grad_norm': 1.3949050903320312, 'learning_rate': 8.308895405669599e-06, 'epoch': 1.95}
+{'loss': 1.3407, 'grad_norm': 4.16380500793457, 'learning_rate': 8.284457478005865e-06, 'epoch': 1.95}
+{'loss': 1.6301, 'grad_norm': 1.7018452882766724, 'learning_rate': 8.26001955034213e-06, 'epoch': 1.95}
+{'loss': 1.4144, 'grad_norm': 1.3921611309051514, 'learning_rate': 8.235581622678396e-06, 'epoch': 1.95}
+{'loss': 1.4583, 'grad_norm': 2.7200441360473633, 'learning_rate': 8.211143695014663e-06, 'epoch': 1.95}
+{'loss': 1.4575, 'grad_norm': 1.539747714996338, 'learning_rate': 8.186705767350928e-06, 'epoch': 1.95}
+{'loss': 1.266, 'grad_norm': 1.5239930152893066, 'learning_rate': 8.162267839687194e-06, 'epoch': 1.95}
+{'loss': 1.2497, 'grad_norm': 2.7010016441345215, 'learning_rate': 8.137829912023459e-06, 'epoch': 1.95}
+{'loss': 1.537, 'grad_norm': 3.007016658782959, 'learning_rate': 8.113391984359725e-06, 'epoch': 1.95}
+{'loss': 1.4236, 'grad_norm': 1.432244896888733, 'learning_rate': 8.088954056695992e-06, 'epoch': 1.95}
+{'loss': 1.5051, 'grad_norm': 1.9789886474609375, 'learning_rate': 8.064516129032257e-06, 'epoch': 1.95}
+{'loss': 1.3133, 'grad_norm': 1.625560998916626, 'learning_rate': 8.040078201368523e-06, 'epoch': 1.95}
+ 98%|█████████▊| 12465/12776 [2:12:13<01:38,  3.17it/s] 98%|█████████▊| 12466/12776 [2:12:14<01:34,  3.29it/s]                                                        98%|█████████▊| 12466/12776 [2:12:14<01:34,  3.29it/s] 98%|█████████▊| 12467/12776 [2:12:14<01:39,  3.11it/s]                                                        98%|█████████▊| 12467/12776 [2:12:14<01:39,  3.11it/s] 98%|█████████▊| 12468/12776 [2:12:14<01:33,  3.29it/s]                                                        98%|█████████▊| 12468/12776 [2:12:14<01:33,  3.29it/s] 98%|█████████▊| 12469/12776 [2:12:15<01:28,  3.45it/s]                                                        98%|█████████▊| 12469/12776 [2:12:15<01:28,  3.45it/s] 98%|█████████▊| 12470/12776 [2:12:15<01:24,  3.61it/s]                                                        98%|█████████▊| 12470/12776 [2:12:15<01:24,  3.61it/s] 98%|█████████▊| 12471/12776 [2:12:15<01:31,  3.32it/s]                                                        98%|█████████▊| 12471/12776 [2:12:15<01:31,  3.32it/s] 98%|█████████▊| 12472/12776 [2:12:15<01:25,  3.54it/s]                                                        98%|█████████▊| 12472/12776 [2:12:15<01:25,  3.54it/s] 98%|█████████▊| 12473/12776 [2:12:16<01:21,  3.73it/s]                                                        98%|█████████▊| 12473/12776 [2:12:16<01:21,  3.73it/s] 98%|█████████▊| 12474/12776 [2:12:16<01:17,  3.91it/s]                                                        98%|█████████▊| 12474/12776 [2:12:16<01:17,  3.91it/s] 98%|█████████▊| 12475/12776 [2:12:16<01:23,  3.62it/s]                                                        98%|█████████▊| 12475/12776 [2:12:16<01:23,  3.62it/s] 98%|█████████▊| 12476/12776 [2:12:16<01:17,  3.88it/s]                                                        98%|█████████▊| 12476/12776 [2:12:16<01:17,  3.88it/s] 98%|█████████▊| 12477/12776 [2:12:17<01:12,  4.11it/s]                                                        98%|█████████▊| 12477/12776 [2:12:17<01:12,  4.11it/s] 98%|█████████▊| 12478/12776 [2:12:17<01:09,  4.30it/s]                                                        98%|█████████▊| 12478/12776 [2:12:17<01:09,  4.30it/s] 98%|█████████▊| 12479/12776 [2:12:17<01:07,  4.43it/s]                                                        98%|█████████▊| 12479/12776 [2:12:17<01:07,  4.43it/s] 98%|█████████▊| 12480/12776 [2:12:17<01:12,  4.09it/s]                                                        98%|█████████▊| 12480/12776 [2:12:17<01:12,  4.09it/s] 98%|█████████▊| 12481/12776 [2:12:18<01:06,  4.46it/s]                                                        98%|█████████▊| 12481/12776 [2:12:18<01:06,  4.46it/s] 98%|█████████▊| 12482/12776 [2:12:18<01:01,  4.77it/s]                                                        98%|█████████▊| 12482/12776 [2:12:18<01:01,  4.77it/s] 98%|█████████▊| 12483/12776 [2:12:18<00:58,  5.04it/s]                                                        98%|█████████▊| 12483/12776 [2:12:18<00:58,  5.04it/s] 98%|█████████▊| 12484/12776 [2:12:18<00:55,  5.25it/s]                                                        98%|█████████▊| 12484/12776 [2:12:18<00:55,  5.25it/s] 98%|█████████▊| 12485/12776 [2:12:18<00:53,  5.40it/s]                                                        98%|█████████▊| 12485/12776 [2:12:18<00:53,  5.40it/s] 98%|█████████▊| 12486/12776 [2:12:19<01:00,  4.78it/s]                                                        98%|█████████▊| 12486/12776 [2:12:19<01:00,  4.78it/s] 98%|█████████▊| 12487/12776 [2:12:19<00:56,  5.13it/s]                                                        98%|█████████▊| 12487/12776 [2:12:19<00:56,  5.13it/s] 98%|█████████▊| 12488/12776 [2:12:19<01:39,  2.91it/s]                                                        98%|█████████▊| 12488/12776 [2:12:19<01:39,  2.91it/s] 98%|█████████▊| 12489/12776 [2:12:21<03:16,  1.46it/s]                                                        98%|█████████▊| 12489/12776 [2:12:21<03:16,  1.46it/s] 98%|█████████▊| 12490/12776 [2:12:22<03:34,  1.33it/s]                                                        98%|█████████▊| 12490/12776 [2:12:22<03:34,  1.33it/s] 98%|█████████▊| 12491/12776 [2:12:23<03:41,  1.29it/s]                                                        98%|█████████▊| 12491/12776 [2:12:23<03:41,  1.29it/s] 98%|█████████▊| 12492/12776 [2:12:23<03:44,  1.27it/s]                                                        98%|█████████▊| 12492/12776 [2:12:23<03:44,  1.27it/s] 98%|█████████▊| 12493/12776 [2:12:24<03:49,  1.23it/s]                                                        98%|█████████▊| 12493/12776 [2:12:24<03:49,  1.23it/s] 98%|█████████▊| 12494/12776 [2:12:25<03:40,  1.28it/s]                                                        98%|█████████▊| 12494/12776 [2:12:25<03:40,  1.28it/s] 98%|█████████▊| 12495/12776 [2:12:26<03:35,  1.31it/s]                                                        98%|█████████▊| 12495/12776 [2:12:26<03:35,  1.31it/s] 98%|█████████▊| 12496/12776 [2:12:26<03:23,  1.38it/s]                                                        98%|█████████▊| 12496/12776 [2:12:26<03:23,  1.38it/s] 98%|█████████▊| 12497/12776 [2:12:27<03:12,  1.45it/s]                                                        98%|█████████▊| 12497/12776 [2:12:27<03:12,  1.45it/s] 98%|█████████▊| 12498/12776 [2:12:28<03:01,  1.53it/s]                                                        98%|█████████▊| 12498/12776 [2:12:28<03:01,  1.53it/s] 98%|█████████▊| 12499/12776 [2:12:28<02:53,  1.60it/s]                                                        98%|█████████▊| 12499/12776 [2:12:28<02:53,  1.60it/s] 98%|█████████▊| 12500/12776 [2:12:29<02:44,  1.67it/s]                                                        98%|█████████▊| 12500/12776 [2:12:29<02:44,  1.67it/s] 98%|█████████▊| 12501/12776 [2:12:29<02:45,  1.66it/s]                                                        98%|█████████▊| 12501/12776 [2:12:29<02:45,  1.66it/s] 98%|█████████▊| 12502/12776 [2:12:30<02:35,  1.76it/s]                                                        98%|█████████▊| 12502/12776 [2:12:30<02:35,  1.76it/s] 98%|█████████▊| 12503/12776 [2:12:30<02:34,  1.77it/s]                                                        98%|█████████▊| 12503/12776 [2:12:30<02:34,  1.77it/s] 98%|█████████▊| 12504/12776 [2:12:31<02:22,  1.91it/s]                                                        98%|█████████▊| 12504/12776 [2:12:31<02:22,  1.91it/s] 98%|█████████▊| 12505/12776 [2:12:31<02:20,  1.93it/s]                                                        98%|█████████▊| 12505/12776 [2:12:31<02:20,  1.93it/s] 98%|█████████▊| 12506/12776 [2:12:32<02:10,  2.07it/s]                                                        98%|█████████▊| 12506/12776 [2:12:32<02:10,  2.07it/s] 98%|█████████▊| 12507/12776 [2:12:32<02:02,  2.19it/s]                                                        98%|█████████▊| 12507/12776 [2:12:32<02:02,  2.19it/s] 98%|█████████▊| 12508/12776 [2:12:32<01:56,  2.31it/s]                                                        98%|█████████▊| 12508/12776 [2:12:32<01:56,  2.31it/s] 98%|█████████▊| 12509/12776 [2:12:33<01:49,  2.44it/s]                                                        98%|█████████▊| 12509/12776 [2:12:33<01:49,  2.44it/s] 98%|█████████▊| 12510/12776 [2:12:33<01:44,  2.55it/s]                                                        98%|█████████▊| 12510/12776 [2:12:33<01:44,  2.55it/s] 98%|█████████▊| 12511/12776 [2:12:34<01:46,  2.49it/s]                                                        98%|█████████▊| 12511/12776 [2:12:34<01:46,  2.49it/s] 98%|█████████▊| 12512/12776 [2:12:34<01:40,  2.63it/s]                                                        98%|█████████▊| 12512/12776 [2:12:34<01:40,  2.63it/s] 98%|█████████▊| 12513/12776 [2:12:34<01:35,  2.75it/s]                                                        98%|█████████▊| 12513/12776 [2:12:34<01:35,  2.75it/s] 98%|█████████▊| 12514/12776 [2:12:35<01:30,  2.89it/s]                                                        98%|█████████▊| 12514/12776 [2:12:35<01:30,  2.89it/s] 98%|█████████▊| 12515/12776 [2:12:35<01:27,  2.98it/s]                                                        98%|█████████▊| 12515/12776 [2:12:35<01:27,  2.98it/s] 98%|█████████▊| 12516/12776 [2:12:35<01:23,  3.11it/s]                                                        98%|█████████▊| 12516/12776 [2:12:35<01:23,  3.11it/s] 98%|█████████▊| 12517/12776 [2:12:35<01:20,  3.24it/s]                                                        98%|█████████▊| 12517/12776 [2:12:35<01:20,  3.24it/s] 98%|█████████▊| 12518/12776 [2:12:36<01:16,  3.37it/s]                                                        98%|█████████▊| 12518/12776 [2:12:36<01:16,  3.37it/s] 98%|█████████▊| 12519/12776 [2:12:36<01:14,  3.44it/s]                                                        98%|█████████▊| 12519/12776 [2:12:36<01:14,  3.44it/s] 98%|█████████▊| 12520/12776 [2:12:36<01:11,  3.60it/s]                                                        98%|█████████▊| 12520/12776 [2:12:36<01:11,  3.60it/s] 98%|█████████▊| 12521/12776 [2:12:36<01:08,  3.72it/s]                                                        98%|█████████▊| 12521/12776 [2:12:36<01:08,  3.72it/s] 98%|█████████▊| 12522/12776 [2:12:37<01:06,  3.83it/s]                                                        98%|█████████▊| 12522/12776 [2:12:37<01:06,  3.83it/s] 98%|█████████▊| 12523/12776 [2:12:37<01:14,  3.39it/s]                                                        98%|█████████▊| 12523/12776 [2:12:37<01:14,  3.39it/s] 98%|█████████▊| 12524/12776 [2:12:37<01:09,  3.63it/s]                                                        98%|█████████▊| 12524/12776 [2:12:37<01:09,  3.63it/s] 98%|█████████▊| 12525/12776 [2:12:37<01:05,  3.85it/s]                                                        98%|█████████▊| 12525/12776 [2:12:37<01:05,  3.85it/s] 98%|█████████▊| 12526/12776 [2:12:38<01:01,  4.04it/s]                                                        98%|█████████▊| 12526/12776 [2:12:38<01:01,  4.04it/s] 98%|████��████▊| 12527/12776 [2:12:38<01:09,  3.60it/s]                                                        98%|█████████▊| 12527/12776 [2:12:38<01:09,  3.60it/s] 98%|█████████▊| 12528/12776 [2:12:38<01:03,  3.89it/s]                                                        98%|█████████▊| 12528/12776 [2:12:38<01:03,  3.89it/s] 98%|█████████▊| 12529/12776 [2:12:38<01:00,  4.12it/s]                                                        98%|█████████▊| 12529/12776 [2:12:38<01:00,  4.12it/s] 98%|█████████▊| 12530/12776 [2:12:39<00:56,  4.32it/s]                                                        98%|█████████▊| 12530/12776 [2:12:39<00:56,  4.32it/s] 98%|█████████▊| 12531/12776 [2:12:39<00:54,  4.50it/s]                                                        98%|█████████▊| 12531/12776 [2:12:39<00:54,  4.50it/s] 98%|█████████▊| 12532/12776 [2:12:39<01:00,  4.06it/s]                                                        98%|█████████▊| 12532/12776 [2:12:39<01:00,  4.06it/s] 98%|█████████▊| 12533/12776 [2:12:39<00:56,  4.31it/s]                                                        98%|█████████▊| 12533/12776 [2:12:39<00:56,  4.31it/s] 98%|█████████▊| 12534/12776 [2:12:40<00:53,  4.51it/s]                                                        98%|█████████▊| 12534/12776 [2:12:40<00:53,  4.51it/s] 98%|█████████▊| 12535/12776 [2:12:40<00:51,  4.69it/s]                                                        98%|█████████▊| 12535/12776 [2:12:40<00:51,  4.69it/s] 98%|█████████▊| 12536/12776 [2:12:40<00:49,  4.82it/s]                                                        98%|█████████▊| 12536/12776 [2:12:40<00:49,  4.82it/s] 98%|█████████▊| 12537/12776 [2:12:40<00:48,  4.96it/s]                                                        98%|█████████▊| 12537/12776 [2:12:40<00:48,  4.96it/s] 98%|█████████▊| 12538/12776 [2:12:41<01:23,  2.86it/s]                                                        98%|█████████▊| 12538/12776 [2:12:41<01:23,  2.86it/s] 98%|█████████▊| 12539/12776 [2:12:42<02:39,  1.49it/s]                                                        98%|█████████▊| 12539/12776 [2:12:42<02:39,  1.49it/s] 98%|█████████▊| 12540/12776 [2:12:43<02:58,  1.32it/s]                                                        98%|█████████▊| 12540/12776 [2:12:43<02:58,  1.32it/s] 98%|█████████▊| 12541/12776 [2:12:44<03:10,  1.23it/s]                                                        98%|█████████▊| 12541/12776 [2:12:44<03:10,  1.23it/s] 98%|█████████▊| 12542/12776 [2:12:45<03:08,  1.24it/s]                                                        98%|█████████▊| 12542/12776 [2:12:45<03:08,  1.24it/s] 98%|█████████▊| 12543/12776 [2:12:46<03:01,  1.28it/s]                                                       {'loss': 1.3131, 'grad_norm': 5.823700428009033, 'learning_rate': 8.01564027370479e-06, 'epoch': 1.95}
+{'loss': 1.2024, 'grad_norm': 1.7935535907745361, 'learning_rate': 7.991202346041054e-06, 'epoch': 1.95}
+{'loss': 1.0846, 'grad_norm': 2.3289988040924072, 'learning_rate': 7.966764418377321e-06, 'epoch': 1.95}
+{'loss': 1.53, 'grad_norm': 7.128284454345703, 'learning_rate': 7.942326490713586e-06, 'epoch': 1.95}
+{'loss': 1.2947, 'grad_norm': 1.9482355117797852, 'learning_rate': 7.917888563049852e-06, 'epoch': 1.95}
+{'loss': 1.2437, 'grad_norm': 1.5053616762161255, 'learning_rate': 7.893450635386119e-06, 'epoch': 1.95}
+{'loss': 1.5358, 'grad_norm': 8.75932502746582, 'learning_rate': 7.869012707722384e-06, 'epoch': 1.95}
+{'loss': 0.8694, 'grad_norm': 2.771195411682129, 'learning_rate': 7.84457478005865e-06, 'epoch': 1.95}
+{'loss': 1.6965, 'grad_norm': 5.4038262367248535, 'learning_rate': 7.820136852394917e-06, 'epoch': 1.95}
+{'loss': 1.3113, 'grad_norm': 2.8601737022399902, 'learning_rate': 7.795698924731181e-06, 'epoch': 1.95}
+{'loss': 1.778, 'grad_norm': 13.03695011138916, 'learning_rate': 7.771260997067448e-06, 'epoch': 1.95}
+{'loss': 0.9247, 'grad_norm': 4.585375785827637, 'learning_rate': 7.746823069403714e-06, 'epoch': 1.95}
+{'loss': 1.1538, 'grad_norm': 2.3865814208984375, 'learning_rate': 7.72238514173998e-06, 'epoch': 1.95}
+{'loss': 1.0104, 'grad_norm': 4.630260944366455, 'learning_rate': 7.697947214076246e-06, 'epoch': 1.95}
+{'loss': 1.0723, 'grad_norm': 2.182964563369751, 'learning_rate': 7.673509286412512e-06, 'epoch': 1.95}
+{'loss': 0.6017, 'grad_norm': 4.5911760330200195, 'learning_rate': 7.649071358748777e-06, 'epoch': 1.95}
+{'loss': 0.8234, 'grad_norm': 3.150099754333496, 'learning_rate': 7.6246334310850434e-06, 'epoch': 1.95}
+{'loss': 1.0587, 'grad_norm': 1.7521438598632812, 'learning_rate': 7.600195503421309e-06, 'epoch': 1.95}
+{'loss': 1.5662, 'grad_norm': 3.584641933441162, 'learning_rate': 7.575757575757575e-06, 'epoch': 1.95}
+{'loss': 1.109, 'grad_norm': 3.3541533946990967, 'learning_rate': 7.55131964809384e-06, 'epoch': 1.95}
+{'loss': 1.2002, 'grad_norm': 7.451817512512207, 'learning_rate': 7.526881720430107e-06, 'epoch': 1.95}
+{'loss': 0.6299, 'grad_norm': 6.928215503692627, 'learning_rate': 7.5024437927663725e-06, 'epoch': 1.95}
+{'loss': 1.0142, 'grad_norm': 3.0479836463928223, 'learning_rate': 7.478005865102638e-06, 'epoch': 1.95}
+{'loss': 0.4941, 'grad_norm': 1.0150418281555176, 'learning_rate': 7.453567937438905e-06, 'epoch': 1.95}
+{'loss': 1.4971, 'grad_norm': 1.5538381338119507, 'learning_rate': 7.42913000977517e-06, 'epoch': 1.96}
+{'loss': 1.4253, 'grad_norm': 0.806178867816925, 'learning_rate': 7.404692082111436e-06, 'epoch': 1.96}
+{'loss': 1.2646, 'grad_norm': 1.2289999723434448, 'learning_rate': 7.3802541544477025e-06, 'epoch': 1.96}
+{'loss': 1.4464, 'grad_norm': 1.0183123350143433, 'learning_rate': 7.355816226783968e-06, 'epoch': 1.96}
+{'loss': 1.3698, 'grad_norm': 0.9893926382064819, 'learning_rate': 7.331378299120234e-06, 'epoch': 1.96}
+{'loss': 1.507, 'grad_norm': 1.2529398202896118, 'learning_rate': 7.3069403714565e-06, 'epoch': 1.96}
+{'loss': 1.3113, 'grad_norm': 0.9131374955177307, 'learning_rate': 7.282502443792766e-06, 'epoch': 1.96}
+{'loss': 1.3692, 'grad_norm': 0.9663822650909424, 'learning_rate': 7.2580645161290315e-06, 'epoch': 1.96}
+{'loss': 1.4265, 'grad_norm': 4.0027265548706055, 'learning_rate': 7.233626588465298e-06, 'epoch': 1.96}
+{'loss': 1.4167, 'grad_norm': 1.6491836309432983, 'learning_rate': 7.209188660801564e-06, 'epoch': 1.96}
+{'loss': 1.5605, 'grad_norm': 1.1959609985351562, 'learning_rate': 7.184750733137829e-06, 'epoch': 1.96}
+{'loss': 1.417, 'grad_norm': 2.8737237453460693, 'learning_rate': 7.160312805474095e-06, 'epoch': 1.96}
+{'loss': 1.3524, 'grad_norm': 2.0030691623687744, 'learning_rate': 7.1358748778103615e-06, 'epoch': 1.96}
+{'loss': 1.3771, 'grad_norm': 1.234277606010437, 'learning_rate': 7.111436950146627e-06, 'epoch': 1.96}
+{'loss': 1.4366, 'grad_norm': 3.153789520263672, 'learning_rate': 7.086999022482893e-06, 'epoch': 1.96}
+{'loss': 1.5128, 'grad_norm': 2.164249897003174, 'learning_rate': 7.062561094819159e-06, 'epoch': 1.96}
+{'loss': 1.3222, 'grad_norm': 2.0796103477478027, 'learning_rate': 7.038123167155425e-06, 'epoch': 1.96}
+{'loss': 1.4278, 'grad_norm': 10.709492683410645, 'learning_rate': 7.0136852394916906e-06, 'epoch': 1.96}
+{'loss': 1.3899, 'grad_norm': 3.223146915435791, 'learning_rate': 6.989247311827957e-06, 'epoch': 1.96}
+{'loss': 1.3115, 'grad_norm': 1.8081053495407104, 'learning_rate': 6.964809384164223e-06, 'epoch': 1.96}
+{'loss': 1.1652, 'grad_norm': 2.1025900840759277, 'learning_rate': 6.9403714565004875e-06, 'epoch': 1.96}
+{'loss': 1.33, 'grad_norm': 2.087188720703125, 'learning_rate': 6.915933528836755e-06, 'epoch': 1.96}
+{'loss': 1.3163, 'grad_norm': 1.6247758865356445, 'learning_rate': 6.89149560117302e-06, 'epoch': 1.96}
+{'loss': 1.6139, 'grad_norm': 5.537015914916992, 'learning_rate': 6.867057673509285e-06, 'epoch': 1.96}
+{'loss': 1.3413, 'grad_norm': 5.143603801727295, 'learning_rate': 6.842619745845553e-06, 'epoch': 1.96}
+{'loss': 1.2207, 'grad_norm': 6.505813121795654, 'learning_rate': 6.8181818181818174e-06, 'epoch': 1.96}
+{'loss': 1.3264, 'grad_norm': 3.945194721221924, 'learning_rate': 6.793743890518083e-06, 'epoch': 1.96}
+{'loss': 1.1143, 'grad_norm': 3.9010744094848633, 'learning_rate': 6.769305962854349e-06, 'epoch': 1.96}
+{'loss': 1.3981, 'grad_norm': 3.1134190559387207, 'learning_rate': 6.744868035190615e-06, 'epoch': 1.96}
+{'loss': 0.932, 'grad_norm': 1.898162841796875, 'learning_rate': 6.720430107526881e-06, 'epoch': 1.96}
+{'loss': 1.4597, 'grad_norm': 4.892297744750977, 'learning_rate': 6.6959921798631465e-06, 'epoch': 1.96}
+{'loss': 1.2172, 'grad_norm': 2.914050340652466, 'learning_rate': 6.671554252199413e-06, 'epoch': 1.96}
+{'loss': 1.0094, 'grad_norm': 2.2120187282562256, 'learning_rate': 6.647116324535679e-06, 'epoch': 1.96}
+{'loss': 1.2513, 'grad_norm': 3.932375907897949, 'learning_rate': 6.622678396871944e-06, 'epoch': 1.96}
+{'loss': 1.0083, 'grad_norm': 2.4831714630126953, 'learning_rate': 6.598240469208211e-06, 'epoch': 1.96}
+{'loss': 1.1801, 'grad_norm': 1.6757017374038696, 'learning_rate': 6.5738025415444764e-06, 'epoch': 1.96}
+{'loss': 1.4077, 'grad_norm': 4.2157745361328125, 'learning_rate': 6.549364613880742e-06, 'epoch': 1.96}
+{'loss': 1.1976, 'grad_norm': 7.748180866241455, 'learning_rate': 6.524926686217009e-06, 'epoch': 1.96}
+{'loss': 0.8903, 'grad_norm': 2.130333423614502, 'learning_rate': 6.500488758553274e-06, 'epoch': 1.96}
+{'loss': 1.4919, 'grad_norm': 11.016569137573242, 'learning_rate': 6.47605083088954e-06, 'epoch': 1.96}
+{'loss': 1.0693, 'grad_norm': 3.6152400970458984, 'learning_rate': 6.451612903225806e-06, 'epoch': 1.96}
+{'loss': 0.9952, 'grad_norm': 5.328700542449951, 'learning_rate': 6.427174975562072e-06, 'epoch': 1.96}
+{'loss': 1.0022, 'grad_norm': 4.276764869689941, 'learning_rate': 6.402737047898338e-06, 'epoch': 1.96}
+{'loss': 1.5189, 'grad_norm': 5.339593887329102, 'learning_rate': 6.378299120234603e-06, 'epoch': 1.96}
+{'loss': 0.8523, 'grad_norm': 3.1604864597320557, 'learning_rate': 6.35386119257087e-06, 'epoch': 1.96}
+{'loss': 1.2364, 'grad_norm': 2.233135461807251, 'learning_rate': 6.3294232649071355e-06, 'epoch': 1.96}
+{'loss': 0.8228, 'grad_norm': 9.808837890625, 'learning_rate': 6.304985337243401e-06, 'epoch': 1.96}
+{'loss': 0.731, 'grad_norm': 4.273067474365234, 'learning_rate': 6.280547409579668e-06, 'epoch': 1.96}
+{'loss': 0.725, 'grad_norm': 4.046677112579346, 'learning_rate': 6.256109481915933e-06, 'epoch': 1.96}
+{'loss': 0.5296, 'grad_norm': 3.113222599029541, 'learning_rate': 6.231671554252199e-06, 'epoch': 1.96}
+{'loss': 1.5546, 'grad_norm': 1.053043007850647, 'learning_rate': 6.207233626588465e-06, 'epoch': 1.96}
+{'loss': 1.3777, 'grad_norm': 1.1030488014221191, 'learning_rate': 6.182795698924731e-06, 'epoch': 1.96}
+{'loss': 1.3897, 'grad_norm': 0.8204811215400696, 'learning_rate': 6.158357771260996e-06, 'epoch': 1.96}
+{'loss': 1.4116, 'grad_norm': 0.8840574622154236, 'learning_rate': 6.133919843597263e-06, 'epoch': 1.96}
+ 98%|█████████▊| 12543/12776 [2:12:46<03:01,  1.28it/s] 98%|█████████▊| 12544/12776 [2:12:46<02:54,  1.33it/s]                                                        98%|█████████▊| 12544/12776 [2:12:46<02:54,  1.33it/s] 98%|█████████▊| 12545/12776 [2:12:47<02:45,  1.39it/s]                                                        98%|█████████▊| 12545/12776 [2:12:47<02:45,  1.39it/s] 98%|█████████▊| 12546/12776 [2:12:48<02:38,  1.46it/s]                                                        98%|█████████▊| 12546/12776 [2:12:48<02:38,  1.46it/s] 98%|█████████▊| 12547/12776 [2:12:48<02:30,  1.52it/s]                                                        98%|█████████▊| 12547/12776 [2:12:48<02:30,  1.52it/s] 98%|█████████▊| 12548/12776 [2:12:49<02:22,  1.59it/s]                                                        98%|█████████▊| 12548/12776 [2:12:49<02:22,  1.59it/s] 98%|█████████▊| 12549/12776 [2:12:49<02:16,  1.67it/s]                                                        98%|█████████▊| 12549/12776 [2:12:49<02:16,  1.67it/s] 98%|█████████▊| 12550/12776 [2:12:50<02:13,  1.70it/s]                                                        98%|█████████▊| 12550/12776 [2:12:50<02:13,  1.70it/s] 98%|█████████▊| 12551/12776 [2:12:50<02:06,  1.79it/s]                                                        98%|█████████▊| 12551/12776 [2:12:50<02:06,  1.79it/s] 98%|█████████▊| 12552/12776 [2:12:51<02:03,  1.82it/s]                                                        98%|█████████▊| 12552/12776 [2:12:51<02:03,  1.82it/s] 98%|█████████▊| 12553/12776 [2:12:51<01:55,  1.93it/s]                                                        98%|█████████▊| 12553/12776 [2:12:51<01:55,  1.93it/s] 98%|█████████▊| 12554/12776 [2:12:52<01:54,  1.95it/s]                                                        98%|█████████▊| 12554/12776 [2:12:52<01:54,  1.95it/s] 98%|█████████▊| 12555/12776 [2:12:52<01:46,  2.07it/s]                                                        98%|█████████▊| 12555/12776 [2:12:52<01:46,  2.07it/s] 98%|█████████▊| 12556/12776 [2:12:53<01:40,  2.18it/s]                                                        98%|█████████▊| 12556/12776 [2:12:53<01:40,  2.18it/s] 98%|█████████▊| 12557/12776 [2:12:53<01:42,  2.14it/s]                                                        98%|█████████▊| 12557/12776 [2:12:53<01:42,  2.14it/s] 98%|█████████▊| 12558/12776 [2:12:54<01:36,  2.27it/s]                                                        98%|█████████▊| 12558/12776 [2:12:54<01:36,  2.27it/s] 98%|█████████▊| 12559/12776 [2:12:54<01:30,  2.41it/s]                                                        98%|█████████▊| 12559/12776 [2:12:54<01:30,  2.41it/s] 98%|█████████▊| 12560/12776 [2:12:54<01:30,  2.39it/s]                                                        98%|█████████▊| 12560/12776 [2:12:54<01:30,  2.39it/s] 98%|█████████▊| 12561/12776 [2:12:55<01:24,  2.54it/s]                                                        98%|█████████▊| 12561/12776 [2:12:55<01:24,  2.54it/s] 98%|█████████▊| 12562/12776 [2:12:55<01:20,  2.67it/s]                                                        98%|█████████▊| 12562/12776 [2:12:55<01:20,  2.67it/s] 98%|█████████▊| 12563/12776 [2:12:55<01:20,  2.65it/s]                                                        98%|█████████▊| 12563/12776 [2:12:55<01:20,  2.65it/s] 98%|█████████▊| 12564/12776 [2:12:56<01:15,  2.81it/s]                                                        98%|█████████▊| 12564/12776 [2:12:56<01:15,  2.81it/s] 98%|█████████▊| 12565/12776 [2:12:56<01:11,  2.96it/s]                                                        98%|█████████▊| 12565/12776 [2:12:56<01:11,  2.96it/s] 98%|█████████▊| 12566/12776 [2:12:56<01:12,  2.88it/s]                                                        98%|█████████▊| 12566/12776 [2:12:56<01:12,  2.88it/s] 98%|█████████▊| 12567/12776 [2:12:57<01:08,  3.07it/s]                                                        98%|█████████▊| 12567/12776 [2:12:57<01:08,  3.07it/s] 98%|█████████▊| 12568/12776 [2:12:57<01:04,  3.23it/s]                                                        98%|█████████▊| 12568/12776 [2:12:57<01:04,  3.23it/s] 98%|█████████▊| 12569/12776 [2:12:57<01:01,  3.37it/s]                                                        98%|█████████▊| 12569/12776 [2:12:57<01:01,  3.37it/s] 98%|█████████▊| 12570/12776 [2:12:57<01:02,  3.28it/s]                                                        98%|█████████▊| 12570/12776 [2:12:57<01:02,  3.28it/s] 98%|█████████▊| 12571/12776 [2:12:58<00:59,  3.46it/s]                                                        98%|█████████▊| 12571/12776 [2:12:58<00:59,  3.46it/s] 98%|█████████▊| 12572/12776 [2:12:58<00:56,  3.63it/s]                                                        98%|█████████▊| 12572/12776 [2:12:58<00:56,  3.63it/s] 98%|█████████▊| 12573/12776 [2:12:58<00:53,  3.78it/s]                                                        98%|█████████▊| 12573/12776 [2:12:58<00:53,  3.78it/s] 98%|█████████▊| 12574/12776 [2:12:58<00:51,  3.94it/s]                                                        98%|█████████▊| 12574/12776 [2:12:58<00:51,  3.94it/s] 98%|█████████▊| 12575/12776 [2:12:59<00:52,  3.80it/s]                                                        98%|█████████▊| 12575/12776 [2:12:59<00:52,  3.80it/s] 98%|█████████▊| 12576/12776 [2:12:59<00:50,  4.00it/s]                                                        98%|█████████▊| 12576/12776 [2:12:59<00:50,  4.00it/s] 98%|█████████▊| 12577/12776 [2:12:59<00:47,  4.18it/s]                                                        98%|█████████▊| 12577/12776 [2:12:59<00:47,  4.18it/s] 98%|█████████▊| 12578/12776 [2:12:59<00:45,  4.35it/s]                                                        98%|█████████▊| 12578/12776 [2:12:59<00:45,  4.35it/s] 98%|█████████▊| 12579/12776 [2:13:00<00:44,  4.45it/s]                                                        98%|█████████▊| 12579/12776 [2:13:00<00:44,  4.45it/s] 98%|█████████▊| 12580/12776 [2:13:00<00:47,  4.12it/s]                                                        98%|█████████▊| 12580/12776 [2:13:00<00:47,  4.12it/s] 98%|█████████▊| 12581/12776 [2:13:00<00:45,  4.32it/s]                                                        98%|█████████▊| 12581/12776 [2:13:00<00:45,  4.32it/s] 98%|█████████▊| 12582/12776 [2:13:00<00:43,  4.50it/s]                                                        98%|█████████▊| 12582/12776 [2:13:00<00:43,  4.50it/s] 98%|█████████▊| 12583/12776 [2:13:00<00:41,  4.65it/s]                                                        98%|█████████▊| 12583/12776 [2:13:00<00:41,  4.65it/s] 98%|█████████▊| 12584/12776 [2:13:01<00:40,  4.79it/s]                                                        98%|█████████▊| 12584/12776 [2:13:01<00:40,  4.79it/s] 99%|█████████▊| 12585/12776 [2:13:01<00:47,  4.05it/s]                                                        99%|███████���█▊| 12585/12776 [2:13:01<00:47,  4.05it/s] 99%|█████████▊| 12586/12776 [2:13:01<00:43,  4.36it/s]                                                        99%|█████████▊| 12586/12776 [2:13:01<00:43,  4.36it/s] 99%|█████████▊| 12587/12776 [2:13:01<00:40,  4.64it/s]                                                        99%|█████████▊| 12587/12776 [2:13:01<00:40,  4.64it/s] 99%|█████████▊| 12588/12776 [2:13:02<01:10,  2.65it/s]                                                        99%|█████████▊| 12588/12776 [2:13:02<01:10,  2.65it/s] 99%|█████████▊| 12589/12776 [2:13:04<02:12,  1.41it/s]                                                        99%|█████████▊| 12589/12776 [2:13:04<02:12,  1.41it/s] 99%|█████████▊| 12590/12776 [2:13:05<02:23,  1.29it/s]                                                        99%|█████████▊| 12590/12776 [2:13:05<02:23,  1.29it/s] 99%|█████████▊| 12591/12776 [2:13:05<02:34,  1.20it/s]                                                        99%|█████████▊| 12591/12776 [2:13:05<02:34,  1.20it/s] 99%|█████████▊| 12592/12776 [2:13:06<02:29,  1.23it/s]                                                        99%|█████████▊| 12592/12776 [2:13:06<02:29,  1.23it/s] 99%|█████████▊| 12593/12776 [2:13:07<02:27,  1.24it/s]                                                        99%|█████████▊| 12593/12776 [2:13:07<02:27,  1.24it/s] 99%|█████████▊| 12594/12776 [2:13:08<02:22,  1.28it/s]                                                        99%|█████████▊| 12594/12776 [2:13:08<02:22,  1.28it/s] 99%|█████████▊| 12595/12776 [2:13:08<02:14,  1.35it/s]                                                        99%|█████████▊| 12595/12776 [2:13:08<02:14,  1.35it/s] 99%|█████████▊| 12596/12776 [2:13:09<02:14,  1.34it/s]                                                        99%|█████████▊| 12596/12776 [2:13:09<02:14,  1.34it/s] 99%|█████████▊| 12597/12776 [2:13:10<02:04,  1.44it/s]                                                        99%|█████████▊| 12597/12776 [2:13:10<02:04,  1.44it/s] 99%|█████████▊| 12598/12776 [2:13:10<02:01,  1.47it/s]                                                        99%|█████████▊| 12598/12776 [2:13:10<02:01,  1.47it/s] 99%|█████████▊| 12599/12776 [2:13:11<01:52,  1.57it/s]                                                        99%|█████████▊| 12599/12776 [2:13:11<01:52,  1.57it/s] 99%|█████████▊| 12600/12776 [2:13:12<01:50,  1.60it/s]                                                        99%|█████████▊| 12600/12776 [2:13:12<01:50,  1.60it/s] 99%|█████████▊| 12601/12776 [2:13:12<01:42,  1.70it/s]                                                        99%|█████████▊| 12601/12776 [2:13:12<01:42,  1.70it/s] 99%|█████████▊| 12602/12776 [2:13:13<01:38,  1.77it/s]                                                        99%|█████████▊| 12602/12776 [2:13:13<01:38,  1.77it/s] 99%|█████████▊| 12603/12776 [2:13:13<01:30,  1.90it/s]                                                        99%|█████████▊| 12603/12776 [2:13:13<01:30,  1.90it/s] 99%|█████████▊| 12604/12776 [2:13:13<01:28,  1.94it/s]                                                        99%|█████████▊| 12604/12776 [2:13:13<01:28,  1.94it/s] 99%|█████████▊| 12605/12776 [2:13:14<01:22,  2.06it/s]                                                        99%|█████████▊| 12605/12776 [2:13:14<01:22,  2.06it/s] 99%|█████████▊| 12606/12776 [2:13:14<01:17,  2.19it/s]                                                        99%|█████████▊| 12606/12776 [2:13:14<01:17,  2.19it/s] 99%|█████████▊| 12607/12776 [2:13:15<01:18,  2.15it/s]                                                        99%|█████████▊| 12607/12776 [2:13:15<01:18,  2.15it/s] 99%|█████████▊| 12608/12776 [2:13:15<01:12,  2.31it/s]                                                        99%|█████████▊| 12608/12776 [2:13:15<01:12,  2.31it/s] 99%|█████████▊| 12609/12776 [2:13:15<01:07,  2.47it/s]                                                        99%|█████████▊| 12609/12776 [2:13:15<01:07,  2.47it/s] 99%|█████████▊| 12610/12776 [2:13:16<01:08,  2.42it/s]                                                        99%|█████████▊| 12610/12776 [2:13:16<01:08,  2.42it/s] 99%|█████████▊| 12611/12776 [2:13:16<01:04,  2.54it/s]                                                        99%|█████████▊| 12611/12776 [2:13:16<01:04,  2.54it/s] 99%|█████████▊| 12612/12776 [2:13:17<01:00,  2.71it/s]                                                        99%|█████████▊| 12612/12776 [2:13:17<01:00,  2.71it/s] 99%|█████████▊| 12613/12776 [2:13:17<00:57,  2.82it/s]                                                        99%|█████████▊| 12613/12776 [2:13:17<00:57,  2.82it/s] 99%|█████████▊| 12614/12776 [2:13:17<00:54,  2.95it/s]                                                        99%|█████████▊| 12614/12776 [2:13:17<00:54,  2.95it/s] 99%|█████████▊| 12615/12776 [2:13:17<00:52,  3.08it/s]                                                        99%|█████████▊| 12615/12776 [2:13:17<00:52,  3.08it/s] 99%|█████████▊| 12616/12776 [2:13:18<00:51,  3.12it/s]                                                        99%|█████████▊| 12616/12776 [2:13:18<00:51,  3.12it/s] 99%|█████████▉| 12617/12776 [2:13:18<00:54,  2.93it/s]                                                        99%|█████████▉| 12617/12776 [2:13:18<00:54,  2.93it/s] 99%|█████████▉| 12618/12776 [2:13:18<00:49,  3.19it/s]                                                        99%|█████████▉| 12618/12776 [2:13:18<00:49,  3.19it/s] 99%|█████████▉| 12619/12776 [2:13:19<00:46,  3.40it/s]                                                        99%|█████████▉| 12619/12776 [2:13:19<00:46,  3.40it/s] 99%|█████████▉| 12620/12776 [2:13:19<00:43,  3.59it/s]                                                        99%|█████████▉| 12620/12776 [2:13:19<00:43,  3.59it/s] 99%|█████████▉| 12621/12776 [2:13:19<00:42,  3.63it/s]                                                       {'loss': 1.4974, 'grad_norm': 1.045058012008667, 'learning_rate': 6.109481915933529e-06, 'epoch': 1.96}
+{'loss': 1.5181, 'grad_norm': 2.036625623703003, 'learning_rate': 6.085043988269794e-06, 'epoch': 1.96}
+{'loss': 1.4962, 'grad_norm': 1.2316803932189941, 'learning_rate': 6.060606060606061e-06, 'epoch': 1.96}
+{'loss': 1.4232, 'grad_norm': 1.3155983686447144, 'learning_rate': 6.036168132942326e-06, 'epoch': 1.96}
+{'loss': 1.3122, 'grad_norm': 1.7271487712860107, 'learning_rate': 6.011730205278591e-06, 'epoch': 1.96}
+{'loss': 1.4496, 'grad_norm': 2.852642774581909, 'learning_rate': 5.987292277614857e-06, 'epoch': 1.96}
+{'loss': 1.4374, 'grad_norm': 1.5159505605697632, 'learning_rate': 5.9628543499511236e-06, 'epoch': 1.96}
+{'loss': 1.4034, 'grad_norm': 0.9547608494758606, 'learning_rate': 5.938416422287389e-06, 'epoch': 1.96}
+{'loss': 1.428, 'grad_norm': 2.335413694381714, 'learning_rate': 5.913978494623655e-06, 'epoch': 1.96}
+{'loss': 1.5437, 'grad_norm': 1.940859317779541, 'learning_rate': 5.889540566959921e-06, 'epoch': 1.96}
+{'loss': 1.5615, 'grad_norm': 1.7635548114776611, 'learning_rate': 5.865102639296187e-06, 'epoch': 1.97}
+{'loss': 1.3193, 'grad_norm': 1.3363462686538696, 'learning_rate': 5.840664711632453e-06, 'epoch': 1.97}
+{'loss': 1.4132, 'grad_norm': 1.3077514171600342, 'learning_rate': 5.816226783968719e-06, 'epoch': 1.97}
+{'loss': 1.3992, 'grad_norm': 1.0712963342666626, 'learning_rate': 5.791788856304985e-06, 'epoch': 1.97}
+{'loss': 1.4594, 'grad_norm': 1.7005242109298706, 'learning_rate': 5.7673509286412504e-06, 'epoch': 1.97}
+{'loss': 1.3866, 'grad_norm': 1.507432460784912, 'learning_rate': 5.742913000977517e-06, 'epoch': 1.97}
+{'loss': 1.5171, 'grad_norm': 2.3320140838623047, 'learning_rate': 5.718475073313783e-06, 'epoch': 1.97}
+{'loss': 1.3281, 'grad_norm': 1.6303120851516724, 'learning_rate': 5.694037145650048e-06, 'epoch': 1.97}
+{'loss': 1.3929, 'grad_norm': 9.600627899169922, 'learning_rate': 5.669599217986315e-06, 'epoch': 1.97}
+{'loss': 1.2811, 'grad_norm': 2.142137050628662, 'learning_rate': 5.64516129032258e-06, 'epoch': 1.97}
+{'loss': 1.3192, 'grad_norm': 3.0277087688446045, 'learning_rate': 5.620723362658846e-06, 'epoch': 1.97}
+{'loss': 1.5502, 'grad_norm': 7.777707576751709, 'learning_rate': 5.596285434995112e-06, 'epoch': 1.97}
+{'loss': 1.2499, 'grad_norm': 2.6765975952148438, 'learning_rate': 5.571847507331378e-06, 'epoch': 1.97}
+{'loss': 1.1884, 'grad_norm': 2.20479416847229, 'learning_rate': 5.547409579667644e-06, 'epoch': 1.97}
+{'loss': 1.1944, 'grad_norm': 2.699751377105713, 'learning_rate': 5.5229716520039095e-06, 'epoch': 1.97}
+{'loss': 1.5012, 'grad_norm': 10.28531265258789, 'learning_rate': 5.498533724340176e-06, 'epoch': 1.97}
+{'loss': 1.1851, 'grad_norm': 4.221407890319824, 'learning_rate': 5.474095796676442e-06, 'epoch': 1.97}
+{'loss': 1.0217, 'grad_norm': 4.658267498016357, 'learning_rate': 5.449657869012707e-06, 'epoch': 1.97}
+{'loss': 1.545, 'grad_norm': 7.977578163146973, 'learning_rate': 5.425219941348974e-06, 'epoch': 1.97}
+{'loss': 1.2254, 'grad_norm': 2.949538469314575, 'learning_rate': 5.400782013685239e-06, 'epoch': 1.97}
+{'loss': 1.1792, 'grad_norm': 5.280106067657471, 'learning_rate': 5.376344086021505e-06, 'epoch': 1.97}
+{'loss': 1.3272, 'grad_norm': 6.7711710929870605, 'learning_rate': 5.3519061583577715e-06, 'epoch': 1.97}
+{'loss': 1.4548, 'grad_norm': 5.601173400878906, 'learning_rate': 5.327468230694037e-06, 'epoch': 1.97}
+{'loss': 0.905, 'grad_norm': 2.446711540222168, 'learning_rate': 5.303030303030302e-06, 'epoch': 1.97}
+{'loss': 1.2203, 'grad_norm': 2.5086371898651123, 'learning_rate': 5.278592375366569e-06, 'epoch': 1.97}
+{'loss': 0.9831, 'grad_norm': 2.3690879344940186, 'learning_rate': 5.254154447702834e-06, 'epoch': 1.97}
+{'loss': 1.2816, 'grad_norm': 4.901893615722656, 'learning_rate': 5.2297165200391e-06, 'epoch': 1.97}
+{'loss': 0.8846, 'grad_norm': 3.7496819496154785, 'learning_rate': 5.205278592375365e-06, 'epoch': 1.97}
+{'loss': 1.17, 'grad_norm': 6.612450122833252, 'learning_rate': 5.180840664711632e-06, 'epoch': 1.97}
+{'loss': 1.9362, 'grad_norm': 5.878314018249512, 'learning_rate': 5.1564027370478976e-06, 'epoch': 1.97}
+{'loss': 1.3451, 'grad_norm': 4.377343654632568, 'learning_rate': 5.131964809384163e-06, 'epoch': 1.97}
+{'loss': 0.6299, 'grad_norm': 2.8641626834869385, 'learning_rate': 5.10752688172043e-06, 'epoch': 1.97}
+{'loss': 0.5064, 'grad_norm': 2.5689475536346436, 'learning_rate': 5.083088954056695e-06, 'epoch': 1.97}
+{'loss': 0.6611, 'grad_norm': 2.1100947856903076, 'learning_rate': 5.058651026392961e-06, 'epoch': 1.97}
+{'loss': 0.8358, 'grad_norm': 2.1359899044036865, 'learning_rate': 5.0342130987292275e-06, 'epoch': 1.97}
+{'loss': 0.864, 'grad_norm': 2.32698392868042, 'learning_rate': 5.009775171065493e-06, 'epoch': 1.97}
+{'loss': 1.5319, 'grad_norm': 0.9872207641601562, 'learning_rate': 4.985337243401759e-06, 'epoch': 1.97}
+{'loss': 1.4884, 'grad_norm': 1.0228644609451294, 'learning_rate': 4.960899315738025e-06, 'epoch': 1.97}
+{'loss': 1.5075, 'grad_norm': 0.822214663028717, 'learning_rate': 4.936461388074291e-06, 'epoch': 1.97}
+{'loss': 1.6097, 'grad_norm': 1.3645509481430054, 'learning_rate': 4.9120234604105566e-06, 'epoch': 1.97}
+{'loss': 1.5494, 'grad_norm': 1.2646301984786987, 'learning_rate': 4.887585532746823e-06, 'epoch': 1.97}
+{'loss': 1.4176, 'grad_norm': 1.6212059259414673, 'learning_rate': 4.863147605083089e-06, 'epoch': 1.97}
+{'loss': 1.444, 'grad_norm': 2.566788673400879, 'learning_rate': 4.838709677419354e-06, 'epoch': 1.97}
+{'loss': 1.3891, 'grad_norm': 1.1608645915985107, 'learning_rate': 4.81427174975562e-06, 'epoch': 1.97}
+{'loss': 1.3828, 'grad_norm': 2.177610397338867, 'learning_rate': 4.7898338220918865e-06, 'epoch': 1.97}
+{'loss': 1.3869, 'grad_norm': 5.3571977615356445, 'learning_rate': 4.765395894428152e-06, 'epoch': 1.97}
+{'loss': 1.5305, 'grad_norm': 1.231022834777832, 'learning_rate': 4.740957966764418e-06, 'epoch': 1.97}
+{'loss': 1.4779, 'grad_norm': 1.084704875946045, 'learning_rate': 4.716520039100684e-06, 'epoch': 1.97}
+{'loss': 1.4689, 'grad_norm': 2.484323024749756, 'learning_rate': 4.69208211143695e-06, 'epoch': 1.97}
+{'loss': 1.3976, 'grad_norm': 6.887755870819092, 'learning_rate': 4.667644183773216e-06, 'epoch': 1.97}
+{'loss': 1.4639, 'grad_norm': 2.1070051193237305, 'learning_rate': 4.643206256109481e-06, 'epoch': 1.97}
+{'loss': 1.6144, 'grad_norm': 8.759196281433105, 'learning_rate': 4.618768328445748e-06, 'epoch': 1.97}
+{'loss': 1.4443, 'grad_norm': 2.5562074184417725, 'learning_rate': 4.594330400782013e-06, 'epoch': 1.97}
+{'loss': 1.4747, 'grad_norm': 1.4550449848175049, 'learning_rate': 4.569892473118279e-06, 'epoch': 1.97}
+{'loss': 1.3716, 'grad_norm': 3.1739792823791504, 'learning_rate': 4.5454545454545455e-06, 'epoch': 1.97}
+{'loss': 1.4844, 'grad_norm': 2.384730577468872, 'learning_rate': 4.521016617790811e-06, 'epoch': 1.97}
+{'loss': 1.5907, 'grad_norm': 1.8984757661819458, 'learning_rate': 4.496578690127077e-06, 'epoch': 1.97}
+{'loss': 1.5429, 'grad_norm': 2.8245761394500732, 'learning_rate': 4.472140762463343e-06, 'epoch': 1.97}
+{'loss': 1.408, 'grad_norm': 3.4093105792999268, 'learning_rate': 4.447702834799608e-06, 'epoch': 1.97}
+{'loss': 1.2528, 'grad_norm': 2.0443670749664307, 'learning_rate': 4.423264907135875e-06, 'epoch': 1.97}
+{'loss': 1.1372, 'grad_norm': 2.573211193084717, 'learning_rate': 4.39882697947214e-06, 'epoch': 1.97}
+{'loss': 1.1394, 'grad_norm': 3.4609265327453613, 'learning_rate': 4.374389051808406e-06, 'epoch': 1.97}
+{'loss': 1.2423, 'grad_norm': 5.100785255432129, 'learning_rate': 4.349951124144672e-06, 'epoch': 1.97}
+{'loss': 1.1019, 'grad_norm': 2.3350820541381836, 'learning_rate': 4.325513196480938e-06, 'epoch': 1.97}
+{'loss': 1.3405, 'grad_norm': 6.419863224029541, 'learning_rate': 4.301075268817204e-06, 'epoch': 1.98}
+{'loss': 1.5057, 'grad_norm': 2.2042758464813232, 'learning_rate': 4.27663734115347e-06, 'epoch': 1.98}
+{'loss': 1.1672, 'grad_norm': 4.460781574249268, 'learning_rate': 4.252199413489736e-06, 'epoch': 1.98}
+{'loss': 1.1246, 'grad_norm': 3.557311773300171, 'learning_rate': 4.2277614858260015e-06, 'epoch': 1.98}
+ 99%|█████████▉| 12621/12776 [2:13:19<00:42,  3.63it/s] 99%|█████████▉| 12622/12776 [2:13:19<00:40,  3.76it/s]                                                        99%|█████████▉| 12622/12776 [2:13:19<00:40,  3.76it/s] 99%|█████████▉| 12623/12776 [2:13:20<00:38,  3.94it/s]                                                        99%|█████████▉| 12623/12776 [2:13:20<00:38,  3.94it/s] 99%|█████████▉| 12624/12776 [2:13:20<00:36,  4.12it/s]                                                        99%|█████████▉| 12624/12776 [2:13:20<00:36,  4.12it/s] 99%|█████████▉| 12625/12776 [2:13:20<00:35,  4.28it/s]                                                        99%|█████████▉| 12625/12776 [2:13:20<00:35,  4.28it/s] 99%|█████████▉| 12626/12776 [2:13:20<00:39,  3.83it/s]                                                        99%|█████████▉| 12626/12776 [2:13:20<00:39,  3.83it/s] 99%|█████████▉| 12627/12776 [2:13:21<00:36,  4.07it/s]                                                        99%|█████████▉| 12627/12776 [2:13:21<00:36,  4.07it/s] 99%|█████████▉| 12628/12776 [2:13:21<00:34,  4.29it/s]                                                        99%|█████████▉| 12628/12776 [2:13:21<00:34,  4.29it/s] 99%|█████████▉| 12629/12776 [2:13:21<00:32,  4.47it/s]                                                        99%|█████████▉| 12629/12776 [2:13:21<00:32,  4.47it/s] 99%|█████████▉| 12630/12776 [2:13:21<00:31,  4.62it/s]                                                        99%|█████████▉| 12630/12776 [2:13:21<00:31,  4.62it/s] 99%|█████████▉| 12631/12776 [2:13:22<00:36,  3.98it/s]                                                        99%|█████████▉| 12631/12776 [2:13:22<00:36,  3.98it/s] 99%|█████████▉| 12632/12776 [2:13:22<00:33,  4.26it/s]                                                        99%|█████████▉| 12632/12776 [2:13:22<00:33,  4.26it/s] 99%|█████████▉| 12633/12776 [2:13:22<00:31,  4.49it/s]                                                        99%|█████████▉| 12633/12776 [2:13:22<00:31,  4.49it/s] 99%|█████████▉| 12634/12776 [2:13:22<00:30,  4.67it/s]                                                        99%|█████████▉| 12634/12776 [2:13:22<00:30,  4.67it/s] 99%|█████████▉| 12635/12776 [2:13:22<00:29,  4.84it/s]                                                        99%|█████████▉| 12635/12776 [2:13:22<00:29,  4.84it/s] 99%|█████████▉| 12636/12776 [2:13:23<00:30,  4.59it/s]                                                        99%|█████████▉| 12636/12776 [2:13:23<00:30,  4.59it/s] 99%|█████████▉| 12637/12776 [2:13:23<00:28,  4.83it/s]                                                        99%|█████████▉| 12637/12776 [2:13:23<00:28,  4.83it/s] 99%|█████████▉| 12638/12776 [2:13:23<00:47,  2.89it/s]                                                        99%|█████████▉| 12638/12776 [2:13:23<00:47,  2.89it/s] 99%|█████████▉| 12639/12776 [2:13:25<01:32,  1.48it/s]                                                        99%|█████████▉| 12639/12776 [2:13:25<01:32,  1.48it/s] 99%|█████████▉| 12640/12776 [2:13:26<01:42,  1.33it/s]                                                        99%|█████████▉| 12640/12776 [2:13:26<01:42,  1.33it/s] 99%|█████████▉| 12641/12776 [2:13:27<01:44,  1.29it/s]                                                        99%|█████████▉| 12641/12776 [2:13:27<01:44,  1.29it/s] 99%|█████████▉| 12642/12776 [2:13:27<01:45,  1.27it/s]                                                        99%|█████████▉| 12642/12776 [2:13:27<01:45,  1.27it/s] 99%|█████████▉| 12643/12776 [2:13:28<01:46,  1.25it/s]                                                        99%|█████████▉| 12643/12776 [2:13:28<01:46,  1.25it/s] 99%|█████████▉| 12644/12776 [2:13:29<01:41,  1.30it/s]                                                        99%|█████████▉| 12644/12776 [2:13:29<01:41,  1.30it/s] 99%|█████████▉| 12645/12776 [2:13:30<01:38,  1.33it/s]                                                        99%|█████████▉| 12645/12776 [2:13:30<01:38,  1.33it/s] 99%|█████████▉| 12646/12776 [2:13:30<01:32,  1.40it/s]                                                        99%|█████████▉| 12646/12776 [2:13:30<01:32,  1.40it/s] 99%|█████████▉| 12647/12776 [2:13:31<01:28,  1.46it/s]                                                        99%|█████████▉| 12647/12776 [2:13:31<01:28,  1.46it/s] 99%|█████████▉| 12648/12776 [2:13:31<01:23,  1.54it/s]                                                        99%|█████████▉| 12648/12776 [2:13:31<01:23,  1.54it/s] 99%|█████████▉| 12649/12776 [2:13:32<01:18,  1.61it/s]                                                        99%|█████████▉| 12649/12776 [2:13:32<01:18,  1.61it/s] 99%|█████████▉| 12650/12776 [2:13:33<01:14,  1.69it/s]                                                        99%|█████████▉| 12650/12776 [2:13:33<01:14,  1.69it/s] 99%|█████████▉| 12651/12776 [2:13:33<01:13,  1.70it/s]                                                        99%|█████████▉| 12651/12776 [2:13:33<01:13,  1.70it/s] 99%|█████████▉| 12652/12776 [2:13:34<01:08,  1.82it/s]                                                        99%|█████████▉| 12652/12776 [2:13:34<01:08,  1.82it/s] 99%|█████████▉| 12653/12776 [2:13:34<01:07,  1.81it/s]                                                        99%|█████████▉| 12653/12776 [2:13:34<01:07,  1.81it/s] 99%|█████████▉| 12654/12776 [2:13:35<01:03,  1.93it/s]                                                        99%|█████████▉| 12654/12776 [2:13:35<01:03,  1.93it/s] 99%|█████████▉| 12655/12776 [2:13:35<01:01,  1.96it/s]                                                        99%|█████████▉| 12655/12776 [2:13:35<01:01,  1.96it/s] 99%|█████████▉| 12656/12776 [2:13:35<00:57,  2.09it/s]                                                        99%|█████████▉| 12656/12776 [2:13:35<00:57,  2.09it/s] 99%|█████████▉| 12657/12776 [2:13:36<00:53,  2.21it/s]                                                        99%|█████████▉| 12657/12776 [2:13:36<00:53,  2.21it/s] 99%|█████████▉| 12658/12776 [2:13:36<00:56,  2.10it/s]                                                        99%|█████████▉| 12658/12776 [2:13:36<00:56,  2.10it/s] 99%|█████████▉| 12659/12776 [2:13:37<00:51,  2.26it/s]                                                        99%|█████████▉| 12659/12776 [2:13:37<00:51,  2.26it/s] 99%|█████████▉| 12660/12776 [2:13:37<00:48,  2.41it/s]                                                        99%|█████████▉| 12660/12776 [2:13:37<00:48,  2.41it/s] 99%|█████████▉| 12661/12776 [2:13:38<00:47,  2.40it/s]                                                        99%|█████████▉| 12661/12776 [2:13:38<00:47,  2.40it/s] 99%|█████████▉| 12662/12776 [2:13:38<00:44,  2.56it/s]                                                        99%|█████████▉| 12662/12776 [2:13:38<00:44,  2.56it/s] 99%|█████████▉| 12663/12776 [2:13:38<00:41,  2.70it/s]                                                        99%|█████████▉| 12663/12776 [2:13:38<00:41,  2.70it/s] 99%|█████████▉| 12664/12776 [2:13:38<00:39,  2.83it/s]                                                        99%|█████████▉| 12664/12776 [2:13:38<00:39,  2.83it/s] 99%|█████████▉| 12665/12776 [2:13:39<00:37,  2.96it/s]                                                        99%|█████████▉| 12665/12776 [2:13:39<00:37,  2.96it/s] 99%|█████████▉| 12666/12776 [2:13:39<00:35,  3.08it/s]                                                        99%|█████████▉| 12666/12776 [2:13:39<00:35,  3.08it/s] 99%|█████████▉| 12667/12776 [2:13:39<00:34,  3.20it/s]                                                        99%|█████████▉| 12667/12776 [2:13:39<00:34,  3.20it/s] 99%|█████████▉| 12668/12776 [2:13:40<00:35,  3.03it/s]                                                        99%|█████████▉| 12668/12776 [2:13:40<00:35,  3.03it/s] 99%|█████████▉| 12669/12776 [2:13:40<00:33,  3.21it/s]                                                        99%|█████████▉| 12669/12776 [2:13:40<00:33,  3.21it/s] 99%|█████████▉| 12670/12776 [2:13:40<00:31,  3.40it/s]                                                        99%|█████████▉| 12670/12776 [2:13:40<00:31,  3.40it/s] 99%|█████████▉| 12671/12776 [2:13:41<00:29,  3.56it/s]                                                        99%|█████████▉| 12671/12776 [2:13:41<00:29,  3.56it/s] 99%|█████████▉| 12672/12776 [2:13:41<00:30,  3.40it/s]                                                        99%|█████████▉| 12672/12776 [2:13:41<00:30,  3.40it/s] 99%|█████████▉| 12673/12776 [2:13:41<00:28,  3.60it/s]                                                        99%|█████████▉| 12673/12776 [2:13:41<00:28,  3.60it/s] 99%|█████████▉| 12674/12776 [2:13:41<00:26,  3.78it/s]                                                        99%|█████████▉| 12674/12776 [2:13:41<00:26,  3.78it/s] 99%|█████████▉| 12675/12776 [2:13:42<00:25,  3.94it/s]                                                        99%|█████████▉| 12675/12776 [2:13:42<00:25,  3.94it/s] 99%|█████████▉| 12676/12776 [2:13:42<00:26,  3.78it/s]                                                        99%|█████████▉| 12676/12776 [2:13:42<00:26,  3.78it/s] 99%|█████████▉| 12677/12776 [2:13:42<00:24,  4.00it/s]                                                        99%|█████████▉| 12677/12776 [2:13:42<00:24,  4.00it/s] 99%|█████████▉| 12678/12776 [2:13:42<00:23,  4.19it/s]                                                        99%|█████████▉| 12678/12776 [2:13:42<00:23,  4.19it/s] 99%|█████████▉| 12679/12776 [2:13:42<00:22,  4.34it/s]                                                        99%|█████████▉| 12679/12776 [2:13:42<00:22,  4.34it/s] 99%|█████████▉| 12680/12776 [2:13:43<00:21,  4.49it/s]                                                        99%|█████████▉| 12680/12776 [2:13:43<00:21,  4.49it/s] 99%|█████████▉| 12681/12776 [2:13:43<00:23,  4.09it/s]                                                        99%|█████████▉| 12681/12776 [2:13:43<00:23,  4.09it/s] 99%|█████████▉| 12682/12776 [2:13:43<00:21,  4.33it/s]                                                        99%|█████████▉| 12682/12776 [2:13:43<00:21,  4.33it/s] 99%|█████████▉| 12683/12776 [2:13:43<00:20,  4.54it/s]                                                        99%|█████████▉| 12683/12776 [2:13:43<00:20,  4.54it/s] 99%|█████████▉| 12684/12776 [2:13:44<00:19,  4.71it/s]                                                        99%|█████████▉| 12684/12776 [2:13:44<00:19,  4.71it/s] 99%|█████████▉| 12685/12776 [2:13:44<00:18,  4.84it/s]                                                        99%|█████████▉| 12685/12776 [2:13:44<00:18,  4.84it/s] 99%|█████████▉| 12686/12776 [2:13:44<00:18,  4.99it/s]                                                        99%|█████████▉| 12686/12776 [2:13:44<00:18,  4.99it/s] 99%|█████████▉| 12687/12776 [2:13:44<00:20,  4.39it/s]                                                        99%|█████████▉| 12687/12776 [2:13:44<00:20,  4.39it/s] 99%|█████████▉| 12688/12776 [2:13:45<00:32,  2.67it/s]                                                        99%|█████████▉| 12688/12776 [2:13:45<00:32,  2.67it/s] 99%|█████████▉| 12689/12776 [2:13:46<01:02,  1.40it/s]                                                        99%|█████████▉| 12689/12776 [2:13:46<01:02,  1.40it/s] 99%|█████████▉| 12690/12776 [2:13:48<01:11,  1.21it/s]                                                        99%|█████████▉| 12690/12776 [2:13:48<01:11,  1.21it/s] 99%|█████████▉| 12691/12776 [2:13:48<01:11,  1.19it/s]                                                        99%|█████████▉| 12691/12776 [2:13:48<01:11,  1.19it/s] 99%|█████████▉| 12692/12776 [2:13:49<01:08,  1.22it/s]                                                        99%|█████████▉| 12692/12776 [2:13:49<01:08,  1.22it/s] 99%|█████████▉| 12693/12776 [2:13:50<01:05,  1.27it/s]                                                        99%|█████████▉| 12693/12776 [2:13:50<01:05,  1.27it/s] 99%|█████████▉| 12694/12776 [2:13:51<01:01,  1.33it/s]                                                        99%|█████████▉| 12694/12776 [2:13:51<01:01,  1.33it/s] 99%|█████████▉| 12695/12776 [2:13:51<01:00,  1.34it/s]                                                        99%|█████████▉| 12695/12776 [2:13:51<01:00,  1.34it/s] 99%|█████████▉| 12696/12776 [2:13:52<00:56,  1.42it/s]                                                        99%|█████████▉| 12696/12776 [2:13:52<00:56,  1.42it/s] 99%|█████████▉| 12697/12776 [2:13:53<00:52,  1.49it/s]                                                        99%|█████████▉| 12697/12776 [2:13:53<00:52,  1.49it/s] 99%|█████████▉| 12698/12776 [2:13:53<00:49,  1.58it/s]                                                        99%|█████████▉| 12698/12776 [2:13:53<00:49,  1.58it/s] 99%|█████████▉| 12699/12776 [2:13:54<00:48,  1.60it/s]                                                       {'loss': 1.149, 'grad_norm': 1.4838236570358276, 'learning_rate': 4.203323558162268e-06, 'epoch': 1.98}
+{'loss': 1.1258, 'grad_norm': 6.371537685394287, 'learning_rate': 4.178885630498534e-06, 'epoch': 1.98}
+{'loss': 1.1318, 'grad_norm': 4.541806221008301, 'learning_rate': 4.154447702834799e-06, 'epoch': 1.98}
+{'loss': 1.4951, 'grad_norm': 4.556163787841797, 'learning_rate': 4.130009775171065e-06, 'epoch': 1.98}
+{'loss': 1.6353, 'grad_norm': 6.643764495849609, 'learning_rate': 4.105571847507331e-06, 'epoch': 1.98}
+{'loss': 1.3794, 'grad_norm': 3.9981839656829834, 'learning_rate': 4.081133919843597e-06, 'epoch': 1.98}
+{'loss': 1.2775, 'grad_norm': 2.1788060665130615, 'learning_rate': 4.056695992179863e-06, 'epoch': 1.98}
+{'loss': 1.3801, 'grad_norm': 3.0667903423309326, 'learning_rate': 4.032258064516128e-06, 'epoch': 1.98}
+{'loss': 1.6699, 'grad_norm': 5.267396926879883, 'learning_rate': 4.007820136852395e-06, 'epoch': 1.98}
+{'loss': 1.2728, 'grad_norm': 3.0507497787475586, 'learning_rate': 3.9833822091886605e-06, 'epoch': 1.98}
+{'loss': 0.9162, 'grad_norm': 2.740083694458008, 'learning_rate': 3.958944281524926e-06, 'epoch': 1.98}
+{'loss': 1.2745, 'grad_norm': 3.808227300643921, 'learning_rate': 3.934506353861192e-06, 'epoch': 1.98}
+{'loss': 1.1256, 'grad_norm': 2.6046314239501953, 'learning_rate': 3.910068426197458e-06, 'epoch': 1.98}
+{'loss': 0.5949, 'grad_norm': 2.029060125350952, 'learning_rate': 3.885630498533724e-06, 'epoch': 1.98}
+{'loss': 0.8376, 'grad_norm': 4.188864707946777, 'learning_rate': 3.86119257086999e-06, 'epoch': 1.98}
+{'loss': 0.4762, 'grad_norm': 1.602839708328247, 'learning_rate': 3.836754643206256e-06, 'epoch': 1.98}
+{'loss': 0.7491, 'grad_norm': 1.9798146486282349, 'learning_rate': 3.8123167155425217e-06, 'epoch': 1.98}
+{'loss': 0.9718, 'grad_norm': 7.58799409866333, 'learning_rate': 3.7878787878787874e-06, 'epoch': 1.98}
+{'loss': 1.5842, 'grad_norm': 0.6994940638542175, 'learning_rate': 3.7634408602150534e-06, 'epoch': 1.98}
+{'loss': 1.6148, 'grad_norm': 1.8685157299041748, 'learning_rate': 3.739002932551319e-06, 'epoch': 1.98}
+{'loss': 1.5231, 'grad_norm': 0.7222037315368652, 'learning_rate': 3.714565004887585e-06, 'epoch': 1.98}
+{'loss': 1.5259, 'grad_norm': 0.7957490682601929, 'learning_rate': 3.6901270772238512e-06, 'epoch': 1.98}
+{'loss': 1.5504, 'grad_norm': 0.8446835875511169, 'learning_rate': 3.665689149560117e-06, 'epoch': 1.98}
+{'loss': 1.5892, 'grad_norm': 1.2421315908432007, 'learning_rate': 3.641251221896383e-06, 'epoch': 1.98}
+{'loss': 1.4458, 'grad_norm': 1.1285243034362793, 'learning_rate': 3.616813294232649e-06, 'epoch': 1.98}
+{'loss': 1.4354, 'grad_norm': 0.9664294123649597, 'learning_rate': 3.5923753665689147e-06, 'epoch': 1.98}
+{'loss': 1.4355, 'grad_norm': 1.5649850368499756, 'learning_rate': 3.5679374389051807e-06, 'epoch': 1.98}
+{'loss': 1.6038, 'grad_norm': 1.3277194499969482, 'learning_rate': 3.5434995112414464e-06, 'epoch': 1.98}
+{'loss': 1.4237, 'grad_norm': 1.0830150842666626, 'learning_rate': 3.5190615835777125e-06, 'epoch': 1.98}
+{'loss': 1.4098, 'grad_norm': 1.434964895248413, 'learning_rate': 3.4946236559139785e-06, 'epoch': 1.98}
+{'loss': 1.5271, 'grad_norm': 1.861877202987671, 'learning_rate': 3.4701857282502437e-06, 'epoch': 1.98}
+{'loss': 1.3852, 'grad_norm': 2.560774326324463, 'learning_rate': 3.44574780058651e-06, 'epoch': 1.98}
+{'loss': 1.3889, 'grad_norm': 1.684672236442566, 'learning_rate': 3.4213098729227763e-06, 'epoch': 1.98}
+{'loss': 1.3657, 'grad_norm': 1.0975278615951538, 'learning_rate': 3.3968719452590415e-06, 'epoch': 1.98}
+{'loss': 1.2904, 'grad_norm': 1.292589783668518, 'learning_rate': 3.3724340175953076e-06, 'epoch': 1.98}
+{'loss': 1.4146, 'grad_norm': 4.135151386260986, 'learning_rate': 3.3479960899315733e-06, 'epoch': 1.98}
+{'loss': 1.4241, 'grad_norm': 2.1695103645324707, 'learning_rate': 3.3235581622678393e-06, 'epoch': 1.98}
+{'loss': 1.5548, 'grad_norm': 5.103790283203125, 'learning_rate': 3.2991202346041054e-06, 'epoch': 1.98}
+{'loss': 1.4314, 'grad_norm': 3.985460042953491, 'learning_rate': 3.274682306940371e-06, 'epoch': 1.98}
+{'loss': 1.3154, 'grad_norm': 1.8362746238708496, 'learning_rate': 3.250244379276637e-06, 'epoch': 1.98}
+{'loss': 1.4386, 'grad_norm': 5.180610179901123, 'learning_rate': 3.225806451612903e-06, 'epoch': 1.98}
+{'loss': 1.3667, 'grad_norm': 3.1565520763397217, 'learning_rate': 3.201368523949169e-06, 'epoch': 1.98}
+{'loss': 1.1725, 'grad_norm': 1.2697768211364746, 'learning_rate': 3.176930596285435e-06, 'epoch': 1.98}
+{'loss': 1.7743, 'grad_norm': 12.818161964416504, 'learning_rate': 3.1524926686217006e-06, 'epoch': 1.98}
+{'loss': 1.4785, 'grad_norm': 4.739766597747803, 'learning_rate': 3.1280547409579666e-06, 'epoch': 1.98}
+{'loss': 1.4496, 'grad_norm': 2.6713364124298096, 'learning_rate': 3.1036168132942327e-06, 'epoch': 1.98}
+{'loss': 1.255, 'grad_norm': 2.46557879447937, 'learning_rate': 3.079178885630498e-06, 'epoch': 1.98}
+{'loss': 1.3316, 'grad_norm': 3.2623789310455322, 'learning_rate': 3.0547409579667644e-06, 'epoch': 1.98}
+{'loss': 1.1054, 'grad_norm': 3.391611337661743, 'learning_rate': 3.0303030303030305e-06, 'epoch': 1.98}
+{'loss': 1.035, 'grad_norm': 2.285482168197632, 'learning_rate': 3.0058651026392957e-06, 'epoch': 1.98}
+{'loss': 0.9496, 'grad_norm': 1.301674246788025, 'learning_rate': 2.9814271749755618e-06, 'epoch': 1.98}
+{'loss': 1.2513, 'grad_norm': 21.29591178894043, 'learning_rate': 2.9569892473118274e-06, 'epoch': 1.98}
+{'loss': 0.8991, 'grad_norm': 3.679635763168335, 'learning_rate': 2.9325513196480935e-06, 'epoch': 1.98}
+{'loss': 1.2406, 'grad_norm': 3.6654906272888184, 'learning_rate': 2.9081133919843596e-06, 'epoch': 1.98}
+{'loss': 1.1958, 'grad_norm': 3.2650372982025146, 'learning_rate': 2.8836754643206252e-06, 'epoch': 1.98}
+{'loss': 1.1306, 'grad_norm': 4.416281223297119, 'learning_rate': 2.8592375366568913e-06, 'epoch': 1.98}
+{'loss': 1.1018, 'grad_norm': 2.6011712551116943, 'learning_rate': 2.8347996089931574e-06, 'epoch': 1.98}
+{'loss': 0.9713, 'grad_norm': 3.921834945678711, 'learning_rate': 2.810361681329423e-06, 'epoch': 1.98}
+{'loss': 1.1969, 'grad_norm': 5.239248275756836, 'learning_rate': 2.785923753665689e-06, 'epoch': 1.98}
+{'loss': 0.8116, 'grad_norm': 7.275485038757324, 'learning_rate': 2.7614858260019547e-06, 'epoch': 1.98}
+{'loss': 1.3322, 'grad_norm': 1.9333630800247192, 'learning_rate': 2.737047898338221e-06, 'epoch': 1.99}
+{'loss': 1.0275, 'grad_norm': 3.5819573402404785, 'learning_rate': 2.712609970674487e-06, 'epoch': 1.99}
+{'loss': 0.8067, 'grad_norm': 3.910200834274292, 'learning_rate': 2.6881720430107525e-06, 'epoch': 1.99}
+{'loss': 0.7034, 'grad_norm': 3.8609650135040283, 'learning_rate': 2.6637341153470186e-06, 'epoch': 1.99}
+{'loss': 0.8698, 'grad_norm': 2.1950485706329346, 'learning_rate': 2.6392961876832847e-06, 'epoch': 1.99}
+{'loss': 0.9502, 'grad_norm': 3.267005205154419, 'learning_rate': 2.61485826001955e-06, 'epoch': 1.99}
+{'loss': 0.6124, 'grad_norm': 1.1924513578414917, 'learning_rate': 2.590420332355816e-06, 'epoch': 1.99}
+{'loss': 0.7941, 'grad_norm': 1.8844116926193237, 'learning_rate': 2.5659824046920816e-06, 'epoch': 1.99}
+{'loss': 1.4848, 'grad_norm': 0.9816786050796509, 'learning_rate': 2.5415444770283477e-06, 'epoch': 1.99}
+{'loss': 1.4116, 'grad_norm': 0.7692421078681946, 'learning_rate': 2.5171065493646137e-06, 'epoch': 1.99}
+{'loss': 1.4803, 'grad_norm': 0.8593037724494934, 'learning_rate': 2.4926686217008794e-06, 'epoch': 1.99}
+{'loss': 1.4621, 'grad_norm': 1.4960730075836182, 'learning_rate': 2.4682306940371455e-06, 'epoch': 1.99}
+{'loss': 1.3931, 'grad_norm': 1.2484159469604492, 'learning_rate': 2.4437927663734115e-06, 'epoch': 1.99}
+{'loss': 1.4726, 'grad_norm': 2.241774320602417, 'learning_rate': 2.419354838709677e-06, 'epoch': 1.99}
+{'loss': 1.3704, 'grad_norm': 1.0046658515930176, 'learning_rate': 2.3949169110459433e-06, 'epoch': 1.99}
+{'loss': 1.4287, 'grad_norm': 1.600348949432373, 'learning_rate': 2.370478983382209e-06, 'epoch': 1.99}
+{'loss': 1.3028, 'grad_norm': 0.8872906565666199, 'learning_rate': 2.346041055718475e-06, 'epoch': 1.99}
+{'loss': 1.4942, 'grad_norm': 1.608568787574768, 'learning_rate': 2.3216031280547406e-06, 'epoch': 1.99}
+ 99%|█████████▉| 12699/12776 [2:13:54<00:48,  1.60it/s] 99%|█████████▉| 12700/12776 [2:13:54<00:44,  1.71it/s]                                                        99%|█████████▉| 12700/12776 [2:13:54<00:44,  1.71it/s] 99%|█████████▉| 12701/12776 [2:13:55<00:42,  1.76it/s]                                                        99%|█████████▉| 12701/12776 [2:13:55<00:42,  1.76it/s] 99%|█████████▉| 12702/12776 [2:13:55<00:39,  1.88it/s]                                                        99%|█████████▉| 12702/12776 [2:13:55<00:39,  1.88it/s] 99%|█████████▉| 12703/12776 [2:13:56<00:38,  1.92it/s]                                                        99%|█████████▉| 12703/12776 [2:13:56<00:38,  1.92it/s] 99%|█████████▉| 12704/12776 [2:13:56<00:35,  2.04it/s]                                                        99%|█████████▉| 12704/12776 [2:13:56<00:35,  2.04it/s] 99%|█████████▉| 12705/12776 [2:13:56<00:32,  2.15it/s]                                                        99%|█████████▉| 12705/12776 [2:13:56<00:32,  2.15it/s] 99%|█████████▉| 12706/12776 [2:13:57<00:33,  2.10it/s]                                                        99%|█████████▉| 12706/12776 [2:13:57<00:33,  2.10it/s] 99%|█████████▉| 12707/12776 [2:13:57<00:30,  2.24it/s]                                                        99%|█████████▉| 12707/12776 [2:13:57<00:30,  2.24it/s] 99%|█████████▉| 12708/12776 [2:13:58<00:28,  2.38it/s]                                                        99%|█████████▉| 12708/12776 [2:13:58<00:28,  2.38it/s] 99%|█████████▉| 12709/12776 [2:13:58<00:28,  2.37it/s]                                                        99%|█████████▉| 12709/12776 [2:13:58<00:28,  2.37it/s] 99%|█████████▉| 12710/12776 [2:13:58<00:26,  2.52it/s]                                                        99%|█████████▉| 12710/12776 [2:13:58<00:26,  2.52it/s] 99%|█████████▉| 12711/12776 [2:13:59<00:24,  2.65it/s]                                                        99%|█████████▉| 12711/12776 [2:13:59<00:24,  2.65it/s] 99%|█████████▉| 12712/12776 [2:13:59<00:24,  2.60it/s]                                                        99%|█████████▉| 12712/12776 [2:13:59<00:24,  2.60it/s]100%|█████████▉| 12713/12776 [2:13:59<00:22,  2.77it/s]                                                       100%|█████████▉| 12713/12776 [2:13:59<00:22,  2.77it/s]100%|█████████▉| 12714/12776 [2:14:00<00:21,  2.93it/s]                                                       100%|█████████▉| 12714/12776 [2:14:00<00:21,  2.93it/s]100%|█████████▉| 12715/12776 [2:14:00<00:21,  2.89it/s]                                                       100%|█████████▉| 12715/12776 [2:14:00<00:21,  2.89it/s]100%|█████████▉| 12716/12776 [2:14:00<00:19,  3.07it/s]                                                       100%|█████████▉| 12716/12776 [2:14:00<00:19,  3.07it/s]100%|█████████▉| 12717/12776 [2:14:01<00:18,  3.24it/s]                                                       100%|█████████▉| 12717/12776 [2:14:01<00:18,  3.24it/s]100%|█████████▉| 12718/12776 [2:14:01<00:17,  3.40it/s]                                                       100%|█████████▉| 12718/12776 [2:14:01<00:17,  3.40it/s]100%|█████████▉| 12719/12776 [2:14:01<00:17,  3.29it/s]                                                       100%|█████████▉| 12719/12776 [2:14:01<00:17,  3.29it/s]100%|█████████▉| 12720/12776 [2:14:02<00:16,  3.49it/s]                                                       100%|█████████▉| 12720/12776 [2:14:02<00:16,  3.49it/s]100%|█████████▉| 12721/12776 [2:14:02<00:15,  3.65it/s]                                                       100%|█████████▉| 12721/12776 [2:14:02<00:15,  3.65it/s]100%|█████████▉| 12722/12776 [2:14:02<00:14,  3.80it/s]                                                       100%|█████████▉| 12722/12776 [2:14:02<00:14,  3.80it/s]100%|█████████▉| 12723/12776 [2:14:02<00:13,  3.95it/s]                                                       100%|█████████▉| 12723/12776 [2:14:02<00:13,  3.95it/s]100%|█████████▉| 12724/12776 [2:14:03<00:13,  3.76it/s]                                                       100%|█████████▉| 12724/12776 [2:14:03<00:13,  3.76it/s]100%|█████████▉| 12725/12776 [2:14:03<00:12,  3.98it/s]                                                       100%|█████████▉| 12725/12776 [2:14:03<00:12,  3.98it/s]100%|█████████▉| 12726/12776 [2:14:03<00:12,  4.16it/s]                                                       100%|█████████▉| 12726/12776 [2:14:03<00:12,  4.16it/s]100%|█████████▉| 12727/12776 [2:14:03<00:11,  4.33it/s]                                                       100%|█████████▉| 12727/12776 [2:14:03<00:11,  4.33it/s]100%|█████████▉| 12728/12776 [2:14:03<00:10,  4.44it/s]                                                       100%|█████████▉| 12728/12776 [2:14:03<00:10,  4.44it/s]100%|█████████▉| 12729/12776 [2:14:04<00:11,  4.06it/s]                                                       100%|█████████▉| 12729/12776 [2:14:04<00:11,  4.06it/s]100%|█████████▉| 12730/12776 [2:14:04<00:10,  4.30it/s]                                                       100%|█████████▉| 12730/12776 [2:14:04<00:10,  4.30it/s]100%|█████████▉| 12731/12776 [2:14:04<00:10,  4.49it/s]                                                       100%|█████████▉| 12731/12776 [2:14:04<00:10,  4.49it/s]100%|█████████▉| 12732/12776 [2:14:04<00:09,  4.65it/s]                                                       100%|█████████▉| 12732/12776 [2:14:04<00:09,  4.65it/s]100%|█████████▉| 12733/12776 [2:14:04<00:08,  5.08it/s]                                                       100%|█████████▉| 12733/12776 [2:14:04<00:08,  5.08it/s]100%|█████████▉| 12734/12776 [2:14:05<00:09,  4.58it/s]                                                       100%|█████████▉| 12734/12776 [2:14:05<00:09,  4.58it/s]100%|█████████▉| 12735/12776 [2:14:05<00:08,  4.76it/s]                                                       100%|█████████▉| 12735/12776 [2:14:05<00:08,  4.76it/s]100%|█████████▉| 12736/12776 [2:14:05<00:08,  4.93it/s]                                                       100%|█████████▉| 12736/12776 [2:14:05<00:08,  4.93it/s]100%|█████████▉| 12737/12776 [2:14:05<00:07,  5.08it/s]                                                       100%|█████████▉| 12737/12776 [2:14:05<00:07,  5.08it/s]100%|█████████▉| 12738/12776 [2:14:06<00:13,  2.90it/s]                                                       100%|█████████▉| 12738/12776 [2:14:06<00:13,  2.90it/s]100%|█████████▉| 12739/12776 [2:14:07<00:25,  1.44it/s]                                                       100%|█████████▉| 12739/12776 [2:14:07<00:25,  1.44it/s]100%|█████████▉| 12740/12776 [2:14:08<00:26,  1.34it/s]                                                       100%|█████████▉| 12740/12776 [2:14:08<00:26,  1.34it/s]100%|█████████▉| 12741/12776 [2:14:09<00:26,  1.32it/s]                                                       100%|█████████▉| 12741/12776 [2:14:09<00:26,  1.32it/s]100%|█████████▉| 12742/12776 [2:14:10<00:25,  1.33it/s]                                                       100%|█████████▉| 12742/12776 [2:14:10<00:25,  1.33it/s]100%|█████████▉| 12743/12776 [2:14:11<00:23,  1.38it/s]                                                       100%|█████████▉| 12743/12776 [2:14:11<00:23,  1.38it/s]100%|█████████▉| 12744/12776 [2:14:11<00:22,  1.43it/s]                                                       100%|█████████▉| 12744/12776 [2:14:11<00:22,  1.43it/s]100%|█████████▉| 12745/12776 [2:14:12<00:22,  1.38it/s]                                                       100%|█████████▉| 12745/12776 [2:14:12<00:22,  1.38it/s]100%|█████████▉| 12746/12776 [2:14:13<00:20,  1.48it/s]                                                       100%|█████████▉| 12746/12776 [2:14:13<00:20,  1.48it/s]100%|█████████▉| 12747/12776 [2:14:13<00:19,  1.50it/s]                                                       100%|█████████▉| 12747/12776 [2:14:13<00:19,  1.50it/s]100%|█████████▉| 12748/12776 [2:14:14<00:17,  1.61it/s]                                                       100%|█████████▉| 12748/12776 [2:14:14<00:17,  1.61it/s]100%|█████████▉| 12749/12776 [2:14:14<00:16,  1.67it/s]                                                       100%|█████████▉| 12749/12776 [2:14:14<00:16,  1.67it/s]100%|█████████▉| 12750/12776 [2:14:15<00:14,  1.81it/s]                                                       100%|█████████▉| 12750/12776 [2:14:15<00:14,  1.81it/s]100%|█████████▉| 12751/12776 [2:14:15<00:13,  1.87it/s]                                                       100%|█████████▉| 12751/12776 [2:14:15<00:13,  1.87it/s]100%|█████████▉| 12752/12776 [2:14:16<00:11,  2.01it/s]                                                       100%|█████████▉| 12752/12776 [2:14:16<00:11,  2.01it/s]100%|█████████▉| 12753/12776 [2:14:16<00:10,  2.15it/s]                                                       100%|█████████▉| 12753/12776 [2:14:16<00:10,  2.15it/s]100%|█████████▉| 12754/12776 [2:14:16<00:10,  2.10it/s]                                                       100%|█████████▉| 12754/12776 [2:14:16<00:10,  2.10it/s]100%|█████████▉| 12755/12776 [2:14:17<00:09,  2.28it/s]                                                       100%|█████████▉| 12755/12776 [2:14:17<00:09,  2.28it/s]100%|█████████▉| 12756/12776 [2:14:17<00:08,  2.46it/s]                                                       100%|█████████▉| 12756/12776 [2:14:17<00:08,  2.46it/s]100%|█████████▉| 12757/12776 [2:14:18<00:07,  2.42it/s]                                                       100%|█████████▉| 12757/12776 [2:14:18<00:07,  2.42it/s]100%|█████████▉| 12758/12776 [2:14:18<00:06,  2.63it/s]                                                       100%|█████████▉| 12758/12776 [2:14:18<00:06,  2.63it/s]100%|█████████▉| 12759/12776 [2:14:18<00:06,  2.82it/s]                                                       100%|█████████▉| 12759/12776 [2:14:18<00:06,  2.82it/s]100%|█████████▉| 12760/12776 [2:14:19<00:05,  2.75it/s]                                                       100%|█████████▉| 12760/12776 [2:14:19<00:05,  2.75it/s]100%|█████████▉| 12761/12776 [2:14:19<00:05,  2.99it/s]                                                       100%|█████████▉| 12761/12776 [2:14:19<00:05,  2.99it/s]100%|█████████▉| 12762/12776 [2:14:19<00:04,  3.23it/s]                                                       100%|█████████▉| 12762/12776 [2:14:19<00:04,  3.23it/s]100%|█████████▉| 12763/12776 [2:14:19<00:03,  3.44it/s]                                                       100%|█████████▉| 12763/12776 [2:14:19<00:03,  3.44it/s]100%|█████████▉| 12764/12776 [2:14:20<00:03,  3.26it/s]                                                       100%|█████████▉| 12764/12776 [2:14:20<00:03,  3.26it/s]100%|█████████▉| 12765/12776 [2:14:20<00:03,  3.54it/s]                                                       100%|█████████▉| 12765/12776 [2:14:20<00:03,  3.54it/s]100%|█████████▉| 12766/12776 [2:14:20<00:02,  3.80it/s]                                                       100%|█████████▉| 12766/12776 [2:14:20<00:02,  3.80it/s]100%|█████████▉| 12767/12776 [2:14:20<00:02,  4.04it/s]                                                       100%|█████████▉| 12767/12776 [2:14:20<00:02,  4.04it/s]100%|█████████▉| 12768/12776 [2:14:21<00:01,  4.21it/s]                                                       100%|█████████▉| 12768/12776 [2:14:21<00:01,  4.21it/s]100%|█████████▉| 12769/12776 [2:14:21<00:01,  4.11it/s]                                                       100%|█████████▉| 12769/12776 [2:14:21<00:01,  4.11it/s]100%|█████████▉| 12770/12776 [2:14:21<00:01,  4.34it/s]                                                       100%|█████████▉| 12770/12776 [2:14:21<00:01,  4.34it/s]100%|█████████▉| 12771/12776 [2:14:21<00:01,  4.52it/s]                                                       100%|█████████▉| 12771/12776 [2:14:21<00:01,  4.52it/s]100%|█████████▉| 12772/12776 [2:14:21<00:00,  4.68it/s]                                                       100%|█████████▉| 12772/12776 [2:14:21<00:00,  4.68it/s]100%|█████████▉| 12773/12776 [2:14:22<00:00,  4.82it/s]                                                       100%|█████████▉| 12773/12776 [2:14:22<00:00,  4.82it/s]100%|█████████▉| 12774/12776 [2:14:22<00:00,  4.94it/s]                                                       100%|█████████▉| 12774/12776 [2:14:22<00:00,  4.94it/s]100%|█████████▉| 12775/12776 [2:14:22<00:00,  4.52it/s]                                                       100%|█████████▉| 12775/12776 [2:14:22<00:00,  4.52it/s]100%|██████████| 12776/12776 [2:14:22<00:00,  4.88it/s]                                                       100%|██████████| 12776/12776 [2:14:22<00:00,  4.88it/s]Saving model checkpoint to ./checkpoint-12776
+Configuration saved in ./checkpoint-12776/config.json
+Model weights saved in ./checkpoint-12776/model.safetensors
+Feature extractor saved in ./checkpoint-12776/preprocessor_config.json
+tokenizer config file saved in ./checkpoint-12776/tokenizer_config.json
+Special tokens file saved in ./checkpoint-12776/special_tokens_map.json
+added tokens file saved in ./checkpoint-12776/added_tokens.json
+Feature extractor saved in ./preprocessor_config.json
+tokenizer config file saved in ./tokenizer_config.json
+Special tokens file saved in ./special_tokens_map.json
+added tokens file saved in ./added_tokens.json
+Deleting older checkpoint [checkpoint-11600] due to args.save_total_limit
+
+
+Training completed. Do not forget to share your model on huggingface.co/models =)
+
+
+                                                       {'loss': 1.3595, 'grad_norm': 2.1755223274230957, 'learning_rate': 2.2971652003910067e-06, 'epoch': 1.99}
+{'loss': 1.396, 'grad_norm': 1.844046711921692, 'learning_rate': 2.2727272727272728e-06, 'epoch': 1.99}
+{'loss': 1.4409, 'grad_norm': 1.6896350383758545, 'learning_rate': 2.2482893450635384e-06, 'epoch': 1.99}
+{'loss': 1.3609, 'grad_norm': 2.31750226020813, 'learning_rate': 2.223851417399804e-06, 'epoch': 1.99}
+{'loss': 1.3918, 'grad_norm': 2.511352062225342, 'learning_rate': 2.19941348973607e-06, 'epoch': 1.99}
+{'loss': 1.3564, 'grad_norm': 1.6797364950180054, 'learning_rate': 2.174975562072336e-06, 'epoch': 1.99}
+{'loss': 1.4078, 'grad_norm': 3.3981409072875977, 'learning_rate': 2.150537634408602e-06, 'epoch': 1.99}
+{'loss': 1.2969, 'grad_norm': 1.422497272491455, 'learning_rate': 2.126099706744868e-06, 'epoch': 1.99}
+{'loss': 1.3743, 'grad_norm': 6.27196741104126, 'learning_rate': 2.101661779081134e-06, 'epoch': 1.99}
+{'loss': 1.1895, 'grad_norm': 2.9742422103881836, 'learning_rate': 2.0772238514173996e-06, 'epoch': 1.99}
+{'loss': 1.3513, 'grad_norm': 2.017789363861084, 'learning_rate': 2.0527859237536657e-06, 'epoch': 1.99}
+{'loss': 1.5357, 'grad_norm': 4.888723850250244, 'learning_rate': 2.0283479960899314e-06, 'epoch': 1.99}
+{'loss': 1.4441, 'grad_norm': 2.907987594604492, 'learning_rate': 2.0039100684261974e-06, 'epoch': 1.99}
+{'loss': 1.3473, 'grad_norm': 4.175799369812012, 'learning_rate': 1.979472140762463e-06, 'epoch': 1.99}
+{'loss': 1.4132, 'grad_norm': 1.9999713897705078, 'learning_rate': 1.955034213098729e-06, 'epoch': 1.99}
+{'loss': 1.2115, 'grad_norm': 2.5126242637634277, 'learning_rate': 1.930596285434995e-06, 'epoch': 1.99}
+{'loss': 1.1961, 'grad_norm': 1.8449431657791138, 'learning_rate': 1.9061583577712609e-06, 'epoch': 1.99}
+{'loss': 1.2299, 'grad_norm': 8.649978637695312, 'learning_rate': 1.8817204301075267e-06, 'epoch': 1.99}
+{'loss': 1.2318, 'grad_norm': 12.748600006103516, 'learning_rate': 1.8572825024437926e-06, 'epoch': 1.99}
+{'loss': 1.4941, 'grad_norm': 9.06901741027832, 'learning_rate': 1.8328445747800584e-06, 'epoch': 1.99}
+{'loss': 1.0969, 'grad_norm': 6.057063102722168, 'learning_rate': 1.8084066471163245e-06, 'epoch': 1.99}
+{'loss': 1.2691, 'grad_norm': 3.5154621601104736, 'learning_rate': 1.7839687194525904e-06, 'epoch': 1.99}
+{'loss': 1.0831, 'grad_norm': 3.144012451171875, 'learning_rate': 1.7595307917888562e-06, 'epoch': 1.99}
+{'loss': 1.0199, 'grad_norm': 7.510293483734131, 'learning_rate': 1.7350928641251219e-06, 'epoch': 1.99}
+{'loss': 1.3326, 'grad_norm': 3.6680872440338135, 'learning_rate': 1.7106549364613882e-06, 'epoch': 1.99}
+{'loss': 1.1472, 'grad_norm': 7.296219348907471, 'learning_rate': 1.6862170087976538e-06, 'epoch': 1.99}
+{'loss': 0.9459, 'grad_norm': 3.799044132232666, 'learning_rate': 1.6617790811339197e-06, 'epoch': 1.99}
+{'loss': 1.1241, 'grad_norm': 4.250785827636719, 'learning_rate': 1.6373411534701855e-06, 'epoch': 1.99}
+{'loss': 1.1708, 'grad_norm': 5.237523078918457, 'learning_rate': 1.6129032258064516e-06, 'epoch': 1.99}
+{'loss': 1.2067, 'grad_norm': 2.55849552154541, 'learning_rate': 1.5884652981427175e-06, 'epoch': 1.99}
+{'loss': 1.8229, 'grad_norm': 4.564910411834717, 'learning_rate': 1.5640273704789833e-06, 'epoch': 1.99}
+{'loss': 1.2861, 'grad_norm': 3.0857863426208496, 'learning_rate': 1.539589442815249e-06, 'epoch': 1.99}
+{'loss': 1.1077, 'grad_norm': 3.102818489074707, 'learning_rate': 1.5151515151515152e-06, 'epoch': 1.99}
+{'loss': 0.6339, 'grad_norm': 1.7498348951339722, 'learning_rate': 1.4907135874877809e-06, 'epoch': 1.99}
+{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.4907135874877809e-06, 'epoch': 1.99}
+{'loss': 0.9257, 'grad_norm': 8.903223991394043, 'learning_rate': 1.4662756598240468e-06, 'epoch': 1.99}
+{'loss': 1.1337, 'grad_norm': 9.738998413085938, 'learning_rate': 1.4418377321603126e-06, 'epoch': 1.99}
+{'loss': 0.5903, 'grad_norm': 2.1444084644317627, 'learning_rate': 1.4173998044965787e-06, 'epoch': 1.99}
+{'loss': 1.2444, 'grad_norm': 3.0119729042053223, 'learning_rate': 1.3929618768328445e-06, 'epoch': 1.99}
+{'loss': 1.2683, 'grad_norm': 4.827343940734863, 'learning_rate': 1.3685239491691104e-06, 'epoch': 1.99}
+{'loss': 1.4892, 'grad_norm': 1.7269351482391357, 'learning_rate': 1.3440860215053763e-06, 'epoch': 1.99}
+{'loss': 1.4841, 'grad_norm': 1.236871600151062, 'learning_rate': 1.3196480938416423e-06, 'epoch': 1.99}
+{'loss': 1.4703, 'grad_norm': 1.0202114582061768, 'learning_rate': 1.295210166177908e-06, 'epoch': 1.99}
+{'loss': 1.4379, 'grad_norm': 0.5978159308433533, 'learning_rate': 1.2707722385141738e-06, 'epoch': 1.99}
+{'loss': 1.3344, 'grad_norm': 1.8672462701797485, 'learning_rate': 1.2463343108504397e-06, 'epoch': 1.99}
+{'loss': 1.4433, 'grad_norm': 0.9997517466545105, 'learning_rate': 1.2218963831867058e-06, 'epoch': 1.99}
+{'loss': 1.4282, 'grad_norm': 1.3582754135131836, 'learning_rate': 1.1974584555229716e-06, 'epoch': 2.0}
+{'loss': 1.4486, 'grad_norm': 1.0937734842300415, 'learning_rate': 1.1730205278592375e-06, 'epoch': 2.0}
+{'loss': 1.3394, 'grad_norm': 0.7297368049621582, 'learning_rate': 1.1485826001955033e-06, 'epoch': 2.0}
+{'loss': 1.3208, 'grad_norm': 1.304893136024475, 'learning_rate': 1.1241446725317692e-06, 'epoch': 2.0}
+{'loss': 1.3768, 'grad_norm': 0.9732430577278137, 'learning_rate': 1.099706744868035e-06, 'epoch': 2.0}
+{'loss': 1.3748, 'grad_norm': 2.0447745323181152, 'learning_rate': 1.075268817204301e-06, 'epoch': 2.0}
+{'loss': 1.4384, 'grad_norm': 1.8304771184921265, 'learning_rate': 1.050830889540567e-06, 'epoch': 2.0}
+{'loss': 1.3156, 'grad_norm': 1.1020729541778564, 'learning_rate': 1.0263929618768329e-06, 'epoch': 2.0}
+{'loss': 1.2381, 'grad_norm': 2.1293833255767822, 'learning_rate': 1.0019550342130987e-06, 'epoch': 2.0}
+{'loss': 1.2771, 'grad_norm': 2.4541094303131104, 'learning_rate': 9.775171065493646e-07, 'epoch': 2.0}
+{'loss': 1.3604, 'grad_norm': 3.102612257003784, 'learning_rate': 9.530791788856304e-07, 'epoch': 2.0}
+{'loss': 1.2247, 'grad_norm': 1.3522253036499023, 'learning_rate': 9.286412512218963e-07, 'epoch': 2.0}
+{'loss': 1.3129, 'grad_norm': 4.309826850891113, 'learning_rate': 9.042033235581623e-07, 'epoch': 2.0}
+{'loss': 1.1901, 'grad_norm': 1.946575403213501, 'learning_rate': 8.797653958944281e-07, 'epoch': 2.0}
+{'loss': 1.0399, 'grad_norm': 2.3310208320617676, 'learning_rate': 8.553274682306941e-07, 'epoch': 2.0}
+{'loss': 1.1825, 'grad_norm': 1.9439860582351685, 'learning_rate': 8.308895405669598e-07, 'epoch': 2.0}
+{'loss': 1.2622, 'grad_norm': 2.59775972366333, 'learning_rate': 8.064516129032258e-07, 'epoch': 2.0}
+{'loss': 1.4661, 'grad_norm': 4.020702838897705, 'learning_rate': 7.820136852394917e-07, 'epoch': 2.0}
+{'loss': 1.2114, 'grad_norm': 4.491816997528076, 'learning_rate': 7.575757575757576e-07, 'epoch': 2.0}
+{'loss': 1.169, 'grad_norm': 3.0681772232055664, 'learning_rate': 7.331378299120234e-07, 'epoch': 2.0}
+{'loss': 1.5927, 'grad_norm': 11.430411338806152, 'learning_rate': 7.086999022482893e-07, 'epoch': 2.0}
+{'loss': 1.3744, 'grad_norm': 8.486896514892578, 'learning_rate': 6.842619745845552e-07, 'epoch': 2.0}
+{'loss': 1.1301, 'grad_norm': 14.378063201904297, 'learning_rate': 6.598240469208212e-07, 'epoch': 2.0}
+{'loss': 1.7693, 'grad_norm': 15.947986602783203, 'learning_rate': 6.353861192570869e-07, 'epoch': 2.0}
+{'loss': 1.2029, 'grad_norm': 10.535764694213867, 'learning_rate': 6.109481915933529e-07, 'epoch': 2.0}
+{'loss': 1.4126, 'grad_norm': 3.89440655708313, 'learning_rate': 5.865102639296187e-07, 'epoch': 2.0}
+{'loss': 1.1434, 'grad_norm': 2.050438642501831, 'learning_rate': 5.620723362658846e-07, 'epoch': 2.0}
+{'loss': 1.0368, 'grad_norm': 1.9969635009765625, 'learning_rate': 5.376344086021505e-07, 'epoch': 2.0}
+{'loss': 0.965, 'grad_norm': 4.676234722137451, 'learning_rate': 5.131964809384164e-07, 'epoch': 2.0}
+{'loss': 0.5821, 'grad_norm': 1.3593485355377197, 'learning_rate': 4.887585532746823e-07, 'epoch': 2.0}
+{'loss': 1.5815, 'grad_norm': 2.8748373985290527, 'learning_rate': 4.6432062561094814e-07, 'epoch': 2.0}
+{'loss': 1.2623, 'grad_norm': 4.058191299438477, 'learning_rate': 4.3988269794721406e-07, 'epoch': 2.0}
+100%|██████████| 12776/12776 [2:14:28<00:00,  4.88it/s]100%|██████████| 12776/12776 [2:14:28<00:00,  1.58it/s]
+Waiting for the current checkpoint push to be finished, this might take a couple of minutes.