pythia-70m_sciq / train_job_output.txt
Tristan's picture
Training in progress, epoch 0
cd5c9f8 verified
raw
history blame
60.2 kB
slurm submission log: 2024-05-26 22:30:16.581914
created following sbatch script:
###############################
#!/bin/bash
#SBATCH --account=nlp
#SBATCH --cpus-per-task=16
#SBATCH --dependency=afterok:7653570
#SBATCH --gres=gpu:2
#SBATCH --job-name=tthrush-job-3137501
#SBATCH --mem=100G
#SBATCH --nodelist=sphinx2
#SBATCH --open-mode=append
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1/train_job_output.txt
#SBATCH --partition=sphinx
#SBATCH --time=14-0
# activate your desired anaconda environment
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
# cd to working directory
cd .
# launch commands
srun --unbuffered run_as_child_processes 'torchrun --master_port 29509 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
###############################
submission to slurm complete!
###############################
slurm submission output
Submitted batch job 7653571
###############################
slurm submission log: 2024-05-26 22:32:57.495347
created following sbatch script:
###############################
#!/bin/bash
#SBATCH --account=nlp
#SBATCH --cpus-per-task=16
#SBATCH --dependency=afterok:7653600
#SBATCH --gres=gpu:2
#SBATCH --job-name=tthrush-job-3075134
#SBATCH --mem=100G
#SBATCH --nodelist=sphinx2
#SBATCH --open-mode=append
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1/train_job_output.txt
#SBATCH --partition=sphinx
#SBATCH --time=14-0
# activate your desired anaconda environment
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
# cd to working directory
cd .
# launch commands
srun --unbuffered run_as_child_processes 'torchrun --master_port 29509 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
###############################
submission to slurm complete!
###############################
slurm submission output
Submitted batch job 7653601
###############################
slurm submission log: 2024-05-26 22:58:09.787171
created following sbatch script:
###############################
#!/bin/bash
#SBATCH --account=nlp
#SBATCH --cpus-per-task=16
#SBATCH --dependency=afterok:7653655
#SBATCH --gres=gpu:2
#SBATCH --job-name=tthrush-job-3775598
#SBATCH --mem=100G
#SBATCH --nodelist=sphinx2
#SBATCH --open-mode=append
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1/train_job_output.txt
#SBATCH --partition=sphinx
#SBATCH --time=14-0
# activate your desired anaconda environment
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
# cd to working directory
cd .
# launch commands
srun --unbuffered run_as_child_processes 'torchrun --master_port 29509 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
###############################
submission to slurm complete!
###############################
slurm submission output
Submitted batch job 7653656
###############################
slurm submission log: 2024-05-26 23:16:43.398883
created following sbatch script:
###############################
#!/bin/bash
#SBATCH --account=nlp
#SBATCH --cpus-per-task=16
#SBATCH --dependency=afterok:7653712
#SBATCH --gres=gpu:2
#SBATCH --job-name=tthrush-job-3360635
#SBATCH --mem=100G
#SBATCH --nodelist=sphinx2
#SBATCH --open-mode=append
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1/train_job_output.txt
#SBATCH --partition=sphinx
#SBATCH --time=14-0
# activate your desired anaconda environment
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
# cd to working directory
cd .
# launch commands
srun --unbuffered run_as_child_processes 'torchrun --master_port 29509 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
###############################
submission to slurm complete!
###############################
slurm submission output
Submitted batch job 7653713
###############################
###############################
start time: 2024-05-27 10:05:46.837699
machine: sphinx2
conda env: pretraining-coreset-selection
###############################
running following processes
torchrun --master_port 29509 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14
###############################
command outputs:
[2024-05-27 10:05:53,482] torch.distributed.run: [WARNING]
[2024-05-27 10:05:53,482] torch.distributed.run: [WARNING] *****************************************
[2024-05-27 10:05:53,482] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
[2024-05-27 10:05:53,482] torch.distributed.run: [WARNING] *****************************************
05/27/2024 10:06:12 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)
05/27/2024 10:06:15 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_projection/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)
0%| | 0/10682 [00:00<?, ?it/s][rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
0%| | 1/10682 [00:10<31:15:36, 10.54s/it] 0%| | 2/10682 [00:14<20:14:46, 6.82s/it] 0%| | 3/10682 [00:18<16:10:01, 5.45s/it] 0%| | 4/10682 [00:20<12:28:20, 4.20s/it] 0%| | 5/10682 [00:23<10:46:51, 3.64s/it] 0%| | 6/10682 [00:25<9:36:45, 3.24s/it] 0%| | 7/10682 [00:27<8:11:25, 2.76s/it] 0%| | 8/10682 [00:29<7:34:52, 2.56s/it] 0%| | 9/10682 [00:31<6:42:37, 2.26s/it] 0%| | 10/10682 [00:33<6:07:18, 2.07s/it] 0%| | 11/10682 [00:34<5:22:06, 1.81s/it] 0%| | 12/10682 [00:35<5:02:01, 1.70s/it] 0%| | 13/10682 [00:36<4:34:37, 1.54s/it] 0%| | 14/10682 [00:38<4:13:15, 1.42s/it] 0%| | 15/10682 [00:39<3:53:25, 1.31s/it] 0%| | 16/10682 [00:40<3:32:44, 1.20s/it] 0%| | 17/10682 [00:41<3:18:22, 1.12s/it] 0%| | 18/10682 [00:41<3:00:59, 1.02s/it] 0%| | 19/10682 [00:42<3:03:58, 1.04s/it] 0%| | 20/10682 [00:43<2:51:46, 1.03it/s] 0%| | 21/10682 [00:44<2:42:19, 1.09it/s] 0%| | 22/10682 [00:45<2:33:12, 1.16it/s] 0%| | 23/10682 [00:45<2:24:02, 1.23it/s] 0%| | 24/10682 [00:46<2:16:31, 1.30it/s] 0%| | 25/10682 [00:47<2:14:11, 1.32it/s]{'loss': 10.6474, 'grad_norm': 1.4571192264556885, 'learning_rate': 2.3386342376052384e-05, 'epoch': 0.03}
0%| | 25/10682 [00:47<2:14:11, 1.32it/s] 0%| | 26/10682 [00:48<2:19:36, 1.27it/s] 0%| | 27/10682 [00:48<2:11:34, 1.35it/s] 0%| | 28/10682 [00:49<2:07:07, 1.40it/s] 0%| | 29/10682 [00:50<2:02:26, 1.45it/s] 0%| | 30/10682 [00:50<1:55:59, 1.53it/s] 0%| | 31/10682 [00:51<1:51:27, 1.59it/s] 0%| | 32/10682 [00:51<1:49:26, 1.62it/s] 0%| | 33/10682 [00:52<1:50:08, 1.61it/s] 0%| | 34/10682 [00:53<1:48:17, 1.64it/s] 0%| | 35/10682 [00:53<1:57:44, 1.51it/s] 0%| | 36/10682 [00:54<1:53:55, 1.56it/s] 0%| | 37/10682 [00:54<1:48:19, 1.64it/s] 0%| | 38/10682 [00:55<1:47:36, 1.65it/s] 0%| | 39/10682 [00:56<1:45:07, 1.69it/s] 0%| | 40/10682 [00:56<1:43:20, 1.72it/s] 0%| | 41/10682 [00:57<1:40:35, 1.76it/s] 0%| | 42/10682 [00:57<1:42:26, 1.73it/s] 0%| | 43/10682 [00:58<1:40:02, 1.77it/s] 0%| | 44/10682 [00:58<1:38:14, 1.80it/s] 0%| | 45/10682 [00:59<1:38:18, 1.80it/s] 0%| | 46/10682 [00:59<1:36:53, 1.83it/s] 0%| | 47/10682 [01:00<1:34:36, 1.87it/s] 0%| | 48/10682 [01:00<1:32:40, 1.91it/s] 0%| | 49/10682 [01:01<1:31:16, 1.94it/s] 0%| | 50/10682 [01:01<1:30:30, 1.96it/s] {'loss': 9.9024, 'grad_norm': 1.3314062356948853, 'learning_rate': 4.677268475210477e-05, 'epoch': 0.07}
0%| | 50/10682 [01:01<1:30:30, 1.96it/s] 0%| | 51/10682 [01:02<1:29:48, 1.97it/s] 0%| | 52/10682 [01:02<1:30:13, 1.96it/s] 0%| | 53/10682 [01:03<1:31:21, 1.94it/s] 1%| | 54/10682 [01:04<1:31:20, 1.94it/s] 1%| | 55/10682 [01:04<1:31:27, 1.94it/s] 1%| | 56/10682 [01:05<1:32:24, 1.92it/s] 1%| | 57/10682 [01:05<1:32:34, 1.91it/s] 1%| | 58/10682 [01:06<1:32:51, 1.91it/s] 1%| | 59/10682 [01:06<1:33:54, 1.89it/s] 1%| | 60/10682 [01:07<1:32:35, 1.91it/s] 1%| | 61/10682 [01:07<1:32:23, 1.92it/s] 1%| | 62/10682 [01:08<1:32:03, 1.92it/s] 1%| | 63/10682 [01:08<1:31:01, 1.94it/s] 1%| | 64/10682 [01:09<1:30:28, 1.96it/s] 1%| | 65/10682 [01:09<1:31:04, 1.94it/s] 1%| | 66/10682 [01:10<1:30:05, 1.96it/s] 1%| | 67/10682 [01:10<1:29:44, 1.97it/s] 1%| | 68/10682 [01:11<1:29:09, 1.98it/s] 1%| | 69/10682 [01:11<1:29:27, 1.98it/s] 1%| | 70/10682 [01:12<1:29:05, 1.99it/s] 1%| | 71/10682 [01:12<1:28:39, 1.99it/s] 1%| | 72/10682 [01:13<1:28:23, 2.00it/s] 1%| | 73/10682 [01:13<1:28:11, 2.00it/s] 1%| | 74/10682 [01:14<1:27:54, 2.01it/s] 1%| | 75/10682 [01:14<1:27:45, 2.01it/s]{'loss': 9.1814, 'grad_norm': 1.1333130598068237, 'learning_rate': 7.015902712815715e-05, 'epoch': 0.1}
1%| | 75/10682 [01:14<1:27:45, 2.01it/s] 1%| | 76/10682 [01:15<1:27:54, 2.01it/s] 1%| | 77/10682 [01:15<1:27:58, 2.01it/s] 1%| | 78/10682 [01:16<1:27:47, 2.01it/s] 1%| | 79/10682 [01:16<1:28:03, 2.01it/s] 1%| | 80/10682 [01:17<1:28:44, 1.99it/s] 1%| | 81/10682 [01:17<1:28:29, 2.00it/s] 1%| | 82/10682 [01:18<1:28:28, 2.00it/s] 1%| | 83/10682 [01:18<1:28:05, 2.01it/s] 1%| | 84/10682 [01:19<1:27:51, 2.01it/s] 1%| | 85/10682 [01:19<1:28:10, 2.00it/s] 1%| | 86/10682 [01:20<1:27:57, 2.01it/s] 1%| | 87/10682 [01:20<1:27:43, 2.01it/s] 1%| | 88/10682 [01:21<1:27:33, 2.02it/s] 1%| | 89/10682 [01:21<1:27:19, 2.02it/s] 1%| | 90/10682 [01:22<1:27:12, 2.02it/s] 1%| | 91/10682 [01:22<1:27:49, 2.01it/s] 1%| | 92/10682 [01:23<1:28:05, 2.00it/s] 1%| | 93/10682 [01:23<1:28:10, 2.00it/s] 1%| | 94/10682 [01:24<1:28:24, 2.00it/s] 1%| | 95/10682 [01:24<1:28:01, 2.00it/s] 1%| | 96/10682 [01:25<1:27:40, 2.01it/s] 1%| | 97/10682 [01:25<1:27:25, 2.02it/s] 1%| | 98/10682 [01:26<1:27:21, 2.02it/s] 1%| | 99/10682 [01:26<1:27:12, 2.02it/s] 1%| | 100/10682 [01:27<1:27:06, 2.02it/s]{'loss': 8.3876, 'grad_norm': 0.7866231203079224, 'learning_rate': 9.354536950420954e-05, 'epoch': 0.13}
1%| | 100/10682 [01:27<1:27:06, 2.02it/s] 1%| | 101/10682 [01:27<1:27:09, 2.02it/s] 1%| | 102/10682 [01:28<1:27:08, 2.02it/s] 1%| | 103/10682 [01:28<1:27:30, 2.01it/s] 1%| | 104/10682 [01:29<1:27:34, 2.01it/s] 1%| | 105/10682 [01:29<1:27:19, 2.02it/s] 1%| | 106/10682 [01:30<1:27:22, 2.02it/s] 1%| | 107/10682 [01:30<1:27:09, 2.02it/s] 1%| | 108/10682 [01:31<1:27:05, 2.02it/s] 1%| | 109/10682 [01:31<1:27:29, 2.01it/s] 1%| | 110/10682 [01:32<1:27:16, 2.02it/s] 1%| | 111/10682 [01:32<1:27:07, 2.02it/s] 1%| | 112/10682 [01:33<1:27:02, 2.02it/s] 1%| | 113/10682 [01:33<1:26:58, 2.03it/s] 1%| | 114/10682 [01:34<1:26:56, 2.03it/s] 1%| | 115/10682 [01:34<1:26:54, 2.03it/s] 1%| | 116/10682 [01:35<1:26:57, 2.03it/s] 1%| | 117/10682 [01:35<1:26:48, 2.03it/s] 1%| | 118/10682 [01:36<1:26:48, 2.03it/s] 1%| | 119/10682 [01:36<1:26:53, 2.03it/s] 1%| | 120/10682 [01:37<1:26:55, 2.03it/s] 1%| | 121/10682 [01:37<1:26:50, 2.03it/s] 1%| | 122/10682 [01:38<1:26:44, 2.03it/s] 1%| | 123/10682 [01:38<1:26:48, 2.03it/s] 1%| | 124/10682 [01:39<1:26:53, 2.03it/s] 1%| | 125/10682 [01:39<1:26:54, 2.02it/s] {'loss': 7.7383, 'grad_norm': 0.45621567964553833, 'learning_rate': 0.00011693171188026193, 'epoch': 0.16}
1%| | 125/10682 [01:39<1:26:54, 2.02it/s] 1%| | 126/10682 [01:40<1:27:25, 2.01it/s] 1%| | 127/10682 [01:40<1:27:14, 2.02it/s] 1%| | 128/10682 [01:40<1:27:05, 2.02it/s] 1%| | 129/10682 [01:41<1:27:24, 2.01it/s] 1%| | 130/10682 [01:41<1:27:12, 2.02it/s] 1%| | 131/10682 [01:42<1:26:58, 2.02it/s] 1%| | 132/10682 [01:42<1:26:57, 2.02it/s] 1%| | 133/10682 [01:43<1:26:47, 2.03it/s] 1%|▏ | 134/10682 [01:43<1:26:42, 2.03it/s] 1%|▏ | 135/10682 [01:44<1:26:39, 2.03it/s] 1%|▏ | 136/10682 [01:44<1:26:37, 2.03it/s] 1%|▏ | 137/10682 [01:45<1:26:48, 2.02it/s] 1%|▏ | 138/10682 [01:45<1:26:44, 2.03it/s] 1%|▏ | 139/10682 [01:46<1:26:42, 2.03it/s] 1%|▏ | 140/10682 [01:46<1:26:41, 2.03it/s] 1%|▏ | 141/10682 [01:47<1:26:36, 2.03it/s] 1%|▏ | 142/10682 [01:47<1:26:37, 2.03it/s] 1%|▏ | 143/10682 [01:48<1:26:41, 2.03it/s] 1%|▏ | 144/10682 [01:48<1:26:36, 2.03it/s] 1%|▏ | 145/10682 [01:49<1:26:27, 2.03it/s] 1%|▏ | 146/10682 [01:49<1:26:27, 2.03it/s] 1%|▏ | 147/10682 [01:50<1:26:27, 2.03it/s] 1%|▏ | 148/10682 [01:50<1:26:20, 2.03it/s] 1%|▏ | 149/10682 [01:51<1:26:20, 2.03it/s] 1%|▏ | 150/10682 [01:51<1:26:23, 2.03it/s] {'loss': 7.2714, 'grad_norm': 0.5800231099128723, 'learning_rate': 0.0001403180542563143, 'epoch': 0.2}
1%|▏ | 150/10682 [01:51<1:26:23, 2.03it/s] 1%|▏ | 151/10682 [01:52<1:26:26, 2.03it/s] 1%|▏ | 152/10682 [01:52<1:26:25, 2.03it/s] 1%|▏ | 153/10682 [01:53<1:26:23, 2.03it/s] 1%|▏ | 154/10682 [01:53<1:26:22, 2.03it/s] 1%|▏ | 155/10682 [01:54<1:26:24, 2.03it/s] 1%|▏ | 156/10682 [01:54<1:26:22, 2.03it/s] 1%|▏ | 157/10682 [01:55<1:26:22, 2.03it/s] 1%|▏ | 158/10682 [01:55<1:26:15, 2.03it/s] 1%|▏ | 159/10682 [01:56<1:26:19, 2.03it/s] 1%|▏ | 160/10682 [01:56<1:26:19, 2.03it/s] 2%|▏ | 161/10682 [01:57<1:26:12, 2.03it/s] 2%|▏ | 162/10682 [01:57<1:26:21, 2.03it/s] 2%|▏ | 163/10682 [01:58<1:26:19, 2.03it/s] 2%|▏ | 164/10682 [01:58<1:26:15, 2.03it/s] 2%|▏ | 165/10682 [01:59<1:26:17, 2.03it/s] 2%|▏ | 166/10682 [01:59<1:26:13, 2.03it/s] 2%|▏ | 167/10682 [02:00<1:26:11, 2.03it/s] 2%|▏ | 168/10682 [02:00<1:26:15, 2.03it/s] 2%|▏ | 169/10682 [02:01<1:26:10, 2.03it/s] 2%|▏ | 170/10682 [02:01<1:26:08, 2.03it/s] 2%|▏ | 171/10682 [02:02<1:26:09, 2.03it/s] 2%|▏ | 172/10682 [02:02<1:26:10, 2.03it/s] 2%|▏ | 173/10682 [02:03<1:26:14, 2.03it/s] 2%|▏ | 174/10682 [02:03<1:26:14, 2.03it/s] 2%|▏ | 175/10682 [02:04<1:26:13, 2.03it/s] {'loss': 6.8893, 'grad_norm': 0.4449046552181244, 'learning_rate': 0.00016370439663236668, 'epoch': 0.23}
2%|▏ | 175/10682 [02:04<1:26:13, 2.03it/s] 2%|▏ | 176/10682 [02:04<1:26:19, 2.03it/s] 2%|▏ | 177/10682 [02:05<1:26:15, 2.03it/s] 2%|▏ | 178/10682 [02:05<1:26:20, 2.03it/s] 2%|▏ | 179/10682 [02:06<1:26:15, 2.03it/s] 2%|▏ | 180/10682 [02:06<1:26:18, 2.03it/s] 2%|▏ | 181/10682 [02:07<1:26:12, 2.03it/s] 2%|▏ | 182/10682 [02:07<1:26:06, 2.03it/s] 2%|▏ | 183/10682 [02:08<1:26:10, 2.03it/s] 2%|▏ | 184/10682 [02:08<1:26:05, 2.03it/s] 2%|▏ | 185/10682 [02:09<1:26:03, 2.03it/s] 2%|▏ | 186/10682 [02:09<1:26:07, 2.03it/s] 2%|▏ | 187/10682 [02:10<1:26:09, 2.03it/s] 2%|▏ | 188/10682 [02:10<1:26:09, 2.03it/s] 2%|▏ | 189/10682 [02:11<1:26:04, 2.03it/s] 2%|▏ | 190/10682 [02:11<1:26:04, 2.03it/s] 2%|▏ | 191/10682 [02:12<1:26:02, 2.03it/s] 2%|▏ | 192/10682 [02:12<1:26:08, 2.03it/s] 2%|▏ | 193/10682 [02:13<1:26:08, 2.03it/s] 2%|▏ | 194/10682 [02:13<1:26:04, 2.03it/s] 2%|▏ | 195/10682 [02:14<1:26:03, 2.03it/s] 2%|▏ | 196/10682 [02:14<1:26:04, 2.03it/s] 2%|▏ | 197/10682 [02:14<1:26:01, 2.03it/s] 2%|▏ | 198/10682 [02:15<1:26:02, 2.03it/s] 2%|▏ | 199/10682 [02:15<1:26:00, 2.03it/s] 2%|▏ | 200/10682 [02:16<1:26:03, 2.03it/s]{'loss': 6.5708, 'grad_norm': 0.5720486640930176, 'learning_rate': 0.00018709073900841907, 'epoch': 0.26}
2%|▏ | 200/10682 [02:16<1:26:03, 2.03it/s] 2%|▏ | 201/10682 [02:16<1:26:30, 2.02it/s] 2%|▏ | 202/10682 [02:17<1:26:20, 2.02it/s] 2%|▏ | 203/10682 [02:17<1:26:18, 2.02it/s] 2%|▏ | 204/10682 [02:18<1:26:14, 2.02it/s] 2%|▏ | 205/10682 [02:18<1:26:12, 2.03it/s] 2%|▏ | 206/10682 [02:19<1:26:07, 2.03it/s] 2%|▏ | 207/10682 [02:19<1:26:08, 2.03it/s] 2%|▏ | 208/10682 [02:20<1:26:04, 2.03it/s] 2%|▏ | 209/10682 [02:20<1:25:59, 2.03it/s] 2%|▏ | 210/10682 [02:21<1:25:59, 2.03it/s] 2%|▏ | 211/10682 [02:21<1:25:57, 2.03it/s] 2%|▏ | 212/10682 [02:22<1:25:55, 2.03it/s] 2%|▏ | 213/10682 [02:22<1:25:53, 2.03it/s] 2%|▏ | 214/10682 [02:23<1:26:00, 2.03it/s] 2%|▏ | 215/10682 [02:23<1:25:59, 2.03it/s] 2%|▏ | 216/10682 [02:24<1:25:58, 2.03it/s] 2%|▏ | 217/10682 [02:24<1:26:01, 2.03it/s] 2%|▏ | 218/10682 [02:25<1:25:56, 2.03it/s] 2%|▏ | 219/10682 [02:25<1:25:57, 2.03it/s] 2%|▏ | 220/10682 [02:26<1:25:53, 2.03it/s] 2%|▏ | 221/10682 [02:26<1:25:57, 2.03it/s] 2%|▏ | 222/10682 [02:27<1:25:52, 2.03it/s] 2%|▏ | 223/10682 [02:27<1:25:49, 2.03it/s] 2%|▏ | 224/10682 [02:28<1:25:52, 2.03it/s] 2%|▏ | 225/10682 [02:28<1:25:53, 2.03it/s]{'loss': 6.3201, 'grad_norm': 0.6047703623771667, 'learning_rate': 0.00021047708138447147, 'epoch': 0.29}
2%|▏ | 225/10682 [02:28<1:25:53, 2.03it/s] 2%|▏ | 226/10682 [02:29<1:26:00, 2.03it/s] 2%|▏ | 227/10682 [02:29<1:25:56, 2.03it/s] 2%|▏ | 228/10682 [02:30<1:25:59, 2.03it/s] 2%|▏ | 229/10682 [02:30<1:25:56, 2.03it/s] 2%|▏ | 230/10682 [02:31<1:25:58, 2.03it/s] 2%|▏ | 231/10682 [02:31<1:25:57, 2.03it/s] 2%|▏ | 232/10682 [02:32<1:25:55, 2.03it/s] 2%|▏ | 233/10682 [02:32<1:25:58, 2.03it/s] 2%|▏ | 234/10682 [02:33<1:25:51, 2.03it/s] 2%|▏ | 235/10682 [02:33<1:25:50, 2.03it/s] 2%|▏ | 236/10682 [02:34<1:25:44, 2.03it/s] 2%|▏ | 237/10682 [02:34<1:25:48, 2.03it/s] 2%|▏ | 238/10682 [02:35<1:25:45, 2.03it/s] 2%|▏ | 239/10682 [02:35<1:25:43, 2.03it/s] 2%|▏ | 240/10682 [02:36<1:25:47, 2.03it/s] 2%|▏ | 241/10682 [02:36<1:25:39, 2.03it/s] 2%|▏ | 242/10682 [02:37<1:25:40, 2.03it/s] 2%|▏ | 243/10682 [02:37<1:25:38, 2.03it/s] 2%|▏ | 244/10682 [02:38<1:25:36, 2.03it/s] 2%|▏ | 245/10682 [02:38<1:25:39, 2.03it/s] 2%|▏ | 246/10682 [02:39<1:25:40, 2.03it/s] 2%|▏ | 247/10682 [02:39<1:25:42, 2.03it/s] 2%|▏ | 248/10682 [02:40<1:25:40, 2.03it/s] 2%|▏ | 249/10682 [02:40<1:25:39, 2.03it/s] 2%|▏ | 250/10682 [02:41<1:25:40, 2.03it/s]{'loss': 6.1126, 'grad_norm': 0.669146716594696, 'learning_rate': 0.00023386342376052386, 'epoch': 0.33}
2%|▏ | 250/10682 [02:41<1:25:40, 2.03it/s] 2%|▏ | 251/10682 [02:41<1:25:41, 2.03it/s] 2%|▏ | 252/10682 [02:42<1:25:50, 2.03it/s] 2%|▏ | 253/10682 [02:42<1:25:42, 2.03it/s] 2%|▏ | 254/10682 [02:43<1:25:40, 2.03it/s] 2%|▏ | 255/10682 [02:43<1:25:42, 2.03it/s] 2%|▏ | 256/10682 [02:44<1:25:43, 2.03it/s] 2%|▏ | 257/10682 [02:44<1:25:40, 2.03it/s] 2%|▏ | 258/10682 [02:45<1:25:35, 2.03it/s] 2%|▏ | 259/10682 [02:45<1:25:32, 2.03it/s] 2%|▏ | 260/10682 [02:46<1:25:35, 2.03it/s] 2%|▏ | 261/10682 [02:46<1:25:36, 2.03it/s] 2%|▏ | 262/10682 [02:47<1:25:38, 2.03it/s] 2%|▏ | 263/10682 [02:47<1:25:35, 2.03it/s] 2%|▏ | 264/10682 [02:48<1:25:35, 2.03it/s] 2%|▏ | 265/10682 [02:48<1:25:32, 2.03it/s] 2%|▏ | 266/10682 [02:49<1:25:32, 2.03it/s] 2%|▏ | 267/10682 [02:49<1:25:34, 2.03it/s] 3%|β–Ž | 268/10682 [02:49<1:25:38, 2.03it/s] 3%|β–Ž | 269/10682 [02:50<1:25:30, 2.03it/s] 3%|β–Ž | 270/10682 [02:50<1:25:32, 2.03it/s] 3%|β–Ž | 271/10682 [02:51<1:25:29, 2.03it/s] 3%|β–Ž | 272/10682 [02:51<1:25:34, 2.03it/s] 3%|β–Ž | 273/10682 [02:52<1:25:29, 2.03it/s] 3%|β–Ž | 274/10682 [02:52<1:25:25, 2.03it/s] 3%|β–Ž | 275/10682 [02:53<1:25:27, 2.03it/s]{'loss': 5.9336, 'grad_norm': 1.123867154121399, 'learning_rate': 0.00025724976613657625, 'epoch': 0.36}
3%|β–Ž | 275/10682 [02:53<1:25:27, 2.03it/s] 3%|β–Ž | 276/10682 [02:53<1:25:30, 2.03it/s] 3%|β–Ž | 277/10682 [02:54<1:25:29, 2.03it/s] 3%|β–Ž | 278/10682 [02:54<1:25:25, 2.03it/s] 3%|β–Ž | 279/10682 [02:55<1:25:35, 2.03it/s] 3%|β–Ž | 280/10682 [02:55<1:25:30, 2.03it/s] 3%|β–Ž | 281/10682 [02:56<1:25:26, 2.03it/s] 3%|β–Ž | 282/10682 [02:56<1:25:31, 2.03it/s] 3%|β–Ž | 283/10682 [02:57<1:25:28, 2.03it/s] 3%|β–Ž | 284/10682 [02:57<1:25:32, 2.03it/s] 3%|β–Ž | 285/10682 [02:58<1:25:27, 2.03it/s] 3%|β–Ž | 286/10682 [02:58<1:25:30, 2.03it/s] 3%|β–Ž | 287/10682 [02:59<1:25:26, 2.03it/s] 3%|β–Ž | 288/10682 [02:59<1:25:30, 2.03it/s] 3%|β–Ž | 289/10682 [03:00<1:25:25, 2.03it/s] 3%|β–Ž | 290/10682 [03:00<1:25:25, 2.03it/s] 3%|β–Ž | 291/10682 [03:01<1:25:35, 2.02it/s] 3%|β–Ž | 292/10682 [03:01<1:25:33, 2.02it/s] 3%|β–Ž | 293/10682 [03:02<1:25:30, 2.02it/s] 3%|β–Ž | 294/10682 [03:02<1:25:23, 2.03it/s] 3%|β–Ž | 295/10682 [03:03<1:25:25, 2.03it/s] 3%|β–Ž | 296/10682 [03:03<1:25:20, 2.03it/s] 3%|β–Ž | 297/10682 [03:04<1:25:19, 2.03it/s] 3%|β–Ž | 298/10682 [03:04<1:25:20, 2.03it/s] 3%|β–Ž | 299/10682 [03:05<1:25:15, 2.03it/s] 3%|β–Ž | 300/10682 [03:05<1:25:17, 2.03it/s]{'loss': 5.7914, 'grad_norm': 0.8035856485366821, 'learning_rate': 0.0002806361085126286, 'epoch': 0.39}
3%|β–Ž | 300/10682 [03:05<1:25:17, 2.03it/s] 3%|β–Ž | 301/10682 [03:06<1:25:18, 2.03it/s] 3%|β–Ž | 302/10682 [03:06<1:25:20, 2.03it/s] 3%|β–Ž | 303/10682 [03:07<1:25:21, 2.03it/s] 3%|β–Ž | 304/10682 [03:07<1:25:21, 2.03it/s] 3%|β–Ž | 305/10682 [03:08<1:25:25, 2.02it/s] 3%|β–Ž | 306/10682 [03:08<1:25:18, 2.03it/s] 3%|β–Ž | 307/10682 [03:09<1:25:21, 2.03it/s] 3%|β–Ž | 308/10682 [03:09<1:25:23, 2.02it/s] 3%|β–Ž | 309/10682 [03:10<1:25:23, 2.02it/s] 3%|β–Ž | 310/10682 [03:10<1:25:18, 2.03it/s] 3%|β–Ž | 311/10682 [03:11<1:25:10, 2.03it/s] 3%|β–Ž | 312/10682 [03:11<1:25:10, 2.03it/s] 3%|β–Ž | 313/10682 [03:12<1:25:12, 2.03it/s] 3%|β–Ž | 314/10682 [03:12<1:25:18, 2.03it/s] 3%|β–Ž | 315/10682 [03:13<1:25:17, 2.03it/s] 3%|β–Ž | 316/10682 [03:13<1:25:18, 2.03it/s] 3%|β–Ž | 317/10682 [03:14<1:25:13, 2.03it/s] 3%|β–Ž | 318/10682 [03:14<1:25:15, 2.03it/s] 3%|β–Ž | 319/10682 [03:15<1:25:10, 2.03it/s] 3%|β–Ž | 320/10682 [03:15<1:25:11, 2.03it/s] 3%|β–Ž | 321/10682 [03:16<1:25:08, 2.03it/s] 3%|β–Ž | 322/10682 [03:16<1:25:05, 2.03it/s] 3%|β–Ž | 323/10682 [03:17<1:25:09, 2.03it/s] 3%|β–Ž | 324/10682 [03:17<1:25:04, 2.03it/s] 3%|β–Ž | 325/10682 [03:18<1:25:09, 2.03it/s] {'loss': 5.6603, 'grad_norm': 0.6922128200531006, 'learning_rate': 0.00030402245088868103, 'epoch': 0.43}
3%|β–Ž | 325/10682 [03:18<1:25:09, 2.03it/s] 3%|β–Ž | 326/10682 [03:18<1:25:11, 2.03it/s] 3%|β–Ž | 327/10682 [03:19<1:25:12, 2.03it/s] 3%|β–Ž | 328/10682 [03:19<1:25:06, 2.03it/s] 3%|β–Ž | 329/10682 [03:20<1:25:06, 2.03it/s] 3%|β–Ž | 330/10682 [03:20<1:25:07, 2.03it/s] 3%|β–Ž | 331/10682 [03:21<1:25:14, 2.02it/s] 3%|β–Ž | 332/10682 [03:21<1:25:10, 2.03it/s] 3%|β–Ž | 333/10682 [03:22<1:25:08, 2.03it/s] 3%|β–Ž | 334/10682 [03:22<1:25:05, 2.03it/s] 3%|β–Ž | 335/10682 [03:23<1:25:04, 2.03it/s] 3%|β–Ž | 336/10682 [03:23<1:25:05, 2.03it/s] 3%|β–Ž | 337/10682 [03:24<1:25:01, 2.03it/s] 3%|β–Ž | 338/10682 [03:24<1:25:01, 2.03it/s] 3%|β–Ž | 339/10682 [03:25<1:24:55, 2.03it/s] 3%|β–Ž | 340/10682 [03:25<1:24:59, 2.03it/s] 3%|β–Ž | 341/10682 [03:25<1:24:55, 2.03it/s] 3%|β–Ž | 342/10682 [03:26<1:24:58, 2.03it/s] 3%|β–Ž | 343/10682 [03:26<1:24:58, 2.03it/s] 3%|β–Ž | 344/10682 [03:27<1:24:57, 2.03it/s] 3%|β–Ž | 345/10682 [03:27<1:24:53, 2.03it/s] 3%|β–Ž | 346/10682 [03:28<1:24:51, 2.03it/s] 3%|β–Ž | 347/10682 [03:28<1:24:51, 2.03it/s] 3%|β–Ž | 348/10682 [03:29<1:24:55, 2.03it/s] 3%|β–Ž | 349/10682 [03:29<1:24:55, 2.03it/s] 3%|β–Ž | 350/10682 [03:30<1:24:57, 2.03it/s] {'loss': 5.5384, 'grad_norm': 0.8799753785133362, 'learning_rate': 0.00032740879326473337, 'epoch': 0.46}
3%|β–Ž | 350/10682 [03:30<1:24:57, 2.03it/s] 3%|β–Ž | 351/10682 [03:30<1:25:01, 2.03it/s] 3%|β–Ž | 352/10682 [03:31<1:25:03, 2.02it/s] 3%|β–Ž | 353/10682 [03:31<1:25:00, 2.03it/s] 3%|β–Ž | 354/10682 [03:32<1:24:54, 2.03it/s] 3%|β–Ž | 355/10682 [03:32<1:24:53, 2.03it/s] 3%|β–Ž | 356/10682 [03:33<1:24:48, 2.03it/s] 3%|β–Ž | 357/10682 [03:33<1:24:49, 2.03it/s] 3%|β–Ž | 358/10682 [03:34<1:24:48, 2.03it/s] 3%|β–Ž | 359/10682 [03:34<1:24:44, 2.03it/s] 3%|β–Ž | 360/10682 [03:35<1:24:46, 2.03it/s] 3%|β–Ž | 361/10682 [03:35<1:24:45, 2.03it/s] 3%|β–Ž | 362/10682 [03:36<1:24:46, 2.03it/s] 3%|β–Ž | 363/10682 [03:36<1:24:49, 2.03it/s] 3%|β–Ž | 364/10682 [03:37<1:24:47, 2.03it/s] 3%|β–Ž | 365/10682 [03:37<1:24:46, 2.03it/s] 3%|β–Ž | 366/10682 [03:38<1:24:47, 2.03it/s] 3%|β–Ž | 367/10682 [03:38<1:24:45, 2.03it/s] 3%|β–Ž | 368/10682 [03:39<1:24:40, 2.03it/s] 3%|β–Ž | 369/10682 [03:39<1:24:45, 2.03it/s] 3%|β–Ž | 370/10682 [03:40<1:24:43, 2.03it/s] 3%|β–Ž | 371/10682 [03:40<1:24:47, 2.03it/s] 3%|β–Ž | 372/10682 [03:41<1:24:44, 2.03it/s] 3%|β–Ž | 373/10682 [03:41<1:24:42, 2.03it/s] 4%|β–Ž | 374/10682 [03:42<1:24:46, 2.03it/s] 4%|β–Ž | 375/10682 [03:42<1:24:38, 2.03it/s] {'loss': 5.4296, 'grad_norm': 0.642095685005188, 'learning_rate': 0.0003507951356407858, 'epoch': 0.49}
4%|β–Ž | 375/10682 [03:42<1:24:38, 2.03it/s] 4%|β–Ž | 376/10682 [03:43<1:24:48, 2.03it/s] 4%|β–Ž | 377/10682 [03:43<1:24:40, 2.03it/s] 4%|β–Ž | 378/10682 [03:44<1:24:43, 2.03it/s] 4%|β–Ž | 379/10682 [03:44<1:24:40, 2.03it/s] 4%|β–Ž | 380/10682 [03:45<1:24:33, 2.03it/s] 4%|β–Ž | 381/10682 [03:45<1:24:36, 2.03it/s] 4%|β–Ž | 382/10682 [03:46<1:24:33, 2.03it/s] 4%|β–Ž | 383/10682 [03:46<1:24:38, 2.03it/s] 4%|β–Ž | 384/10682 [03:47<1:24:40, 2.03it/s] 4%|β–Ž | 385/10682 [03:47<1:24:45, 2.02it/s] 4%|β–Ž | 386/10682 [03:48<1:24:40, 2.03it/s] 4%|β–Ž | 387/10682 [03:48<1:24:33, 2.03it/s] 4%|β–Ž | 388/10682 [03:49<1:24:38, 2.03it/s] 4%|β–Ž | 389/10682 [03:49<1:24:35, 2.03it/s] 4%|β–Ž | 390/10682 [03:50<1:24:37, 2.03it/s] 4%|β–Ž | 391/10682 [03:50<1:24:33, 2.03it/s] 4%|β–Ž | 392/10682 [03:51<1:24:34, 2.03it/s] 4%|β–Ž | 393/10682 [03:51<1:24:30, 2.03it/s] 4%|β–Ž | 394/10682 [03:52<1:24:27, 2.03it/s] 4%|β–Ž | 395/10682 [03:52<1:24:31, 2.03it/s] 4%|β–Ž | 396/10682 [03:53<1:24:28, 2.03it/s] 4%|β–Ž | 397/10682 [03:53<1:24:33, 2.03it/s] 4%|β–Ž | 398/10682 [03:54<1:24:31, 2.03it/s] 4%|β–Ž | 399/10682 [03:54<1:24:30, 2.03it/s] 4%|β–Ž | 400/10682 [03:55<1:24:33, 2.03it/s] {'loss': 5.327, 'grad_norm': 0.7051675319671631, 'learning_rate': 0.00037418147801683815, 'epoch': 0.52}
4%|β–Ž | 400/10682 [03:55<1:24:33, 2.03it/s] 4%|▍ | 401/10682 [03:55<1:24:36, 2.03it/s] 4%|▍ | 402/10682 [03:56<1:24:37, 2.02it/s] 4%|▍ | 403/10682 [03:56<1:24:34, 2.03it/s] 4%|▍ | 404/10682 [03:57<1:24:34, 2.03it/s] 4%|▍ | 405/10682 [03:57<1:24:32, 2.03it/s] 4%|▍ | 406/10682 [03:58<1:24:27, 2.03it/s] 4%|▍ | 407/10682 [03:58<1:24:29, 2.03it/s] 4%|▍ | 408/10682 [03:59<1:24:28, 2.03it/s] 4%|▍ | 409/10682 [03:59<1:24:26, 2.03it/s] 4%|▍ | 410/10682 [04:00<1:24:25, 2.03it/s] 4%|▍ | 411/10682 [04:00<1:24:16, 2.03it/s] 4%|▍ | 412/10682 [04:01<1:24:18, 2.03it/s] 4%|▍ | 413/10682 [04:01<1:24:18, 2.03it/s] 4%|▍ | 414/10682 [04:01<1:24:23, 2.03it/s] 4%|▍ | 415/10682 [04:02<1:24:21, 2.03it/s] 4%|▍ | 416/10682 [04:02<1:24:13, 2.03it/s] 4%|▍ | 417/10682 [04:03<1:24:16, 2.03it/s] 4%|▍ | 418/10682 [04:03<1:24:16, 2.03it/s] 4%|▍ | 419/10682 [04:04<1:24:17, 2.03it/s] 4%|▍ | 420/10682 [04:04<1:24:15, 2.03it/s] 4%|▍ | 421/10682 [04:05<1:24:08, 2.03it/s] 4%|▍ | 422/10682 [04:05<1:24:10, 2.03it/s] 4%|▍ | 423/10682 [04:06<1:24:07, 2.03it/s] 4%|▍ | 424/10682 [04:06<1:24:09, 2.03it/s] 4%|▍ | 425/10682 [04:07<1:24:10, 2.03it/s]{'loss': 5.234, 'grad_norm': 0.7084140777587891, 'learning_rate': 0.0003975678203928906, 'epoch': 0.56}
4%|▍ | 425/10682 [04:07<1:24:10, 2.03it/s] 4%|▍ | 426/10682 [04:07<1:24:12, 2.03it/s] 4%|▍ | 427/10682 [04:08<1:24:14, 2.03it/s] 4%|▍ | 428/10682 [04:08<1:24:11, 2.03it/s] 4%|▍ | 429/10682 [04:09<1:24:12, 2.03it/s] 4%|▍ | 430/10682 [04:09<1:24:10, 2.03it/s] 4%|▍ | 431/10682 [04:10<1:24:08, 2.03it/s] 4%|▍ | 432/10682 [04:10<1:24:14, 2.03it/s] 4%|▍ | 433/10682 [04:11<1:24:09, 2.03it/s] 4%|▍ | 434/10682 [04:11<1:24:04, 2.03it/s] 4%|▍ | 435/10682 [04:12<1:24:05, 2.03it/s] 4%|▍ | 436/10682 [04:12<1:24:02, 2.03it/s] 4%|▍ | 437/10682 [04:13<1:24:05, 2.03it/s] 4%|▍ | 438/10682 [04:13<1:24:12, 2.03it/s] 4%|▍ | 439/10682 [04:14<1:24:06, 2.03it/s] 4%|▍ | 440/10682 [04:14<1:24:09, 2.03it/s] 4%|▍ | 441/10682 [04:15<1:24:05, 2.03it/s] 4%|▍ | 442/10682 [04:15<1:23:59, 2.03it/s] 4%|▍ | 443/10682 [04:16<1:24:00, 2.03it/s] 4%|▍ | 444/10682 [04:16<1:23:58, 2.03it/s] 4%|▍ | 445/10682 [04:17<1:23:55, 2.03it/s] 4%|▍ | 446/10682 [04:17<1:23:59, 2.03it/s] 4%|▍ | 447/10682 [04:18<1:24:00, 2.03it/s] 4%|▍ | 448/10682 [04:18<1:23:56, 2.03it/s] 4%|▍ | 449/10682 [04:19<1:23:56, 2.03it/s] 4%|▍ | 450/10682 [04:19<1:24:00, 2.03it/s]{'loss': 5.1535, 'grad_norm': 0.733258068561554, 'learning_rate': 0.00042095416276894293, 'epoch': 0.59}
4%|▍ | 450/10682 [04:19<1:24:00, 2.03it/s] 4%|▍ | 451/10682 [04:20<1:24:02, 2.03it/s] 4%|▍ | 452/10682 [04:20<1:23:59, 2.03it/s] 4%|▍ | 453/10682 [04:21<1:23:59, 2.03it/s] 4%|▍ | 454/10682 [04:21<1:23:54, 2.03it/s] 4%|▍ | 455/10682 [04:22<1:23:55, 2.03it/s] 4%|▍ | 456/10682 [04:22<1:23:54, 2.03it/s] 4%|▍ | 457/10682 [04:23<1:23:53, 2.03it/s] 4%|▍ | 458/10682 [04:23<1:23:55, 2.03it/s] 4%|▍ | 459/10682 [04:24<1:23:52, 2.03it/s] 4%|▍ | 460/10682 [04:24<1:23:51, 2.03it/s] 4%|▍ | 461/10682 [04:25<1:23:55, 2.03it/s] 4%|▍ | 462/10682 [04:25<1:23:53, 2.03it/s] 4%|▍ | 463/10682 [04:26<1:23:56, 2.03it/s] 4%|▍ | 464/10682 [04:26<1:23:51, 2.03it/s] 4%|▍ | 465/10682 [04:27<1:23:58, 2.03it/s] 4%|▍ | 466/10682 [04:27<1:23:58, 2.03it/s] 4%|▍ | 467/10682 [04:28<1:23:57, 2.03it/s] 4%|▍ | 468/10682 [04:28<1:23:52, 2.03it/s] 4%|▍ | 469/10682 [04:29<1:23:51, 2.03it/s] 4%|▍ | 470/10682 [04:29<1:23:55, 2.03it/s] 4%|▍ | 471/10682 [04:30<1:23:52, 2.03it/s] 4%|▍ | 472/10682 [04:30<1:23:48, 2.03it/s] 4%|▍ | 473/10682 [04:31<1:23:54, 2.03it/s] 4%|▍ | 474/10682 [04:31<1:23:51, 2.03it/s] 4%|▍ | 475/10682 [04:32<1:23:54, 2.03it/s]{'loss': 5.0819, 'grad_norm': 0.5916937589645386, 'learning_rate': 0.0004443405051449954, 'epoch': 0.62}
4%|▍ | 475/10682 [04:32<1:23:54, 2.03it/s] 4%|▍ | 476/10682 [04:32<1:23:55, 2.03it/s] 4%|▍ | 477/10682 [04:33<1:23:56, 2.03it/s] 4%|▍ | 478/10682 [04:33<1:23:52, 2.03it/s] 4%|▍ | 479/10682 [04:34<1:23:56, 2.03it/s] 4%|▍ | 480/10682 [04:34<1:23:49, 2.03it/s] 5%|▍ | 481/10682 [04:35<1:23:50, 2.03it/s] 5%|▍ | 482/10682 [04:35<1:23:50, 2.03it/s] 5%|▍ | 483/10682 [04:35<1:23:51, 2.03it/s] 5%|▍ | 484/10682 [04:36<1:23:49, 2.03it/s] 5%|▍ | 485/10682 [04:36<1:23:48, 2.03it/s] 5%|▍ | 486/10682 [04:37<1:23:50, 2.03it/s] 5%|▍ | 487/10682 [04:37<1:23:44, 2.03it/s] 5%|▍ | 488/10682 [04:38<1:23:45, 2.03it/s] 5%|▍ | 489/10682 [04:38<1:23:44, 2.03it/s] 5%|▍ | 490/10682 [04:39<1:23:47, 2.03it/s] 5%|▍ | 491/10682 [04:39<1:23:50, 2.03it/s] 5%|▍ | 492/10682 [04:40<1:23:48, 2.03it/s] 5%|▍ | 493/10682 [04:40<1:23:45, 2.03it/s] 5%|▍ | 494/10682 [04:41<1:23:38, 2.03it/s] 5%|▍ | 495/10682 [04:41<1:23:42, 2.03it/s] 5%|▍ | 496/10682 [04:42<1:23:40, 2.03it/s] 5%|▍ | 497/10682 [04:42<1:23:40, 2.03it/s] 5%|▍ | 498/10682 [04:43<1:23:41, 2.03it/s] 5%|▍ | 499/10682 [04:43<1:23:41, 2.03it/s] 5%|▍ | 500/10682 [04:44<1:23:37, 2.03it/s]{'loss': 5.0165, 'grad_norm': 0.5962570309638977, 'learning_rate': 0.0004677268475210477, 'epoch': 0.65}
5%|▍ | 500/10682 [04:44<1:23:37, 2.03it/s] 5%|▍ | 501/10682 [04:44<1:23:41, 2.03it/s] 5%|▍ | 502/10682 [04:45<1:23:42, 2.03it/s] 5%|▍ | 503/10682 [04:45<1:23:38, 2.03it/s] 5%|▍ | 504/10682 [04:46<1:23:37, 2.03it/s] 5%|▍ | 505/10682 [04:46<1:23:37, 2.03it/s] 5%|▍ | 506/10682 [04:47<1:23:39, 2.03it/s] 5%|▍ | 507/10682 [04:47<1:23:39, 2.03it/s] 5%|▍ | 508/10682 [04:48<1:23:38, 2.03it/s] 5%|▍ | 509/10682 [04:48<1:23:44, 2.02it/s] 5%|▍ | 510/10682 [04:49<1:23:43, 2.02it/s] 5%|▍ | 511/10682 [04:49<1:23:42, 2.02it/s] 5%|▍ | 512/10682 [04:50<1:23:37, 2.03it/s] 5%|▍ | 513/10682 [04:50<1:23:37, 2.03it/s] 5%|▍ | 514/10682 [04:51<1:23:35, 2.03it/s] 5%|▍ | 515/10682 [04:51<1:23:38, 2.03it/s] 5%|▍ | 516/10682 [04:52<1:23:37, 2.03it/s] 5%|▍ | 517/10682 [04:52<1:23:36, 2.03it/s] 5%|▍ | 518/10682 [04:53<1:23:35, 2.03it/s] 5%|▍ | 519/10682 [04:53<1:23:36, 2.03it/s] 5%|▍ | 520/10682 [04:54<1:23:35, 2.03it/s] 5%|▍ | 521/10682 [04:54<1:23:32, 2.03it/s] 5%|▍ | 522/10682 [04:55<1:23:31, 2.03it/s] 5%|▍ | 523/10682 [04:55<1:23:30, 2.03it/s] 5%|▍ | 524/10682 [04:56<1:23:36, 2.03it/s] 5%|▍ | 525/10682 [04:56<1:23:35, 2.03it/s]{'loss': 4.9437, 'grad_norm': 0.6917767524719238, 'learning_rate': 0.0004911131898971, 'epoch': 0.69}
5%|▍ | 525/10682 [04:56<1:23:35, 2.03it/s] 5%|▍ | 526/10682 [04:57<1:23:41, 2.02it/s] 5%|▍ | 527/10682 [04:57<1:23:40, 2.02it/s] 5%|▍ | 528/10682 [04:58<1:23:36, 2.02it/s] 5%|▍ | 529/10682 [04:58<1:23:36, 2.02it/s] 5%|▍ | 530/10682 [04:59<1:23:34, 2.02it/s] 5%|▍ | 531/10682 [04:59<1:23:30, 2.03it/s] 5%|▍ | 532/10682 [05:00<1:23:30, 2.03it/s] 5%|▍ | 533/10682 [05:00<1:23:26, 2.03it/s] 5%|▍ | 534/10682 [05:01<1:23:31, 2.02it/s] 5%|β–Œ | 535/10682 [05:01<1:23:26, 2.03it/s] 5%|β–Œ | 536/10682 [05:02<1:23:28, 2.03it/s] 5%|β–Œ | 537/10682 [05:02<1:23:24, 2.03it/s] 5%|β–Œ | 538/10682 [05:03<1:23:24, 2.03it/s] 5%|β–Œ | 539/10682 [05:03<1:23:25, 2.03it/s] 5%|β–Œ | 540/10682 [05:04<1:23:19, 2.03it/s] 5%|β–Œ | 541/10682 [05:04<1:23:22, 2.03it/s] 5%|β–Œ | 542/10682 [05:05<1:23:20, 2.03it/s] 5%|β–Œ | 543/10682 [05:05<1:23:18, 2.03it/s] 5%|β–Œ | 544/10682 [05:06<1:23:16, 2.03it/s] 5%|β–Œ | 545/10682 [05:06<1:23:20, 2.03it/s] 5%|β–Œ | 546/10682 [05:07<1:23:20, 2.03it/s] 5%|β–Œ | 547/10682 [05:07<1:23:19, 2.03it/s] 5%|β–Œ | 548/10682 [05:08<1:23:20, 2.03it/s] 5%|β–Œ | 549/10682 [05:08<1:23:16, 2.03it/s] 5%|β–Œ | 550/10682 [05:09<1:23:17, 2.03it/s] {'loss': 4.8784, 'grad_norm': 0.6977190971374512, 'learning_rate': 0.0005144995322731525, 'epoch': 0.72}
5%|β–Œ | 550/10682 [05:09<1:23:17, 2.03it/s] 5%|β–Œ | 551/10682 [05:09<1:23:22, 2.03it/s] 5%|β–Œ | 552/10682 [05:10<1:23:18, 2.03it/s] 5%|β–Œ | 553/10682 [05:10<1:23:17, 2.03it/s] 5%|β–Œ | 554/10682 [05:11<1:23:18, 2.03it/s] 5%|β–Œ | 555/10682 [05:11<1:23:18, 2.03it/s] 5%|β–Œ | 556/10682 [05:12<1:23:23, 2.02it/s] 5%|β–Œ | 557/10682 [05:12<1:23:21, 2.02it/s] 5%|β–Œ | 558/10682 [05:12<1:23:17, 2.03it/s] 5%|β–Œ | 559/10682 [05:13<1:23:18, 2.03it/s] 5%|β–Œ | 560/10682 [05:13<1:23:15, 2.03it/s] 5%|β–Œ | 561/10682 [05:14<1:23:11, 2.03it/s] 5%|β–Œ | 562/10682 [05:14<1:23:09, 2.03it/s] 5%|β–Œ | 563/10682 [05:15<1:23:11, 2.03it/s] 5%|β–Œ | 564/10682 [05:15<1:23:09, 2.03it/s] 5%|β–Œ | 565/10682 [05:16<1:23:08, 2.03it/s] 5%|β–Œ | 566/10682 [05:16<1:23:10, 2.03it/s] 5%|β–Œ | 567/10682 [05:17<1:23:08, 2.03it/s] 5%|β–Œ | 568/10682 [05:17<1:23:13, 2.03it/s] 5%|β–Œ | 569/10682 [05:18<1:23:15, 2.02it/s] 5%|β–Œ | 570/10682 [05:18<1:23:10, 2.03it/s] 5%|β–Œ | 571/10682 [05:19<1:23:11, 2.03it/s] 5%|β–Œ | 572/10682 [05:19<1:23:10, 2.03it/s] 5%|β–Œ | 573/10682 [05:20<1:23:09, 2.03it/s] 5%|β–Œ | 574/10682 [05:20<1:23:06, 2.03it/s] 5%|β–Œ | 575/10682 [05:21<1:23:13, 2.02it/s]{'loss': 4.8253, 'grad_norm': 0.582194447517395, 'learning_rate': 0.0005378858746492049, 'epoch': 0.75}
5%|β–Œ | 575/10682 [05:21<1:23:13, 2.02it/s] 5%|β–Œ | 576/10682 [05:21<1:23:15, 2.02it/s] 5%|β–Œ | 577/10682 [05:22<1:23:11, 2.02it/s] 5%|β–Œ | 578/10682 [05:22<1:23:14, 2.02it/s] 5%|β–Œ | 579/10682 [05:23<1:23:07, 2.03it/s] 5%|β–Œ | 580/10682 [05:23<1:23:11, 2.02it/s] 5%|β–Œ | 581/10682 [05:24<1:23:03, 2.03it/s] 5%|β–Œ | 582/10682 [05:24<1:23:00, 2.03it/s] 5%|β–Œ | 583/10682 [05:25<1:22:59, 2.03it/s] 5%|β–Œ | 584/10682 [05:25<1:22:57, 2.03it/s] 5%|β–Œ | 585/10682 [05:26<1:22:54, 2.03it/s] 5%|β–Œ | 586/10682 [05:26<1:22:56, 2.03it/s] 5%|β–Œ | 587/10682 [05:27<1:22:58, 2.03it/s] 6%|β–Œ | 588/10682 [05:27<1:22:57, 2.03it/s] 6%|β–Œ | 589/10682 [05:28<1:22:59, 2.03it/s] 6%|β–Œ | 590/10682 [05:28<1:22:55, 2.03it/s] 6%|β–Œ | 591/10682 [05:29<1:23:04, 2.02it/s] 6%|β–Œ | 592/10682 [05:29<1:22:57, 2.03it/s] 6%|β–Œ | 593/10682 [05:30<1:22:55, 2.03it/s] 6%|β–Œ | 594/10682 [05:30<1:22:54, 2.03it/s] 6%|β–Œ | 595/10682 [05:31<1:22:49, 2.03it/s] 6%|β–Œ | 596/10682 [05:31<1:22:53, 2.03it/s] 6%|β–Œ | 597/10682 [05:32<1:22:48, 2.03it/s] 6%|β–Œ | 598/10682 [05:32<1:22:54, 2.03it/s] 6%|β–Œ | 599/10682 [05:33<1:22:51, 2.03it/s] 6%|β–Œ | 600/10682 [05:33<1:22:50, 2.03it/s]{'loss': 4.7621, 'grad_norm': 0.5218554735183716, 'learning_rate': 0.0005612722170252572, 'epoch': 0.79}
6%|β–Œ | 600/10682 [05:33<1:22:50, 2.03it/s] 6%|β–Œ | 601/10682 [05:34<1:22:54, 2.03it/s] 6%|β–Œ | 602/10682 [05:34<1:22:55, 2.03it/s] 6%|β–Œ | 603/10682 [05:35<1:22:53, 2.03it/s] 6%|β–Œ | 604/10682 [05:35<1:22:52, 2.03it/s] 6%|β–Œ | 605/10682 [05:36<1:23:00, 2.02it/s] 6%|β–Œ | 606/10682 [05:36<1:22:58, 2.02it/s] 6%|β–Œ | 607/10682 [05:37<1:23:01, 2.02it/s] 6%|β–Œ | 608/10682 [05:37<1:22:54, 2.03it/s] 6%|β–Œ | 609/10682 [05:38<1:22:54, 2.03it/s] 6%|β–Œ | 610/10682 [05:38<1:22:50, 2.03it/s] 6%|β–Œ | 611/10682 [05:39<1:22:54, 2.02it/s] 6%|β–Œ | 612/10682 [05:39<1:22:52, 2.02it/s] 6%|β–Œ | 613/10682 [05:40<1:22:52, 2.03it/s] 6%|β–Œ | 614/10682 [05:40<1:22:54, 2.02it/s] 6%|β–Œ | 615/10682 [05:41<1:29:54, 1.87it/s] 6%|β–Œ | 616/10682 [05:41<1:34:42, 1.77it/s] 6%|β–Œ | 617/10682 [05:42<1:31:07, 1.84it/s] 6%|β–Œ | 618/10682 [05:42<1:28:38, 1.89it/s] 6%|β–Œ | 619/10682 [05:43<1:26:49, 1.93it/s] 6%|β–Œ | 620/10682 [05:43<1:25:31, 1.96it/s] 6%|β–Œ | 621/10682 [05:44<1:24:42, 1.98it/s] 6%|β–Œ | 622/10682 [05:44<1:24:00, 2.00it/s] 6%|β–Œ | 623/10682 [05:45<1:23:36, 2.00it/s] 6%|β–Œ | 624/10682 [05:45<1:23:18, 2.01it/s] 6%|β–Œ | 625/10682 [05:46<1:23:03, 2.02it/s]{'loss': 4.7137, 'grad_norm': 0.5765935182571411, 'learning_rate': 0.0005846585594013096, 'epoch': 0.82}
6%|β–Œ | 625/10682 [05:46<1:23:03, 2.02it/s] 6%|β–Œ | 626/10682 [05:46<1:23:00, 2.02it/s] 6%|β–Œ | 627/10682 [05:47<1:22:48, 2.02it/s] 6%|β–Œ | 628/10682 [05:47<1:22:48, 2.02it/s] 6%|β–Œ | 629/10682 [05:48<1:22:41, 2.03it/s] 6%|β–Œ | 630/10682 [05:48<1:22:44, 2.02it/s] 6%|β–Œ | 631/10682 [05:49<1:22:36, 2.03it/s] 6%|β–Œ | 632/10682 [05:49<1:22:35, 2.03it/s] 6%|β–Œ | 633/10682 [05:50<1:22:35, 2.03it/s] 6%|β–Œ | 634/10682 [05:50<1:22:31, 2.03it/s] 6%|β–Œ | 635/10682 [05:51<1:22:31, 2.03it/s] 6%|β–Œ | 636/10682 [05:51<1:22:28, 2.03it/s] 6%|β–Œ | 637/10682 [05:52<1:22:26, 2.03it/s] 6%|β–Œ | 638/10682 [05:52<1:22:27, 2.03it/s] 6%|β–Œ | 639/10682 [05:53<1:22:26, 2.03it/s] 6%|β–Œ | 640/10682 [05:53<1:22:31, 2.03it/s] 6%|β–Œ | 641/10682 [05:54<1:22:29, 2.03it/s] 6%|β–Œ | 642/10682 [05:54<1:22:30, 2.03it/s] 6%|β–Œ | 643/10682 [05:55<1:22:29, 2.03it/s] 6%|β–Œ | 644/10682 [05:55<1:22:27, 2.03it/s] 6%|β–Œ | 645/10682 [05:56<1:22:31, 2.03it/s] 6%|β–Œ | 646/10682 [05:56<1:22:28, 2.03it/s] 6%|β–Œ | 647/10682 [05:57<1:22:31, 2.03it/s] 6%|β–Œ | 648/10682 [05:57<1:22:22, 2.03it/s] 6%|β–Œ | 649/10682 [05:58<1:22:24, 2.03it/s] 6%|β–Œ | 650/10682 [05:58<1:22:25, 2.03it/s]{'loss': 4.663, 'grad_norm': 0.4962884187698364, 'learning_rate': 0.0006080449017773621, 'epoch': 0.85}
6%|β–Œ | 650/10682 [05:58<1:22:25, 2.03it/s] 6%|β–Œ | 651/10682 [05:59<1:22:37, 2.02it/s] 6%|β–Œ | 652/10682 [05:59<1:22:35, 2.02it/s] 6%|β–Œ | 653/10682 [06:00<1:22:29, 2.03it/s] 6%|β–Œ | 654/10682 [06:00<1:22:32, 2.02it/s] 6%|β–Œ | 655/10682 [06:01<1:22:30, 2.03it/s] 6%|β–Œ | 656/10682 [06:01<1:22:31, 2.02it/s] 6%|β–Œ | 657/10682 [06:02<1:22:23, 2.03it/s] 6%|β–Œ | 658/10682 [06:02<1:22:27, 2.03it/s] 6%|β–Œ | 659/10682 [06:03<1:22:24, 2.03it/s] 6%|β–Œ | 660/10682 [06:03<1:22:21, 2.03it/s] 6%|β–Œ | 661/10682 [06:04<1:22:25, 2.03it/s] 6%|β–Œ | 662/10682 [06:04<1:22:22, 2.03it/s] 6%|β–Œ | 663/10682 [06:05<1:22:24, 2.03it/s] 6%|β–Œ | 664/10682 [06:05<1:22:22, 2.03it/s] 6%|β–Œ | 665/10682 [06:06<1:22:19, 2.03it/s] 6%|β–Œ | 666/10682 [06:06<1:22:17, 2.03it/s] 6%|β–Œ | 667/10682 [06:07<1:22:19, 2.03it/s] 6%|β–‹ | 668/10682 [06:07<1:22:20, 2.03it/s] 6%|β–‹ | 669/10682 [06:08<1:22:20, 2.03it/s] 6%|β–‹ | 670/10682 [06:08<1:22:15, 2.03it/s] 6%|β–‹ | 671/10682 [06:09<1:22:16, 2.03it/s] 6%|β–‹ | 672/10682 [06:09<1:22:10, 2.03it/s] 6%|β–‹ | 673/10682 [06:10<1:22:13, 2.03it/s] 6%|β–‹ | 674/10682 [06:10<1:22:10, 2.03it/s] 6%|β–‹ | 675/10682 [06:10<1:22:08, 2.03it/s] {'loss': 4.622, 'grad_norm': 0.5398389101028442, 'learning_rate': 0.0006314312441534145, 'epoch': 0.88}
6%|β–‹ | 675/10682 [06:10<1:22:08, 2.03it/s] 6%|β–‹ | 676/10682 [06:11<1:22:17, 2.03it/s] 6%|β–‹ | 677/10682 [06:11<1:22:12, 2.03it/s] 6%|β–‹ | 678/10682 [06:12<1:22:15, 2.03it/s] 6%|β–‹ | 679/10682 [06:12<1:22:13, 2.03it/s] 6%|β–‹ | 680/10682 [06:13<1:22:18, 2.03it/s] 6%|β–‹ | 681/10682 [06:13<1:22:16, 2.03it/s] 6%|β–‹ | 682/10682 [06:14<1:22:18, 2.02it/s] 6%|β–‹ | 683/10682 [06:14<1:22:14, 2.03it/s] 6%|β–‹ | 684/10682 [06:15<1:22:13, 2.03it/s] 6%|β–‹ | 685/10682 [06:15<1:22:15, 2.03it/s] 6%|β–‹ | 686/10682 [06:16<1:22:09, 2.03it/s] 6%|β–‹ | 687/10682 [06:16<1:22:22, 2.02it/s] 6%|β–‹ | 688/10682 [06:17<1:22:14, 2.03it/s] 6%|β–‹ | 689/10682 [06:17<1:22:16, 2.02it/s] 6%|β–‹ | 690/10682 [06:18<1:22:09, 2.03it/s] 6%|β–‹ | 691/10682 [06:18<1:22:07, 2.03it/s] 6%|β–‹ | 692/10682 [06:19<1:22:08, 2.03it/s] 6%|β–‹ | 693/10682 [06:19<1:22:11, 2.03it/s] 6%|β–‹ | 694/10682 [06:20<1:22:08, 2.03it/s] 7%|β–‹ | 695/10682 [06:20<1:22:08, 2.03it/s] 7%|β–‹ | 696/10682 [06:21<1:22:13, 2.02it/s] 7%|β–‹ | 697/10682 [06:21<1:22:07, 2.03it/s] 7%|β–‹ | 698/10682 [06:22<1:22:18, 2.02it/s] 7%|β–‹ | 699/10682 [06:22<1:22:12, 2.02it/s] 7%|β–‹ | 700/10682 [06:23<1:22:12, 2.02it/s] {'loss': 4.5632, 'grad_norm': 0.4890291392803192, 'learning_rate': 0.0006548175865294667, 'epoch': 0.92}
7%|β–‹ | 700/10682 [06:23<1:22:12, 2.02it/s] 7%|β–‹ | 701/10682 [06:23<1:22:18, 2.02it/s] 7%|β–‹ | 702/10682 [06:24<1:22:17, 2.02it/s] 7%|β–‹ | 703/10682 [06:24<1:22:14, 2.02it/s] 7%|β–‹ | 704/10682 [06:25<1:22:13, 2.02it/s] 7%|β–‹ | 705/10682 [06:25<1:22:06, 2.03it/s] 7%|β–‹ | 706/10682 [06:26<1:22:11, 2.02it/s] 7%|β–‹ | 707/10682 [06:26<1:22:09, 2.02it/s] 7%|β–‹ | 708/10682 [06:27<1:22:13, 2.02it/s] 7%|β–‹ | 709/10682 [06:27<1:22:02, 2.03it/s] 7%|β–‹ | 710/10682 [06:28<1:22:06, 2.02it/s] 7%|β–‹ | 711/10682 [06:28<1:22:09, 2.02it/s] 7%|β–‹ | 712/10682 [06:29<1:22:05, 2.02it/s] 7%|β–‹ | 713/10682 [06:29<1:22:01, 2.03it/s] 7%|β–‹ | 714/10682 [06:30<1:21:53, 2.03it/s] 7%|β–‹ | 715/10682 [06:30<1:21:53, 2.03it/s] 7%|β–‹ | 716/10682 [06:31<1:21:55, 2.03it/s] 7%|β–‹ | 717/10682 [06:31<1:21:50, 2.03it/s] 7%|β–‹ | 718/10682 [06:32<1:21:53, 2.03it/s] 7%|β–‹ | 719/10682 [06:32<1:21:53, 2.03it/s] 7%|β–‹ | 720/10682 [06:33<1:21:57, 2.03it/s] 7%|β–‹ | 721/10682 [06:33<1:21:55, 2.03it/s] 7%|β–‹ | 722/10682 [06:34<1:21:52, 2.03it/s] 7%|β–‹ | 723/10682 [06:34<1:21:51, 2.03it/s] 7%|β–‹ | 724/10682 [06:35<1:21:51, 2.03it/s] 7%|β–‹ | 725/10682 [06:35<1:21:47, 2.03it/s]{'loss': 4.5343, 'grad_norm': 0.4946634769439697, 'learning_rate': 0.0006782039289055192, 'epoch': 0.95}
7%|β–‹ | 725/10682 [06:35<1:21:47, 2.03it/s] 7%|β–‹ | 726/10682 [06:36<1:21:55, 2.03it/s] 7%|β–‹ | 727/10682 [06:36<1:21:52, 2.03it/s] 7%|β–‹ | 728/10682 [06:37<1:21:49, 2.03it/s] 7%|β–‹ | 729/10682 [06:37<1:21:52, 2.03it/s] 7%|β–‹ | 730/10682 [06:38<1:21:48, 2.03it/s] 7%|β–‹ | 731/10682 [06:38<1:21:51, 2.03it/s] 7%|β–‹ | 732/10682 [06:39<1:21:53, 2.03it/s] 7%|β–‹ | 733/10682 [06:39<1:21:56, 2.02it/s] 7%|β–‹ | 734/10682 [06:40<1:21:53, 2.02it/s] 7%|β–‹ | 735/10682 [06:40<1:21:55, 2.02it/s] 7%|β–‹ | 736/10682 [06:41<1:21:50, 2.03it/s] 7%|β–‹ | 737/10682 [06:41<1:21:54, 2.02it/s] 7%|β–‹ | 738/10682 [06:42<1:21:51, 2.02it/s] 7%|β–‹ | 739/10682 [06:42<1:21:50, 2.02it/s] 7%|β–‹ | 740/10682 [06:43<1:21:51, 2.02it/s] 7%|β–‹ | 741/10682 [06:43<1:21:49, 2.02it/s] 7%|β–‹ | 742/10682 [06:44<1:21:48, 2.03it/s] 7%|β–‹ | 743/10682 [06:44<1:21:46, 2.03it/s] 7%|β–‹ | 744/10682 [06:45<1:21:49, 2.02it/s] 7%|β–‹ | 745/10682 [06:45<1:21:52, 2.02it/s] 7%|β–‹ | 746/10682 [06:46<1:21:53, 2.02it/s] 7%|β–‹ | 747/10682 [06:46<1:21:51, 2.02it/s] 7%|β–‹ | 748/10682 [06:47<1:21:50, 2.02it/s] 7%|β–‹ | 749/10682 [06:47<1:21:46, 2.02it/s] 7%|β–‹ | 750/10682 [06:48<1:21:42, 2.03it/s] {'loss': 4.4964, 'grad_norm': 0.47002631425857544, 'learning_rate': 0.0007015902712815716, 'epoch': 0.98}
7%|β–‹ | 750/10682 [06:48<1:21:42, 2.03it/s] 7%|β–‹ | 751/10682 [06:48<1:21:47, 2.02it/s] 7%|β–‹ | 752/10682 [06:49<1:21:46, 2.02it/s] 7%|β–‹ | 753/10682 [06:49<1:21:47, 2.02it/s] 7%|β–‹ | 754/10682 [06:49<1:21:48, 2.02it/s] 7%|β–‹ | 755/10682 [06:50<1:21:40, 2.03it/s] 7%|β–‹ | 756/10682 [06:50<1:21:38, 2.03it/s] 7%|β–‹ | 757/10682 [06:51<1:21:41, 2.02it/s] 7%|β–‹ | 758/10682 [06:51<1:21:35, 2.03it/s] 7%|β–‹ | 759/10682 [06:52<1:21:36, 2.03it/s] 7%|β–‹ | 760/10682 [06:52<1:21:34, 2.03it/s] 7%|β–‹ | 761/10682 [06:53<1:21:41, 2.02it/s] 7%|β–‹ | 762/10682 [06:53<1:21:32, 2.03it/s] 7%|β–‹ | 763/10682 [06:54<1:23:40, 1.98it/s] 7%|β–‹ | 764/10682 [07:06<10:54:54, 3.96s/it] 7%|β–‹ | 765/10682 [07:07<8:02:51, 2.92s/it] 7%|β–‹ | 766/10682 [07:07<6:02:50, 2.20s/it] 7%|β–‹ | 767/10682 [07:07<4:38:27, 1.69s/it] 7%|β–‹ | 768/10682 [07:08<3:39:18, 1.33s/it] 7%|β–‹ | 769/10682 [07:08<2:57:58, 1.08s/it] 7%|β–‹ | 770/10682 [07:09<2:29:06, 1.11it/s] 7%|β–‹ | 771/10682 [07:09<2:08:50, 1.28it/s] 7%|β–‹ | 772/10682 [07:10<1:54:38, 1.44it/s] 7%|β–‹ | 773/10682 [07:10<1:44:47, 1.58it/s] 7%|β–‹ | 774/10682 [07:11<1:37:40, 1.69it/s] 7%|β–‹ | 775/10682 [07:11<1:32:55, 1.78it/s]{'loss': 4.4347, 'grad_norm': 0.5852263569831848, 'learning_rate': 0.0007249766136576241, 'epoch': 1.02}
7%|β–‹ | 775/10682 [07:11<1:32:55, 1.78it/s] 7%|β–‹ | 776/10682 [07:12<1:29:35, 1.84it/s] 7%|β–‹ | 777/10682 [07:12<1:27:13, 1.89it/s] 7%|β–‹ | 778/10682 [07:13<1:25:21, 1.93it/s] 7%|β–‹ | 779/10682 [07:13<1:24:15, 1.96it/s] 7%|β–‹ | 780/10682 [07:14<1:23:24, 1.98it/s] 7%|β–‹ | 781/10682 [07:14<1:22:52, 1.99it/s] 7%|β–‹ | 782/10682 [07:15<1:22:26, 2.00it/s] 7%|β–‹ | 783/10682 [07:15<1:22:11, 2.01it/s] 7%|β–‹ | 784/10682 [07:16<1:21:56, 2.01it/s] 7%|β–‹ | 785/10682 [07:16<1:21:49, 2.02it/s] 7%|β–‹ | 786/10682 [07:17<1:21:40, 2.02it/s] 7%|β–‹ | 787/10682 [07:17<1:21:33, 2.02it/s]