File size: 61,567 Bytes
b845aee 0729830 b845aee 0729830 b845aee a3bdea5 e98066d cf4c4a9 b845aee e82b954 4683a9a e82b954 b845aee e82b954 b845aee e82b954 b845aee 4683a9a b845aee 4683a9a b845aee 4683a9a b845aee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
slurm submission log: 2024-05-24 23:54:02.551434 created following sbatch script: ############################### #!/bin/bash #SBATCH --account=nlp #SBATCH --cpus-per-task=16 #SBATCH --dependency=afterok:7649440 #SBATCH --gres=gpu:2 #SBATCH --job-name=tthrush-job-2884917 #SBATCH --mem=100G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 # activate your desired anaconda environment . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection # cd to working directory cd . # launch commands srun --unbuffered run_as_child_processes 'torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14' ############################### submission to slurm complete! ############################### slurm submission output Submitted batch job 7649441 ############################### /var/lib/slurm/slurmd/job7649441/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. To initialize your shell, run $ conda init <SHELL_NAME> Currently supported shells are: - bash - fish - tcsh - xonsh - zsh - powershell See 'conda init --help' for more information and options. IMPORTANT: You may need to close and restart your shell after running 'conda init'. ############################### start time: 2024-05-25 04:55:04.852621 machine: sphinx2 conda env: pretraining-coreset-selection ############################### running following processes torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14 ############################### command outputs: [2024-05-25 04:55:06,922] torch.distributed.run: [WARNING] [2024-05-25 04:55:06,922] torch.distributed.run: [WARNING] ***************************************** [2024-05-25 04:55:06,922] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. [2024-05-25 04:55:06,922] torch.distributed.run: [WARNING] ***************************************** 05/25/2024 04:55:12 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) 05/25/2024 04:55:17 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) 0%| | 0/11788 [00:00<?, ?it/s][rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) [rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) 0%| | 1/11788 [00:05<18:49:07, 5.75s/it] 0%| | 2/11788 [00:07<11:47:29, 3.60s/it] 0%| | 3/11788 [00:09<8:39:23, 2.64s/it] 0%| | 4/11788 [00:10<6:48:15, 2.08s/it] 0%| | 5/11788 [00:11<5:34:19, 1.70s/it] 0%| | 6/11788 [00:12<4:45:06, 1.45s/it] 0%| | 7/11788 [00:13<4:09:48, 1.27s/it] 0%| | 8/11788 [00:14<3:41:36, 1.13s/it] 0%| | 9/11788 [00:15<3:19:46, 1.02s/it] 0%| | 10/11788 [00:15<3:02:25, 1.08it/s] 0%| | 11/11788 [00:16<2:47:48, 1.17it/s] 0%| | 12/11788 [00:17<2:36:12, 1.26it/s] 0%| | 13/11788 [00:17<2:26:16, 1.34it/s] 0%| | 14/11788 [00:18<2:19:18, 1.41it/s] 0%| | 15/11788 [00:19<2:13:46, 1.47it/s] 0%| | 16/11788 [00:19<2:08:28, 1.53it/s] 0%| | 17/11788 [00:20<2:03:53, 1.58it/s] 0%| | 18/11788 [00:20<1:59:53, 1.64it/s] 0%| | 19/11788 [00:21<1:56:58, 1.68it/s] 0%| | 20/11788 [00:21<1:54:07, 1.72it/s] 0%| | 21/11788 [00:22<1:51:35, 1.76it/s] 0%| | 22/11788 [00:22<1:50:02, 1.78it/s] 0%| | 23/11788 [00:23<1:48:10, 1.81it/s] 0%| | 24/11788 [00:24<1:47:46, 1.82it/s] 0%| | 25/11788 [00:24<1:46:51, 1.83it/s]{'loss': 10.6806, 'grad_norm': 1.3700144290924072, 'learning_rate': 2.1204410517387616e-05, 'epoch': 0.03} 0%| | 25/11788 [00:24<1:46:51, 1.83it/s] 0%| | 26/11788 [00:25<1:45:46, 1.85it/s] 0%| | 27/11788 [00:25<1:45:10, 1.86it/s] 0%| | 28/11788 [00:26<1:43:34, 1.89it/s] 0%| | 29/11788 [00:26<1:43:10, 1.90it/s] 0%| | 30/11788 [00:27<1:42:20, 1.91it/s] 0%| | 31/11788 [00:27<1:41:31, 1.93it/s] 0%| | 32/11788 [00:28<1:41:29, 1.93it/s] 0%| | 33/11788 [00:28<1:41:01, 1.94it/s] 0%| | 34/11788 [00:29<1:40:26, 1.95it/s] 0%| | 35/11788 [00:29<1:40:16, 1.95it/s] 0%| | 36/11788 [00:30<1:39:46, 1.96it/s] 0%| | 37/11788 [00:30<1:40:09, 1.96it/s] 0%| | 38/11788 [00:31<1:39:47, 1.96it/s] 0%| | 39/11788 [00:31<1:39:09, 1.97it/s] 0%| | 40/11788 [00:32<1:39:11, 1.97it/s] 0%| | 41/11788 [00:32<1:38:51, 1.98it/s] 0%| | 42/11788 [00:33<1:38:23, 1.99it/s] 0%| | 43/11788 [00:33<1:39:25, 1.97it/s] 0%| | 44/11788 [00:34<1:39:01, 1.98it/s] 0%| | 45/11788 [00:34<1:38:28, 1.99it/s] 0%| | 46/11788 [00:35<1:38:13, 1.99it/s] 0%| | 47/11788 [00:35<1:38:16, 1.99it/s] 0%| | 48/11788 [00:36<1:37:59, 2.00it/s] 0%| | 49/11788 [00:36<1:38:07, 1.99it/s] 0%| | 50/11788 [00:37<1:38:03, 2.00it/s]{'loss': 9.9893, 'grad_norm': 1.2721635103225708, 'learning_rate': 4.240882103477523e-05, 'epoch': 0.06} 0%| | 50/11788 [00:37<1:38:03, 2.00it/s] 0%| | 51/11788 [00:37<1:38:04, 1.99it/s] 0%| | 52/11788 [00:38<1:38:29, 1.99it/s] 0%| | 53/11788 [00:38<1:38:21, 1.99it/s] 0%| | 54/11788 [00:39<1:38:25, 1.99it/s] 0%| | 55/11788 [00:39<1:38:40, 1.98it/s] 0%| | 56/11788 [00:40<1:38:25, 1.99it/s] 0%| | 57/11788 [00:40<1:38:13, 1.99it/s] 0%| | 58/11788 [00:41<1:37:50, 2.00it/s] 1%| | 59/11788 [00:41<1:37:50, 2.00it/s] 1%| | 60/11788 [00:42<1:37:37, 2.00it/s] 1%| | 61/11788 [00:42<1:37:36, 2.00it/s] 1%| | 62/11788 [00:43<1:37:30, 2.00it/s] 1%| | 63/11788 [00:43<1:37:27, 2.01it/s] 1%| | 64/11788 [00:44<1:37:09, 2.01it/s] 1%| | 65/11788 [00:44<1:37:08, 2.01it/s] 1%| | 66/11788 [00:45<1:37:05, 2.01it/s] 1%| | 67/11788 [00:45<1:37:22, 2.01it/s] 1%| | 68/11788 [00:46<1:37:16, 2.01it/s] 1%| | 69/11788 [00:46<1:36:56, 2.01it/s] 1%| | 70/11788 [00:47<1:37:04, 2.01it/s] 1%| | 71/11788 [00:47<1:37:11, 2.01it/s] 1%| | 72/11788 [00:48<1:36:56, 2.01it/s] 1%| | 73/11788 [00:48<1:36:56, 2.01it/s] 1%| | 74/11788 [00:49<1:36:53, 2.01it/s] 1%| | 75/11788 [00:49<1:36:52, 2.02it/s]{'loss': 9.3183, 'grad_norm': 1.0619311332702637, 'learning_rate': 6.361323155216285e-05, 'epoch': 0.09} 1%| | 75/11788 [00:49<1:36:52, 2.02it/s] 1%| | 76/11788 [00:50<1:37:04, 2.01it/s] 1%| | 77/11788 [00:50<1:36:47, 2.02it/s] 1%| | 78/11788 [00:51<1:36:52, 2.01it/s] 1%| | 79/11788 [00:51<1:36:50, 2.02it/s] 1%| | 80/11788 [00:52<1:36:44, 2.02it/s] 1%| | 81/11788 [00:52<1:36:48, 2.02it/s] 1%| | 82/11788 [00:53<1:36:42, 2.02it/s] 1%| | 83/11788 [00:53<1:36:37, 2.02it/s] 1%| | 84/11788 [00:54<1:36:31, 2.02it/s] 1%| | 85/11788 [00:54<1:36:30, 2.02it/s] 1%| | 86/11788 [00:55<1:36:29, 2.02it/s] 1%| | 87/11788 [00:55<1:36:23, 2.02it/s] 1%| | 88/11788 [00:56<1:36:17, 2.02it/s] 1%| | 89/11788 [00:56<1:36:12, 2.03it/s] 1%| | 90/11788 [00:57<1:36:13, 2.03it/s] 1%| | 91/11788 [00:57<1:36:12, 2.03it/s] 1%| | 92/11788 [00:58<1:36:16, 2.02it/s] 1%| | 93/11788 [00:58<1:36:13, 2.03it/s] 1%| | 94/11788 [00:59<1:36:11, 2.03it/s] 1%| | 95/11788 [00:59<1:36:05, 2.03it/s] 1%| | 96/11788 [01:00<1:36:11, 2.03it/s] 1%| | 97/11788 [01:00<1:36:05, 2.03it/s] 1%| | 98/11788 [01:01<1:36:08, 2.03it/s] 1%| | 99/11788 [01:01<1:36:02, 2.03it/s] 1%| | 100/11788 [01:02<1:36:07, 2.03it/s]{'loss': 8.5608, 'grad_norm': 0.8029088377952576, 'learning_rate': 8.481764206955047e-05, 'epoch': 0.12} 1%| | 100/11788 [01:02<1:36:07, 2.03it/s] 1%| | 101/11788 [01:02<1:36:10, 2.03it/s] 1%| | 102/11788 [01:03<1:36:05, 2.03it/s] 1%| | 103/11788 [01:03<1:36:03, 2.03it/s] 1%| | 104/11788 [01:04<1:36:02, 2.03it/s] 1%| | 105/11788 [01:04<1:36:04, 2.03it/s] 1%| | 106/11788 [01:05<1:35:56, 2.03it/s] 1%| | 107/11788 [01:05<1:36:01, 2.03it/s] 1%| | 108/11788 [01:06<1:36:00, 2.03it/s] 1%| | 109/11788 [01:06<1:35:58, 2.03it/s] 1%| | 110/11788 [01:07<1:35:57, 2.03it/s] 1%| | 111/11788 [01:07<1:35:51, 2.03it/s] 1%| | 112/11788 [01:07<1:36:00, 2.03it/s] 1%| | 113/11788 [01:08<1:35:56, 2.03it/s] 1%| | 114/11788 [01:08<1:35:57, 2.03it/s] 1%| | 115/11788 [01:09<1:35:53, 2.03it/s] 1%| | 116/11788 [01:09<1:35:55, 2.03it/s] 1%| | 117/11788 [01:10<1:35:52, 2.03it/s] 1%| | 118/11788 [01:10<1:35:54, 2.03it/s] 1%| | 119/11788 [01:11<1:35:55, 2.03it/s] 1%| | 120/11788 [01:11<1:35:54, 2.03it/s] 1%| | 121/11788 [01:12<1:35:56, 2.03it/s] 1%| | 122/11788 [01:12<1:35:55, 2.03it/s] 1%| | 123/11788 [01:13<1:35:56, 2.03it/s] 1%| | 124/11788 [01:13<1:35:50, 2.03it/s] 1%| | 125/11788 [01:14<1:35:57, 2.03it/s] {'loss': 7.9105, 'grad_norm': 0.5431452393531799, 'learning_rate': 0.0001060220525869381, 'epoch': 0.15} 1%| | 125/11788 [01:14<1:35:57, 2.03it/s] 1%| | 126/11788 [01:14<1:35:58, 2.03it/s] 1%| | 127/11788 [01:15<1:36:01, 2.02it/s] 1%| | 128/11788 [01:15<1:35:53, 2.03it/s] 1%| | 129/11788 [01:16<1:35:51, 2.03it/s] 1%| | 130/11788 [01:16<1:35:52, 2.03it/s] 1%| | 131/11788 [01:17<1:35:49, 2.03it/s] 1%| | 132/11788 [01:17<1:35:51, 2.03it/s] 1%| | 133/11788 [01:18<1:35:54, 2.03it/s] 1%| | 134/11788 [01:18<1:35:48, 2.03it/s] 1%| | 135/11788 [01:19<1:35:46, 2.03it/s] 1%| | 136/11788 [01:19<1:35:47, 2.03it/s] 1%| | 137/11788 [01:20<1:35:43, 2.03it/s] 1%| | 138/11788 [01:20<1:35:40, 2.03it/s] 1%| | 139/11788 [01:21<1:35:39, 2.03it/s] 1%| | 140/11788 [01:21<1:35:37, 2.03it/s] 1%| | 141/11788 [01:22<1:35:37, 2.03it/s] 1%| | 142/11788 [01:22<1:35:39, 2.03it/s] 1%| | 143/11788 [01:23<1:35:43, 2.03it/s] 1%| | 144/11788 [01:23<1:35:44, 2.03it/s] 1%| | 145/11788 [01:24<1:35:40, 2.03it/s] 1%| | 146/11788 [01:24<1:35:43, 2.03it/s] 1%| | 147/11788 [01:25<1:35:49, 2.02it/s] 1%|β | 148/11788 [01:25<1:35:46, 2.03it/s] 1%|β | 149/11788 [01:26<1:35:50, 2.02it/s] 1%|β | 150/11788 [01:26<1:35:48, 2.02it/s]{'loss': 7.4247, 'grad_norm': 0.4856749475002289, 'learning_rate': 0.0001272264631043257, 'epoch': 0.18} 1%|β | 150/11788 [01:26<1:35:48, 2.02it/s] 1%|β | 151/11788 [01:27<1:35:48, 2.02it/s] 1%|β | 152/11788 [01:27<1:35:47, 2.02it/s] 1%|β | 153/11788 [01:28<1:35:41, 2.03it/s] 1%|β | 154/11788 [01:28<1:35:39, 2.03it/s] 1%|β | 155/11788 [01:29<1:35:40, 2.03it/s] 1%|β | 156/11788 [01:29<1:35:40, 2.03it/s] 1%|β | 157/11788 [01:30<1:35:40, 2.03it/s] 1%|β | 158/11788 [01:30<1:35:33, 2.03it/s] 1%|β | 159/11788 [01:31<1:35:34, 2.03it/s] 1%|β | 160/11788 [01:31<1:35:31, 2.03it/s] 1%|β | 161/11788 [01:32<1:35:27, 2.03it/s] 1%|β | 162/11788 [01:32<1:35:29, 2.03it/s] 1%|β | 163/11788 [01:33<1:35:26, 2.03it/s] 1%|β | 164/11788 [01:33<1:35:31, 2.03it/s] 1%|β | 165/11788 [01:34<1:35:32, 2.03it/s] 1%|β | 166/11788 [01:34<1:35:34, 2.03it/s] 1%|β | 167/11788 [01:35<1:35:34, 2.03it/s] 1%|β | 168/11788 [01:35<1:35:32, 2.03it/s] 1%|β | 169/11788 [01:36<1:35:33, 2.03it/s] 1%|β | 170/11788 [01:36<1:35:28, 2.03it/s] 1%|β | 171/11788 [01:37<1:35:28, 2.03it/s] 1%|β | 172/11788 [01:37<1:35:26, 2.03it/s] 1%|β | 173/11788 [01:38<1:35:26, 2.03it/s] 1%|β | 174/11788 [01:38<1:35:26, 2.03it/s] 1%|β | 175/11788 [01:39<1:35:21, 2.03it/s]{'loss': 7.0223, 'grad_norm': 0.5401294231414795, 'learning_rate': 0.00014843087362171331, 'epoch': 0.21} 1%|β | 175/11788 [01:39<1:35:21, 2.03it/s] 1%|β | 176/11788 [01:39<1:35:33, 2.03it/s] 2%|β | 177/11788 [01:40<1:35:27, 2.03it/s] 2%|β | 178/11788 [01:40<1:35:31, 2.03it/s] 2%|β | 179/11788 [01:41<1:35:23, 2.03it/s] 2%|β | 180/11788 [01:41<1:35:27, 2.03it/s] 2%|β | 181/11788 [01:42<1:35:21, 2.03it/s] 2%|β | 182/11788 [01:42<1:35:14, 2.03it/s] 2%|β | 183/11788 [01:43<1:35:19, 2.03it/s] 2%|β | 184/11788 [01:43<1:35:22, 2.03it/s] 2%|β | 185/11788 [01:44<1:35:25, 2.03it/s] 2%|β | 186/11788 [01:44<1:35:18, 2.03it/s] 2%|β | 187/11788 [01:44<1:35:23, 2.03it/s] 2%|β | 188/11788 [01:45<1:35:23, 2.03it/s] 2%|β | 189/11788 [01:45<1:35:26, 2.03it/s] 2%|β | 190/11788 [01:46<1:35:28, 2.02it/s] 2%|β | 191/11788 [01:46<1:35:26, 2.02it/s] 2%|β | 192/11788 [01:47<1:35:31, 2.02it/s] 2%|β | 193/11788 [01:47<1:35:29, 2.02it/s] 2%|β | 194/11788 [01:48<1:35:29, 2.02it/s] 2%|β | 195/11788 [01:48<1:35:37, 2.02it/s] 2%|β | 196/11788 [01:49<1:35:31, 2.02it/s] 2%|β | 197/11788 [01:49<1:35:27, 2.02it/s] 2%|β | 198/11788 [01:50<1:35:24, 2.02it/s] 2%|β | 199/11788 [01:50<1:35:27, 2.02it/s] 2%|β | 200/11788 [01:51<1:35:18, 2.03it/s]{'loss': 6.6636, 'grad_norm': 0.38549456000328064, 'learning_rate': 0.00016963528413910093, 'epoch': 0.24} 2%|β | 200/11788 [01:51<1:35:18, 2.03it/s] 2%|β | 201/11788 [01:51<1:35:27, 2.02it/s] 2%|β | 202/11788 [01:52<1:35:25, 2.02it/s] 2%|β | 203/11788 [01:52<1:35:21, 2.02it/s] 2%|β | 204/11788 [01:53<1:35:22, 2.02it/s] 2%|β | 205/11788 [01:53<1:35:15, 2.03it/s] 2%|β | 206/11788 [01:54<1:35:15, 2.03it/s] 2%|β | 207/11788 [01:54<1:35:15, 2.03it/s] 2%|β | 208/11788 [01:55<1:35:34, 2.02it/s] 2%|β | 209/11788 [01:55<1:35:37, 2.02it/s] 2%|β | 210/11788 [01:56<1:35:41, 2.02it/s] 2%|β | 211/11788 [01:56<1:35:37, 2.02it/s] 2%|β | 212/11788 [01:57<1:35:31, 2.02it/s] 2%|β | 213/11788 [01:57<1:35:25, 2.02it/s] 2%|β | 214/11788 [01:58<1:35:18, 2.02it/s] 2%|β | 215/11788 [01:58<1:35:17, 2.02it/s] 2%|β | 216/11788 [01:59<1:35:12, 2.03it/s] 2%|β | 217/11788 [01:59<1:35:05, 2.03it/s] 2%|β | 218/11788 [02:00<1:35:10, 2.03it/s] 2%|β | 219/11788 [02:00<1:35:04, 2.03it/s] 2%|β | 220/11788 [02:01<1:35:11, 2.03it/s] 2%|β | 221/11788 [02:01<1:35:02, 2.03it/s] 2%|β | 222/11788 [02:02<1:35:00, 2.03it/s] 2%|β | 223/11788 [02:02<1:34:59, 2.03it/s] 2%|β | 224/11788 [02:03<1:34:54, 2.03it/s] 2%|β | 225/11788 [02:03<1:35:02, 2.03it/s]{'loss': 6.3808, 'grad_norm': 0.6071425080299377, 'learning_rate': 0.00019083969465648857, 'epoch': 0.27} 2%|β | 225/11788 [02:03<1:35:02, 2.03it/s] 2%|β | 226/11788 [02:04<1:35:06, 2.03it/s] 2%|β | 227/11788 [02:04<1:35:12, 2.02it/s] 2%|β | 228/11788 [02:05<1:35:04, 2.03it/s] 2%|β | 229/11788 [02:05<1:35:02, 2.03it/s] 2%|β | 230/11788 [02:06<1:34:57, 2.03it/s] 2%|β | 231/11788 [02:06<1:34:54, 2.03it/s] 2%|β | 232/11788 [02:07<1:35:00, 2.03it/s] 2%|β | 233/11788 [02:07<1:34:59, 2.03it/s] 2%|β | 234/11788 [02:08<1:35:03, 2.03it/s] 2%|β | 235/11788 [02:08<1:34:59, 2.03it/s] 2%|β | 236/11788 [02:09<1:35:01, 2.03it/s] 2%|β | 237/11788 [02:09<1:34:58, 2.03it/s] 2%|β | 238/11788 [02:10<1:34:56, 2.03it/s] 2%|β | 239/11788 [02:10<1:34:56, 2.03it/s] 2%|β | 240/11788 [02:11<1:34:49, 2.03it/s] 2%|β | 241/11788 [02:11<1:34:53, 2.03it/s] 2%|β | 242/11788 [02:12<1:34:51, 2.03it/s] 2%|β | 243/11788 [02:12<1:34:50, 2.03it/s] 2%|β | 244/11788 [02:13<1:34:50, 2.03it/s] 2%|β | 245/11788 [02:13<1:34:49, 2.03it/s] 2%|β | 246/11788 [02:14<1:34:51, 2.03it/s] 2%|β | 247/11788 [02:14<1:34:48, 2.03it/s] 2%|β | 248/11788 [02:15<1:34:48, 2.03it/s] 2%|β | 249/11788 [02:15<1:34:48, 2.03it/s] 2%|β | 250/11788 [02:16<1:34:49, 2.03it/s]{'loss': 6.1534, 'grad_norm': 0.6674228310585022, 'learning_rate': 0.0002120441051738762, 'epoch': 0.3} 2%|β | 250/11788 [02:16<1:34:49, 2.03it/s] 2%|β | 251/11788 [02:16<1:34:59, 2.02it/s] 2%|β | 252/11788 [02:17<1:34:55, 2.03it/s] 2%|β | 253/11788 [02:17<1:34:49, 2.03it/s] 2%|β | 254/11788 [02:18<1:34:46, 2.03it/s] 2%|β | 255/11788 [02:18<1:34:49, 2.03it/s] 2%|β | 256/11788 [02:19<1:34:43, 2.03it/s] 2%|β | 257/11788 [02:19<1:34:49, 2.03it/s] 2%|β | 258/11788 [02:20<1:34:40, 2.03it/s] 2%|β | 259/11788 [02:20<1:34:46, 2.03it/s] 2%|β | 260/11788 [02:21<1:34:46, 2.03it/s] 2%|β | 261/11788 [02:21<1:34:45, 2.03it/s] 2%|β | 262/11788 [02:22<1:34:45, 2.03it/s] 2%|β | 263/11788 [02:22<1:34:45, 2.03it/s] 2%|β | 264/11788 [02:22<1:34:46, 2.03it/s] 2%|β | 265/11788 [02:23<1:34:42, 2.03it/s] 2%|β | 266/11788 [02:23<1:34:50, 2.02it/s] 2%|β | 267/11788 [02:24<1:34:43, 2.03it/s] 2%|β | 268/11788 [02:24<1:34:46, 2.03it/s] 2%|β | 269/11788 [02:25<1:34:44, 2.03it/s] 2%|β | 270/11788 [02:25<1:34:44, 2.03it/s] 2%|β | 271/11788 [02:26<1:34:41, 2.03it/s] 2%|β | 272/11788 [02:26<1:34:37, 2.03it/s] 2%|β | 273/11788 [02:27<1:34:36, 2.03it/s] 2%|β | 274/11788 [02:27<1:34:30, 2.03it/s] 2%|β | 275/11788 [02:28<1:34:34, 2.03it/s]{'loss': 5.957, 'grad_norm': 0.5068850517272949, 'learning_rate': 0.00023324851569126378, 'epoch': 0.33} 2%|β | 275/11788 [02:28<1:34:34, 2.03it/s] 2%|β | 276/11788 [02:28<1:34:39, 2.03it/s] 2%|β | 277/11788 [02:29<1:34:40, 2.03it/s] 2%|β | 278/11788 [02:29<1:34:37, 2.03it/s] 2%|β | 279/11788 [02:30<1:34:36, 2.03it/s] 2%|β | 280/11788 [02:30<1:34:42, 2.03it/s] 2%|β | 281/11788 [02:31<1:34:36, 2.03it/s] 2%|β | 282/11788 [02:31<1:34:34, 2.03it/s] 2%|β | 283/11788 [02:32<1:34:31, 2.03it/s] 2%|β | 284/11788 [02:32<1:34:27, 2.03it/s] 2%|β | 285/11788 [02:33<1:34:32, 2.03it/s] 2%|β | 286/11788 [02:33<1:34:25, 2.03it/s] 2%|β | 287/11788 [02:34<1:34:24, 2.03it/s] 2%|β | 288/11788 [02:34<1:34:25, 2.03it/s] 2%|β | 289/11788 [02:35<1:34:21, 2.03it/s] 2%|β | 290/11788 [02:35<1:34:24, 2.03it/s] 2%|β | 291/11788 [02:36<1:34:29, 2.03it/s] 2%|β | 292/11788 [02:36<1:34:28, 2.03it/s] 2%|β | 293/11788 [02:37<1:34:30, 2.03it/s] 2%|β | 294/11788 [02:37<1:34:24, 2.03it/s] 3%|β | 295/11788 [02:38<1:34:20, 2.03it/s] 3%|β | 296/11788 [02:38<1:34:18, 2.03it/s] 3%|β | 297/11788 [02:39<1:34:16, 2.03it/s] 3%|β | 298/11788 [02:39<1:34:19, 2.03it/s] 3%|β | 299/11788 [02:40<1:34:23, 2.03it/s] 3%|β | 300/11788 [02:40<1:34:21, 2.03it/s]{'loss': 5.8146, 'grad_norm': 0.6466884613037109, 'learning_rate': 0.0002544529262086514, 'epoch': 0.36} 3%|β | 300/11788 [02:40<1:34:21, 2.03it/s] 3%|β | 301/11788 [02:41<1:34:34, 2.02it/s] 3%|β | 302/11788 [02:41<1:34:32, 2.02it/s] 3%|β | 303/11788 [02:42<1:34:30, 2.03it/s] 3%|β | 304/11788 [02:42<1:34:28, 2.03it/s] 3%|β | 305/11788 [02:43<1:34:23, 2.03it/s] 3%|β | 306/11788 [02:43<1:34:28, 2.03it/s] 3%|β | 307/11788 [02:44<1:34:19, 2.03it/s] 3%|β | 308/11788 [02:44<1:34:18, 2.03it/s] 3%|β | 309/11788 [02:45<1:34:18, 2.03it/s] 3%|β | 310/11788 [02:45<1:34:19, 2.03it/s] 3%|β | 311/11788 [02:46<1:34:19, 2.03it/s] 3%|β | 312/11788 [02:46<1:34:24, 2.03it/s] 3%|β | 313/11788 [02:47<1:34:21, 2.03it/s] 3%|β | 314/11788 [02:47<1:34:16, 2.03it/s] 3%|β | 315/11788 [02:48<1:34:18, 2.03it/s] 3%|β | 316/11788 [02:48<1:34:17, 2.03it/s] 3%|β | 317/11788 [02:49<1:34:23, 2.03it/s] 3%|β | 318/11788 [02:49<1:34:21, 2.03it/s] 3%|β | 319/11788 [02:50<1:34:20, 2.03it/s] 3%|β | 320/11788 [02:50<1:34:23, 2.02it/s] 3%|β | 321/11788 [02:51<1:34:22, 2.02it/s] 3%|β | 322/11788 [02:51<1:34:21, 2.03it/s] 3%|β | 323/11788 [02:52<1:34:18, 2.03it/s] 3%|β | 324/11788 [02:52<1:34:15, 2.03it/s] 3%|β | 325/11788 [02:53<1:34:14, 2.03it/s]{'loss': 5.6587, 'grad_norm': 0.8380582332611084, 'learning_rate': 0.00027565733672603904, 'epoch': 0.39} 3%|β | 325/11788 [02:53<1:34:14, 2.03it/s] 3%|β | 326/11788 [02:53<1:34:21, 2.02it/s] 3%|β | 327/11788 [02:54<1:34:20, 2.02it/s] 3%|β | 328/11788 [02:54<1:34:15, 2.03it/s] 3%|β | 329/11788 [02:55<1:34:09, 2.03it/s] 3%|β | 330/11788 [02:55<1:34:14, 2.03it/s] 3%|β | 331/11788 [02:56<1:34:11, 2.03it/s] 3%|β | 332/11788 [02:56<1:34:14, 2.03it/s] 3%|β | 333/11788 [02:57<1:34:14, 2.03it/s] 3%|β | 334/11788 [02:57<1:34:24, 2.02it/s] 3%|β | 335/11788 [02:58<1:34:20, 2.02it/s] 3%|β | 336/11788 [02:58<1:34:22, 2.02it/s] 3%|β | 337/11788 [02:59<1:34:20, 2.02it/s] 3%|β | 338/11788 [02:59<1:34:17, 2.02it/s] 3%|β | 339/11788 [02:59<1:34:12, 2.03it/s] 3%|β | 340/11788 [03:00<1:34:07, 2.03it/s] 3%|β | 341/11788 [03:00<1:34:05, 2.03it/s] 3%|β | 342/11788 [03:01<1:34:05, 2.03it/s] 3%|β | 343/11788 [03:01<1:34:09, 2.03it/s] 3%|β | 344/11788 [03:02<1:34:06, 2.03it/s] 3%|β | 345/11788 [03:02<1:34:08, 2.03it/s] 3%|β | 346/11788 [03:03<1:34:03, 2.03it/s] 3%|β | 347/11788 [03:03<1:34:04, 2.03it/s] 3%|β | 348/11788 [03:04<1:33:57, 2.03it/s] 3%|β | 349/11788 [03:04<1:33:57, 2.03it/s] 3%|β | 350/11788 [03:05<1:33:56, 2.03it/s]{'loss': 5.5439, 'grad_norm': 0.7836480736732483, 'learning_rate': 0.00029686174724342663, 'epoch': 0.42} 3%|β | 350/11788 [03:05<1:33:56, 2.03it/s] 3%|β | 351/11788 [03:05<1:34:08, 2.02it/s] 3%|β | 352/11788 [03:06<1:34:03, 2.03it/s] 3%|β | 353/11788 [03:06<1:33:57, 2.03it/s] 3%|β | 354/11788 [03:07<1:33:58, 2.03it/s] 3%|β | 355/11788 [03:07<1:33:55, 2.03it/s] 3%|β | 356/11788 [03:08<1:33:56, 2.03it/s] 3%|β | 357/11788 [03:08<1:33:55, 2.03it/s] 3%|β | 358/11788 [03:09<1:33:53, 2.03it/s] 3%|β | 359/11788 [03:09<1:33:54, 2.03it/s] 3%|β | 360/11788 [03:10<1:33:50, 2.03it/s] 3%|β | 361/11788 [03:10<1:33:50, 2.03it/s] 3%|β | 362/11788 [03:11<1:33:49, 2.03it/s] 3%|β | 363/11788 [03:11<1:33:52, 2.03it/s] 3%|β | 364/11788 [03:12<1:33:54, 2.03it/s] 3%|β | 365/11788 [03:12<1:33:48, 2.03it/s] 3%|β | 366/11788 [03:13<1:33:54, 2.03it/s] 3%|β | 367/11788 [03:13<1:33:49, 2.03it/s] 3%|β | 368/11788 [03:14<1:33:49, 2.03it/s] 3%|β | 369/11788 [03:14<1:33:51, 2.03it/s] 3%|β | 370/11788 [03:15<1:33:51, 2.03it/s] 3%|β | 371/11788 [03:15<1:33:53, 2.03it/s] 3%|β | 372/11788 [03:16<1:33:48, 2.03it/s] 3%|β | 373/11788 [03:16<1:33:51, 2.03it/s] 3%|β | 374/11788 [03:17<1:33:49, 2.03it/s] 3%|β | 375/11788 [03:17<1:33:50, 2.03it/s]{'loss': 5.4288, 'grad_norm': 0.8087937235832214, 'learning_rate': 0.0003180661577608143, 'epoch': 0.45} 3%|β | 375/11788 [03:17<1:33:50, 2.03it/s] 3%|β | 376/11788 [03:18<1:33:52, 2.03it/s] 3%|β | 377/11788 [03:18<1:33:55, 2.02it/s] 3%|β | 378/11788 [03:19<1:33:50, 2.03it/s] 3%|β | 379/11788 [03:19<1:33:52, 2.03it/s] 3%|β | 380/11788 [03:20<1:33:51, 2.03it/s] 3%|β | 381/11788 [03:20<1:33:47, 2.03it/s] 3%|β | 382/11788 [03:21<1:33:49, 2.03it/s] 3%|β | 383/11788 [03:21<1:33:41, 2.03it/s] 3%|β | 384/11788 [03:22<1:33:39, 2.03it/s] 3%|β | 385/11788 [03:22<1:33:41, 2.03it/s] 3%|β | 386/11788 [03:23<1:33:45, 2.03it/s] 3%|β | 387/11788 [03:23<1:33:44, 2.03it/s] 3%|β | 388/11788 [03:24<1:33:38, 2.03it/s] 3%|β | 389/11788 [03:24<1:33:37, 2.03it/s] 3%|β | 390/11788 [03:25<1:33:36, 2.03it/s] 3%|β | 391/11788 [03:25<1:33:37, 2.03it/s] 3%|β | 392/11788 [03:26<1:33:36, 2.03it/s] 3%|β | 393/11788 [03:26<1:33:28, 2.03it/s] 3%|β | 394/11788 [03:27<1:33:35, 2.03it/s] 3%|β | 395/11788 [03:27<1:33:32, 2.03it/s] 3%|β | 396/11788 [03:28<1:33:34, 2.03it/s] 3%|β | 397/11788 [03:28<1:33:31, 2.03it/s] 3%|β | 398/11788 [03:29<1:33:33, 2.03it/s] 3%|β | 399/11788 [03:29<1:33:37, 2.03it/s] 3%|β | 400/11788 [03:30<1:33:31, 2.03it/s]{'loss': 5.3226, 'grad_norm': 0.8001272082328796, 'learning_rate': 0.00033927056827820186, 'epoch': 0.48} 3%|β | 400/11788 [03:30<1:33:31, 2.03it/s] 3%|β | 401/11788 [03:30<1:33:42, 2.03it/s] 3%|β | 402/11788 [03:31<1:33:35, 2.03it/s] 3%|β | 403/11788 [03:31<1:33:37, 2.03it/s] 3%|β | 404/11788 [03:32<1:33:34, 2.03it/s] 3%|β | 405/11788 [03:32<1:33:41, 2.02it/s] 3%|β | 406/11788 [03:33<1:33:35, 2.03it/s] 3%|β | 407/11788 [03:33<1:33:34, 2.03it/s] 3%|β | 408/11788 [03:34<1:33:34, 2.03it/s] 3%|β | 409/11788 [03:34<1:33:34, 2.03it/s] 3%|β | 410/11788 [03:35<1:33:35, 2.03it/s] 3%|β | 411/11788 [03:35<1:33:28, 2.03it/s] 3%|β | 412/11788 [03:35<1:33:32, 2.03it/s] 4%|β | 413/11788 [03:36<1:33:29, 2.03it/s] 4%|β | 414/11788 [03:36<1:33:28, 2.03it/s] 4%|β | 415/11788 [03:37<1:33:25, 2.03it/s] 4%|β | 416/11788 [03:37<1:33:24, 2.03it/s] 4%|β | 417/11788 [03:38<1:33:24, 2.03it/s] 4%|β | 418/11788 [03:38<1:33:23, 2.03it/s] 4%|β | 419/11788 [03:39<1:33:33, 2.03it/s] 4%|β | 420/11788 [03:39<1:33:31, 2.03it/s] 4%|β | 421/11788 [03:40<1:33:29, 2.03it/s] 4%|β | 422/11788 [03:40<1:33:27, 2.03it/s] 4%|β | 423/11788 [03:41<1:33:22, 2.03it/s] 4%|β | 424/11788 [03:41<1:33:23, 2.03it/s] 4%|β | 425/11788 [03:42<1:33:21, 2.03it/s] {'loss': 5.2315, 'grad_norm': 0.8854663968086243, 'learning_rate': 0.0003604749787955895, 'epoch': 0.5} 4%|β | 425/11788 [03:42<1:33:21, 2.03it/s] 4%|β | 426/11788 [03:42<1:33:34, 2.02it/s] 4%|β | 427/11788 [03:43<1:33:27, 2.03it/s] 4%|β | 428/11788 [03:43<1:33:21, 2.03it/s] 4%|β | 429/11788 [03:44<1:33:24, 2.03it/s] 4%|β | 430/11788 [03:44<1:33:27, 2.03it/s] 4%|β | 431/11788 [03:45<1:33:26, 2.03it/s] 4%|β | 432/11788 [03:45<1:33:23, 2.03it/s] 4%|β | 433/11788 [03:46<1:33:17, 2.03it/s] 4%|β | 434/11788 [03:46<1:33:20, 2.03it/s] 4%|β | 435/11788 [03:47<1:33:15, 2.03it/s] 4%|β | 436/11788 [03:47<1:33:16, 2.03it/s] 4%|β | 437/11788 [03:48<1:33:14, 2.03it/s] 4%|β | 438/11788 [03:48<1:33:05, 2.03it/s] 4%|β | 439/11788 [03:49<1:33:14, 2.03it/s] 4%|β | 440/11788 [03:49<1:33:13, 2.03it/s] 4%|β | 441/11788 [03:50<1:33:15, 2.03it/s] 4%|β | 442/11788 [03:50<1:33:16, 2.03it/s] 4%|β | 443/11788 [03:51<1:33:11, 2.03it/s] 4%|β | 444/11788 [03:51<1:33:16, 2.03it/s] 4%|β | 445/11788 [03:52<1:33:08, 2.03it/s] 4%|β | 446/11788 [03:52<1:33:05, 2.03it/s] 4%|β | 447/11788 [03:53<1:33:02, 2.03it/s] 4%|β | 448/11788 [03:53<1:33:01, 2.03it/s] 4%|β | 449/11788 [03:54<1:33:03, 2.03it/s] 4%|β | 450/11788 [03:54<1:33:04, 2.03it/s] {'loss': 5.1333, 'grad_norm': 0.8772043585777283, 'learning_rate': 0.00038167938931297715, 'epoch': 0.53} 4%|β | 450/11788 [03:54<1:33:04, 2.03it/s] 4%|β | 451/11788 [03:55<1:33:09, 2.03it/s] 4%|β | 452/11788 [03:55<1:33:07, 2.03it/s] 4%|β | 453/11788 [03:56<1:33:00, 2.03it/s] 4%|β | 454/11788 [03:56<1:33:02, 2.03it/s] 4%|β | 455/11788 [03:57<1:32:57, 2.03it/s] 4%|β | 456/11788 [03:57<1:33:01, 2.03it/s] 4%|β | 457/11788 [03:58<1:33:02, 2.03it/s] 4%|β | 458/11788 [03:58<1:33:00, 2.03it/s] 4%|β | 459/11788 [03:59<1:33:06, 2.03it/s] 4%|β | 460/11788 [03:59<1:33:01, 2.03it/s] 4%|β | 461/11788 [04:00<1:33:01, 2.03it/s] 4%|β | 462/11788 [04:00<1:33:01, 2.03it/s] 4%|β | 463/11788 [04:01<1:32:59, 2.03it/s] 4%|β | 464/11788 [04:01<1:33:03, 2.03it/s] 4%|β | 465/11788 [04:02<1:33:02, 2.03it/s] 4%|β | 466/11788 [04:02<1:33:04, 2.03it/s] 4%|β | 467/11788 [04:03<1:33:04, 2.03it/s] 4%|β | 468/11788 [04:03<1:33:05, 2.03it/s] 4%|β | 469/11788 [04:04<1:32:59, 2.03it/s] 4%|β | 470/11788 [04:04<1:33:01, 2.03it/s] 4%|β | 471/11788 [04:05<1:32:58, 2.03it/s] 4%|β | 472/11788 [04:05<1:32:55, 2.03it/s] 4%|β | 473/11788 [04:06<1:33:01, 2.03it/s] 4%|β | 474/11788 [04:06<1:32:54, 2.03it/s] 4%|β | 475/11788 [04:07<1:32:56, 2.03it/s]{'loss': 5.0491, 'grad_norm': 0.8502684831619263, 'learning_rate': 0.00040288379983036474, 'epoch': 0.56} 4%|β | 475/11788 [04:07<1:32:56, 2.03it/s] 4%|β | 476/11788 [04:07<1:32:57, 2.03it/s] 4%|β | 477/11788 [04:08<1:32:58, 2.03it/s] 4%|β | 478/11788 [04:08<1:32:56, 2.03it/s] 4%|β | 479/11788 [04:09<1:32:53, 2.03it/s] 4%|β | 480/11788 [04:09<1:33:00, 2.03it/s] 4%|β | 481/11788 [04:10<1:32:59, 2.03it/s] 4%|β | 482/11788 [04:10<1:33:06, 2.02it/s] 4%|β | 483/11788 [04:11<1:33:02, 2.03it/s] 4%|β | 484/11788 [04:11<1:33:03, 2.02it/s] 4%|β | 485/11788 [04:11<1:32:55, 2.03it/s] 4%|β | 486/11788 [04:12<1:33:00, 2.03it/s] 4%|β | 487/11788 [04:12<1:33:01, 2.02it/s] 4%|β | 488/11788 [04:13<1:33:03, 2.02it/s] 4%|β | 489/11788 [04:13<1:32:57, 2.03it/s] 4%|β | 490/11788 [04:14<1:32:59, 2.03it/s] 4%|β | 491/11788 [04:14<1:32:55, 2.03it/s] 4%|β | 492/11788 [04:15<1:32:51, 2.03it/s] 4%|β | 493/11788 [04:15<1:32:57, 2.03it/s] 4%|β | 494/11788 [04:16<1:32:57, 2.03it/s] 4%|β | 495/11788 [04:16<1:32:54, 2.03it/s] 4%|β | 496/11788 [04:17<1:32:52, 2.03it/s] 4%|β | 497/11788 [04:17<1:32:45, 2.03it/s] 4%|β | 498/11788 [04:18<1:32:48, 2.03it/s] 4%|β | 499/11788 [04:18<1:32:43, 2.03it/s] 4%|β | 500/11788 [04:19<1:32:42, 2.03it/s] {'loss': 4.9798, 'grad_norm': 0.6902145147323608, 'learning_rate': 0.0004240882103477524, 'epoch': 0.59} 4%|β | 500/11788 [04:19<1:32:42, 2.03it/s] 4%|β | 501/11788 [04:19<1:32:46, 2.03it/s] 4%|β | 502/11788 [04:20<1:32:44, 2.03it/s] 4%|β | 503/11788 [04:20<1:32:49, 2.03it/s] 4%|β | 504/11788 [04:21<1:32:43, 2.03it/s] 4%|β | 505/11788 [04:21<1:32:47, 2.03it/s] 4%|β | 506/11788 [04:22<1:32:43, 2.03it/s] 4%|β | 507/11788 [04:22<1:32:45, 2.03it/s] 4%|β | 508/11788 [04:23<1:32:42, 2.03it/s] 4%|β | 509/11788 [04:23<1:32:43, 2.03it/s] 4%|β | 510/11788 [04:24<1:32:43, 2.03it/s] 4%|β | 511/11788 [04:24<1:32:35, 2.03it/s] 4%|β | 512/11788 [04:25<1:32:40, 2.03it/s] 4%|β | 513/11788 [04:25<1:32:37, 2.03it/s] 4%|β | 514/11788 [04:26<1:32:43, 2.03it/s] 4%|β | 515/11788 [04:26<1:32:41, 2.03it/s] 4%|β | 516/11788 [04:27<1:32:41, 2.03it/s] 4%|β | 517/11788 [04:27<1:32:40, 2.03it/s] 4%|β | 518/11788 [04:28<1:32:33, 2.03it/s] 4%|β | 519/11788 [04:28<1:32:38, 2.03it/s] 4%|β | 520/11788 [04:29<1:32:30, 2.03it/s] 4%|β | 521/11788 [04:29<1:32:30, 2.03it/s] 4%|β | 522/11788 [04:30<1:32:30, 2.03it/s] 4%|β | 523/11788 [04:30<1:32:22, 2.03it/s] 4%|β | 524/11788 [04:31<1:32:24, 2.03it/s] 4%|β | 525/11788 [04:31<1:32:23, 2.03it/s]{'loss': 4.9114, 'grad_norm': 0.6238113045692444, 'learning_rate': 0.0004452926208651399, 'epoch': 0.62} 4%|β | 525/11788 [04:31<1:32:23, 2.03it/s] 4%|β | 526/11788 [04:32<1:32:33, 2.03it/s] 4%|β | 527/11788 [04:32<1:32:33, 2.03it/s] 4%|β | 528/11788 [04:33<1:32:26, 2.03it/s] 4%|β | 529/11788 [04:33<1:32:34, 2.03it/s] 4%|β | 530/11788 [04:34<1:32:26, 2.03it/s] 5%|β | 531/11788 [04:34<1:32:29, 2.03it/s] 5%|β | 532/11788 [04:35<1:32:27, 2.03it/s] 5%|β | 533/11788 [04:35<1:32:29, 2.03it/s] 5%|β | 534/11788 [04:36<1:32:29, 2.03it/s] 5%|β | 535/11788 [04:36<1:32:21, 2.03it/s] 5%|β | 536/11788 [04:37<1:32:28, 2.03it/s] 5%|β | 537/11788 [04:37<1:32:26, 2.03it/s] 5%|β | 538/11788 [04:38<1:32:29, 2.03it/s] 5%|β | 539/11788 [04:38<1:32:25, 2.03it/s] 5%|β | 540/11788 [04:39<1:32:21, 2.03it/s] 5%|β | 541/11788 [04:39<1:32:23, 2.03it/s] 5%|β | 542/11788 [04:40<1:32:18, 2.03it/s] 5%|β | 543/11788 [04:40<1:32:20, 2.03it/s] 5%|β | 544/11788 [04:41<1:32:18, 2.03it/s] 5%|β | 545/11788 [04:41<1:32:11, 2.03it/s] 5%|β | 546/11788 [04:42<1:32:22, 2.03it/s] 5%|β | 547/11788 [04:42<1:32:20, 2.03it/s] 5%|β | 548/11788 [04:43<1:32:25, 2.03it/s] 5%|β | 549/11788 [04:43<1:32:21, 2.03it/s] 5%|β | 550/11788 [04:44<1:32:24, 2.03it/s]{'loss': 4.8494, 'grad_norm': 0.5650936365127563, 'learning_rate': 0.00046649703138252756, 'epoch': 0.65} 5%|β | 550/11788 [04:44<1:32:24, 2.03it/s] 5%|β | 551/11788 [04:44<1:32:26, 2.03it/s] 5%|β | 552/11788 [04:45<1:32:24, 2.03it/s] 5%|β | 553/11788 [04:45<1:32:21, 2.03it/s] 5%|β | 554/11788 [04:46<1:32:17, 2.03it/s] 5%|β | 555/11788 [04:46<1:32:22, 2.03it/s] 5%|β | 556/11788 [04:46<1:32:15, 2.03it/s] 5%|β | 557/11788 [04:47<1:32:14, 2.03it/s] 5%|β | 558/11788 [04:47<1:32:19, 2.03it/s] 5%|β | 559/11788 [04:48<1:32:15, 2.03it/s] 5%|β | 560/11788 [04:48<1:32:14, 2.03it/s] 5%|β | 561/11788 [04:49<1:32:16, 2.03it/s] 5%|β | 562/11788 [04:49<1:32:14, 2.03it/s] 5%|β | 563/11788 [04:50<1:32:19, 2.03it/s] 5%|β | 564/11788 [04:50<1:32:15, 2.03it/s] 5%|β | 565/11788 [04:51<1:32:18, 2.03it/s] 5%|β | 566/11788 [04:51<1:32:18, 2.03it/s] 5%|β | 567/11788 [04:52<1:32:12, 2.03it/s] 5%|β | 568/11788 [04:52<1:32:16, 2.03it/s] 5%|β | 569/11788 [04:53<1:32:15, 2.03it/s] 5%|β | 570/11788 [04:53<1:32:17, 2.03it/s] 5%|β | 571/11788 [04:54<1:32:21, 2.02it/s] 5%|β | 572/11788 [04:54<1:32:17, 2.03it/s] 5%|β | 573/11788 [04:55<1:32:13, 2.03it/s] 5%|β | 574/11788 [04:55<1:32:07, 2.03it/s] 5%|β | 575/11788 [04:56<1:32:04, 2.03it/s] {'loss': 4.784, 'grad_norm': 0.7106149792671204, 'learning_rate': 0.00048770144189991515, 'epoch': 0.68} 5%|β | 575/11788 [04:56<1:32:04, 2.03it/s] 5%|β | 576/11788 [04:56<1:32:11, 2.03it/s] 5%|β | 577/11788 [04:57<1:32:11, 2.03it/s] 5%|β | 578/11788 [04:57<1:32:11, 2.03it/s] 5%|β | 579/11788 [04:58<1:32:09, 2.03it/s] 5%|β | 580/11788 [04:58<1:32:07, 2.03it/s] 5%|β | 581/11788 [04:59<1:32:07, 2.03it/s] 5%|β | 582/11788 [04:59<1:32:05, 2.03it/s] 5%|β | 583/11788 [05:00<1:32:07, 2.03it/s] 5%|β | 584/11788 [05:00<1:32:07, 2.03it/s] 5%|β | 585/11788 [05:01<1:32:07, 2.03it/s] 5%|β | 586/11788 [05:01<1:32:07, 2.03it/s] 5%|β | 587/11788 [05:02<1:32:10, 2.03it/s] 5%|β | 588/11788 [05:02<1:32:07, 2.03it/s] 5%|β | 589/11788 [05:03<1:32:09, 2.03it/s] 5%|β | 590/11788 [05:03<1:32:12, 2.02it/s] 5%|β | 591/11788 [05:04<1:32:09, 2.02it/s] 5%|β | 592/11788 [05:04<1:32:08, 2.02it/s] 5%|β | 593/11788 [05:05<1:32:12, 2.02it/s] 5%|β | 594/11788 [05:05<1:32:12, 2.02it/s] 5%|β | 595/11788 [05:06<1:32:11, 2.02it/s] 5%|β | 596/11788 [05:06<1:32:14, 2.02it/s] 5%|β | 597/11788 [05:07<1:32:13, 2.02it/s] 5%|β | 598/11788 [05:07<1:32:07, 2.02it/s] 5%|β | 599/11788 [05:08<1:32:06, 2.02it/s] 5%|β | 600/11788 [05:08<1:31:58, 2.03it/s]{'loss': 4.7204, 'grad_norm': 0.5446744561195374, 'learning_rate': 0.0005089058524173028, 'epoch': 0.71} 5%|β | 600/11788 [05:08<1:31:58, 2.03it/s] 5%|β | 601/11788 [05:09<1:32:12, 2.02it/s] 5%|β | 602/11788 [05:09<1:32:02, 2.03it/s] 5%|β | 603/11788 [05:10<1:32:00, 2.03it/s] 5%|β | 604/11788 [05:10<1:31:58, 2.03it/s] 5%|β | 605/11788 [05:11<1:31:55, 2.03it/s] 5%|β | 606/11788 [05:11<1:31:56, 2.03it/s] 5%|β | 607/11788 [05:12<1:31:56, 2.03it/s] 5%|β | 608/11788 [05:12<1:31:55, 2.03it/s] 5%|β | 609/11788 [05:13<1:31:54, 2.03it/s] 5%|β | 610/11788 [05:13<1:31:53, 2.03it/s] 5%|β | 611/11788 [05:14<1:31:51, 2.03it/s] 5%|β | 612/11788 [05:14<1:31:48, 2.03it/s] 5%|β | 613/11788 [05:15<1:31:48, 2.03it/s] 5%|β | 614/11788 [05:15<1:31:40, 2.03it/s] 5%|β | 615/11788 [05:16<1:39:29, 1.87it/s] 5%|β | 616/11788 [05:16<1:44:59, 1.77it/s] 5%|β | 617/11788 [05:17<1:41:01, 1.84it/s] 5%|β | 618/11788 [05:17<1:38:18, 1.89it/s] 5%|β | 619/11788 [05:18<1:36:17, 1.93it/s] 5%|β | 620/11788 [05:18<1:34:59, 1.96it/s] 5%|β | 621/11788 [05:19<1:34:02, 1.98it/s] 5%|β | 622/11788 [05:19<1:33:28, 1.99it/s] 5%|β | 623/11788 [05:20<1:32:53, 2.00it/s] 5%|β | 624/11788 [05:20<1:32:32, 2.01it/s] 5%|β | 625/11788 [05:21<1:32:19, 2.02it/s]{'loss': 4.6719, 'grad_norm': 0.6386901140213013, 'learning_rate': 0.0005301102629346905, 'epoch': 0.74} 5%|β | 625/11788 [05:21<1:32:19, 2.02it/s] 5%|β | 626/11788 [05:21<1:32:13, 2.02it/s] 5%|β | 627/11788 [05:22<1:32:05, 2.02it/s] 5%|β | 628/11788 [05:22<1:31:56, 2.02it/s] 5%|β | 629/11788 [05:23<1:31:54, 2.02it/s] 5%|β | 630/11788 [05:23<1:31:44, 2.03it/s] 5%|β | 631/11788 [05:24<1:31:45, 2.03it/s] 5%|β | 632/11788 [05:24<1:31:46, 2.03it/s] 5%|β | 633/11788 [05:25<1:31:38, 2.03it/s] 5%|β | 634/11788 [05:25<1:31:44, 2.03it/s] 5%|β | 635/11788 [05:26<1:31:36, 2.03it/s] 5%|β | 636/11788 [05:26<1:31:41, 2.03it/s] 5%|β | 637/11788 [05:27<1:31:41, 2.03it/s] 5%|β | 638/11788 [05:27<1:31:41, 2.03it/s] 5%|β | 639/11788 [05:28<1:31:38, 2.03it/s] 5%|β | 640/11788 [05:28<1:31:37, 2.03it/s] 5%|β | 641/11788 [05:29<1:31:34, 2.03it/s] 5%|β | 642/11788 [05:29<1:31:34, 2.03it/s] 5%|β | 643/11788 [05:30<1:31:43, 2.02it/s] 5%|β | 644/11788 [05:30<1:31:39, 2.03it/s] 5%|β | 645/11788 [05:31<1:31:35, 2.03it/s] 5%|β | 646/11788 [05:31<1:31:37, 2.03it/s] 5%|β | 647/11788 [05:32<1:31:36, 2.03it/s] 5%|β | 648/11788 [05:32<1:31:37, 2.03it/s] 6%|β | 649/11788 [05:33<1:31:32, 2.03it/s] 6%|β | 650/11788 [05:33<1:31:34, 2.03it/s]{'loss': 4.6171, 'grad_norm': 0.6312897205352783, 'learning_rate': 0.0005513146734520781, 'epoch': 0.77} 6%|β | 650/11788 [05:33<1:31:34, 2.03it/s] 6%|β | 651/11788 [05:34<1:31:33, 2.03it/s] 6%|β | 652/11788 [05:34<1:31:35, 2.03it/s] 6%|β | 653/11788 [05:35<1:31:32, 2.03it/s] 6%|β | 654/11788 [05:35<1:31:36, 2.03it/s] 6%|β | 655/11788 [05:36<1:31:29, 2.03it/s] 6%|β | 656/11788 [05:36<1:31:27, 2.03it/s] 6%|β | 657/11788 [05:37<1:31:28, 2.03it/s] 6%|β | 658/11788 [05:37<1:31:26, 2.03it/s] 6%|β | 659/11788 [05:38<1:31:29, 2.03it/s] 6%|β | 660/11788 [05:38<1:31:26, 2.03it/s] 6%|β | 661/11788 [05:39<1:31:29, 2.03it/s] 6%|β | 662/11788 [05:39<1:31:27, 2.03it/s] 6%|β | 663/11788 [05:40<1:31:33, 2.03it/s] 6%|β | 664/11788 [05:40<1:31:30, 2.03it/s] 6%|β | 665/11788 [05:41<1:31:29, 2.03it/s] 6%|β | 666/11788 [05:41<1:31:28, 2.03it/s] 6%|β | 667/11788 [05:42<1:31:22, 2.03it/s] 6%|β | 668/11788 [05:42<1:31:24, 2.03it/s] 6%|β | 669/11788 [05:43<1:31:23, 2.03it/s] 6%|β | 670/11788 [05:43<1:31:27, 2.03it/s] 6%|β | 671/11788 [05:44<1:31:20, 2.03it/s] 6%|β | 672/11788 [05:44<1:31:19, 2.03it/s] 6%|β | 673/11788 [05:44<1:31:21, 2.03it/s] 6%|β | 674/11788 [05:45<1:31:13, 2.03it/s] 6%|β | 675/11788 [05:45<1:31:20, 2.03it/s]{'loss': 4.5837, 'grad_norm': 0.7919646501541138, 'learning_rate': 0.0005725190839694656, 'epoch': 0.8} 6%|β | 675/11788 [05:45<1:31:20, 2.03it/s] 6%|β | 676/11788 [05:46<1:31:25, 2.03it/s] 6%|β | 677/11788 [05:46<1:31:29, 2.02it/s] 6%|β | 678/11788 [05:47<1:31:24, 2.03it/s] 6%|β | 679/11788 [05:47<1:31:25, 2.03it/s] 6%|β | 680/11788 [05:48<1:31:22, 2.03it/s] 6%|β | 681/11788 [05:48<1:31:14, 2.03it/s] 6%|β | 682/11788 [05:49<1:31:16, 2.03it/s] 6%|β | 683/11788 [05:49<1:31:13, 2.03it/s] 6%|β | 684/11788 [05:50<1:31:16, 2.03it/s] 6%|β | 685/11788 [05:50<1:31:13, 2.03it/s] 6%|β | 686/11788 [05:51<1:31:11, 2.03it/s] 6%|β | 687/11788 [05:51<1:31:16, 2.03it/s] 6%|β | 688/11788 [05:52<1:31:11, 2.03it/s] 6%|β | 689/11788 [05:52<1:31:13, 2.03it/s] 6%|β | 690/11788 [05:53<1:31:10, 2.03it/s] 6%|β | 691/11788 [05:53<1:31:07, 2.03it/s] 6%|β | 692/11788 [05:54<1:31:10, 2.03it/s] 6%|β | 693/11788 [05:54<1:31:05, 2.03it/s] 6%|β | 694/11788 [05:55<1:31:08, 2.03it/s] 6%|β | 695/11788 [05:55<1:31:05, 2.03it/s] 6%|β | 696/11788 [05:56<1:31:08, 2.03it/s] 6%|β | 697/11788 [05:56<1:31:15, 2.03it/s] 6%|β | 698/11788 [05:57<1:31:08, 2.03it/s] 6%|β | 699/11788 [05:57<1:31:07, 2.03it/s] 6%|β | 700/11788 [05:58<1:31:05, 2.03it/s]{'loss': 4.5283, 'grad_norm': 0.6023926734924316, 'learning_rate': 0.0005937234944868533, 'epoch': 0.83} 6%|β | 700/11788 [05:58<1:31:05, 2.03it/s] 6%|β | 701/11788 [05:58<1:31:13, 2.03it/s] 6%|β | 702/11788 [05:59<1:31:13, 2.03it/s] 6%|β | 703/11788 [05:59<1:31:12, 2.03it/s] 6%|β | 704/11788 [06:00<1:31:10, 2.03it/s] 6%|β | 705/11788 [06:00<1:31:12, 2.03it/s] 6%|β | 706/11788 [06:01<1:31:08, 2.03it/s] 6%|β | 707/11788 [06:01<1:31:08, 2.03it/s] 6%|β | 708/11788 [06:02<1:31:11, 2.03it/s] 6%|β | 709/11788 [06:02<1:31:04, 2.03it/s] 6%|β | 710/11788 [06:03<1:31:12, 2.02it/s] 6%|β | 711/11788 [06:03<1:31:04, 2.03it/s] 6%|β | 712/11788 [06:04<1:31:06, 2.03it/s] 6%|β | 713/11788 [06:04<1:30:59, 2.03it/s] 6%|β | 714/11788 [06:05<1:31:02, 2.03it/s] 6%|β | 715/11788 [06:05<1:30:59, 2.03it/s] 6%|β | 716/11788 [06:06<1:30:53, 2.03it/s] 6%|β | 717/11788 [06:06<1:30:57, 2.03it/s] 6%|β | 718/11788 [06:07<1:31:01, 2.03it/s] 6%|β | 719/11788 [06:07<1:31:02, 2.03it/s] 6%|β | 720/11788 [06:08<1:30:54, 2.03it/s] 6%|β | 721/11788 [06:08<1:30:58, 2.03it/s] 6%|β | 722/11788 [06:09<1:30:58, 2.03it/s] 6%|β | 723/11788 [06:09<1:30:52, 2.03it/s] 6%|β | 724/11788 [06:10<1:30:56, 2.03it/s] 6%|β | 725/11788 [06:10<1:30:50, 2.03it/s]{'loss': 4.4724, 'grad_norm': 0.6503545641899109, 'learning_rate': 0.0006149279050042408, 'epoch': 0.86} 6%|β | 725/11788 [06:10<1:30:50, 2.03it/s] 6%|β | 726/11788 [06:11<1:31:03, 2.02it/s] 6%|β | 727/11788 [06:11<1:30:57, 2.03it/s] 6%|β | 728/11788 [06:12<1:30:59, 2.03it/s] 6%|β | 729/11788 [06:12<1:31:04, 2.02it/s] 6%|β | 730/11788 [06:13<1:31:03, 2.02it/s] 6%|β | 731/11788 [06:13<1:30:57, 2.03it/s] 6%|β | 732/11788 [06:14<1:30:57, 2.03it/s] 6%|β | 733/11788 [06:14<1:30:56, 2.03it/s] 6%|β | 734/11788 [06:15<1:30:54, 2.03it/s] 6%|β | 735/11788 [06:15<1:30:51, 2.03it/s] 6%|β | 736/11788 [06:16<1:30:47, 2.03it/s] 6%|β | 737/11788 [06:16<1:30:47, 2.03it/s] 6%|β | 738/11788 [06:17<1:30:49, 2.03it/s] 6%|β | 739/11788 [06:17<1:30:50, 2.03it/s] 6%|β | 740/11788 [06:18<1:30:47, 2.03it/s] 6%|β | 741/11788 [06:18<1:30:47, 2.03it/s] 6%|β | 742/11788 [06:19<1:30:43, 2.03it/s] 6%|β | 743/11788 [06:19<1:30:39, 2.03it/s] 6%|β | 744/11788 [06:20<1:30:49, 2.03it/s] 6%|β | 745/11788 [06:20<1:30:44, 2.03it/s] 6%|β | 746/11788 [06:21<1:30:48, 2.03it/s] 6%|β | 747/11788 [06:21<1:30:45, 2.03it/s] 6%|β | 748/11788 [06:21<1:30:46, 2.03it/s] 6%|β | 749/11788 [06:22<1:30:47, 2.03it/s] 6%|β | 750/11788 [06:22<1:30:49, 2.03it/s]{'loss': 4.4458, 'grad_norm': 0.5474122762680054, 'learning_rate': 0.0006361323155216285, 'epoch': 0.89} 6%|β | 750/11788 [06:22<1:30:49, 2.03it/s] 6%|β | 751/11788 [06:23<1:30:59, 2.02it/s] 6%|β | 752/11788 [06:23<1:30:55, 2.02it/s] 6%|β | 753/11788 [06:24<1:30:53, 2.02it/s] 6%|β | 754/11788 [06:24<1:30:45, 2.03it/s] 6%|β | 755/11788 [06:25<1:30:44, 2.03it/s] 6%|β | 756/11788 [06:25<1:30:43, 2.03it/s] 6%|β | 757/11788 [06:26<1:30:39, 2.03it/s] 6%|β | 758/11788 [06:26<1:30:40, 2.03it/s] 6%|β | 759/11788 [06:27<1:30:42, 2.03it/s] 6%|β | 760/11788 [06:27<1:30:41, 2.03it/s] 6%|β | 761/11788 [06:28<1:30:48, 2.02it/s] 6%|β | 762/11788 [06:28<1:30:48, 2.02it/s] 6%|β | 763/11788 [06:29<1:30:42, 2.03it/s] 6%|β | 764/11788 [06:29<1:30:43, 2.03it/s] 6%|β | 765/11788 [06:30<1:30:36, 2.03it/s] 6%|β | 766/11788 [06:30<1:30:38, 2.03it/s] 7%|β | 767/11788 [06:31<1:30:37, 2.03it/s] 7%|β | 768/11788 [06:31<1:30:38, 2.03it/s] 7%|β | 769/11788 [06:32<1:30:40, 2.03it/s] 7%|β | 770/11788 [06:32<1:30:35, 2.03it/s] 7%|β | 771/11788 [06:33<1:30:37, 2.03it/s] 7%|β | 772/11788 [06:33<1:30:34, 2.03it/s] 7%|β | 773/11788 [06:34<1:30:38, 2.03it/s] 7%|β | 774/11788 [06:34<1:30:34, 2.03it/s] 7%|β | 775/11788 [06:35<1:30:36, 2.03it/s]{'loss': 4.4025, 'grad_norm': 0.6143103241920471, 'learning_rate': 0.0006573367260390161, 'epoch': 0.92} 7%|β | 775/11788 [06:35<1:30:36, 2.03it/s] 7%|β | 776/11788 [06:35<1:30:33, 2.03it/s] 7%|β | 777/11788 [06:36<1:30:31, 2.03it/s] 7%|β | 778/11788 [06:36<1:30:31, 2.03it/s] 7%|β | 779/11788 [06:37<1:30:26, 2.03it/s] 7%|β | 780/11788 [06:37<1:30:30, 2.03it/s] 7%|β | 781/11788 [06:38<1:30:23, 2.03it/s] 7%|β | 782/11788 [06:38<1:30:34, 2.03it/s] 7%|β | 783/11788 [06:39<1:30:32, 2.03it/s] 7%|β | 784/11788 [06:39<1:30:33, 2.03it/s] 7%|β | 785/11788 [06:40<1:30:29, 2.03it/s] 7%|β | 786/11788 [06:40<1:30:23, 2.03it/s] 7%|β | 787/11788 [06:41<1:30:29, 2.03it/s] 7%|β | 788/11788 [06:41<1:30:24, 2.03it/s] 7%|β | 789/11788 [06:42<1:30:25, 2.03it/s] 7%|β | 790/11788 [06:42<1:30:18, 2.03it/s] 7%|β | 791/11788 [06:43<1:30:17, 2.03it/s] 7%|β | 792/11788 [06:43<1:30:17, 2.03it/s] 7%|β | 793/11788 [06:44<1:30:17, 2.03it/s] 7%|β | 794/11788 [06:44<1:30:20, 2.03it/s] 7%|β | 795/11788 [06:45<1:30:15, 2.03it/s] 7%|β | 796/11788 [06:45<1:30:15, 2.03it/s] 7%|β | 797/11788 [06:46<1:30:14, 2.03it/s] 7%|β | 798/11788 [06:46<1:30:12, 2.03it/s] 7%|β | 799/11788 [06:47<1:30:17, 2.03it/s] 7%|β | 800/11788 [06:47<1:30:12, 2.03it/s]{'loss': 4.369, 'grad_norm': 0.45020633935928345, 'learning_rate': 0.0006785411365564037, 'epoch': 0.95} 7%|β | 800/11788 [06:47<1:30:12, 2.03it/s] 7%|β | 801/11788 [06:48<1:30:19, 2.03it/s] 7%|β | 802/11788 [06:48<1:30:14, 2.03it/s] 7%|β | 803/11788 [06:49<1:30:17, 2.03it/s] 7%|β | 804/11788 [06:49<1:30:20, 2.03it/s] 7%|β | 805/11788 [06:50<1:30:15, 2.03it/s] 7%|β | 806/11788 [06:50<1:30:16, 2.03it/s] 7%|β | 807/11788 [06:51<1:30:11, 2.03it/s] 7%|β | 808/11788 [06:51<1:30:17, 2.03it/s] 7%|β | 809/11788 [06:52<1:30:09, 2.03it/s] 7%|β | 810/11788 [06:52<1:30:08, 2.03it/s] 7%|β | 811/11788 [06:53<1:30:09, 2.03it/s] 7%|β | 812/11788 [06:53<1:30:02, 2.03it/s] 7%|β | 813/11788 [06:54<1:30:03, 2.03it/s] 7%|β | 814/11788 [06:54<1:30:05, 2.03it/s] 7%|β | 815/11788 [06:55<1:30:05, 2.03it/s] 7%|β | 816/11788 [06:55<1:30:10, 2.03it/s] 7%|β | 817/11788 [06:56<1:30:10, 2.03it/s] 7%|β | 818/11788 [06:56<1:30:13, 2.03it/s] 7%|β | 819/11788 [06:57<1:30:05, 2.03it/s] 7%|β | 820/11788 [06:57<1:30:07, 2.03it/s] 7%|β | 821/11788 [06:57<1:30:02, 2.03it/s] 7%|β | 822/11788 [06:58<1:30:01, 2.03it/s] 7%|β | 823/11788 [06:58<1:30:04, 2.03it/s] 7%|β | 824/11788 [06:59<1:30:04, 2.03it/s] 7%|β | 825/11788 [06:59<1:30:06, 2.03it/s]{'loss': 4.3353, 'grad_norm': 0.46021631360054016, 'learning_rate': 0.0006997455470737913, 'epoch': 0.98} 7%|β | 825/11788 [06:59<1:30:06, 2.03it/s] 7%|β | 826/11788 [07:00<1:30:08, 2.03it/s] 7%|β | 827/11788 [07:00<1:30:12, 2.03it/s] 7%|β | 828/11788 [07:01<1:30:10, 2.03it/s] 7%|β | 829/11788 [07:01<1:30:03, 2.03it/s] 7%|β | 830/11788 [07:02<1:30:06, 2.03it/s] 7%|β | 831/11788 [07:02<1:29:59, 2.03it/s] 7%|β | 832/11788 [07:03<1:30:02, 2.03it/s] 7%|β | 833/11788 [07:03<1:30:01, 2.03it/s] 7%|β | 834/11788 [07:04<1:30:01, 2.03it/s] 7%|β | 835/11788 [07:04<1:30:02, 2.03it/s] 7%|β | 836/11788 [07:05<1:30:03, 2.03it/s] 7%|β | 837/11788 [07:05<1:30:09, 2.02it/s] 7%|β | 838/11788 [07:06<1:30:07, 2.02it/s] 7%|β | 839/11788 [07:06<1:30:14, 2.02it/s] 7%|β | 840/11788 [07:07<1:30:13, 2.02it/s] 7%|β | 841/11788 [07:07<1:30:15, 2.02it/s] 7%|β | 842/11788 [07:08<1:29:46, 2.03it/s] 7%|β | 843/11788 [07:20<12:01:12, 3.95s/it] 7%|β | 844/11788 [07:20<8:52:19, 2.92s/it] 7%|β | 845/11788 [07:21<6:39:37, 2.19s/it] 7%|β | 846/11788 [07:21<5:06:43, 1.68s/it] 7%|β | 847/11788 [07:22<4:01:43, 1.33s/it] 7%|β | 848/11788 [07:22<3:16:30, 1.08s/it] 7%|β | 849/11788 [07:23<2:45:20, 1.10it/s] 7%|β | 850/11788 [07:23<2:22:58, 1.28it/s] {'loss': 4.2742, 'grad_norm': 0.6338014006614685, 'learning_rate': 0.000720949957591179, 'epoch': 1.01} 7%|β | 850/11788 [07:23<2:22:58, 1.28it/s] 7%|β | 851/11788 [07:24<2:07:20, 1.43it/s] 7%|β | 852/11788 [07:24<1:56:07, 1.57it/s] 7%|β | 853/11788 [07:25<1:48:15, 1.68it/s] 7%|β | 854/11788 [07:25<1:42:54, 1.77it/s] 7%|β | 855/11788 [07:26<1:38:57, 1.84it/s] 7%|β | 856/11788 [07:26<1:36:14, 1.89it/s] 7%|β | 857/11788 [07:27<1:34:22, 1.93it/s] 7%|β | 858/11788 [07:27<1:32:56, 1.96it/s] 7%|β | 859/11788 [07:28<1:31:59, 1.98it/s] 7%|β | 860/11788 [07:28<1:31:17, 2.00it/s] 7%|β | 861/11788 [07:29<1:30:47, 2.01it/s] 7%|β | 862/11788 [07:29<1:30:40, 2.01it/s] 7%|β | 863/11788 [07:30<1:30:19, 2.02it/s] 7%|β | 864/11788 [07:30<1:30:07, 2.02it/s] 7%|β | 865/11788 [07:31<1:30:08, 2.02it/s] 7%|β | 866/11788 [07:31<1:30:00, 2.02it/s] 7%|β | 867/11788 [07:32<1:29:57, 2.02it/s] 7%|β | 868/11788 [07:32<1:29:47, 2.03it/s] |