File size: 61,567 Bytes
b845aee
0729830
 
 
 
 
 
 
 
b845aee
0729830
b845aee
a3bdea5
e98066d
cf4c4a9
b845aee
e82b954
 
 
 
4683a9a
e82b954
 
 
 
 
b845aee
e82b954
 
 
 
 
 
 
 
 
b845aee
e82b954
 
 
 
 
b845aee
4683a9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b845aee
4683a9a
 
 
 
 
b845aee
4683a9a
 
 
 
 
 
b845aee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
slurm submission log: 2024-05-24 23:54:02.551434
created following sbatch script: 

###############################

#!/bin/bash

#SBATCH --account=nlp
#SBATCH --cpus-per-task=16
#SBATCH --dependency=afterok:7649440
#SBATCH --gres=gpu:2
#SBATCH --job-name=tthrush-job-2884917
#SBATCH --mem=100G
#SBATCH --nodelist=sphinx2
#SBATCH --open-mode=append
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1/train_job_output.txt
#SBATCH --partition=sphinx
#SBATCH --time=14-0

# activate your desired anaconda environment
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection

# cd to working directory
cd .

# launch commands
srun --unbuffered run_as_child_processes 'torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'

###############################

submission to slurm complete!


###############################
slurm submission output

Submitted batch job 7649441



###############################

/var/lib/slurm/slurmd/job7649441/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory

CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'.
To initialize your shell, run

    $ conda init <SHELL_NAME>

Currently supported shells are:
  - bash
  - fish
  - tcsh
  - xonsh
  - zsh
  - powershell

See 'conda init --help' for more information and options.

IMPORTANT: You may need to close and restart your shell after running 'conda init'.


###############################
start time: 2024-05-25 04:55:04.852621
machine: sphinx2
conda env: pretraining-coreset-selection
###############################
running following processes

	torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14


###############################
command outputs: 


[2024-05-25 04:55:06,922] torch.distributed.run: [WARNING] 
[2024-05-25 04:55:06,922] torch.distributed.run: [WARNING] *****************************************
[2024-05-25 04:55:06,922] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
[2024-05-25 04:55:06,922] torch.distributed.run: [WARNING] *****************************************
05/25/2024 04:55:12 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)
05/25/2024 04:55:17 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)

  0%|          | 0/11788 [00:00<?, ?it/s][rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())

  0%|          | 1/11788 [00:05<18:49:07,  5.75s/it]
  0%|          | 2/11788 [00:07<11:47:29,  3.60s/it]
  0%|          | 3/11788 [00:09<8:39:23,  2.64s/it] 
  0%|          | 4/11788 [00:10<6:48:15,  2.08s/it]
  0%|          | 5/11788 [00:11<5:34:19,  1.70s/it]
  0%|          | 6/11788 [00:12<4:45:06,  1.45s/it]
  0%|          | 7/11788 [00:13<4:09:48,  1.27s/it]
  0%|          | 8/11788 [00:14<3:41:36,  1.13s/it]
  0%|          | 9/11788 [00:15<3:19:46,  1.02s/it]
  0%|          | 10/11788 [00:15<3:02:25,  1.08it/s]
  0%|          | 11/11788 [00:16<2:47:48,  1.17it/s]
  0%|          | 12/11788 [00:17<2:36:12,  1.26it/s]
  0%|          | 13/11788 [00:17<2:26:16,  1.34it/s]
  0%|          | 14/11788 [00:18<2:19:18,  1.41it/s]
  0%|          | 15/11788 [00:19<2:13:46,  1.47it/s]
  0%|          | 16/11788 [00:19<2:08:28,  1.53it/s]
  0%|          | 17/11788 [00:20<2:03:53,  1.58it/s]
  0%|          | 18/11788 [00:20<1:59:53,  1.64it/s]
  0%|          | 19/11788 [00:21<1:56:58,  1.68it/s]
  0%|          | 20/11788 [00:21<1:54:07,  1.72it/s]
  0%|          | 21/11788 [00:22<1:51:35,  1.76it/s]
  0%|          | 22/11788 [00:22<1:50:02,  1.78it/s]
  0%|          | 23/11788 [00:23<1:48:10,  1.81it/s]
  0%|          | 24/11788 [00:24<1:47:46,  1.82it/s]
  0%|          | 25/11788 [00:24<1:46:51,  1.83it/s]{'loss': 10.6806, 'grad_norm': 1.3700144290924072, 'learning_rate': 2.1204410517387616e-05, 'epoch': 0.03}

                                                    

  0%|          | 25/11788 [00:24<1:46:51,  1.83it/s]
  0%|          | 26/11788 [00:25<1:45:46,  1.85it/s]
  0%|          | 27/11788 [00:25<1:45:10,  1.86it/s]
  0%|          | 28/11788 [00:26<1:43:34,  1.89it/s]
  0%|          | 29/11788 [00:26<1:43:10,  1.90it/s]
  0%|          | 30/11788 [00:27<1:42:20,  1.91it/s]
  0%|          | 31/11788 [00:27<1:41:31,  1.93it/s]
  0%|          | 32/11788 [00:28<1:41:29,  1.93it/s]
  0%|          | 33/11788 [00:28<1:41:01,  1.94it/s]
  0%|          | 34/11788 [00:29<1:40:26,  1.95it/s]
  0%|          | 35/11788 [00:29<1:40:16,  1.95it/s]
  0%|          | 36/11788 [00:30<1:39:46,  1.96it/s]
  0%|          | 37/11788 [00:30<1:40:09,  1.96it/s]
  0%|          | 38/11788 [00:31<1:39:47,  1.96it/s]
  0%|          | 39/11788 [00:31<1:39:09,  1.97it/s]
  0%|          | 40/11788 [00:32<1:39:11,  1.97it/s]
  0%|          | 41/11788 [00:32<1:38:51,  1.98it/s]
  0%|          | 42/11788 [00:33<1:38:23,  1.99it/s]
  0%|          | 43/11788 [00:33<1:39:25,  1.97it/s]
  0%|          | 44/11788 [00:34<1:39:01,  1.98it/s]
  0%|          | 45/11788 [00:34<1:38:28,  1.99it/s]
  0%|          | 46/11788 [00:35<1:38:13,  1.99it/s]
  0%|          | 47/11788 [00:35<1:38:16,  1.99it/s]
  0%|          | 48/11788 [00:36<1:37:59,  2.00it/s]
  0%|          | 49/11788 [00:36<1:38:07,  1.99it/s]
  0%|          | 50/11788 [00:37<1:38:03,  2.00it/s]{'loss': 9.9893, 'grad_norm': 1.2721635103225708, 'learning_rate': 4.240882103477523e-05, 'epoch': 0.06}

                                                    

  0%|          | 50/11788 [00:37<1:38:03,  2.00it/s]
  0%|          | 51/11788 [00:37<1:38:04,  1.99it/s]
  0%|          | 52/11788 [00:38<1:38:29,  1.99it/s]
  0%|          | 53/11788 [00:38<1:38:21,  1.99it/s]
  0%|          | 54/11788 [00:39<1:38:25,  1.99it/s]
  0%|          | 55/11788 [00:39<1:38:40,  1.98it/s]
  0%|          | 56/11788 [00:40<1:38:25,  1.99it/s]
  0%|          | 57/11788 [00:40<1:38:13,  1.99it/s]
  0%|          | 58/11788 [00:41<1:37:50,  2.00it/s]
  1%|          | 59/11788 [00:41<1:37:50,  2.00it/s]
  1%|          | 60/11788 [00:42<1:37:37,  2.00it/s]
  1%|          | 61/11788 [00:42<1:37:36,  2.00it/s]
  1%|          | 62/11788 [00:43<1:37:30,  2.00it/s]
  1%|          | 63/11788 [00:43<1:37:27,  2.01it/s]
  1%|          | 64/11788 [00:44<1:37:09,  2.01it/s]
  1%|          | 65/11788 [00:44<1:37:08,  2.01it/s]
  1%|          | 66/11788 [00:45<1:37:05,  2.01it/s]
  1%|          | 67/11788 [00:45<1:37:22,  2.01it/s]
  1%|          | 68/11788 [00:46<1:37:16,  2.01it/s]
  1%|          | 69/11788 [00:46<1:36:56,  2.01it/s]
  1%|          | 70/11788 [00:47<1:37:04,  2.01it/s]
  1%|          | 71/11788 [00:47<1:37:11,  2.01it/s]
  1%|          | 72/11788 [00:48<1:36:56,  2.01it/s]
  1%|          | 73/11788 [00:48<1:36:56,  2.01it/s]
  1%|          | 74/11788 [00:49<1:36:53,  2.01it/s]
  1%|          | 75/11788 [00:49<1:36:52,  2.02it/s]{'loss': 9.3183, 'grad_norm': 1.0619311332702637, 'learning_rate': 6.361323155216285e-05, 'epoch': 0.09}

                                                    

  1%|          | 75/11788 [00:49<1:36:52,  2.02it/s]
  1%|          | 76/11788 [00:50<1:37:04,  2.01it/s]
  1%|          | 77/11788 [00:50<1:36:47,  2.02it/s]
  1%|          | 78/11788 [00:51<1:36:52,  2.01it/s]
  1%|          | 79/11788 [00:51<1:36:50,  2.02it/s]
  1%|          | 80/11788 [00:52<1:36:44,  2.02it/s]
  1%|          | 81/11788 [00:52<1:36:48,  2.02it/s]
  1%|          | 82/11788 [00:53<1:36:42,  2.02it/s]
  1%|          | 83/11788 [00:53<1:36:37,  2.02it/s]
  1%|          | 84/11788 [00:54<1:36:31,  2.02it/s]
  1%|          | 85/11788 [00:54<1:36:30,  2.02it/s]
  1%|          | 86/11788 [00:55<1:36:29,  2.02it/s]
  1%|          | 87/11788 [00:55<1:36:23,  2.02it/s]
  1%|          | 88/11788 [00:56<1:36:17,  2.02it/s]
  1%|          | 89/11788 [00:56<1:36:12,  2.03it/s]
  1%|          | 90/11788 [00:57<1:36:13,  2.03it/s]
  1%|          | 91/11788 [00:57<1:36:12,  2.03it/s]
  1%|          | 92/11788 [00:58<1:36:16,  2.02it/s]
  1%|          | 93/11788 [00:58<1:36:13,  2.03it/s]
  1%|          | 94/11788 [00:59<1:36:11,  2.03it/s]
  1%|          | 95/11788 [00:59<1:36:05,  2.03it/s]
  1%|          | 96/11788 [01:00<1:36:11,  2.03it/s]
  1%|          | 97/11788 [01:00<1:36:05,  2.03it/s]
  1%|          | 98/11788 [01:01<1:36:08,  2.03it/s]
  1%|          | 99/11788 [01:01<1:36:02,  2.03it/s]
  1%|          | 100/11788 [01:02<1:36:07,  2.03it/s]{'loss': 8.5608, 'grad_norm': 0.8029088377952576, 'learning_rate': 8.481764206955047e-05, 'epoch': 0.12}

                                                     

  1%|          | 100/11788 [01:02<1:36:07,  2.03it/s]
  1%|          | 101/11788 [01:02<1:36:10,  2.03it/s]
  1%|          | 102/11788 [01:03<1:36:05,  2.03it/s]
  1%|          | 103/11788 [01:03<1:36:03,  2.03it/s]
  1%|          | 104/11788 [01:04<1:36:02,  2.03it/s]
  1%|          | 105/11788 [01:04<1:36:04,  2.03it/s]
  1%|          | 106/11788 [01:05<1:35:56,  2.03it/s]
  1%|          | 107/11788 [01:05<1:36:01,  2.03it/s]
  1%|          | 108/11788 [01:06<1:36:00,  2.03it/s]
  1%|          | 109/11788 [01:06<1:35:58,  2.03it/s]
  1%|          | 110/11788 [01:07<1:35:57,  2.03it/s]
  1%|          | 111/11788 [01:07<1:35:51,  2.03it/s]
  1%|          | 112/11788 [01:07<1:36:00,  2.03it/s]
  1%|          | 113/11788 [01:08<1:35:56,  2.03it/s]
  1%|          | 114/11788 [01:08<1:35:57,  2.03it/s]
  1%|          | 115/11788 [01:09<1:35:53,  2.03it/s]
  1%|          | 116/11788 [01:09<1:35:55,  2.03it/s]
  1%|          | 117/11788 [01:10<1:35:52,  2.03it/s]
  1%|          | 118/11788 [01:10<1:35:54,  2.03it/s]
  1%|          | 119/11788 [01:11<1:35:55,  2.03it/s]
  1%|          | 120/11788 [01:11<1:35:54,  2.03it/s]
  1%|          | 121/11788 [01:12<1:35:56,  2.03it/s]
  1%|          | 122/11788 [01:12<1:35:55,  2.03it/s]
  1%|          | 123/11788 [01:13<1:35:56,  2.03it/s]
  1%|          | 124/11788 [01:13<1:35:50,  2.03it/s]
  1%|          | 125/11788 [01:14<1:35:57,  2.03it/s]
                                                     
{'loss': 7.9105, 'grad_norm': 0.5431452393531799, 'learning_rate': 0.0001060220525869381, 'epoch': 0.15}

  1%|          | 125/11788 [01:14<1:35:57,  2.03it/s]
  1%|          | 126/11788 [01:14<1:35:58,  2.03it/s]
  1%|          | 127/11788 [01:15<1:36:01,  2.02it/s]
  1%|          | 128/11788 [01:15<1:35:53,  2.03it/s]
  1%|          | 129/11788 [01:16<1:35:51,  2.03it/s]
  1%|          | 130/11788 [01:16<1:35:52,  2.03it/s]
  1%|          | 131/11788 [01:17<1:35:49,  2.03it/s]
  1%|          | 132/11788 [01:17<1:35:51,  2.03it/s]
  1%|          | 133/11788 [01:18<1:35:54,  2.03it/s]
  1%|          | 134/11788 [01:18<1:35:48,  2.03it/s]
  1%|          | 135/11788 [01:19<1:35:46,  2.03it/s]
  1%|          | 136/11788 [01:19<1:35:47,  2.03it/s]
  1%|          | 137/11788 [01:20<1:35:43,  2.03it/s]
  1%|          | 138/11788 [01:20<1:35:40,  2.03it/s]
  1%|          | 139/11788 [01:21<1:35:39,  2.03it/s]
  1%|          | 140/11788 [01:21<1:35:37,  2.03it/s]
  1%|          | 141/11788 [01:22<1:35:37,  2.03it/s]
  1%|          | 142/11788 [01:22<1:35:39,  2.03it/s]
  1%|          | 143/11788 [01:23<1:35:43,  2.03it/s]
  1%|          | 144/11788 [01:23<1:35:44,  2.03it/s]
  1%|          | 145/11788 [01:24<1:35:40,  2.03it/s]
  1%|          | 146/11788 [01:24<1:35:43,  2.03it/s]
  1%|          | 147/11788 [01:25<1:35:49,  2.02it/s]
  1%|▏         | 148/11788 [01:25<1:35:46,  2.03it/s]
  1%|▏         | 149/11788 [01:26<1:35:50,  2.02it/s]
  1%|▏         | 150/11788 [01:26<1:35:48,  2.02it/s]{'loss': 7.4247, 'grad_norm': 0.4856749475002289, 'learning_rate': 0.0001272264631043257, 'epoch': 0.18}

                                                     

  1%|▏         | 150/11788 [01:26<1:35:48,  2.02it/s]
  1%|▏         | 151/11788 [01:27<1:35:48,  2.02it/s]
  1%|▏         | 152/11788 [01:27<1:35:47,  2.02it/s]
  1%|▏         | 153/11788 [01:28<1:35:41,  2.03it/s]
  1%|▏         | 154/11788 [01:28<1:35:39,  2.03it/s]
  1%|▏         | 155/11788 [01:29<1:35:40,  2.03it/s]
  1%|▏         | 156/11788 [01:29<1:35:40,  2.03it/s]
  1%|▏         | 157/11788 [01:30<1:35:40,  2.03it/s]
  1%|▏         | 158/11788 [01:30<1:35:33,  2.03it/s]
  1%|▏         | 159/11788 [01:31<1:35:34,  2.03it/s]
  1%|▏         | 160/11788 [01:31<1:35:31,  2.03it/s]
  1%|▏         | 161/11788 [01:32<1:35:27,  2.03it/s]
  1%|▏         | 162/11788 [01:32<1:35:29,  2.03it/s]
  1%|▏         | 163/11788 [01:33<1:35:26,  2.03it/s]
  1%|▏         | 164/11788 [01:33<1:35:31,  2.03it/s]
  1%|▏         | 165/11788 [01:34<1:35:32,  2.03it/s]
  1%|▏         | 166/11788 [01:34<1:35:34,  2.03it/s]
  1%|▏         | 167/11788 [01:35<1:35:34,  2.03it/s]
  1%|▏         | 168/11788 [01:35<1:35:32,  2.03it/s]
  1%|▏         | 169/11788 [01:36<1:35:33,  2.03it/s]
  1%|▏         | 170/11788 [01:36<1:35:28,  2.03it/s]
  1%|▏         | 171/11788 [01:37<1:35:28,  2.03it/s]
  1%|▏         | 172/11788 [01:37<1:35:26,  2.03it/s]
  1%|▏         | 173/11788 [01:38<1:35:26,  2.03it/s]
  1%|▏         | 174/11788 [01:38<1:35:26,  2.03it/s]
  1%|▏         | 175/11788 [01:39<1:35:21,  2.03it/s]{'loss': 7.0223, 'grad_norm': 0.5401294231414795, 'learning_rate': 0.00014843087362171331, 'epoch': 0.21}

                                                     

  1%|▏         | 175/11788 [01:39<1:35:21,  2.03it/s]
  1%|▏         | 176/11788 [01:39<1:35:33,  2.03it/s]
  2%|▏         | 177/11788 [01:40<1:35:27,  2.03it/s]
  2%|▏         | 178/11788 [01:40<1:35:31,  2.03it/s]
  2%|▏         | 179/11788 [01:41<1:35:23,  2.03it/s]
  2%|▏         | 180/11788 [01:41<1:35:27,  2.03it/s]
  2%|▏         | 181/11788 [01:42<1:35:21,  2.03it/s]
  2%|▏         | 182/11788 [01:42<1:35:14,  2.03it/s]
  2%|▏         | 183/11788 [01:43<1:35:19,  2.03it/s]
  2%|▏         | 184/11788 [01:43<1:35:22,  2.03it/s]
  2%|▏         | 185/11788 [01:44<1:35:25,  2.03it/s]
  2%|▏         | 186/11788 [01:44<1:35:18,  2.03it/s]
  2%|▏         | 187/11788 [01:44<1:35:23,  2.03it/s]
  2%|▏         | 188/11788 [01:45<1:35:23,  2.03it/s]
  2%|▏         | 189/11788 [01:45<1:35:26,  2.03it/s]
  2%|▏         | 190/11788 [01:46<1:35:28,  2.02it/s]
  2%|▏         | 191/11788 [01:46<1:35:26,  2.02it/s]
  2%|▏         | 192/11788 [01:47<1:35:31,  2.02it/s]
  2%|▏         | 193/11788 [01:47<1:35:29,  2.02it/s]
  2%|▏         | 194/11788 [01:48<1:35:29,  2.02it/s]
  2%|▏         | 195/11788 [01:48<1:35:37,  2.02it/s]
  2%|▏         | 196/11788 [01:49<1:35:31,  2.02it/s]
  2%|▏         | 197/11788 [01:49<1:35:27,  2.02it/s]
  2%|▏         | 198/11788 [01:50<1:35:24,  2.02it/s]
  2%|▏         | 199/11788 [01:50<1:35:27,  2.02it/s]
  2%|▏         | 200/11788 [01:51<1:35:18,  2.03it/s]{'loss': 6.6636, 'grad_norm': 0.38549456000328064, 'learning_rate': 0.00016963528413910093, 'epoch': 0.24}

                                                     

  2%|▏         | 200/11788 [01:51<1:35:18,  2.03it/s]
  2%|▏         | 201/11788 [01:51<1:35:27,  2.02it/s]
  2%|▏         | 202/11788 [01:52<1:35:25,  2.02it/s]
  2%|▏         | 203/11788 [01:52<1:35:21,  2.02it/s]
  2%|▏         | 204/11788 [01:53<1:35:22,  2.02it/s]
  2%|▏         | 205/11788 [01:53<1:35:15,  2.03it/s]
  2%|▏         | 206/11788 [01:54<1:35:15,  2.03it/s]
  2%|▏         | 207/11788 [01:54<1:35:15,  2.03it/s]
  2%|▏         | 208/11788 [01:55<1:35:34,  2.02it/s]
  2%|▏         | 209/11788 [01:55<1:35:37,  2.02it/s]
  2%|▏         | 210/11788 [01:56<1:35:41,  2.02it/s]
  2%|▏         | 211/11788 [01:56<1:35:37,  2.02it/s]
  2%|▏         | 212/11788 [01:57<1:35:31,  2.02it/s]
  2%|▏         | 213/11788 [01:57<1:35:25,  2.02it/s]
  2%|▏         | 214/11788 [01:58<1:35:18,  2.02it/s]
  2%|▏         | 215/11788 [01:58<1:35:17,  2.02it/s]
  2%|▏         | 216/11788 [01:59<1:35:12,  2.03it/s]
  2%|▏         | 217/11788 [01:59<1:35:05,  2.03it/s]
  2%|▏         | 218/11788 [02:00<1:35:10,  2.03it/s]
  2%|▏         | 219/11788 [02:00<1:35:04,  2.03it/s]
  2%|▏         | 220/11788 [02:01<1:35:11,  2.03it/s]
  2%|▏         | 221/11788 [02:01<1:35:02,  2.03it/s]
  2%|▏         | 222/11788 [02:02<1:35:00,  2.03it/s]
  2%|▏         | 223/11788 [02:02<1:34:59,  2.03it/s]
  2%|▏         | 224/11788 [02:03<1:34:54,  2.03it/s]
  2%|▏         | 225/11788 [02:03<1:35:02,  2.03it/s]{'loss': 6.3808, 'grad_norm': 0.6071425080299377, 'learning_rate': 0.00019083969465648857, 'epoch': 0.27}

                                                     

  2%|▏         | 225/11788 [02:03<1:35:02,  2.03it/s]
  2%|▏         | 226/11788 [02:04<1:35:06,  2.03it/s]
  2%|▏         | 227/11788 [02:04<1:35:12,  2.02it/s]
  2%|▏         | 228/11788 [02:05<1:35:04,  2.03it/s]
  2%|▏         | 229/11788 [02:05<1:35:02,  2.03it/s]
  2%|▏         | 230/11788 [02:06<1:34:57,  2.03it/s]
  2%|▏         | 231/11788 [02:06<1:34:54,  2.03it/s]
  2%|▏         | 232/11788 [02:07<1:35:00,  2.03it/s]
  2%|▏         | 233/11788 [02:07<1:34:59,  2.03it/s]
  2%|▏         | 234/11788 [02:08<1:35:03,  2.03it/s]
  2%|▏         | 235/11788 [02:08<1:34:59,  2.03it/s]
  2%|▏         | 236/11788 [02:09<1:35:01,  2.03it/s]
  2%|▏         | 237/11788 [02:09<1:34:58,  2.03it/s]
  2%|▏         | 238/11788 [02:10<1:34:56,  2.03it/s]
  2%|▏         | 239/11788 [02:10<1:34:56,  2.03it/s]
  2%|▏         | 240/11788 [02:11<1:34:49,  2.03it/s]
  2%|▏         | 241/11788 [02:11<1:34:53,  2.03it/s]
  2%|▏         | 242/11788 [02:12<1:34:51,  2.03it/s]
  2%|▏         | 243/11788 [02:12<1:34:50,  2.03it/s]
  2%|▏         | 244/11788 [02:13<1:34:50,  2.03it/s]
  2%|▏         | 245/11788 [02:13<1:34:49,  2.03it/s]
  2%|▏         | 246/11788 [02:14<1:34:51,  2.03it/s]
  2%|▏         | 247/11788 [02:14<1:34:48,  2.03it/s]
  2%|▏         | 248/11788 [02:15<1:34:48,  2.03it/s]
  2%|▏         | 249/11788 [02:15<1:34:48,  2.03it/s]
  2%|▏         | 250/11788 [02:16<1:34:49,  2.03it/s]{'loss': 6.1534, 'grad_norm': 0.6674228310585022, 'learning_rate': 0.0002120441051738762, 'epoch': 0.3}

                                                     

  2%|▏         | 250/11788 [02:16<1:34:49,  2.03it/s]
  2%|▏         | 251/11788 [02:16<1:34:59,  2.02it/s]
  2%|▏         | 252/11788 [02:17<1:34:55,  2.03it/s]
  2%|▏         | 253/11788 [02:17<1:34:49,  2.03it/s]
  2%|▏         | 254/11788 [02:18<1:34:46,  2.03it/s]
  2%|▏         | 255/11788 [02:18<1:34:49,  2.03it/s]
  2%|▏         | 256/11788 [02:19<1:34:43,  2.03it/s]
  2%|▏         | 257/11788 [02:19<1:34:49,  2.03it/s]
  2%|▏         | 258/11788 [02:20<1:34:40,  2.03it/s]
  2%|▏         | 259/11788 [02:20<1:34:46,  2.03it/s]
  2%|▏         | 260/11788 [02:21<1:34:46,  2.03it/s]
  2%|▏         | 261/11788 [02:21<1:34:45,  2.03it/s]
  2%|▏         | 262/11788 [02:22<1:34:45,  2.03it/s]
  2%|▏         | 263/11788 [02:22<1:34:45,  2.03it/s]
  2%|▏         | 264/11788 [02:22<1:34:46,  2.03it/s]
  2%|▏         | 265/11788 [02:23<1:34:42,  2.03it/s]
  2%|▏         | 266/11788 [02:23<1:34:50,  2.02it/s]
  2%|▏         | 267/11788 [02:24<1:34:43,  2.03it/s]
  2%|▏         | 268/11788 [02:24<1:34:46,  2.03it/s]
  2%|▏         | 269/11788 [02:25<1:34:44,  2.03it/s]
  2%|▏         | 270/11788 [02:25<1:34:44,  2.03it/s]
  2%|▏         | 271/11788 [02:26<1:34:41,  2.03it/s]
  2%|▏         | 272/11788 [02:26<1:34:37,  2.03it/s]
  2%|▏         | 273/11788 [02:27<1:34:36,  2.03it/s]
  2%|▏         | 274/11788 [02:27<1:34:30,  2.03it/s]
  2%|▏         | 275/11788 [02:28<1:34:34,  2.03it/s]{'loss': 5.957, 'grad_norm': 0.5068850517272949, 'learning_rate': 0.00023324851569126378, 'epoch': 0.33}

                                                     

  2%|▏         | 275/11788 [02:28<1:34:34,  2.03it/s]
  2%|▏         | 276/11788 [02:28<1:34:39,  2.03it/s]
  2%|▏         | 277/11788 [02:29<1:34:40,  2.03it/s]
  2%|▏         | 278/11788 [02:29<1:34:37,  2.03it/s]
  2%|▏         | 279/11788 [02:30<1:34:36,  2.03it/s]
  2%|▏         | 280/11788 [02:30<1:34:42,  2.03it/s]
  2%|▏         | 281/11788 [02:31<1:34:36,  2.03it/s]
  2%|▏         | 282/11788 [02:31<1:34:34,  2.03it/s]
  2%|▏         | 283/11788 [02:32<1:34:31,  2.03it/s]
  2%|▏         | 284/11788 [02:32<1:34:27,  2.03it/s]
  2%|▏         | 285/11788 [02:33<1:34:32,  2.03it/s]
  2%|▏         | 286/11788 [02:33<1:34:25,  2.03it/s]
  2%|▏         | 287/11788 [02:34<1:34:24,  2.03it/s]
  2%|▏         | 288/11788 [02:34<1:34:25,  2.03it/s]
  2%|▏         | 289/11788 [02:35<1:34:21,  2.03it/s]
  2%|▏         | 290/11788 [02:35<1:34:24,  2.03it/s]
  2%|▏         | 291/11788 [02:36<1:34:29,  2.03it/s]
  2%|▏         | 292/11788 [02:36<1:34:28,  2.03it/s]
  2%|▏         | 293/11788 [02:37<1:34:30,  2.03it/s]
  2%|▏         | 294/11788 [02:37<1:34:24,  2.03it/s]
  3%|β–Ž         | 295/11788 [02:38<1:34:20,  2.03it/s]
  3%|β–Ž         | 296/11788 [02:38<1:34:18,  2.03it/s]
  3%|β–Ž         | 297/11788 [02:39<1:34:16,  2.03it/s]
  3%|β–Ž         | 298/11788 [02:39<1:34:19,  2.03it/s]
  3%|β–Ž         | 299/11788 [02:40<1:34:23,  2.03it/s]
  3%|β–Ž         | 300/11788 [02:40<1:34:21,  2.03it/s]{'loss': 5.8146, 'grad_norm': 0.6466884613037109, 'learning_rate': 0.0002544529262086514, 'epoch': 0.36}
                                                     

  3%|β–Ž         | 300/11788 [02:40<1:34:21,  2.03it/s]
  3%|β–Ž         | 301/11788 [02:41<1:34:34,  2.02it/s]
  3%|β–Ž         | 302/11788 [02:41<1:34:32,  2.02it/s]
  3%|β–Ž         | 303/11788 [02:42<1:34:30,  2.03it/s]
  3%|β–Ž         | 304/11788 [02:42<1:34:28,  2.03it/s]
  3%|β–Ž         | 305/11788 [02:43<1:34:23,  2.03it/s]
  3%|β–Ž         | 306/11788 [02:43<1:34:28,  2.03it/s]
  3%|β–Ž         | 307/11788 [02:44<1:34:19,  2.03it/s]
  3%|β–Ž         | 308/11788 [02:44<1:34:18,  2.03it/s]
  3%|β–Ž         | 309/11788 [02:45<1:34:18,  2.03it/s]
  3%|β–Ž         | 310/11788 [02:45<1:34:19,  2.03it/s]
  3%|β–Ž         | 311/11788 [02:46<1:34:19,  2.03it/s]
  3%|β–Ž         | 312/11788 [02:46<1:34:24,  2.03it/s]
  3%|β–Ž         | 313/11788 [02:47<1:34:21,  2.03it/s]
  3%|β–Ž         | 314/11788 [02:47<1:34:16,  2.03it/s]
  3%|β–Ž         | 315/11788 [02:48<1:34:18,  2.03it/s]
  3%|β–Ž         | 316/11788 [02:48<1:34:17,  2.03it/s]
  3%|β–Ž         | 317/11788 [02:49<1:34:23,  2.03it/s]
  3%|β–Ž         | 318/11788 [02:49<1:34:21,  2.03it/s]
  3%|β–Ž         | 319/11788 [02:50<1:34:20,  2.03it/s]
  3%|β–Ž         | 320/11788 [02:50<1:34:23,  2.02it/s]
  3%|β–Ž         | 321/11788 [02:51<1:34:22,  2.02it/s]
  3%|β–Ž         | 322/11788 [02:51<1:34:21,  2.03it/s]
  3%|β–Ž         | 323/11788 [02:52<1:34:18,  2.03it/s]
  3%|β–Ž         | 324/11788 [02:52<1:34:15,  2.03it/s]
  3%|β–Ž         | 325/11788 [02:53<1:34:14,  2.03it/s]{'loss': 5.6587, 'grad_norm': 0.8380582332611084, 'learning_rate': 0.00027565733672603904, 'epoch': 0.39}
                                                     

  3%|β–Ž         | 325/11788 [02:53<1:34:14,  2.03it/s]
  3%|β–Ž         | 326/11788 [02:53<1:34:21,  2.02it/s]
  3%|β–Ž         | 327/11788 [02:54<1:34:20,  2.02it/s]
  3%|β–Ž         | 328/11788 [02:54<1:34:15,  2.03it/s]
  3%|β–Ž         | 329/11788 [02:55<1:34:09,  2.03it/s]
  3%|β–Ž         | 330/11788 [02:55<1:34:14,  2.03it/s]
  3%|β–Ž         | 331/11788 [02:56<1:34:11,  2.03it/s]
  3%|β–Ž         | 332/11788 [02:56<1:34:14,  2.03it/s]
  3%|β–Ž         | 333/11788 [02:57<1:34:14,  2.03it/s]
  3%|β–Ž         | 334/11788 [02:57<1:34:24,  2.02it/s]
  3%|β–Ž         | 335/11788 [02:58<1:34:20,  2.02it/s]
  3%|β–Ž         | 336/11788 [02:58<1:34:22,  2.02it/s]
  3%|β–Ž         | 337/11788 [02:59<1:34:20,  2.02it/s]
  3%|β–Ž         | 338/11788 [02:59<1:34:17,  2.02it/s]
  3%|β–Ž         | 339/11788 [02:59<1:34:12,  2.03it/s]
  3%|β–Ž         | 340/11788 [03:00<1:34:07,  2.03it/s]
  3%|β–Ž         | 341/11788 [03:00<1:34:05,  2.03it/s]
  3%|β–Ž         | 342/11788 [03:01<1:34:05,  2.03it/s]
  3%|β–Ž         | 343/11788 [03:01<1:34:09,  2.03it/s]
  3%|β–Ž         | 344/11788 [03:02<1:34:06,  2.03it/s]
  3%|β–Ž         | 345/11788 [03:02<1:34:08,  2.03it/s]
  3%|β–Ž         | 346/11788 [03:03<1:34:03,  2.03it/s]
  3%|β–Ž         | 347/11788 [03:03<1:34:04,  2.03it/s]
  3%|β–Ž         | 348/11788 [03:04<1:33:57,  2.03it/s]
  3%|β–Ž         | 349/11788 [03:04<1:33:57,  2.03it/s]
  3%|β–Ž         | 350/11788 [03:05<1:33:56,  2.03it/s]{'loss': 5.5439, 'grad_norm': 0.7836480736732483, 'learning_rate': 0.00029686174724342663, 'epoch': 0.42}
                                                     

  3%|β–Ž         | 350/11788 [03:05<1:33:56,  2.03it/s]
  3%|β–Ž         | 351/11788 [03:05<1:34:08,  2.02it/s]
  3%|β–Ž         | 352/11788 [03:06<1:34:03,  2.03it/s]
  3%|β–Ž         | 353/11788 [03:06<1:33:57,  2.03it/s]
  3%|β–Ž         | 354/11788 [03:07<1:33:58,  2.03it/s]
  3%|β–Ž         | 355/11788 [03:07<1:33:55,  2.03it/s]
  3%|β–Ž         | 356/11788 [03:08<1:33:56,  2.03it/s]
  3%|β–Ž         | 357/11788 [03:08<1:33:55,  2.03it/s]
  3%|β–Ž         | 358/11788 [03:09<1:33:53,  2.03it/s]
  3%|β–Ž         | 359/11788 [03:09<1:33:54,  2.03it/s]
  3%|β–Ž         | 360/11788 [03:10<1:33:50,  2.03it/s]
  3%|β–Ž         | 361/11788 [03:10<1:33:50,  2.03it/s]
  3%|β–Ž         | 362/11788 [03:11<1:33:49,  2.03it/s]
  3%|β–Ž         | 363/11788 [03:11<1:33:52,  2.03it/s]
  3%|β–Ž         | 364/11788 [03:12<1:33:54,  2.03it/s]
  3%|β–Ž         | 365/11788 [03:12<1:33:48,  2.03it/s]
  3%|β–Ž         | 366/11788 [03:13<1:33:54,  2.03it/s]
  3%|β–Ž         | 367/11788 [03:13<1:33:49,  2.03it/s]
  3%|β–Ž         | 368/11788 [03:14<1:33:49,  2.03it/s]
  3%|β–Ž         | 369/11788 [03:14<1:33:51,  2.03it/s]
  3%|β–Ž         | 370/11788 [03:15<1:33:51,  2.03it/s]
  3%|β–Ž         | 371/11788 [03:15<1:33:53,  2.03it/s]
  3%|β–Ž         | 372/11788 [03:16<1:33:48,  2.03it/s]
  3%|β–Ž         | 373/11788 [03:16<1:33:51,  2.03it/s]
  3%|β–Ž         | 374/11788 [03:17<1:33:49,  2.03it/s]
  3%|β–Ž         | 375/11788 [03:17<1:33:50,  2.03it/s]{'loss': 5.4288, 'grad_norm': 0.8087937235832214, 'learning_rate': 0.0003180661577608143, 'epoch': 0.45}

                                                     

  3%|β–Ž         | 375/11788 [03:17<1:33:50,  2.03it/s]
  3%|β–Ž         | 376/11788 [03:18<1:33:52,  2.03it/s]
  3%|β–Ž         | 377/11788 [03:18<1:33:55,  2.02it/s]
  3%|β–Ž         | 378/11788 [03:19<1:33:50,  2.03it/s]
  3%|β–Ž         | 379/11788 [03:19<1:33:52,  2.03it/s]
  3%|β–Ž         | 380/11788 [03:20<1:33:51,  2.03it/s]
  3%|β–Ž         | 381/11788 [03:20<1:33:47,  2.03it/s]
  3%|β–Ž         | 382/11788 [03:21<1:33:49,  2.03it/s]
  3%|β–Ž         | 383/11788 [03:21<1:33:41,  2.03it/s]
  3%|β–Ž         | 384/11788 [03:22<1:33:39,  2.03it/s]
  3%|β–Ž         | 385/11788 [03:22<1:33:41,  2.03it/s]
  3%|β–Ž         | 386/11788 [03:23<1:33:45,  2.03it/s]
  3%|β–Ž         | 387/11788 [03:23<1:33:44,  2.03it/s]
  3%|β–Ž         | 388/11788 [03:24<1:33:38,  2.03it/s]
  3%|β–Ž         | 389/11788 [03:24<1:33:37,  2.03it/s]
  3%|β–Ž         | 390/11788 [03:25<1:33:36,  2.03it/s]
  3%|β–Ž         | 391/11788 [03:25<1:33:37,  2.03it/s]
  3%|β–Ž         | 392/11788 [03:26<1:33:36,  2.03it/s]
  3%|β–Ž         | 393/11788 [03:26<1:33:28,  2.03it/s]
  3%|β–Ž         | 394/11788 [03:27<1:33:35,  2.03it/s]
  3%|β–Ž         | 395/11788 [03:27<1:33:32,  2.03it/s]
  3%|β–Ž         | 396/11788 [03:28<1:33:34,  2.03it/s]
  3%|β–Ž         | 397/11788 [03:28<1:33:31,  2.03it/s]
  3%|β–Ž         | 398/11788 [03:29<1:33:33,  2.03it/s]
  3%|β–Ž         | 399/11788 [03:29<1:33:37,  2.03it/s]
  3%|β–Ž         | 400/11788 [03:30<1:33:31,  2.03it/s]{'loss': 5.3226, 'grad_norm': 0.8001272082328796, 'learning_rate': 0.00033927056827820186, 'epoch': 0.48}
                                                     

  3%|β–Ž         | 400/11788 [03:30<1:33:31,  2.03it/s]
  3%|β–Ž         | 401/11788 [03:30<1:33:42,  2.03it/s]
  3%|β–Ž         | 402/11788 [03:31<1:33:35,  2.03it/s]
  3%|β–Ž         | 403/11788 [03:31<1:33:37,  2.03it/s]
  3%|β–Ž         | 404/11788 [03:32<1:33:34,  2.03it/s]
  3%|β–Ž         | 405/11788 [03:32<1:33:41,  2.02it/s]
  3%|β–Ž         | 406/11788 [03:33<1:33:35,  2.03it/s]
  3%|β–Ž         | 407/11788 [03:33<1:33:34,  2.03it/s]
  3%|β–Ž         | 408/11788 [03:34<1:33:34,  2.03it/s]
  3%|β–Ž         | 409/11788 [03:34<1:33:34,  2.03it/s]
  3%|β–Ž         | 410/11788 [03:35<1:33:35,  2.03it/s]
  3%|β–Ž         | 411/11788 [03:35<1:33:28,  2.03it/s]
  3%|β–Ž         | 412/11788 [03:35<1:33:32,  2.03it/s]
  4%|β–Ž         | 413/11788 [03:36<1:33:29,  2.03it/s]
  4%|β–Ž         | 414/11788 [03:36<1:33:28,  2.03it/s]
  4%|β–Ž         | 415/11788 [03:37<1:33:25,  2.03it/s]
  4%|β–Ž         | 416/11788 [03:37<1:33:24,  2.03it/s]
  4%|β–Ž         | 417/11788 [03:38<1:33:24,  2.03it/s]
  4%|β–Ž         | 418/11788 [03:38<1:33:23,  2.03it/s]
  4%|β–Ž         | 419/11788 [03:39<1:33:33,  2.03it/s]
  4%|β–Ž         | 420/11788 [03:39<1:33:31,  2.03it/s]
  4%|β–Ž         | 421/11788 [03:40<1:33:29,  2.03it/s]
  4%|β–Ž         | 422/11788 [03:40<1:33:27,  2.03it/s]
  4%|β–Ž         | 423/11788 [03:41<1:33:22,  2.03it/s]
  4%|β–Ž         | 424/11788 [03:41<1:33:23,  2.03it/s]
  4%|β–Ž         | 425/11788 [03:42<1:33:21,  2.03it/s]
                                                     {'loss': 5.2315, 'grad_norm': 0.8854663968086243, 'learning_rate': 0.0003604749787955895, 'epoch': 0.5}

  4%|β–Ž         | 425/11788 [03:42<1:33:21,  2.03it/s]
  4%|β–Ž         | 426/11788 [03:42<1:33:34,  2.02it/s]
  4%|β–Ž         | 427/11788 [03:43<1:33:27,  2.03it/s]
  4%|β–Ž         | 428/11788 [03:43<1:33:21,  2.03it/s]
  4%|β–Ž         | 429/11788 [03:44<1:33:24,  2.03it/s]
  4%|β–Ž         | 430/11788 [03:44<1:33:27,  2.03it/s]
  4%|β–Ž         | 431/11788 [03:45<1:33:26,  2.03it/s]
  4%|β–Ž         | 432/11788 [03:45<1:33:23,  2.03it/s]
  4%|β–Ž         | 433/11788 [03:46<1:33:17,  2.03it/s]
  4%|β–Ž         | 434/11788 [03:46<1:33:20,  2.03it/s]
  4%|β–Ž         | 435/11788 [03:47<1:33:15,  2.03it/s]
  4%|β–Ž         | 436/11788 [03:47<1:33:16,  2.03it/s]
  4%|β–Ž         | 437/11788 [03:48<1:33:14,  2.03it/s]
  4%|β–Ž         | 438/11788 [03:48<1:33:05,  2.03it/s]
  4%|β–Ž         | 439/11788 [03:49<1:33:14,  2.03it/s]
  4%|β–Ž         | 440/11788 [03:49<1:33:13,  2.03it/s]
  4%|β–Ž         | 441/11788 [03:50<1:33:15,  2.03it/s]
  4%|β–Ž         | 442/11788 [03:50<1:33:16,  2.03it/s]
  4%|▍         | 443/11788 [03:51<1:33:11,  2.03it/s]
  4%|▍         | 444/11788 [03:51<1:33:16,  2.03it/s]
  4%|▍         | 445/11788 [03:52<1:33:08,  2.03it/s]
  4%|▍         | 446/11788 [03:52<1:33:05,  2.03it/s]
  4%|▍         | 447/11788 [03:53<1:33:02,  2.03it/s]
  4%|▍         | 448/11788 [03:53<1:33:01,  2.03it/s]
  4%|▍         | 449/11788 [03:54<1:33:03,  2.03it/s]
  4%|▍         | 450/11788 [03:54<1:33:04,  2.03it/s]
                                                     {'loss': 5.1333, 'grad_norm': 0.8772043585777283, 'learning_rate': 0.00038167938931297715, 'epoch': 0.53}

  4%|▍         | 450/11788 [03:54<1:33:04,  2.03it/s]
  4%|▍         | 451/11788 [03:55<1:33:09,  2.03it/s]
  4%|▍         | 452/11788 [03:55<1:33:07,  2.03it/s]
  4%|▍         | 453/11788 [03:56<1:33:00,  2.03it/s]
  4%|▍         | 454/11788 [03:56<1:33:02,  2.03it/s]
  4%|▍         | 455/11788 [03:57<1:32:57,  2.03it/s]
  4%|▍         | 456/11788 [03:57<1:33:01,  2.03it/s]
  4%|▍         | 457/11788 [03:58<1:33:02,  2.03it/s]
  4%|▍         | 458/11788 [03:58<1:33:00,  2.03it/s]
  4%|▍         | 459/11788 [03:59<1:33:06,  2.03it/s]
  4%|▍         | 460/11788 [03:59<1:33:01,  2.03it/s]
  4%|▍         | 461/11788 [04:00<1:33:01,  2.03it/s]
  4%|▍         | 462/11788 [04:00<1:33:01,  2.03it/s]
  4%|▍         | 463/11788 [04:01<1:32:59,  2.03it/s]
  4%|▍         | 464/11788 [04:01<1:33:03,  2.03it/s]
  4%|▍         | 465/11788 [04:02<1:33:02,  2.03it/s]
  4%|▍         | 466/11788 [04:02<1:33:04,  2.03it/s]
  4%|▍         | 467/11788 [04:03<1:33:04,  2.03it/s]
  4%|▍         | 468/11788 [04:03<1:33:05,  2.03it/s]
  4%|▍         | 469/11788 [04:04<1:32:59,  2.03it/s]
  4%|▍         | 470/11788 [04:04<1:33:01,  2.03it/s]
  4%|▍         | 471/11788 [04:05<1:32:58,  2.03it/s]
  4%|▍         | 472/11788 [04:05<1:32:55,  2.03it/s]
  4%|▍         | 473/11788 [04:06<1:33:01,  2.03it/s]
  4%|▍         | 474/11788 [04:06<1:32:54,  2.03it/s]
  4%|▍         | 475/11788 [04:07<1:32:56,  2.03it/s]{'loss': 5.0491, 'grad_norm': 0.8502684831619263, 'learning_rate': 0.00040288379983036474, 'epoch': 0.56}

                                                     

  4%|▍         | 475/11788 [04:07<1:32:56,  2.03it/s]
  4%|▍         | 476/11788 [04:07<1:32:57,  2.03it/s]
  4%|▍         | 477/11788 [04:08<1:32:58,  2.03it/s]
  4%|▍         | 478/11788 [04:08<1:32:56,  2.03it/s]
  4%|▍         | 479/11788 [04:09<1:32:53,  2.03it/s]
  4%|▍         | 480/11788 [04:09<1:33:00,  2.03it/s]
  4%|▍         | 481/11788 [04:10<1:32:59,  2.03it/s]
  4%|▍         | 482/11788 [04:10<1:33:06,  2.02it/s]
  4%|▍         | 483/11788 [04:11<1:33:02,  2.03it/s]
  4%|▍         | 484/11788 [04:11<1:33:03,  2.02it/s]
  4%|▍         | 485/11788 [04:11<1:32:55,  2.03it/s]
  4%|▍         | 486/11788 [04:12<1:33:00,  2.03it/s]
  4%|▍         | 487/11788 [04:12<1:33:01,  2.02it/s]
  4%|▍         | 488/11788 [04:13<1:33:03,  2.02it/s]
  4%|▍         | 489/11788 [04:13<1:32:57,  2.03it/s]
  4%|▍         | 490/11788 [04:14<1:32:59,  2.03it/s]
  4%|▍         | 491/11788 [04:14<1:32:55,  2.03it/s]
  4%|▍         | 492/11788 [04:15<1:32:51,  2.03it/s]
  4%|▍         | 493/11788 [04:15<1:32:57,  2.03it/s]
  4%|▍         | 494/11788 [04:16<1:32:57,  2.03it/s]
  4%|▍         | 495/11788 [04:16<1:32:54,  2.03it/s]
  4%|▍         | 496/11788 [04:17<1:32:52,  2.03it/s]
  4%|▍         | 497/11788 [04:17<1:32:45,  2.03it/s]
  4%|▍         | 498/11788 [04:18<1:32:48,  2.03it/s]
  4%|▍         | 499/11788 [04:18<1:32:43,  2.03it/s]
  4%|▍         | 500/11788 [04:19<1:32:42,  2.03it/s]
                                                     
{'loss': 4.9798, 'grad_norm': 0.6902145147323608, 'learning_rate': 0.0004240882103477524, 'epoch': 0.59}

  4%|▍         | 500/11788 [04:19<1:32:42,  2.03it/s]
  4%|▍         | 501/11788 [04:19<1:32:46,  2.03it/s]
  4%|▍         | 502/11788 [04:20<1:32:44,  2.03it/s]
  4%|▍         | 503/11788 [04:20<1:32:49,  2.03it/s]
  4%|▍         | 504/11788 [04:21<1:32:43,  2.03it/s]
  4%|▍         | 505/11788 [04:21<1:32:47,  2.03it/s]
  4%|▍         | 506/11788 [04:22<1:32:43,  2.03it/s]
  4%|▍         | 507/11788 [04:22<1:32:45,  2.03it/s]
  4%|▍         | 508/11788 [04:23<1:32:42,  2.03it/s]
  4%|▍         | 509/11788 [04:23<1:32:43,  2.03it/s]
  4%|▍         | 510/11788 [04:24<1:32:43,  2.03it/s]
  4%|▍         | 511/11788 [04:24<1:32:35,  2.03it/s]
  4%|▍         | 512/11788 [04:25<1:32:40,  2.03it/s]
  4%|▍         | 513/11788 [04:25<1:32:37,  2.03it/s]
  4%|▍         | 514/11788 [04:26<1:32:43,  2.03it/s]
  4%|▍         | 515/11788 [04:26<1:32:41,  2.03it/s]
  4%|▍         | 516/11788 [04:27<1:32:41,  2.03it/s]
  4%|▍         | 517/11788 [04:27<1:32:40,  2.03it/s]
  4%|▍         | 518/11788 [04:28<1:32:33,  2.03it/s]
  4%|▍         | 519/11788 [04:28<1:32:38,  2.03it/s]
  4%|▍         | 520/11788 [04:29<1:32:30,  2.03it/s]
  4%|▍         | 521/11788 [04:29<1:32:30,  2.03it/s]
  4%|▍         | 522/11788 [04:30<1:32:30,  2.03it/s]
  4%|▍         | 523/11788 [04:30<1:32:22,  2.03it/s]
  4%|▍         | 524/11788 [04:31<1:32:24,  2.03it/s]
  4%|▍         | 525/11788 [04:31<1:32:23,  2.03it/s]{'loss': 4.9114, 'grad_norm': 0.6238113045692444, 'learning_rate': 0.0004452926208651399, 'epoch': 0.62}
                                                     

  4%|▍         | 525/11788 [04:31<1:32:23,  2.03it/s]
  4%|▍         | 526/11788 [04:32<1:32:33,  2.03it/s]
  4%|▍         | 527/11788 [04:32<1:32:33,  2.03it/s]
  4%|▍         | 528/11788 [04:33<1:32:26,  2.03it/s]
  4%|▍         | 529/11788 [04:33<1:32:34,  2.03it/s]
  4%|▍         | 530/11788 [04:34<1:32:26,  2.03it/s]
  5%|▍         | 531/11788 [04:34<1:32:29,  2.03it/s]
  5%|▍         | 532/11788 [04:35<1:32:27,  2.03it/s]
  5%|▍         | 533/11788 [04:35<1:32:29,  2.03it/s]
  5%|▍         | 534/11788 [04:36<1:32:29,  2.03it/s]
  5%|▍         | 535/11788 [04:36<1:32:21,  2.03it/s]
  5%|▍         | 536/11788 [04:37<1:32:28,  2.03it/s]
  5%|▍         | 537/11788 [04:37<1:32:26,  2.03it/s]
  5%|▍         | 538/11788 [04:38<1:32:29,  2.03it/s]
  5%|▍         | 539/11788 [04:38<1:32:25,  2.03it/s]
  5%|▍         | 540/11788 [04:39<1:32:21,  2.03it/s]
  5%|▍         | 541/11788 [04:39<1:32:23,  2.03it/s]
  5%|▍         | 542/11788 [04:40<1:32:18,  2.03it/s]
  5%|▍         | 543/11788 [04:40<1:32:20,  2.03it/s]
  5%|▍         | 544/11788 [04:41<1:32:18,  2.03it/s]
  5%|▍         | 545/11788 [04:41<1:32:11,  2.03it/s]
  5%|▍         | 546/11788 [04:42<1:32:22,  2.03it/s]
  5%|▍         | 547/11788 [04:42<1:32:20,  2.03it/s]
  5%|▍         | 548/11788 [04:43<1:32:25,  2.03it/s]
  5%|▍         | 549/11788 [04:43<1:32:21,  2.03it/s]
  5%|▍         | 550/11788 [04:44<1:32:24,  2.03it/s]{'loss': 4.8494, 'grad_norm': 0.5650936365127563, 'learning_rate': 0.00046649703138252756, 'epoch': 0.65}

                                                     

  5%|▍         | 550/11788 [04:44<1:32:24,  2.03it/s]
  5%|▍         | 551/11788 [04:44<1:32:26,  2.03it/s]
  5%|▍         | 552/11788 [04:45<1:32:24,  2.03it/s]
  5%|▍         | 553/11788 [04:45<1:32:21,  2.03it/s]
  5%|▍         | 554/11788 [04:46<1:32:17,  2.03it/s]
  5%|▍         | 555/11788 [04:46<1:32:22,  2.03it/s]
  5%|▍         | 556/11788 [04:46<1:32:15,  2.03it/s]
  5%|▍         | 557/11788 [04:47<1:32:14,  2.03it/s]
  5%|▍         | 558/11788 [04:47<1:32:19,  2.03it/s]
  5%|▍         | 559/11788 [04:48<1:32:15,  2.03it/s]
  5%|▍         | 560/11788 [04:48<1:32:14,  2.03it/s]
  5%|▍         | 561/11788 [04:49<1:32:16,  2.03it/s]
  5%|▍         | 562/11788 [04:49<1:32:14,  2.03it/s]
  5%|▍         | 563/11788 [04:50<1:32:19,  2.03it/s]
  5%|▍         | 564/11788 [04:50<1:32:15,  2.03it/s]
  5%|▍         | 565/11788 [04:51<1:32:18,  2.03it/s]
  5%|▍         | 566/11788 [04:51<1:32:18,  2.03it/s]
  5%|▍         | 567/11788 [04:52<1:32:12,  2.03it/s]
  5%|▍         | 568/11788 [04:52<1:32:16,  2.03it/s]
  5%|▍         | 569/11788 [04:53<1:32:15,  2.03it/s]
  5%|▍         | 570/11788 [04:53<1:32:17,  2.03it/s]
  5%|▍         | 571/11788 [04:54<1:32:21,  2.02it/s]
  5%|▍         | 572/11788 [04:54<1:32:17,  2.03it/s]
  5%|▍         | 573/11788 [04:55<1:32:13,  2.03it/s]
  5%|▍         | 574/11788 [04:55<1:32:07,  2.03it/s]
  5%|▍         | 575/11788 [04:56<1:32:04,  2.03it/s]
                                                     
{'loss': 4.784, 'grad_norm': 0.7106149792671204, 'learning_rate': 0.00048770144189991515, 'epoch': 0.68}

  5%|▍         | 575/11788 [04:56<1:32:04,  2.03it/s]
  5%|▍         | 576/11788 [04:56<1:32:11,  2.03it/s]
  5%|▍         | 577/11788 [04:57<1:32:11,  2.03it/s]
  5%|▍         | 578/11788 [04:57<1:32:11,  2.03it/s]
  5%|▍         | 579/11788 [04:58<1:32:09,  2.03it/s]
  5%|▍         | 580/11788 [04:58<1:32:07,  2.03it/s]
  5%|▍         | 581/11788 [04:59<1:32:07,  2.03it/s]
  5%|▍         | 582/11788 [04:59<1:32:05,  2.03it/s]
  5%|▍         | 583/11788 [05:00<1:32:07,  2.03it/s]
  5%|▍         | 584/11788 [05:00<1:32:07,  2.03it/s]
  5%|▍         | 585/11788 [05:01<1:32:07,  2.03it/s]
  5%|▍         | 586/11788 [05:01<1:32:07,  2.03it/s]
  5%|▍         | 587/11788 [05:02<1:32:10,  2.03it/s]
  5%|▍         | 588/11788 [05:02<1:32:07,  2.03it/s]
  5%|▍         | 589/11788 [05:03<1:32:09,  2.03it/s]
  5%|β–Œ         | 590/11788 [05:03<1:32:12,  2.02it/s]
  5%|β–Œ         | 591/11788 [05:04<1:32:09,  2.02it/s]
  5%|β–Œ         | 592/11788 [05:04<1:32:08,  2.02it/s]
  5%|β–Œ         | 593/11788 [05:05<1:32:12,  2.02it/s]
  5%|β–Œ         | 594/11788 [05:05<1:32:12,  2.02it/s]
  5%|β–Œ         | 595/11788 [05:06<1:32:11,  2.02it/s]
  5%|β–Œ         | 596/11788 [05:06<1:32:14,  2.02it/s]
  5%|β–Œ         | 597/11788 [05:07<1:32:13,  2.02it/s]
  5%|β–Œ         | 598/11788 [05:07<1:32:07,  2.02it/s]
  5%|β–Œ         | 599/11788 [05:08<1:32:06,  2.02it/s]
  5%|β–Œ         | 600/11788 [05:08<1:31:58,  2.03it/s]{'loss': 4.7204, 'grad_norm': 0.5446744561195374, 'learning_rate': 0.0005089058524173028, 'epoch': 0.71}

                                                     

  5%|β–Œ         | 600/11788 [05:08<1:31:58,  2.03it/s]
  5%|β–Œ         | 601/11788 [05:09<1:32:12,  2.02it/s]
  5%|β–Œ         | 602/11788 [05:09<1:32:02,  2.03it/s]
  5%|β–Œ         | 603/11788 [05:10<1:32:00,  2.03it/s]
  5%|β–Œ         | 604/11788 [05:10<1:31:58,  2.03it/s]
  5%|β–Œ         | 605/11788 [05:11<1:31:55,  2.03it/s]
  5%|β–Œ         | 606/11788 [05:11<1:31:56,  2.03it/s]
  5%|β–Œ         | 607/11788 [05:12<1:31:56,  2.03it/s]
  5%|β–Œ         | 608/11788 [05:12<1:31:55,  2.03it/s]
  5%|β–Œ         | 609/11788 [05:13<1:31:54,  2.03it/s]
  5%|β–Œ         | 610/11788 [05:13<1:31:53,  2.03it/s]
  5%|β–Œ         | 611/11788 [05:14<1:31:51,  2.03it/s]
  5%|β–Œ         | 612/11788 [05:14<1:31:48,  2.03it/s]
  5%|β–Œ         | 613/11788 [05:15<1:31:48,  2.03it/s]
  5%|β–Œ         | 614/11788 [05:15<1:31:40,  2.03it/s]
  5%|β–Œ         | 615/11788 [05:16<1:39:29,  1.87it/s]
  5%|β–Œ         | 616/11788 [05:16<1:44:59,  1.77it/s]
  5%|β–Œ         | 617/11788 [05:17<1:41:01,  1.84it/s]
  5%|β–Œ         | 618/11788 [05:17<1:38:18,  1.89it/s]
  5%|β–Œ         | 619/11788 [05:18<1:36:17,  1.93it/s]
  5%|β–Œ         | 620/11788 [05:18<1:34:59,  1.96it/s]
  5%|β–Œ         | 621/11788 [05:19<1:34:02,  1.98it/s]
  5%|β–Œ         | 622/11788 [05:19<1:33:28,  1.99it/s]
  5%|β–Œ         | 623/11788 [05:20<1:32:53,  2.00it/s]
  5%|β–Œ         | 624/11788 [05:20<1:32:32,  2.01it/s]
  5%|β–Œ         | 625/11788 [05:21<1:32:19,  2.02it/s]{'loss': 4.6719, 'grad_norm': 0.6386901140213013, 'learning_rate': 0.0005301102629346905, 'epoch': 0.74}

                                                     

  5%|β–Œ         | 625/11788 [05:21<1:32:19,  2.02it/s]
  5%|β–Œ         | 626/11788 [05:21<1:32:13,  2.02it/s]
  5%|β–Œ         | 627/11788 [05:22<1:32:05,  2.02it/s]
  5%|β–Œ         | 628/11788 [05:22<1:31:56,  2.02it/s]
  5%|β–Œ         | 629/11788 [05:23<1:31:54,  2.02it/s]
  5%|β–Œ         | 630/11788 [05:23<1:31:44,  2.03it/s]
  5%|β–Œ         | 631/11788 [05:24<1:31:45,  2.03it/s]
  5%|β–Œ         | 632/11788 [05:24<1:31:46,  2.03it/s]
  5%|β–Œ         | 633/11788 [05:25<1:31:38,  2.03it/s]
  5%|β–Œ         | 634/11788 [05:25<1:31:44,  2.03it/s]
  5%|β–Œ         | 635/11788 [05:26<1:31:36,  2.03it/s]
  5%|β–Œ         | 636/11788 [05:26<1:31:41,  2.03it/s]
  5%|β–Œ         | 637/11788 [05:27<1:31:41,  2.03it/s]
  5%|β–Œ         | 638/11788 [05:27<1:31:41,  2.03it/s]
  5%|β–Œ         | 639/11788 [05:28<1:31:38,  2.03it/s]
  5%|β–Œ         | 640/11788 [05:28<1:31:37,  2.03it/s]
  5%|β–Œ         | 641/11788 [05:29<1:31:34,  2.03it/s]
  5%|β–Œ         | 642/11788 [05:29<1:31:34,  2.03it/s]
  5%|β–Œ         | 643/11788 [05:30<1:31:43,  2.02it/s]
  5%|β–Œ         | 644/11788 [05:30<1:31:39,  2.03it/s]
  5%|β–Œ         | 645/11788 [05:31<1:31:35,  2.03it/s]
  5%|β–Œ         | 646/11788 [05:31<1:31:37,  2.03it/s]
  5%|β–Œ         | 647/11788 [05:32<1:31:36,  2.03it/s]
  5%|β–Œ         | 648/11788 [05:32<1:31:37,  2.03it/s]
  6%|β–Œ         | 649/11788 [05:33<1:31:32,  2.03it/s]
  6%|β–Œ         | 650/11788 [05:33<1:31:34,  2.03it/s]{'loss': 4.6171, 'grad_norm': 0.6312897205352783, 'learning_rate': 0.0005513146734520781, 'epoch': 0.77}

                                                     

  6%|β–Œ         | 650/11788 [05:33<1:31:34,  2.03it/s]
  6%|β–Œ         | 651/11788 [05:34<1:31:33,  2.03it/s]
  6%|β–Œ         | 652/11788 [05:34<1:31:35,  2.03it/s]
  6%|β–Œ         | 653/11788 [05:35<1:31:32,  2.03it/s]
  6%|β–Œ         | 654/11788 [05:35<1:31:36,  2.03it/s]
  6%|β–Œ         | 655/11788 [05:36<1:31:29,  2.03it/s]
  6%|β–Œ         | 656/11788 [05:36<1:31:27,  2.03it/s]
  6%|β–Œ         | 657/11788 [05:37<1:31:28,  2.03it/s]
  6%|β–Œ         | 658/11788 [05:37<1:31:26,  2.03it/s]
  6%|β–Œ         | 659/11788 [05:38<1:31:29,  2.03it/s]
  6%|β–Œ         | 660/11788 [05:38<1:31:26,  2.03it/s]
  6%|β–Œ         | 661/11788 [05:39<1:31:29,  2.03it/s]
  6%|β–Œ         | 662/11788 [05:39<1:31:27,  2.03it/s]
  6%|β–Œ         | 663/11788 [05:40<1:31:33,  2.03it/s]
  6%|β–Œ         | 664/11788 [05:40<1:31:30,  2.03it/s]
  6%|β–Œ         | 665/11788 [05:41<1:31:29,  2.03it/s]
  6%|β–Œ         | 666/11788 [05:41<1:31:28,  2.03it/s]
  6%|β–Œ         | 667/11788 [05:42<1:31:22,  2.03it/s]
  6%|β–Œ         | 668/11788 [05:42<1:31:24,  2.03it/s]
  6%|β–Œ         | 669/11788 [05:43<1:31:23,  2.03it/s]
  6%|β–Œ         | 670/11788 [05:43<1:31:27,  2.03it/s]
  6%|β–Œ         | 671/11788 [05:44<1:31:20,  2.03it/s]
  6%|β–Œ         | 672/11788 [05:44<1:31:19,  2.03it/s]
  6%|β–Œ         | 673/11788 [05:44<1:31:21,  2.03it/s]
  6%|β–Œ         | 674/11788 [05:45<1:31:13,  2.03it/s]
  6%|β–Œ         | 675/11788 [05:45<1:31:20,  2.03it/s]{'loss': 4.5837, 'grad_norm': 0.7919646501541138, 'learning_rate': 0.0005725190839694656, 'epoch': 0.8}

                                                     

  6%|β–Œ         | 675/11788 [05:45<1:31:20,  2.03it/s]
  6%|β–Œ         | 676/11788 [05:46<1:31:25,  2.03it/s]
  6%|β–Œ         | 677/11788 [05:46<1:31:29,  2.02it/s]
  6%|β–Œ         | 678/11788 [05:47<1:31:24,  2.03it/s]
  6%|β–Œ         | 679/11788 [05:47<1:31:25,  2.03it/s]
  6%|β–Œ         | 680/11788 [05:48<1:31:22,  2.03it/s]
  6%|β–Œ         | 681/11788 [05:48<1:31:14,  2.03it/s]
  6%|β–Œ         | 682/11788 [05:49<1:31:16,  2.03it/s]
  6%|β–Œ         | 683/11788 [05:49<1:31:13,  2.03it/s]
  6%|β–Œ         | 684/11788 [05:50<1:31:16,  2.03it/s]
  6%|β–Œ         | 685/11788 [05:50<1:31:13,  2.03it/s]
  6%|β–Œ         | 686/11788 [05:51<1:31:11,  2.03it/s]
  6%|β–Œ         | 687/11788 [05:51<1:31:16,  2.03it/s]
  6%|β–Œ         | 688/11788 [05:52<1:31:11,  2.03it/s]
  6%|β–Œ         | 689/11788 [05:52<1:31:13,  2.03it/s]
  6%|β–Œ         | 690/11788 [05:53<1:31:10,  2.03it/s]
  6%|β–Œ         | 691/11788 [05:53<1:31:07,  2.03it/s]
  6%|β–Œ         | 692/11788 [05:54<1:31:10,  2.03it/s]
  6%|β–Œ         | 693/11788 [05:54<1:31:05,  2.03it/s]
  6%|β–Œ         | 694/11788 [05:55<1:31:08,  2.03it/s]
  6%|β–Œ         | 695/11788 [05:55<1:31:05,  2.03it/s]
  6%|β–Œ         | 696/11788 [05:56<1:31:08,  2.03it/s]
  6%|β–Œ         | 697/11788 [05:56<1:31:15,  2.03it/s]
  6%|β–Œ         | 698/11788 [05:57<1:31:08,  2.03it/s]
  6%|β–Œ         | 699/11788 [05:57<1:31:07,  2.03it/s]
  6%|β–Œ         | 700/11788 [05:58<1:31:05,  2.03it/s]{'loss': 4.5283, 'grad_norm': 0.6023926734924316, 'learning_rate': 0.0005937234944868533, 'epoch': 0.83}
                                                     

  6%|β–Œ         | 700/11788 [05:58<1:31:05,  2.03it/s]
  6%|β–Œ         | 701/11788 [05:58<1:31:13,  2.03it/s]
  6%|β–Œ         | 702/11788 [05:59<1:31:13,  2.03it/s]
  6%|β–Œ         | 703/11788 [05:59<1:31:12,  2.03it/s]
  6%|β–Œ         | 704/11788 [06:00<1:31:10,  2.03it/s]
  6%|β–Œ         | 705/11788 [06:00<1:31:12,  2.03it/s]
  6%|β–Œ         | 706/11788 [06:01<1:31:08,  2.03it/s]
  6%|β–Œ         | 707/11788 [06:01<1:31:08,  2.03it/s]
  6%|β–Œ         | 708/11788 [06:02<1:31:11,  2.03it/s]
  6%|β–Œ         | 709/11788 [06:02<1:31:04,  2.03it/s]
  6%|β–Œ         | 710/11788 [06:03<1:31:12,  2.02it/s]
  6%|β–Œ         | 711/11788 [06:03<1:31:04,  2.03it/s]
  6%|β–Œ         | 712/11788 [06:04<1:31:06,  2.03it/s]
  6%|β–Œ         | 713/11788 [06:04<1:30:59,  2.03it/s]
  6%|β–Œ         | 714/11788 [06:05<1:31:02,  2.03it/s]
  6%|β–Œ         | 715/11788 [06:05<1:30:59,  2.03it/s]
  6%|β–Œ         | 716/11788 [06:06<1:30:53,  2.03it/s]
  6%|β–Œ         | 717/11788 [06:06<1:30:57,  2.03it/s]
  6%|β–Œ         | 718/11788 [06:07<1:31:01,  2.03it/s]
  6%|β–Œ         | 719/11788 [06:07<1:31:02,  2.03it/s]
  6%|β–Œ         | 720/11788 [06:08<1:30:54,  2.03it/s]
  6%|β–Œ         | 721/11788 [06:08<1:30:58,  2.03it/s]
  6%|β–Œ         | 722/11788 [06:09<1:30:58,  2.03it/s]
  6%|β–Œ         | 723/11788 [06:09<1:30:52,  2.03it/s]
  6%|β–Œ         | 724/11788 [06:10<1:30:56,  2.03it/s]
  6%|β–Œ         | 725/11788 [06:10<1:30:50,  2.03it/s]{'loss': 4.4724, 'grad_norm': 0.6503545641899109, 'learning_rate': 0.0006149279050042408, 'epoch': 0.86}
                                                     

  6%|β–Œ         | 725/11788 [06:10<1:30:50,  2.03it/s]
  6%|β–Œ         | 726/11788 [06:11<1:31:03,  2.02it/s]
  6%|β–Œ         | 727/11788 [06:11<1:30:57,  2.03it/s]
  6%|β–Œ         | 728/11788 [06:12<1:30:59,  2.03it/s]
  6%|β–Œ         | 729/11788 [06:12<1:31:04,  2.02it/s]
  6%|β–Œ         | 730/11788 [06:13<1:31:03,  2.02it/s]
  6%|β–Œ         | 731/11788 [06:13<1:30:57,  2.03it/s]
  6%|β–Œ         | 732/11788 [06:14<1:30:57,  2.03it/s]
  6%|β–Œ         | 733/11788 [06:14<1:30:56,  2.03it/s]
  6%|β–Œ         | 734/11788 [06:15<1:30:54,  2.03it/s]
  6%|β–Œ         | 735/11788 [06:15<1:30:51,  2.03it/s]
  6%|β–Œ         | 736/11788 [06:16<1:30:47,  2.03it/s]
  6%|β–‹         | 737/11788 [06:16<1:30:47,  2.03it/s]
  6%|β–‹         | 738/11788 [06:17<1:30:49,  2.03it/s]
  6%|β–‹         | 739/11788 [06:17<1:30:50,  2.03it/s]
  6%|β–‹         | 740/11788 [06:18<1:30:47,  2.03it/s]
  6%|β–‹         | 741/11788 [06:18<1:30:47,  2.03it/s]
  6%|β–‹         | 742/11788 [06:19<1:30:43,  2.03it/s]
  6%|β–‹         | 743/11788 [06:19<1:30:39,  2.03it/s]
  6%|β–‹         | 744/11788 [06:20<1:30:49,  2.03it/s]
  6%|β–‹         | 745/11788 [06:20<1:30:44,  2.03it/s]
  6%|β–‹         | 746/11788 [06:21<1:30:48,  2.03it/s]
  6%|β–‹         | 747/11788 [06:21<1:30:45,  2.03it/s]
  6%|β–‹         | 748/11788 [06:21<1:30:46,  2.03it/s]
  6%|β–‹         | 749/11788 [06:22<1:30:47,  2.03it/s]
  6%|β–‹         | 750/11788 [06:22<1:30:49,  2.03it/s]{'loss': 4.4458, 'grad_norm': 0.5474122762680054, 'learning_rate': 0.0006361323155216285, 'epoch': 0.89}
                                                     

  6%|β–‹         | 750/11788 [06:22<1:30:49,  2.03it/s]
  6%|β–‹         | 751/11788 [06:23<1:30:59,  2.02it/s]
  6%|β–‹         | 752/11788 [06:23<1:30:55,  2.02it/s]
  6%|β–‹         | 753/11788 [06:24<1:30:53,  2.02it/s]
  6%|β–‹         | 754/11788 [06:24<1:30:45,  2.03it/s]
  6%|β–‹         | 755/11788 [06:25<1:30:44,  2.03it/s]
  6%|β–‹         | 756/11788 [06:25<1:30:43,  2.03it/s]
  6%|β–‹         | 757/11788 [06:26<1:30:39,  2.03it/s]
  6%|β–‹         | 758/11788 [06:26<1:30:40,  2.03it/s]
  6%|β–‹         | 759/11788 [06:27<1:30:42,  2.03it/s]
  6%|β–‹         | 760/11788 [06:27<1:30:41,  2.03it/s]
  6%|β–‹         | 761/11788 [06:28<1:30:48,  2.02it/s]
  6%|β–‹         | 762/11788 [06:28<1:30:48,  2.02it/s]
  6%|β–‹         | 763/11788 [06:29<1:30:42,  2.03it/s]
  6%|β–‹         | 764/11788 [06:29<1:30:43,  2.03it/s]
  6%|β–‹         | 765/11788 [06:30<1:30:36,  2.03it/s]
  6%|β–‹         | 766/11788 [06:30<1:30:38,  2.03it/s]
  7%|β–‹         | 767/11788 [06:31<1:30:37,  2.03it/s]
  7%|β–‹         | 768/11788 [06:31<1:30:38,  2.03it/s]
  7%|β–‹         | 769/11788 [06:32<1:30:40,  2.03it/s]
  7%|β–‹         | 770/11788 [06:32<1:30:35,  2.03it/s]
  7%|β–‹         | 771/11788 [06:33<1:30:37,  2.03it/s]
  7%|β–‹         | 772/11788 [06:33<1:30:34,  2.03it/s]
  7%|β–‹         | 773/11788 [06:34<1:30:38,  2.03it/s]
  7%|β–‹         | 774/11788 [06:34<1:30:34,  2.03it/s]
  7%|β–‹         | 775/11788 [06:35<1:30:36,  2.03it/s]{'loss': 4.4025, 'grad_norm': 0.6143103241920471, 'learning_rate': 0.0006573367260390161, 'epoch': 0.92}

                                                     

  7%|β–‹         | 775/11788 [06:35<1:30:36,  2.03it/s]
  7%|β–‹         | 776/11788 [06:35<1:30:33,  2.03it/s]
  7%|β–‹         | 777/11788 [06:36<1:30:31,  2.03it/s]
  7%|β–‹         | 778/11788 [06:36<1:30:31,  2.03it/s]
  7%|β–‹         | 779/11788 [06:37<1:30:26,  2.03it/s]
  7%|β–‹         | 780/11788 [06:37<1:30:30,  2.03it/s]
  7%|β–‹         | 781/11788 [06:38<1:30:23,  2.03it/s]
  7%|β–‹         | 782/11788 [06:38<1:30:34,  2.03it/s]
  7%|β–‹         | 783/11788 [06:39<1:30:32,  2.03it/s]
  7%|β–‹         | 784/11788 [06:39<1:30:33,  2.03it/s]
  7%|β–‹         | 785/11788 [06:40<1:30:29,  2.03it/s]
  7%|β–‹         | 786/11788 [06:40<1:30:23,  2.03it/s]
  7%|β–‹         | 787/11788 [06:41<1:30:29,  2.03it/s]
  7%|β–‹         | 788/11788 [06:41<1:30:24,  2.03it/s]
  7%|β–‹         | 789/11788 [06:42<1:30:25,  2.03it/s]
  7%|β–‹         | 790/11788 [06:42<1:30:18,  2.03it/s]
  7%|β–‹         | 791/11788 [06:43<1:30:17,  2.03it/s]
  7%|β–‹         | 792/11788 [06:43<1:30:17,  2.03it/s]
  7%|β–‹         | 793/11788 [06:44<1:30:17,  2.03it/s]
  7%|β–‹         | 794/11788 [06:44<1:30:20,  2.03it/s]
  7%|β–‹         | 795/11788 [06:45<1:30:15,  2.03it/s]
  7%|β–‹         | 796/11788 [06:45<1:30:15,  2.03it/s]
  7%|β–‹         | 797/11788 [06:46<1:30:14,  2.03it/s]
  7%|β–‹         | 798/11788 [06:46<1:30:12,  2.03it/s]
  7%|β–‹         | 799/11788 [06:47<1:30:17,  2.03it/s]
  7%|β–‹         | 800/11788 [06:47<1:30:12,  2.03it/s]{'loss': 4.369, 'grad_norm': 0.45020633935928345, 'learning_rate': 0.0006785411365564037, 'epoch': 0.95}

                                                     

  7%|β–‹         | 800/11788 [06:47<1:30:12,  2.03it/s]
  7%|β–‹         | 801/11788 [06:48<1:30:19,  2.03it/s]
  7%|β–‹         | 802/11788 [06:48<1:30:14,  2.03it/s]
  7%|β–‹         | 803/11788 [06:49<1:30:17,  2.03it/s]
  7%|β–‹         | 804/11788 [06:49<1:30:20,  2.03it/s]
  7%|β–‹         | 805/11788 [06:50<1:30:15,  2.03it/s]
  7%|β–‹         | 806/11788 [06:50<1:30:16,  2.03it/s]
  7%|β–‹         | 807/11788 [06:51<1:30:11,  2.03it/s]
  7%|β–‹         | 808/11788 [06:51<1:30:17,  2.03it/s]
  7%|β–‹         | 809/11788 [06:52<1:30:09,  2.03it/s]
  7%|β–‹         | 810/11788 [06:52<1:30:08,  2.03it/s]
  7%|β–‹         | 811/11788 [06:53<1:30:09,  2.03it/s]
  7%|β–‹         | 812/11788 [06:53<1:30:02,  2.03it/s]
  7%|β–‹         | 813/11788 [06:54<1:30:03,  2.03it/s]
  7%|β–‹         | 814/11788 [06:54<1:30:05,  2.03it/s]
  7%|β–‹         | 815/11788 [06:55<1:30:05,  2.03it/s]
  7%|β–‹         | 816/11788 [06:55<1:30:10,  2.03it/s]
  7%|β–‹         | 817/11788 [06:56<1:30:10,  2.03it/s]
  7%|β–‹         | 818/11788 [06:56<1:30:13,  2.03it/s]
  7%|β–‹         | 819/11788 [06:57<1:30:05,  2.03it/s]
  7%|β–‹         | 820/11788 [06:57<1:30:07,  2.03it/s]
  7%|β–‹         | 821/11788 [06:57<1:30:02,  2.03it/s]
  7%|β–‹         | 822/11788 [06:58<1:30:01,  2.03it/s]
  7%|β–‹         | 823/11788 [06:58<1:30:04,  2.03it/s]
  7%|β–‹         | 824/11788 [06:59<1:30:04,  2.03it/s]
  7%|β–‹         | 825/11788 [06:59<1:30:06,  2.03it/s]{'loss': 4.3353, 'grad_norm': 0.46021631360054016, 'learning_rate': 0.0006997455470737913, 'epoch': 0.98}
                                                     

  7%|β–‹         | 825/11788 [06:59<1:30:06,  2.03it/s]
  7%|β–‹         | 826/11788 [07:00<1:30:08,  2.03it/s]
  7%|β–‹         | 827/11788 [07:00<1:30:12,  2.03it/s]
  7%|β–‹         | 828/11788 [07:01<1:30:10,  2.03it/s]
  7%|β–‹         | 829/11788 [07:01<1:30:03,  2.03it/s]
  7%|β–‹         | 830/11788 [07:02<1:30:06,  2.03it/s]
  7%|β–‹         | 831/11788 [07:02<1:29:59,  2.03it/s]
  7%|β–‹         | 832/11788 [07:03<1:30:02,  2.03it/s]
  7%|β–‹         | 833/11788 [07:03<1:30:01,  2.03it/s]
  7%|β–‹         | 834/11788 [07:04<1:30:01,  2.03it/s]
  7%|β–‹         | 835/11788 [07:04<1:30:02,  2.03it/s]
  7%|β–‹         | 836/11788 [07:05<1:30:03,  2.03it/s]
  7%|β–‹         | 837/11788 [07:05<1:30:09,  2.02it/s]
  7%|β–‹         | 838/11788 [07:06<1:30:07,  2.02it/s]
  7%|β–‹         | 839/11788 [07:06<1:30:14,  2.02it/s]
  7%|β–‹         | 840/11788 [07:07<1:30:13,  2.02it/s]
  7%|β–‹         | 841/11788 [07:07<1:30:15,  2.02it/s]
  7%|β–‹         | 842/11788 [07:08<1:29:46,  2.03it/s]
  7%|β–‹         | 843/11788 [07:20<12:01:12,  3.95s/it]
  7%|β–‹         | 844/11788 [07:20<8:52:19,  2.92s/it] 
  7%|β–‹         | 845/11788 [07:21<6:39:37,  2.19s/it]
  7%|β–‹         | 846/11788 [07:21<5:06:43,  1.68s/it]
  7%|β–‹         | 847/11788 [07:22<4:01:43,  1.33s/it]
  7%|β–‹         | 848/11788 [07:22<3:16:30,  1.08s/it]
  7%|β–‹         | 849/11788 [07:23<2:45:20,  1.10it/s]
  7%|β–‹         | 850/11788 [07:23<2:22:58,  1.28it/s]
                                                     {'loss': 4.2742, 'grad_norm': 0.6338014006614685, 'learning_rate': 0.000720949957591179, 'epoch': 1.01}

  7%|β–‹         | 850/11788 [07:23<2:22:58,  1.28it/s]
  7%|β–‹         | 851/11788 [07:24<2:07:20,  1.43it/s]
  7%|β–‹         | 852/11788 [07:24<1:56:07,  1.57it/s]
  7%|β–‹         | 853/11788 [07:25<1:48:15,  1.68it/s]
  7%|β–‹         | 854/11788 [07:25<1:42:54,  1.77it/s]
  7%|β–‹         | 855/11788 [07:26<1:38:57,  1.84it/s]
  7%|β–‹         | 856/11788 [07:26<1:36:14,  1.89it/s]
  7%|β–‹         | 857/11788 [07:27<1:34:22,  1.93it/s]
  7%|β–‹         | 858/11788 [07:27<1:32:56,  1.96it/s]
  7%|β–‹         | 859/11788 [07:28<1:31:59,  1.98it/s]
  7%|β–‹         | 860/11788 [07:28<1:31:17,  2.00it/s]
  7%|β–‹         | 861/11788 [07:29<1:30:47,  2.01it/s]
  7%|β–‹         | 862/11788 [07:29<1:30:40,  2.01it/s]
  7%|β–‹         | 863/11788 [07:30<1:30:19,  2.02it/s]
  7%|β–‹         | 864/11788 [07:30<1:30:07,  2.02it/s]
  7%|β–‹         | 865/11788 [07:31<1:30:08,  2.02it/s]
  7%|β–‹         | 866/11788 [07:31<1:30:00,  2.02it/s]
  7%|β–‹         | 867/11788 [07:32<1:29:57,  2.02it/s]
  7%|β–‹         | 868/11788 [07:32<1:29:47,  2.03it/s]