Tristan commited on
Commit
e007fe4
β€’
1 Parent(s): 0a6af0c

Training in progress, epoch 1

Browse files
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "EleutherAI/pythia-70m",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 0,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 512,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 2048,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 2048,
18
+ "model_type": "gpt_neox",
19
+ "num_attention_heads": 8,
20
+ "num_hidden_layers": 6,
21
+ "rope_scaling": null,
22
+ "rotary_emb_base": 10000,
23
+ "rotary_pct": 0.25,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.39.3",
27
+ "use_cache": true,
28
+ "use_parallel_residual": true,
29
+ "vocab_size": 50304
30
+ }
eval_job_output.txt ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ slurm submission log: 2024-05-11 22:52:02.402564
2
+ created following sbatch script:
3
+
4
+ ###############################
5
+
6
+ #!/bin/bash
7
+
8
+ #SBATCH --account=nlp
9
+ #SBATCH --cpus-per-task=16
10
+ #SBATCH --dependency=afterok:
11
+ #SBATCH --gres=gpu:1
12
+ #SBATCH --job-name=tthrush-job-4888498
13
+ #SBATCH --mem=60G
14
+ #SBATCH --nodelist=sphinx2
15
+ #SBATCH --open-mode=append
16
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/eval_job_output.txt
17
+ #SBATCH --partition=sphinx
18
+ #SBATCH --time=14-0
19
+
20
+ # activate your desired anaconda environment
21
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
22
+
23
+ # cd to working directory
24
+ cd .
25
+
26
+ # launch commands
27
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/perf'
28
+
29
+ ###############################
30
+
31
+ submission to slurm complete!
32
+
33
+
34
+ ###############################
35
+ slurm submission output
36
+
37
+
38
+
39
+ sbatch: error: Batch job submission failed: Job dependency problem
40
+
41
+ ###############################
42
+
43
+ slurm submission log: 2024-05-11 22:53:20.065335
44
+ created following sbatch script:
45
+
46
+ ###############################
47
+
48
+ #!/bin/bash
49
+
50
+ #SBATCH --account=nlp
51
+ #SBATCH --cpus-per-task=16
52
+ #SBATCH --dependency=afterok:7599822
53
+ #SBATCH --gres=gpu:1
54
+ #SBATCH --job-name=tthrush-job-2562954
55
+ #SBATCH --mem=60G
56
+ #SBATCH --nodelist=sphinx2
57
+ #SBATCH --open-mode=append
58
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/eval_job_output.txt
59
+ #SBATCH --partition=sphinx
60
+ #SBATCH --time=14-0
61
+
62
+ # activate your desired anaconda environment
63
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
64
+
65
+ # cd to working directory
66
+ cd .
67
+
68
+ # launch commands
69
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/perf'
70
+
71
+ ###############################
72
+
73
+ submission to slurm complete!
74
+
75
+
76
+ ###############################
77
+ slurm submission output
78
+
79
+ Submitted batch job 7599823
80
+
81
+
82
+
83
+ ###############################
84
+
85
+ slurm submission log: 2024-05-11 23:09:48.287855
86
+ created following sbatch script:
87
+
88
+ ###############################
89
+
90
+ #!/bin/bash
91
+
92
+ #SBATCH --account=nlp
93
+ #SBATCH --cpus-per-task=16
94
+ #SBATCH --dependency=afterok:7599868
95
+ #SBATCH --gres=gpu:1
96
+ #SBATCH --job-name=tthrush-job-4073620
97
+ #SBATCH --mem=60G
98
+ #SBATCH --nodelist=sphinx2
99
+ #SBATCH --open-mode=append
100
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/eval_job_output.txt
101
+ #SBATCH --partition=sphinx
102
+ #SBATCH --time=14-0
103
+
104
+ # activate your desired anaconda environment
105
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
106
+
107
+ # cd to working directory
108
+ cd .
109
+
110
+ # launch commands
111
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/perf'
112
+
113
+ ###############################
114
+
115
+ submission to slurm complete!
116
+
117
+
118
+ ###############################
119
+ slurm submission output
120
+
121
+ Submitted batch job 7599869
122
+
123
+
124
+
125
+ ###############################
126
+
logs/events.out.tfevents.1715496757.sphinx2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7b95287207e6c57dad1065425823fc192b62fbafd42362af2be6f885df08a25
3
+ size 10957
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c793f8be353dd40918cbae1d61766c40cc450e1b728e05f81524d115b6117add
3
+ size 281715176
train_job_output.txt ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/10682 [00:00<?, ?it/s][rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
 
 
1
  0%| | 1/10682 [00:05<15:57:49, 5.38s/it]
2
  0%| | 2/10682 [00:07<10:01:48, 3.38s/it]
3
  0%| | 3/10682 [00:08<7:22:52, 2.49s/it]
4
  0%| | 4/10682 [00:10<5:58:45, 2.02s/it]
5
  0%| | 5/10682 [00:11<5:00:13, 1.69s/it]
6
  0%| | 6/10682 [00:12<4:15:17, 1.43s/it]
7
  0%| | 7/10682 [00:13<3:46:05, 1.27s/it]
8
  0%| | 8/10682 [00:13<3:23:48, 1.15s/it]
9
  0%| | 9/10682 [00:14<3:05:00, 1.04s/it]
10
  0%| | 10/10682 [00:15<2:51:07, 1.04it/s]
11
  0%| | 11/10682 [00:16<2:37:51, 1.13it/s]
12
  0%| | 12/10682 [00:16<2:25:52, 1.22it/s]
13
  0%| | 13/10682 [00:17<2:17:06, 1.30it/s]
14
  0%| | 14/10682 [00:18<2:11:06, 1.36it/s]
15
  0%| | 15/10682 [00:18<2:05:30, 1.42it/s]
16
  0%| | 16/10682 [00:19<1:59:37, 1.49it/s]
17
  0%| | 17/10682 [00:20<1:56:34, 1.52it/s]
18
  0%| | 18/10682 [00:20<1:52:32, 1.58it/s]
19
  0%| | 19/10682 [00:21<1:49:53, 1.62it/s]
20
  0%| | 20/10682 [00:21<1:48:21, 1.64it/s]
21
  0%| | 21/10682 [00:22<1:46:12, 1.67it/s]
22
  0%| | 22/10682 [00:22<1:44:20, 1.70it/s]
23
  0%| | 23/10682 [00:23<1:41:52, 1.74it/s]
24
  0%| | 24/10682 [00:24<1:40:45, 1.76it/s]
25
  0%| | 25/10682 [00:24<1:40:14, 1.77it/s]
26
 
 
27
  0%| | 25/10682 [00:24<1:40:14, 1.77it/s]
28
  0%| | 26/10682 [00:25<1:39:10, 1.79it/s]
29
  0%| | 27/10682 [00:25<1:37:34, 1.82it/s]
30
  0%| | 28/10682 [00:26<1:36:48, 1.83it/s]
31
  0%| | 29/10682 [00:26<1:36:00, 1.85it/s]
32
  0%| | 30/10682 [00:27<1:35:12, 1.86it/s]
33
  0%| | 31/10682 [00:27<1:34:57, 1.87it/s]
34
  0%| | 32/10682 [00:28<1:34:15, 1.88it/s]
35
  0%| | 33/10682 [00:28<1:34:03, 1.89it/s]
36
  0%| | 34/10682 [00:29<1:33:35, 1.90it/s]
37
  0%| | 35/10682 [00:29<1:32:53, 1.91it/s]
38
  0%| | 36/10682 [00:30<1:32:07, 1.93it/s]
39
  0%| | 37/10682 [00:30<1:31:43, 1.93it/s]
40
  0%| | 38/10682 [00:31<1:31:06, 1.95it/s]
41
  0%| | 39/10682 [00:31<1:30:51, 1.95it/s]
42
  0%| | 40/10682 [00:32<1:30:16, 1.96it/s]
43
  0%| | 41/10682 [00:32<1:30:12, 1.97it/s]
44
  0%| | 42/10682 [00:33<1:31:55, 1.93it/s]
45
  0%| | 43/10682 [00:34<1:31:57, 1.93it/s]
46
  0%| | 44/10682 [00:34<1:32:51, 1.91it/s]
47
  0%| | 45/10682 [00:35<1:34:55, 1.87it/s]
48
  0%| | 46/10682 [00:35<1:34:07, 1.88it/s]
49
  0%| | 47/10682 [00:36<1:34:23, 1.88it/s]
50
  0%| | 48/10682 [00:36<1:34:34, 1.87it/s]
51
  0%| | 49/10682 [00:37<1:34:01, 1.88it/s]
52
  0%| | 50/10682 [00:37<1:33:22, 1.90it/s]
53
 
 
54
  0%| | 50/10682 [00:37<1:33:22, 1.90it/s]
55
  0%| | 51/10682 [00:38<1:32:21, 1.92it/s]
56
  0%| | 52/10682 [00:38<1:31:09, 1.94it/s]
57
  0%| | 53/10682 [00:39<1:30:15, 1.96it/s]
58
  1%| | 54/10682 [00:39<1:29:35, 1.98it/s]
59
  1%| | 55/10682 [00:40<1:29:07, 1.99it/s]
60
  1%| | 56/10682 [00:40<1:29:19, 1.98it/s]
61
  1%| | 57/10682 [00:41<1:29:05, 1.99it/s]
62
  1%| | 58/10682 [00:41<1:29:05, 1.99it/s]
63
  1%| | 59/10682 [00:42<1:28:56, 1.99it/s]
64
  1%| | 60/10682 [00:42<1:28:40, 2.00it/s]
65
  1%| | 61/10682 [00:43<1:28:37, 2.00it/s]
66
  1%| | 62/10682 [00:43<1:28:44, 1.99it/s]
67
  1%| | 63/10682 [00:44<1:28:26, 2.00it/s]
68
  1%| | 64/10682 [00:44<1:28:10, 2.01it/s]
69
  1%| | 65/10682 [00:45<1:27:48, 2.02it/s]
70
  1%| | 66/10682 [00:45<1:27:39, 2.02it/s]
71
  1%| | 67/10682 [00:46<1:27:42, 2.02it/s]
72
  1%| | 68/10682 [00:46<1:27:37, 2.02it/s]
73
  1%| | 69/10682 [00:47<1:27:36, 2.02it/s]
74
  1%| | 70/10682 [00:47<1:27:33, 2.02it/s]
75
  1%| | 71/10682 [00:48<1:27:35, 2.02it/s]
76
  1%| | 72/10682 [00:48<1:27:30, 2.02it/s]
77
  1%| | 73/10682 [00:49<1:27:31, 2.02it/s]
78
  1%| | 74/10682 [00:49<1:27:25, 2.02it/s]
79
  1%| | 75/10682 [00:50<1:27:17, 2.03it/s]{'loss': 9.2238, 'grad_norm': 1.1420856714248657, 'learning_rate': 7.015902712815715e-05, 'epoch': 0.1}
 
80
 
81
  1%| | 75/10682 [00:50<1:27:17, 2.03it/s]
82
  1%| | 76/10682 [00:50<1:27:25, 2.02it/s]
83
  1%| | 77/10682 [00:51<1:27:27, 2.02it/s]
84
  1%| | 78/10682 [00:51<1:27:22, 2.02it/s]
85
  1%| | 79/10682 [00:52<1:27:13, 2.03it/s]
86
  1%| | 80/10682 [00:52<1:27:13, 2.03it/s]
87
  1%| | 81/10682 [00:53<1:27:19, 2.02it/s]
88
  1%| | 82/10682 [00:53<1:27:19, 2.02it/s]
89
  1%| | 83/10682 [00:54<1:27:39, 2.02it/s]
90
  1%| | 84/10682 [00:54<1:27:38, 2.02it/s]
91
  1%| | 85/10682 [00:55<1:27:39, 2.01it/s]
92
  1%| | 86/10682 [00:55<1:27:50, 2.01it/s]
93
  1%| | 87/10682 [00:56<1:27:38, 2.02it/s]
94
  1%| | 88/10682 [00:56<1:27:22, 2.02it/s]
95
  1%| | 89/10682 [00:57<1:27:23, 2.02it/s]
96
  1%| | 90/10682 [00:57<1:27:17, 2.02it/s]
97
  1%| | 91/10682 [00:58<1:27:10, 2.02it/s]
98
  1%| | 92/10682 [00:58<1:27:09, 2.03it/s]
99
  1%| | 93/10682 [00:59<1:27:03, 2.03it/s]
100
  1%| | 94/10682 [00:59<1:26:59, 2.03it/s]
101
  1%| | 95/10682 [01:00<1:26:56, 2.03it/s]
102
  1%| | 96/10682 [01:00<1:26:53, 2.03it/s]
103
  1%| | 97/10682 [01:01<1:26:58, 2.03it/s]
104
  1%| | 98/10682 [01:01<1:26:53, 2.03it/s]
105
  1%| | 99/10682 [01:02<1:26:56, 2.03it/s]
106
  1%| | 100/10682 [01:02<1:26:55, 2.03it/s]{'loss': 8.428, 'grad_norm': 0.7997293472290039, 'learning_rate': 9.354536950420954e-05, 'epoch': 0.13}
 
107
 
108
  1%| | 100/10682 [01:02<1:26:55, 2.03it/s]
109
  1%| | 101/10682 [01:03<1:27:08, 2.02it/s]
110
  1%| | 102/10682 [01:03<1:27:00, 2.03it/s]
111
  1%| | 103/10682 [01:04<1:26:59, 2.03it/s]
112
  1%| | 104/10682 [01:04<1:26:57, 2.03it/s]
113
  1%| | 105/10682 [01:04<1:26:51, 2.03it/s]
114
  1%| | 106/10682 [01:05<1:26:49, 2.03it/s]
115
  1%| | 107/10682 [01:05<1:26:50, 2.03it/s]
116
  1%| | 108/10682 [01:06<1:26:51, 2.03it/s]
117
  1%| | 109/10682 [01:06<1:26:47, 2.03it/s]
118
  1%| | 110/10682 [01:07<1:26:40, 2.03it/s]
119
  1%| | 111/10682 [01:07<1:26:43, 2.03it/s]
120
  1%| | 112/10682 [01:08<1:26:42, 2.03it/s]
121
  1%| | 113/10682 [01:08<1:26:45, 2.03it/s]
122
  1%| | 114/10682 [01:09<1:26:41, 2.03it/s]
123
  1%| | 115/10682 [01:09<1:26:39, 2.03it/s]
124
  1%| | 116/10682 [01:10<1:26:40, 2.03it/s]
125
  1%| | 117/10682 [01:10<1:26:41, 2.03it/s]
126
  1%| | 118/10682 [01:11<1:26:46, 2.03it/s]
127
  1%| | 119/10682 [01:11<1:26:41, 2.03it/s]
128
  1%| | 120/10682 [01:12<1:26:34, 2.03it/s]
129
  1%| | 121/10682 [01:12<1:26:37, 2.03it/s]
130
  1%| | 122/10682 [01:13<1:26:44, 2.03it/s]
131
  1%| | 123/10682 [01:13<1:26:45, 2.03it/s]
132
  1%| | 124/10682 [01:14<1:26:44, 2.03it/s]
133
  1%| | 125/10682 [01:14<1:26:43, 2.03it/s]
134
 
 
135
  1%| | 125/10682 [01:14<1:26:43, 2.03it/s]
136
  1%| | 126/10682 [01:15<1:26:50, 2.03it/s]
137
  1%| | 127/10682 [01:15<1:26:45, 2.03it/s]
138
  1%| | 128/10682 [01:16<1:26:35, 2.03it/s]
139
  1%| | 129/10682 [01:16<1:26:35, 2.03it/s]
140
  1%| | 130/10682 [01:17<1:26:35, 2.03it/s]
141
  1%| | 131/10682 [01:17<1:26:31, 2.03it/s]
142
  1%| | 132/10682 [01:18<1:26:32, 2.03it/s]
143
  1%| | 133/10682 [01:18<1:26:33, 2.03it/s]
144
  1%|▏ | 134/10682 [01:19<1:26:27, 2.03it/s]
145
  1%|▏ | 135/10682 [01:19<1:26:29, 2.03it/s]
146
  1%|▏ | 136/10682 [01:20<1:26:31, 2.03it/s]
147
  1%|▏ | 137/10682 [01:20<1:26:31, 2.03it/s]
148
  1%|▏ | 138/10682 [01:21<1:26:30, 2.03it/s]
149
  1%|▏ | 139/10682 [01:21<1:26:30, 2.03it/s]
150
  1%|▏ | 140/10682 [01:22<1:26:29, 2.03it/s]
151
  1%|▏ | 141/10682 [01:22<1:26:35, 2.03it/s]
152
  1%|▏ | 142/10682 [01:23<1:26:28, 2.03it/s]
153
  1%|▏ | 143/10682 [01:23<1:26:27, 2.03it/s]
154
  1%|▏ | 144/10682 [01:24<1:26:27, 2.03it/s]
155
  1%|▏ | 145/10682 [01:24<1:26:23, 2.03it/s]
156
  1%|▏ | 146/10682 [01:25<1:26:32, 2.03it/s]
157
  1%|▏ | 147/10682 [01:25<1:26:25, 2.03it/s]
158
  1%|▏ | 148/10682 [01:26<1:26:20, 2.03it/s]
159
  1%|▏ | 149/10682 [01:26<1:26:23, 2.03it/s]
160
  1%|▏ | 150/10682 [01:27<1:26:20, 2.03it/s]
161
  {'loss': 7.289, 'grad_norm': 0.367520809173584, 'learning_rate': 0.0001403180542563143, 'epoch': 0.2}
 
162
  1%|▏ | 150/10682 [01:27<1:26:20, 2.03it/s]
163
  1%|▏ | 151/10682 [01:27<1:26:26, 2.03it/s]
164
  1%|▏ | 152/10682 [01:28<1:26:25, 2.03it/s]
165
  1%|▏ | 153/10682 [01:28<1:26:23, 2.03it/s]
166
  1%|▏ | 154/10682 [01:29<1:26:26, 2.03it/s]
167
  1%|▏ | 155/10682 [01:29<1:26:23, 2.03it/s]
168
  1%|▏ | 156/10682 [01:30<1:26:20, 2.03it/s]
169
  1%|▏ | 157/10682 [01:30<1:26:22, 2.03it/s]
170
  1%|▏ | 158/10682 [01:31<1:26:23, 2.03it/s]
171
  1%|▏ | 159/10682 [01:31<1:26:25, 2.03it/s]
172
  1%|▏ | 160/10682 [01:32<1:26:15, 2.03it/s]
173
  2%|▏ | 161/10682 [01:32<1:26:14, 2.03it/s]
174
  2%|▏ | 162/10682 [01:33<1:26:18, 2.03it/s]
175
  2%|▏ | 163/10682 [01:33<1:26:12, 2.03it/s]
176
  2%|▏ | 164/10682 [01:34<1:26:10, 2.03it/s]
177
  2%|▏ | 165/10682 [01:34<1:26:10, 2.03it/s]
178
  2%|▏ | 166/10682 [01:35<1:26:08, 2.03it/s]
179
  2%|▏ | 167/10682 [01:35<1:26:13, 2.03it/s]
180
  2%|▏ | 168/10682 [01:36<1:26:12, 2.03it/s]
181
  2%|▏ | 169/10682 [01:36<1:26:15, 2.03it/s]
182
  2%|▏ | 170/10682 [01:36<1:26:14, 2.03it/s]
183
  2%|▏ | 171/10682 [01:37<1:26:08, 2.03it/s]
184
  2%|▏ | 172/10682 [01:37<1:26:18, 2.03it/s]
185
  2%|▏ | 173/10682 [01:38<1:26:12, 2.03it/s]
186
  2%|▏ | 174/10682 [01:38<1:26:13, 2.03it/s]
187
  2%|▏ | 175/10682 [01:39<1:26:11, 2.03it/s]{'loss': 6.8807, 'grad_norm': 0.33734890818595886, 'learning_rate': 0.00016370439663236668, 'epoch': 0.23}
188
 
 
189
  2%|▏ | 175/10682 [01:39<1:26:11, 2.03it/s]
190
  2%|▏ | 176/10682 [01:39<1:26:12, 2.03it/s]
191
  2%|▏ | 177/10682 [01:40<1:26:12, 2.03it/s]
192
  2%|▏ | 178/10682 [01:40<1:26:09, 2.03it/s]
193
  2%|▏ | 179/10682 [01:41<1:26:12, 2.03it/s]
194
  2%|▏ | 180/10682 [01:41<1:26:13, 2.03it/s]
195
  2%|▏ | 181/10682 [01:42<1:26:08, 2.03it/s]
196
  2%|▏ | 182/10682 [01:42<1:26:14, 2.03it/s]
197
  2%|▏ | 183/10682 [01:43<1:26:10, 2.03it/s]
198
  2%|▏ | 184/10682 [01:43<1:26:09, 2.03it/s]
199
  2%|▏ | 185/10682 [01:44<1:26:08, 2.03it/s]
200
  2%|▏ | 186/10682 [01:44<1:26:02, 2.03it/s]
201
  2%|▏ | 187/10682 [01:45<1:26:04, 2.03it/s]
202
  2%|▏ | 188/10682 [01:45<1:26:03, 2.03it/s]
203
  2%|▏ | 189/10682 [01:46<1:25:58, 2.03it/s]
204
  2%|▏ | 190/10682 [01:46<1:25:58, 2.03it/s]
205
  2%|▏ | 191/10682 [01:47<1:25:58, 2.03it/s]
206
  2%|▏ | 192/10682 [01:47<1:26:01, 2.03it/s]
207
  2%|▏ | 193/10682 [01:48<1:26:01, 2.03it/s]
208
  2%|▏ | 194/10682 [01:48<1:26:01, 2.03it/s]
209
  2%|▏ | 195/10682 [01:49<1:25:58, 2.03it/s]
210
  2%|▏ | 196/10682 [01:49<1:26:02, 2.03it/s]
211
  2%|▏ | 197/10682 [01:50<1:26:00, 2.03it/s]
212
  2%|▏ | 198/10682 [01:50<1:26:04, 2.03it/s]
213
  2%|▏ | 199/10682 [01:51<1:26:00, 2.03it/s]
214
  2%|▏ | 200/10682 [01:51<1:25:57, 2.03it/s]{'loss': 6.5556, 'grad_norm': 0.46203550696372986, 'learning_rate': 0.00018709073900841907, 'epoch': 0.26}
 
215
 
216
  2%|▏ | 200/10682 [01:51<1:25:57, 2.03it/s]
217
  2%|▏ | 201/10682 [01:52<1:26:04, 2.03it/s]
218
  2%|▏ | 202/10682 [01:52<1:26:00, 2.03it/s]
219
  2%|▏ | 203/10682 [01:53<1:26:03, 2.03it/s]
220
  2%|▏ | 204/10682 [01:53<1:26:00, 2.03it/s]
221
  2%|▏ | 205/10682 [01:54<1:25:56, 2.03it/s]
222
  2%|▏ | 206/10682 [01:54<1:25:57, 2.03it/s]
223
  2%|▏ | 207/10682 [01:55<1:25:52, 2.03it/s]
224
  2%|▏ | 208/10682 [01:55<1:25:51, 2.03it/s]
225
  2%|▏ | 209/10682 [01:56<1:25:54, 2.03it/s]
226
  2%|▏ | 210/10682 [01:56<1:25:49, 2.03it/s]
227
  2%|▏ | 211/10682 [01:57<1:25:45, 2.04it/s]
228
  2%|▏ | 212/10682 [01:57<1:25:49, 2.03it/s]
229
  2%|▏ | 213/10682 [01:58<1:25:44, 2.04it/s]
230
  2%|▏ | 214/10682 [01:58<1:25:40, 2.04it/s]
231
  2%|▏ | 215/10682 [01:59<1:25:44, 2.03it/s]
232
  2%|▏ | 216/10682 [01:59<1:25:41, 2.04it/s]
233
  2%|▏ | 217/10682 [02:00<1:25:42, 2.03it/s]
234
  2%|▏ | 218/10682 [02:00<1:25:45, 2.03it/s]
235
  2%|▏ | 219/10682 [02:01<1:25:42, 2.03it/s]
236
  2%|▏ | 220/10682 [02:01<1:25:40, 2.04it/s]
237
  2%|▏ | 221/10682 [02:02<1:25:44, 2.03it/s]
238
  2%|▏ | 222/10682 [02:02<1:25:42, 2.03it/s]
239
  2%|▏ | 223/10682 [02:03<1:25:39, 2.04it/s]
240
  2%|▏ | 224/10682 [02:03<1:25:41, 2.03it/s]
241
  2%|▏ | 225/10682 [02:04<1:25:38, 2.04it/s]{'loss': 6.2908, 'grad_norm': 0.7612385153770447, 'learning_rate': 0.00021047708138447147, 'epoch': 0.29}
 
242
 
243
  2%|▏ | 225/10682 [02:04<1:25:38, 2.04it/s]
244
  2%|▏ | 226/10682 [02:04<1:25:45, 2.03it/s]
245
  2%|▏ | 227/10682 [02:05<1:25:46, 2.03it/s]
246
  2%|▏ | 228/10682 [02:05<1:25:42, 2.03it/s]
247
  2%|▏ | 229/10682 [02:06<1:25:41, 2.03it/s]
248
  2%|▏ | 230/10682 [02:06<1:25:43, 2.03it/s]
249
  2%|▏ | 231/10682 [02:07<1:25:37, 2.03it/s]
250
  2%|▏ | 232/10682 [02:07<1:25:41, 2.03it/s]
251
  2%|▏ | 233/10682 [02:07<1:25:39, 2.03it/s]
252
  2%|▏ | 234/10682 [02:08<1:25:33, 2.04it/s]
253
  2%|▏ | 235/10682 [02:08<1:25:38, 2.03it/s]
254
  2%|▏ | 236/10682 [02:09<1:25:37, 2.03it/s]
255
  2%|▏ | 237/10682 [02:09<1:25:35, 2.03it/s]
256
  2%|▏ | 238/10682 [02:10<1:25:40, 2.03it/s]
257
  2%|▏ | 239/10682 [02:10<1:25:36, 2.03it/s]
258
  2%|▏ | 240/10682 [02:11<1:25:38, 2.03it/s]
259
  2%|▏ | 241/10682 [02:11<1:25:39, 2.03it/s]
260
  2%|▏ | 242/10682 [02:12<1:25:38, 2.03it/s]
261
  2%|▏ | 243/10682 [02:12<1:25:43, 2.03it/s]
262
  2%|▏ | 244/10682 [02:13<1:25:35, 2.03it/s]
263
  2%|▏ | 245/10682 [02:13<1:25:33, 2.03it/s]
264
  2%|▏ | 246/10682 [02:14<1:25:38, 2.03it/s]
265
  2%|▏ | 247/10682 [02:14<1:25:35, 2.03it/s]
266
  2%|▏ | 248/10682 [02:15<1:25:37, 2.03it/s]
267
  2%|▏ | 249/10682 [02:15<1:25:34, 2.03it/s]
268
  2%|▏ | 250/10682 [02:16<1:25:30, 2.03it/s]{'loss': 6.0883, 'grad_norm': 0.3854532241821289, 'learning_rate': 0.00023386342376052386, 'epoch': 0.33}
269
 
 
270
  2%|▏ | 250/10682 [02:16<1:25:30, 2.03it/s]
271
  2%|▏ | 251/10682 [02:16<1:25:42, 2.03it/s]
272
  2%|▏ | 252/10682 [02:17<1:25:35, 2.03it/s]
273
  2%|▏ | 253/10682 [02:17<1:25:34, 2.03it/s]
274
  2%|▏ | 254/10682 [02:18<1:25:33, 2.03it/s]
275
  2%|▏ | 255/10682 [02:18<1:25:34, 2.03it/s]
276
  2%|▏ | 256/10682 [02:19<1:25:33, 2.03it/s]
277
  2%|▏ | 257/10682 [02:19<1:25:31, 2.03it/s]
278
  2%|▏ | 258/10682 [02:20<1:25:29, 2.03it/s]
279
  2%|▏ | 259/10682 [02:20<1:25:25, 2.03it/s]
280
  2%|▏ | 260/10682 [02:21<1:25:28, 2.03it/s]
281
  2%|▏ | 261/10682 [02:21<1:25:25, 2.03it/s]
282
  2%|▏ | 262/10682 [02:22<1:25:23, 2.03it/s]
283
  2%|▏ | 263/10682 [02:22<1:25:27, 2.03it/s]
284
  2%|▏ | 264/10682 [02:23<1:25:22, 2.03it/s]
285
  2%|▏ | 265/10682 [02:23<1:25:19, 2.03it/s]
286
  2%|▏ | 266/10682 [02:24<1:25:23, 2.03it/s]
287
  2%|▏ | 267/10682 [02:24<1:25:23, 2.03it/s]
288
  3%|β–Ž | 268/10682 [02:25<1:25:23, 2.03it/s]
289
  3%|β–Ž | 269/10682 [02:25<1:25:27, 2.03it/s]
290
  3%|β–Ž | 270/10682 [02:26<1:25:27, 2.03it/s]
291
  3%|β–Ž | 271/10682 [02:26<1:25:24, 2.03it/s]
292
  3%|β–Ž | 272/10682 [02:27<1:25:22, 2.03it/s]
293
  3%|β–Ž | 273/10682 [02:27<1:25:20, 2.03it/s]
294
  3%|β–Ž | 274/10682 [02:28<1:25:21, 2.03it/s]
295
  3%|β–Ž | 275/10682 [02:28<1:25:21, 2.03it/s]
296
  {'loss': 5.9181, 'grad_norm': 0.7595835328102112, 'learning_rate': 0.00025724976613657625, 'epoch': 0.36}
 
297
  3%|β–Ž | 275/10682 [02:28<1:25:21, 2.03it/s]
298
  3%|β–Ž | 276/10682 [02:29<1:25:26, 2.03it/s]
299
  3%|β–Ž | 277/10682 [02:29<1:25:26, 2.03it/s]
300
  3%|β–Ž | 278/10682 [02:30<1:25:24, 2.03it/s]
301
  3%|β–Ž | 279/10682 [02:30<1:25:23, 2.03it/s]
302
  3%|β–Ž | 280/10682 [02:31<1:25:23, 2.03it/s]
303
  3%|β–Ž | 281/10682 [02:31<1:25:21, 2.03it/s]
304
  3%|β–Ž | 282/10682 [02:32<1:25:25, 2.03it/s]
305
  3%|β–Ž | 283/10682 [02:32<1:25:22, 2.03it/s]
306
  3%|β–Ž | 284/10682 [02:33<1:25:20, 2.03it/s]
307
  3%|β–Ž | 285/10682 [02:33<1:25:21, 2.03it/s]
308
  3%|β–Ž | 286/10682 [02:34<1:25:24, 2.03it/s]
309
  3%|β–Ž | 287/10682 [02:34<1:25:27, 2.03it/s]
310
  3%|β–Ž | 288/10682 [02:35<1:25:22, 2.03it/s]
311
  3%|β–Ž | 289/10682 [02:35<1:25:16, 2.03it/s]
312
  3%|β–Ž | 290/10682 [02:36<1:25:18, 2.03it/s]
313
  3%|β–Ž | 291/10682 [02:36<1:25:15, 2.03it/s]
314
  3%|β–Ž | 292/10682 [02:37<1:25:08, 2.03it/s]
315
  3%|β–Ž | 293/10682 [02:37<1:25:12, 2.03it/s]
316
  3%|β–Ž | 294/10682 [02:38<1:25:09, 2.03it/s]
317
  3%|β–Ž | 295/10682 [02:38<1:25:07, 2.03it/s]
318
  3%|β–Ž | 296/10682 [02:39<1:25:10, 2.03it/s]
319
  3%|β–Ž | 297/10682 [02:39<1:25:02, 2.04it/s]
320
  3%|β–Ž | 298/10682 [02:39<1:25:05, 2.03it/s]
321
  3%|β–Ž | 299/10682 [02:40<1:25:08, 2.03it/s]
322
  3%|β–Ž | 300/10682 [02:40<1:25:04, 2.03it/s]{'loss': 5.7819, 'grad_norm': 0.6112937927246094, 'learning_rate': 0.0002806361085126286, 'epoch': 0.39}
 
323
 
324
  3%|β–Ž | 300/10682 [02:40<1:25:04, 2.03it/s]
325
  3%|β–Ž | 301/10682 [02:41<1:25:24, 2.03it/s]
326
  3%|β–Ž | 302/10682 [02:41<1:25:14, 2.03it/s]
327
  3%|β–Ž | 303/10682 [02:42<1:25:13, 2.03it/s]
328
  3%|β–Ž | 304/10682 [02:42<1:25:08, 2.03it/s]
329
  3%|β–Ž | 305/10682 [02:43<1:25:08, 2.03it/s]
330
  3%|β–Ž | 306/10682 [02:43<1:25:08, 2.03it/s]
331
  3%|β–Ž | 307/10682 [02:44<1:25:02, 2.03it/s]
332
  3%|β–Ž | 308/10682 [02:44<1:25:02, 2.03it/s]
333
  3%|β–Ž | 309/10682 [02:45<1:25:04, 2.03it/s]
334
  3%|β–Ž | 310/10682 [02:45<1:25:05, 2.03it/s]
335
  3%|β–Ž | 311/10682 [02:46<1:25:07, 2.03it/s]
336
  3%|β–Ž | 312/10682 [02:46<1:25:04, 2.03it/s]
337
  3%|β–Ž | 313/10682 [02:47<1:25:04, 2.03it/s]
338
  3%|β–Ž | 314/10682 [02:47<1:25:01, 2.03it/s]
339
  3%|β–Ž | 315/10682 [02:48<1:24:58, 2.03it/s]
340
  3%|β–Ž | 316/10682 [02:48<1:24:57, 2.03it/s]
341
  3%|β–Ž | 317/10682 [02:49<1:24:59, 2.03it/s]
342
  3%|β–Ž | 318/10682 [02:49<1:24:58, 2.03it/s]
343
  3%|β–Ž | 319/10682 [02:50<1:25:00, 2.03it/s]
344
  3%|β–Ž | 320/10682 [02:50<1:24:55, 2.03it/s]
345
  3%|β–Ž | 321/10682 [02:51<1:25:00, 2.03it/s]
346
  3%|β–Ž | 322/10682 [02:51<1:25:05, 2.03it/s]
347
  3%|β–Ž | 323/10682 [02:52<1:24:59, 2.03it/s]
348
  3%|β–Ž | 324/10682 [02:52<1:24:58, 2.03it/s]
349
  3%|β–Ž | 325/10682 [02:53<1:24:55, 2.03it/s]
350
  {'loss': 5.6576, 'grad_norm': 1.0010818243026733, 'learning_rate': 0.00030402245088868103, 'epoch': 0.43}
 
351
  3%|β–Ž | 325/10682 [02:53<1:24:55, 2.03it/s]
352
  3%|β–Ž | 326/10682 [02:53<1:25:01, 2.03it/s]
353
  3%|β–Ž | 327/10682 [02:54<1:25:04, 2.03it/s]
354
  3%|β–Ž | 328/10682 [02:54<1:25:01, 2.03it/s]
355
  3%|β–Ž | 329/10682 [02:55<1:25:04, 2.03it/s]
356
  3%|β–Ž | 330/10682 [02:55<1:25:00, 2.03it/s]
357
  3%|β–Ž | 331/10682 [02:56<1:24:56, 2.03it/s]
358
  3%|β–Ž | 332/10682 [02:56<1:24:56, 2.03it/s]
359
  3%|β–Ž | 333/10682 [02:57<1:24:52, 2.03it/s]
360
  3%|β–Ž | 334/10682 [02:57<1:24:51, 2.03it/s]
361
  3%|β–Ž | 335/10682 [02:58<1:24:51, 2.03it/s]
362
  3%|β–Ž | 336/10682 [02:58<1:24:48, 2.03it/s]
363
  3%|β–Ž | 337/10682 [02:59<1:24:50, 2.03it/s]
364
  3%|β–Ž | 338/10682 [02:59<1:24:49, 2.03it/s]
365
  3%|β–Ž | 339/10682 [03:00<1:24:46, 2.03it/s]
366
  3%|β–Ž | 340/10682 [03:00<1:24:53, 2.03it/s]
367
  3%|β–Ž | 341/10682 [03:01<1:24:48, 2.03it/s]
368
  3%|β–Ž | 342/10682 [03:01<1:24:45, 2.03it/s]
369
  3%|β–Ž | 343/10682 [03:02<1:24:49, 2.03it/s]
370
  3%|β–Ž | 344/10682 [03:02<1:24:46, 2.03it/s]
371
  3%|β–Ž | 345/10682 [03:03<1:24:46, 2.03it/s]
372
  3%|β–Ž | 346/10682 [03:03<1:24:44, 2.03it/s]
373
  3%|β–Ž | 347/10682 [03:04<1:24:40, 2.03it/s]
374
  3%|β–Ž | 348/10682 [03:04<1:24:45, 2.03it/s]
375
  3%|β–Ž | 349/10682 [03:05<1:24:42, 2.03it/s]
376
  3%|β–Ž | 350/10682 [03:05<1:24:40, 2.03it/s]{'loss': 5.5561, 'grad_norm': 0.5823507308959961, 'learning_rate': 0.00032740879326473337, 'epoch': 0.46}
377
 
 
378
  3%|β–Ž | 350/10682 [03:05<1:24:40, 2.03it/s]
379
  3%|β–Ž | 351/10682 [03:06<1:24:50, 2.03it/s]
380
  3%|β–Ž | 352/10682 [03:06<1:24:41, 2.03it/s]
381
  3%|β–Ž | 353/10682 [03:07<1:24:44, 2.03it/s]
382
  3%|β–Ž | 354/10682 [03:07<1:24:42, 2.03it/s]
383
  3%|β–Ž | 355/10682 [03:08<1:24:36, 2.03it/s]
384
  3%|β–Ž | 356/10682 [03:08<1:24:42, 2.03it/s]
385
  3%|β–Ž | 357/10682 [03:09<1:24:39, 2.03it/s]
386
  3%|β–Ž | 358/10682 [03:09<1:24:36, 2.03it/s]
387
  3%|β–Ž | 359/10682 [03:10<1:24:39, 2.03it/s]
388
  3%|β–Ž | 360/10682 [03:10<1:24:39, 2.03it/s]
389
  3%|β–Ž | 361/10682 [03:10<1:24:40, 2.03it/s]
390
  3%|β–Ž | 362/10682 [03:11<1:24:39, 2.03it/s]
391
  3%|β–Ž | 363/10682 [03:11<1:24:35, 2.03it/s]
392
  3%|β–Ž | 364/10682 [03:12<1:24:36, 2.03it/s]
393
  3%|β–Ž | 365/10682 [03:12<1:24:38, 2.03it/s]
394
  3%|β–Ž | 366/10682 [03:13<1:24:36, 2.03it/s]
395
  3%|β–Ž | 367/10682 [03:13<1:24:37, 2.03it/s]
396
  3%|β–Ž | 368/10682 [03:14<1:24:38, 2.03it/s]
397
  3%|β–Ž | 369/10682 [03:14<1:24:37, 2.03it/s]
398
  3%|β–Ž | 370/10682 [03:15<1:24:39, 2.03it/s]
399
  3%|β–Ž | 371/10682 [03:15<1:24:34, 2.03it/s]
400
  3%|β–Ž | 372/10682 [03:16<1:24:37, 2.03it/s]
401
  3%|β–Ž | 373/10682 [03:16<1:24:37, 2.03it/s]
402
  4%|β–Ž | 374/10682 [03:17<1:24:36, 2.03it/s]
403
  4%|β–Ž | 375/10682 [03:17<1:24:36, 2.03it/s]
404
 
 
405
  4%|β–Ž | 375/10682 [03:17<1:24:36, 2.03it/s]
406
  4%|β–Ž | 376/10682 [03:18<1:25:00, 2.02it/s]
407
  4%|β–Ž | 377/10682 [03:18<1:24:53, 2.02it/s]
408
  4%|β–Ž | 378/10682 [03:19<1:24:43, 2.03it/s]
409
  4%|β–Ž | 379/10682 [03:19<1:24:39, 2.03it/s]
410
  4%|β–Ž | 380/10682 [03:20<1:24:37, 2.03it/s]
411
  4%|β–Ž | 381/10682 [03:20<1:24:30, 2.03it/s]
412
  4%|β–Ž | 382/10682 [03:21<1:24:32, 2.03it/s]
413
  4%|β–Ž | 383/10682 [03:21<1:24:28, 2.03it/s]
414
  4%|β–Ž | 384/10682 [03:22<1:24:27, 2.03it/s]
415
  4%|β–Ž | 385/10682 [03:22<1:24:28, 2.03it/s]
416
  4%|β–Ž | 386/10682 [03:23<1:24:22, 2.03it/s]
417
  4%|β–Ž | 387/10682 [03:23<1:24:24, 2.03it/s]
418
  4%|β–Ž | 388/10682 [03:24<1:24:26, 2.03it/s]
419
  4%|β–Ž | 389/10682 [03:24<1:24:25, 2.03it/s]
420
  4%|β–Ž | 390/10682 [03:25<1:24:25, 2.03it/s]
421
  4%|β–Ž | 391/10682 [03:25<1:24:25, 2.03it/s]
422
  4%|β–Ž | 392/10682 [03:26<1:24:20, 2.03it/s]
423
  4%|β–Ž | 393/10682 [03:26<1:24:23, 2.03it/s]
424
  4%|β–Ž | 394/10682 [03:27<1:24:21, 2.03it/s]
425
  4%|β–Ž | 395/10682 [03:27<1:24:21, 2.03it/s]
426
  4%|β–Ž | 396/10682 [03:28<1:24:23, 2.03it/s]
427
  4%|β–Ž | 397/10682 [03:28<1:24:28, 2.03it/s]
428
  4%|β–Ž | 398/10682 [03:29<1:24:28, 2.03it/s]
429
  4%|β–Ž | 399/10682 [03:29<1:24:28, 2.03it/s]
430
  4%|β–Ž | 400/10682 [03:30<1:24:31, 2.03it/s]{'loss': 5.375, 'grad_norm': 0.6424997448921204, 'learning_rate': 0.00037418147801683815, 'epoch': 0.52}
 
431
 
432
  4%|β–Ž | 400/10682 [03:30<1:24:31, 2.03it/s]
433
  4%|▍ | 401/10682 [03:30<1:24:46, 2.02it/s]
434
  4%|▍ | 402/10682 [03:31<1:24:39, 2.02it/s]
435
  4%|▍ | 403/10682 [03:31<1:24:35, 2.03it/s]
436
  4%|▍ | 404/10682 [03:32<1:24:31, 2.03it/s]
437
  4%|▍ | 405/10682 [03:32<1:24:23, 2.03it/s]
438
  4%|▍ | 406/10682 [03:33<1:24:22, 2.03it/s]
439
  4%|▍ | 407/10682 [03:33<1:24:23, 2.03it/s]
440
  4%|▍ | 408/10682 [03:34<1:24:18, 2.03it/s]
441
  4%|▍ | 409/10682 [03:34<1:24:17, 2.03it/s]
442
  4%|▍ | 410/10682 [03:35<1:24:17, 2.03it/s]
443
  4%|▍ | 411/10682 [03:35<1:24:12, 2.03it/s]
444
  4%|▍ | 412/10682 [03:36<1:24:08, 2.03it/s]
445
  4%|▍ | 413/10682 [03:36<1:24:10, 2.03it/s]
446
  4%|▍ | 414/10682 [03:37<1:24:06, 2.03it/s]
447
  4%|▍ | 415/10682 [03:37<1:24:09, 2.03it/s]
448
  4%|▍ | 416/10682 [03:38<1:24:10, 2.03it/s]
449
  4%|▍ | 417/10682 [03:38<1:24:06, 2.03it/s]
450
  4%|▍ | 418/10682 [03:39<1:24:06, 2.03it/s]
451
  4%|▍ | 419/10682 [03:39<1:24:08, 2.03it/s]
452
  4%|▍ | 420/10682 [03:40<1:24:08, 2.03it/s]
453
  4%|▍ | 421/10682 [03:40<1:24:09, 2.03it/s]
454
  4%|▍ | 422/10682 [03:41<1:24:08, 2.03it/s]
455
  4%|▍ | 423/10682 [03:41<1:24:06, 2.03it/s]
456
  4%|▍ | 424/10682 [03:42<1:24:06, 2.03it/s]
457
  4%|▍ | 425/10682 [03:42<1:24:03, 2.03it/s]{'loss': 5.2944, 'grad_norm': 0.4700624942779541, 'learning_rate': 0.0003975678203928906, 'epoch': 0.56}
 
458
 
459
  4%|▍ | 425/10682 [03:42<1:24:03, 2.03it/s]
460
  4%|▍ | 426/10682 [03:43<1:24:12, 2.03it/s]
461
  4%|▍ | 427/10682 [03:43<1:24:08, 2.03it/s]
462
  4%|▍ | 428/10682 [03:43<1:24:06, 2.03it/s]
463
  4%|▍ | 429/10682 [03:44<1:24:09, 2.03it/s]
464
  4%|▍ | 430/10682 [03:44<1:24:07, 2.03it/s]
465
  4%|▍ | 431/10682 [03:45<1:24:08, 2.03it/s]
466
  4%|▍ | 432/10682 [03:45<1:24:04, 2.03it/s]
467
  4%|▍ | 433/10682 [03:46<1:23:59, 2.03it/s]
468
  4%|▍ | 434/10682 [03:46<1:24:01, 2.03it/s]
469
  4%|▍ | 435/10682 [03:47<1:23:59, 2.03it/s]
470
  4%|▍ | 436/10682 [03:47<1:24:02, 2.03it/s]
471
  4%|▍ | 437/10682 [03:48<1:24:03, 2.03it/s]
472
  4%|▍ | 438/10682 [03:48<1:23:58, 2.03it/s]
473
  4%|▍ | 439/10682 [03:49<1:24:02, 2.03it/s]
474
  4%|▍ | 440/10682 [03:49<1:23:57, 2.03it/s]
475
  4%|▍ | 441/10682 [03:50<1:23:53, 2.03it/s]
476
  4%|▍ | 442/10682 [03:50<1:23:57, 2.03it/s]
477
  4%|▍ | 443/10682 [03:51<1:23:55, 2.03it/s]
478
  4%|▍ | 444/10682 [03:51<1:23:51, 2.03it/s]
479
  4%|▍ | 445/10682 [03:52<1:23:58, 2.03it/s]
480
  4%|▍ | 446/10682 [03:52<1:23:52, 2.03it/s]
481
  4%|▍ | 447/10682 [03:53<1:23:56, 2.03it/s]
482
  4%|▍ | 448/10682 [03:53<1:23:55, 2.03it/s]
483
  4%|▍ | 449/10682 [03:54<1:23:52, 2.03it/s]
484
  4%|▍ | 450/10682 [03:54<1:23:56, 2.03it/s]
485
  {'loss': 5.2223, 'grad_norm': 0.4889560043811798, 'learning_rate': 0.00042095416276894293, 'epoch': 0.59}
 
486
  4%|▍ | 450/10682 [03:54<1:23:56, 2.03it/s]
487
  4%|▍ | 451/10682 [03:55<1:23:56, 2.03it/s]
488
  4%|▍ | 452/10682 [03:55<1:23:55, 2.03it/s]
489
  4%|▍ | 453/10682 [03:56<1:23:53, 2.03it/s]
490
  4%|▍ | 454/10682 [03:56<1:23:48, 2.03it/s]
491
  4%|▍ | 455/10682 [03:57<1:23:58, 2.03it/s]
492
  4%|▍ | 456/10682 [03:57<1:23:51, 2.03it/s]
493
  4%|▍ | 457/10682 [03:58<1:23:51, 2.03it/s]
494
  4%|▍ | 458/10682 [03:58<1:23:50, 2.03it/s]
495
  4%|▍ | 459/10682 [03:59<1:23:47, 2.03it/s]
496
  4%|▍ | 460/10682 [03:59<1:23:45, 2.03it/s]
497
  4%|▍ | 461/10682 [04:00<1:23:48, 2.03it/s]
498
  4%|▍ | 462/10682 [04:00<1:23:44, 2.03it/s]
499
  4%|▍ | 463/10682 [04:01<1:23:48, 2.03it/s]
500
  4%|▍ | 464/10682 [04:01<1:23:47, 2.03it/s]
501
  4%|▍ | 465/10682 [04:02<1:23:48, 2.03it/s]
502
  4%|▍ | 466/10682 [04:02<1:23:50, 2.03it/s]
503
  4%|▍ | 467/10682 [04:03<1:23:53, 2.03it/s]
504
  4%|▍ | 468/10682 [04:03<1:23:52, 2.03it/s]
505
  4%|▍ | 469/10682 [04:04<1:23:51, 2.03it/s]
506
  4%|▍ | 470/10682 [04:04<1:23:49, 2.03it/s]
507
  4%|▍ | 471/10682 [04:05<1:23:48, 2.03it/s]
508
  4%|▍ | 472/10682 [04:05<1:23:45, 2.03it/s]
509
  4%|▍ | 473/10682 [04:06<1:23:50, 2.03it/s]
510
  4%|▍ | 474/10682 [04:06<1:23:45, 2.03it/s]
511
  4%|▍ | 475/10682 [04:07<1:23:40, 2.03it/s]{'loss': 5.1492, 'grad_norm': 0.5106998682022095, 'learning_rate': 0.0004443405051449954, 'epoch': 0.62}
 
512
 
513
  4%|▍ | 475/10682 [04:07<1:23:40, 2.03it/s]
514
  4%|▍ | 476/10682 [04:07<1:23:50, 2.03it/s]
515
  4%|▍ | 477/10682 [04:08<1:23:45, 2.03it/s]
516
  4%|▍ | 478/10682 [04:08<1:23:44, 2.03it/s]
517
  4%|▍ | 479/10682 [04:09<1:23:43, 2.03it/s]
518
  4%|▍ | 480/10682 [04:09<1:23:37, 2.03it/s]
519
  5%|▍ | 481/10682 [04:10<1:23:37, 2.03it/s]
520
  5%|▍ | 482/10682 [04:10<1:23:40, 2.03it/s]
521
  5%|▍ | 483/10682 [04:11<1:23:35, 2.03it/s]
522
  5%|▍ | 484/10682 [04:11<1:23:36, 2.03it/s]
523
  5%|▍ | 485/10682 [04:12<1:23:39, 2.03it/s]
524
  5%|▍ | 486/10682 [04:12<1:23:37, 2.03it/s]
525
  5%|▍ | 487/10682 [04:13<1:23:40, 2.03it/s]
526
  5%|▍ | 488/10682 [04:13<1:23:36, 2.03it/s]
527
  5%|▍ | 489/10682 [04:14<1:23:36, 2.03it/s]
528
  5%|▍ | 490/10682 [04:14<1:23:43, 2.03it/s]
529
  5%|▍ | 491/10682 [04:14<1:23:37, 2.03it/s]
530
  5%|▍ | 492/10682 [04:15<1:23:40, 2.03it/s]
531
  5%|▍ | 493/10682 [04:15<1:23:37, 2.03it/s]
532
  5%|▍ | 494/10682 [04:16<1:23:39, 2.03it/s]
533
  5%|▍ | 495/10682 [04:16<1:23:40, 2.03it/s]
534
  5%|▍ | 496/10682 [04:17<1:23:36, 2.03it/s]
535
  5%|▍ | 497/10682 [04:17<1:23:34, 2.03it/s]
536
  5%|▍ | 498/10682 [04:18<1:23:35, 2.03it/s]
537
  5%|▍ | 499/10682 [04:18<1:23:34, 2.03it/s]
538
  5%|▍ | 500/10682 [04:19<1:23:36, 2.03it/s]
539
  {'loss': 5.0961, 'grad_norm': 0.5852717161178589, 'learning_rate': 0.0004677268475210477, 'epoch': 0.66}
 
540
  5%|▍ | 500/10682 [04:19<1:23:36, 2.03it/s]
541
  5%|▍ | 501/10682 [04:19<1:23:38, 2.03it/s]
542
  5%|▍ | 502/10682 [04:20<1:23:35, 2.03it/s]
543
  5%|▍ | 503/10682 [04:20<1:23:33, 2.03it/s]
544
  5%|▍ | 504/10682 [04:21<1:23:27, 2.03it/s]
545
  5%|▍ | 505/10682 [04:21<1:23:25, 2.03it/s]
546
  5%|▍ | 506/10682 [04:22<1:23:30, 2.03it/s]
547
  5%|▍ | 507/10682 [04:22<1:23:32, 2.03it/s]
548
  5%|▍ | 508/10682 [04:23<1:23:34, 2.03it/s]
549
  5%|▍ | 509/10682 [04:23<1:23:31, 2.03it/s]
550
  5%|▍ | 510/10682 [04:24<1:23:28, 2.03it/s]
551
  5%|▍ | 511/10682 [04:24<1:23:30, 2.03it/s]
552
  5%|▍ | 512/10682 [04:25<1:23:30, 2.03it/s]
553
  5%|▍ | 513/10682 [04:25<1:23:31, 2.03it/s]
554
  5%|▍ | 514/10682 [04:26<1:23:28, 2.03it/s]
555
  5%|▍ | 515/10682 [04:26<1:23:25, 2.03it/s]
556
  5%|▍ | 516/10682 [04:27<1:23:24, 2.03it/s]
557
  5%|▍ | 517/10682 [04:27<1:23:23, 2.03it/s]
558
  5%|▍ | 518/10682 [04:28<1:23:19, 2.03it/s]
559
  5%|▍ | 519/10682 [04:28<1:23:23, 2.03it/s]
560
  5%|▍ | 520/10682 [04:29<1:23:21, 2.03it/s]
561
  5%|▍ | 521/10682 [04:29<1:23:23, 2.03it/s]
562
  5%|▍ | 522/10682 [04:30<1:23:26, 2.03it/s]
563
  5%|▍ | 523/10682 [04:30<1:23:26, 2.03it/s]
564
  5%|▍ | 524/10682 [04:31<1:23:26, 2.03it/s]
565
  5%|▍ | 525/10682 [04:31<1:23:17, 2.03it/s]{'loss': 5.0379, 'grad_norm': 0.4721851348876953, 'learning_rate': 0.0004911131898971, 'epoch': 0.69}
 
566
 
567
  5%|▍ | 525/10682 [04:31<1:23:17, 2.03it/s]
568
  5%|▍ | 526/10682 [04:32<1:23:25, 2.03it/s]
569
  5%|▍ | 527/10682 [04:32<1:23:22, 2.03it/s]
570
  5%|▍ | 528/10682 [04:33<1:23:13, 2.03it/s]
571
  5%|▍ | 529/10682 [04:33<1:23:15, 2.03it/s]
572
  5%|▍ | 530/10682 [04:34<1:23:11, 2.03it/s]
573
  5%|▍ | 531/10682 [04:34<1:23:05, 2.04it/s]
574
  5%|▍ | 532/10682 [04:35<1:23:09, 2.03it/s]
575
  5%|▍ | 533/10682 [04:35<1:23:10, 2.03it/s]
576
  5%|▍ | 534/10682 [04:36<1:23:07, 2.03it/s]
577
  5%|β–Œ | 535/10682 [04:36<1:23:09, 2.03it/s]
578
  5%|β–Œ | 536/10682 [04:37<1:23:13, 2.03it/s]
579
  5%|β–Œ | 537/10682 [04:37<1:23:13, 2.03it/s]
580
  5%|β–Œ | 538/10682 [04:38<1:23:09, 2.03it/s]
581
  5%|β–Œ | 539/10682 [04:38<1:23:12, 2.03it/s]
582
  5%|β–Œ | 540/10682 [04:39<1:23:08, 2.03it/s]
583
  5%|β–Œ | 541/10682 [04:39<1:23:06, 2.03it/s]
584
  5%|β–Œ | 542/10682 [04:40<1:23:10, 2.03it/s]
585
  5%|β–Œ | 543/10682 [04:40<1:23:03, 2.03it/s]
586
  5%|β–Œ | 544/10682 [04:41<1:22:57, 2.04it/s]
587
  5%|β–Œ | 545/10682 [04:41<1:22:57, 2.04it/s]
588
  5%|β–Œ | 546/10682 [04:42<1:23:04, 2.03it/s]
589
  5%|β–Œ | 547/10682 [04:42<1:23:01, 2.03it/s]
590
  5%|β–Œ | 548/10682 [04:43<1:23:04, 2.03it/s]
591
  5%|β–Œ | 549/10682 [04:43<1:23:11, 2.03it/s]
592
  5%|β–Œ | 550/10682 [04:44<1:23:07, 2.03it/s]
593
  {'loss': 4.9823, 'grad_norm': 0.5419530272483826, 'learning_rate': 0.0005144995322731525, 'epoch': 0.72}
 
594
  5%|β–Œ | 550/10682 [04:44<1:23:07, 2.03it/s]
595
  5%|β–Œ | 551/10682 [04:44<1:23:14, 2.03it/s]
596
  5%|β–Œ | 552/10682 [04:45<1:23:08, 2.03it/s]
597
  5%|β–Œ | 553/10682 [04:45<1:23:05, 2.03it/s]
598
  5%|β–Œ | 554/10682 [04:46<1:23:07, 2.03it/s]
599
  5%|β–Œ | 555/10682 [04:46<1:23:04, 2.03it/s]
600
  5%|β–Œ | 556/10682 [04:46<1:23:02, 2.03it/s]
601
  5%|β–Œ | 557/10682 [04:47<1:23:02, 2.03it/s]
602
  5%|β–Œ | 558/10682 [04:47<1:22:53, 2.04it/s]
603
  5%|β–Œ | 559/10682 [04:48<1:22:56, 2.03it/s]
604
  5%|β–Œ | 560/10682 [04:48<1:22:58, 2.03it/s]
605
  5%|β–Œ | 561/10682 [04:49<1:22:57, 2.03it/s]
606
  5%|β–Œ | 562/10682 [04:49<1:23:01, 2.03it/s]
607
  5%|β–Œ | 563/10682 [04:50<1:22:55, 2.03it/s]
608
  5%|β–Œ | 564/10682 [04:50<1:22:50, 2.04it/s]
609
  5%|β–Œ | 565/10682 [04:51<1:22:55, 2.03it/s]
610
  5%|β–Œ | 566/10682 [04:51<1:22:54, 2.03it/s]
611
  5%|β–Œ | 567/10682 [04:52<1:22:52, 2.03it/s]
612
  5%|β–Œ | 568/10682 [04:52<1:22:55, 2.03it/s]
613
  5%|β–Œ | 569/10682 [04:53<1:22:49, 2.03it/s]
614
  5%|β–Œ | 570/10682 [04:53<1:22:49, 2.03it/s]
615
  5%|β–Œ | 571/10682 [04:54<1:22:56, 2.03it/s]
616
  5%|β–Œ | 572/10682 [04:54<1:22:55, 2.03it/s]
617
  5%|β–Œ | 573/10682 [04:55<1:22:58, 2.03it/s]
618
  5%|β–Œ | 574/10682 [04:55<1:22:51, 2.03it/s]
619
  5%|β–Œ | 575/10682 [04:56<1:22:54, 2.03it/s]
620
  {'loss': 4.9327, 'grad_norm': 0.5166158080101013, 'learning_rate': 0.0005378858746492049, 'epoch': 0.75}
 
621
  5%|β–Œ | 575/10682 [04:56<1:22:54, 2.03it/s]
622
  5%|β–Œ | 576/10682 [04:56<1:23:00, 2.03it/s]
623
  5%|β–Œ | 577/10682 [04:57<1:22:52, 2.03it/s]
624
  5%|β–Œ | 578/10682 [04:57<1:22:54, 2.03it/s]
625
  5%|β–Œ | 579/10682 [04:58<1:22:50, 2.03it/s]
626
  5%|β–Œ | 580/10682 [04:58<1:22:46, 2.03it/s]
627
  5%|β–Œ | 581/10682 [04:59<1:22:49, 2.03it/s]
628
  5%|β–Œ | 582/10682 [04:59<1:22:44, 2.03it/s]
629
  5%|β–Œ | 583/10682 [05:00<1:22:42, 2.04it/s]
630
  5%|β–Œ | 584/10682 [05:00<1:22:46, 2.03it/s]
631
  5%|β–Œ | 585/10682 [05:01<1:22:42, 2.03it/s]
632
  5%|β–Œ | 586/10682 [05:01<1:22:45, 2.03it/s]
633
  5%|β–Œ | 587/10682 [05:02<1:22:45, 2.03it/s]
634
  6%|β–Œ | 588/10682 [05:02<1:22:42, 2.03it/s]
635
  6%|β–Œ | 589/10682 [05:03<1:22:43, 2.03it/s]
636
  6%|β–Œ | 590/10682 [05:03<1:22:42, 2.03it/s]
637
  6%|β–Œ | 591/10682 [05:04<1:22:41, 2.03it/s]
638
  6%|β–Œ | 592/10682 [05:04<1:22:46, 2.03it/s]
639
  6%|β–Œ | 593/10682 [05:05<1:22:41, 2.03it/s]
640
  6%|β–Œ | 594/10682 [05:05<1:22:42, 2.03it/s]
641
  6%|β–Œ | 595/10682 [05:06<1:22:45, 2.03it/s]
642
  6%|β–Œ | 596/10682 [05:06<1:22:45, 2.03it/s]
643
  6%|β–Œ | 597/10682 [05:07<1:22:45, 2.03it/s]
644
  6%|β–Œ | 598/10682 [05:07<1:22:40, 2.03it/s]
645
  6%|β–Œ | 599/10682 [05:08<1:22:39, 2.03it/s]
646
  6%|β–Œ | 600/10682 [05:08<1:22:42, 2.03it/s]
647
  {'loss': 4.8904, 'grad_norm': 0.47772836685180664, 'learning_rate': 0.0005612722170252572, 'epoch': 0.79}
 
648
  6%|β–Œ | 600/10682 [05:08<1:22:42, 2.03it/s]
649
  6%|β–Œ | 601/10682 [05:09<1:22:56, 2.03it/s]
650
  6%|β–Œ | 602/10682 [05:09<1:22:56, 2.03it/s]
651
  6%|β–Œ | 603/10682 [05:10<1:22:49, 2.03it/s]
652
  6%|β–Œ | 604/10682 [05:10<1:22:51, 2.03it/s]
653
  6%|β–Œ | 605/10682 [05:11<1:22:47, 2.03it/s]
654
  6%|β–Œ | 606/10682 [05:11<1:22:42, 2.03it/s]
655
  6%|β–Œ | 607/10682 [05:12<1:22:46, 2.03it/s]
656
  6%|β–Œ | 608/10682 [05:12<1:22:41, 2.03it/s]
657
  6%|β–Œ | 609/10682 [05:13<1:29:38, 1.87it/s]
658
  6%|β–Œ | 610/10682 [05:13<1:27:31, 1.92it/s]
659
  6%|β–Œ | 611/10682 [05:14<1:25:59, 1.95it/s]
660
  6%|β–Œ | 612/10682 [05:14<1:24:58, 1.98it/s]
661
  6%|β–Œ | 613/10682 [05:15<1:24:20, 1.99it/s]
662
  6%|β–Œ | 614/10682 [05:15<1:23:43, 2.00it/s]
663
  6%|β–Œ | 615/10682 [05:16<1:23:26, 2.01it/s]
664
  6%|β–Œ | 616/10682 [05:16<1:30:13, 1.86it/s]
665
  6%|β–Œ | 617/10682 [05:17<1:27:49, 1.91it/s]
666
  6%|β–Œ | 618/10682 [05:17<1:26:16, 1.94it/s]
667
  6%|β–Œ | 619/10682 [05:18<1:25:07, 1.97it/s]
668
  6%|β–Œ | 620/10682 [05:18<1:24:15, 1.99it/s]
669
  6%|β–Œ | 621/10682 [05:19<1:23:44, 2.00it/s]
670
  6%|β–Œ | 622/10682 [05:19<1:23:18, 2.01it/s]
671
  6%|β–Œ | 623/10682 [05:20<1:23:01, 2.02it/s]
672
  6%|β–Œ | 624/10682 [05:20<1:22:56, 2.02it/s]
673
  6%|β–Œ | 625/10682 [05:21<1:22:46, 2.03it/s]
674
  {'loss': 4.84, 'grad_norm': 0.46007564663887024, 'learning_rate': 0.0005846585594013096, 'epoch': 0.82}
 
675
  6%|β–Œ | 625/10682 [05:21<1:22:46, 2.03it/s]
676
  6%|β–Œ | 626/10682 [05:21<1:22:47, 2.02it/s]
677
  6%|β–Œ | 627/10682 [05:22<1:22:42, 2.03it/s]
678
  6%|β–Œ | 628/10682 [05:22<1:22:37, 2.03it/s]
679
  6%|β–Œ | 629/10682 [05:23<1:22:35, 2.03it/s]
680
  6%|β–Œ | 630/10682 [05:23<1:22:34, 2.03it/s]
681
  6%|β–Œ | 631/10682 [05:24<1:22:36, 2.03it/s]
682
  6%|β–Œ | 632/10682 [05:24<1:22:34, 2.03it/s]
683
  6%|β–Œ | 633/10682 [05:25<1:22:31, 2.03it/s]
684
  6%|β–Œ | 634/10682 [05:25<1:22:29, 2.03it/s]
685
  6%|β–Œ | 635/10682 [05:26<1:22:27, 2.03it/s]
686
  6%|β–Œ | 636/10682 [05:26<1:22:26, 2.03it/s]
687
  6%|β–Œ | 637/10682 [05:27<1:22:25, 2.03it/s]
688
  6%|β–Œ | 638/10682 [05:27<1:22:25, 2.03it/s]
689
  6%|β–Œ | 639/10682 [05:28<1:22:22, 2.03it/s]
690
  6%|β–Œ | 640/10682 [05:28<1:22:25, 2.03it/s]
691
  6%|β–Œ | 641/10682 [05:29<1:22:22, 2.03it/s]
692
  6%|β–Œ | 642/10682 [05:29<1:22:34, 2.03it/s]
693
  6%|β–Œ | 643/10682 [05:30<1:22:29, 2.03it/s]
694
  6%|β–Œ | 644/10682 [05:30<1:22:26, 2.03it/s]
695
  6%|β–Œ | 645/10682 [05:31<1:22:23, 2.03it/s]
696
  6%|β–Œ | 646/10682 [05:31<1:22:21, 2.03it/s]
697
  6%|β–Œ | 647/10682 [05:32<1:22:23, 2.03it/s]
698
  6%|β–Œ | 648/10682 [05:32<1:22:19, 2.03it/s]
699
  6%|β–Œ | 649/10682 [05:33<1:22:13, 2.03it/s]
700
  6%|β–Œ | 650/10682 [05:33<1:22:13, 2.03it/s]{'loss': 4.8005, 'grad_norm': 0.5678160190582275, 'learning_rate': 0.0006080449017773621, 'epoch': 0.85}
 
701
 
702
  6%|β–Œ | 650/10682 [05:33<1:22:13, 2.03it/s]
703
  6%|β–Œ | 651/10682 [05:34<1:22:16, 2.03it/s]
704
  6%|β–Œ | 652/10682 [05:34<1:22:10, 2.03it/s]
705
  6%|β–Œ | 653/10682 [05:35<1:22:16, 2.03it/s]
706
  6%|β–Œ | 654/10682 [05:35<1:22:15, 2.03it/s]
707
  6%|β–Œ | 655/10682 [05:35<1:22:15, 2.03it/s]
708
  6%|β–Œ | 656/10682 [05:36<1:22:15, 2.03it/s]
709
  6%|β–Œ | 657/10682 [05:36<1:22:10, 2.03it/s]
710
  6%|β–Œ | 658/10682 [05:37<1:22:12, 2.03it/s]
711
  6%|β–Œ | 659/10682 [05:37<1:22:11, 2.03it/s]
712
  6%|β–Œ | 660/10682 [05:38<1:22:09, 2.03it/s]
713
  6%|β–Œ | 661/10682 [05:38<1:22:13, 2.03it/s]
714
  6%|β–Œ | 662/10682 [05:39<1:22:08, 2.03it/s]
715
  6%|β–Œ | 663/10682 [05:39<1:22:09, 2.03it/s]
716
  6%|β–Œ | 664/10682 [05:40<1:22:09, 2.03it/s]
717
  6%|β–Œ | 665/10682 [05:40<1:22:09, 2.03it/s]
718
  6%|β–Œ | 666/10682 [05:41<1:22:07, 2.03it/s]
719
  6%|β–Œ | 667/10682 [05:41<1:22:09, 2.03it/s]
720
  6%|β–‹ | 668/10682 [05:42<1:22:09, 2.03it/s]
721
  6%|β–‹ | 669/10682 [05:42<1:22:11, 2.03it/s]
722
  6%|β–‹ | 670/10682 [05:43<1:22:10, 2.03it/s]
723
  6%|β–‹ | 671/10682 [05:43<1:22:05, 2.03it/s]
724
  6%|β–‹ | 672/10682 [05:44<1:22:01, 2.03it/s]
725
  6%|β–‹ | 673/10682 [05:44<1:22:07, 2.03it/s]
726
  6%|β–‹ | 674/10682 [05:45<1:22:03, 2.03it/s]
727
  6%|β–‹ | 675/10682 [05:45<1:22:01, 2.03it/s]
728
  {'loss': 4.7671, 'grad_norm': 0.4880385100841522, 'learning_rate': 0.0006314312441534145, 'epoch': 0.88}
 
729
  6%|β–‹ | 675/10682 [05:45<1:22:01, 2.03it/s]
730
  6%|β–‹ | 676/10682 [05:46<1:22:10, 2.03it/s]
731
  6%|β–‹ | 677/10682 [05:46<1:22:05, 2.03it/s]
732
  6%|β–‹ | 678/10682 [05:47<1:22:01, 2.03it/s]
733
  6%|β–‹ | 679/10682 [05:47<1:22:00, 2.03it/s]
734
  6%|β–‹ | 680/10682 [05:48<1:22:00, 2.03it/s]
735
  6%|β–‹ | 681/10682 [05:48<1:21:58, 2.03it/s]
736
  6%|β–‹ | 682/10682 [05:49<1:22:00, 2.03it/s]
737
  6%|β–‹ | 683/10682 [05:49<1:22:00, 2.03it/s]
738
  6%|β–‹ | 684/10682 [05:50<1:21:53, 2.03it/s]
739
  6%|β–‹ | 685/10682 [05:50<1:21:53, 2.03it/s]
740
  6%|β–‹ | 686/10682 [05:51<1:21:58, 2.03it/s]
741
  6%|β–‹ | 687/10682 [05:51<1:21:59, 2.03it/s]
742
  6%|β–‹ | 688/10682 [05:52<1:21:58, 2.03it/s]
743
  6%|β–‹ | 689/10682 [05:52<1:21:58, 2.03it/s]
744
  6%|β–‹ | 690/10682 [05:53<1:21:54, 2.03it/s]
745
  6%|β–‹ | 691/10682 [05:53<1:21:55, 2.03it/s]
746
  6%|β–‹ | 692/10682 [05:54<1:21:55, 2.03it/s]
747
  6%|β–‹ | 693/10682 [05:54<1:21:52, 2.03it/s]
748
  6%|β–‹ | 694/10682 [05:55<1:21:53, 2.03it/s]
749
  7%|β–‹ | 695/10682 [05:55<1:21:52, 2.03it/s]
750
  7%|β–‹ | 696/10682 [05:56<1:21:49, 2.03it/s]
751
  7%|β–‹ | 697/10682 [05:56<1:21:49, 2.03it/s]
752
  7%|β–‹ | 698/10682 [05:57<1:21:50, 2.03it/s]
753
  7%|β–‹ | 699/10682 [05:57<1:21:52, 2.03it/s]
754
  7%|β–‹ | 700/10682 [05:58<1:21:56, 2.03it/s]{'loss': 4.7325, 'grad_norm': 0.42659100890159607, 'learning_rate': 0.0006548175865294667, 'epoch': 0.92}
 
755
 
756
  7%|β–‹ | 700/10682 [05:58<1:21:56, 2.03it/s]
757
  7%|β–‹ | 701/10682 [05:58<1:22:04, 2.03it/s]
758
  7%|β–‹ | 702/10682 [05:59<1:22:01, 2.03it/s]
759
  7%|β–‹ | 703/10682 [05:59<1:21:58, 2.03it/s]
760
  7%|β–‹ | 704/10682 [06:00<1:21:55, 2.03it/s]
761
  7%|β–‹ | 705/10682 [06:00<1:21:51, 2.03it/s]
762
  7%|β–‹ | 706/10682 [06:01<1:21:51, 2.03it/s]
763
  7%|β–‹ | 707/10682 [06:01<1:21:50, 2.03it/s]
764
  7%|β–‹ | 708/10682 [06:02<1:21:47, 2.03it/s]
765
  7%|β–‹ | 709/10682 [06:02<1:21:49, 2.03it/s]
766
  7%|β–‹ | 710/10682 [06:03<1:21:50, 2.03it/s]
767
  7%|β–‹ | 711/10682 [06:03<1:21:46, 2.03it/s]
768
  7%|β–‹ | 712/10682 [06:04<1:21:52, 2.03it/s]
769
  7%|β–‹ | 713/10682 [06:04<1:21:42, 2.03it/s]
770
  7%|β–‹ | 714/10682 [06:05<1:21:42, 2.03it/s]
771
  7%|β–‹ | 715/10682 [06:05<1:21:42, 2.03it/s]
772
  7%|β–‹ | 716/10682 [06:06<1:21:38, 2.03it/s]
773
  7%|β–‹ | 717/10682 [06:06<1:21:44, 2.03it/s]
774
  7%|β–‹ | 718/10682 [06:06<1:21:44, 2.03it/s]
775
  7%|β–‹ | 719/10682 [06:07<1:21:40, 2.03it/s]
776
  7%|β–‹ | 720/10682 [06:07<1:21:42, 2.03it/s]
777
  7%|β–‹ | 721/10682 [06:08<1:21:39, 2.03it/s]
778
  7%|β–‹ | 722/10682 [06:08<1:21:44, 2.03it/s]
779
  7%|β–‹ | 723/10682 [06:09<1:21:43, 2.03it/s]
780
  7%|β–‹ | 724/10682 [06:09<1:21:41, 2.03it/s]
781
  7%|β–‹ | 725/10682 [06:10<1:21:41, 2.03it/s]{'loss': 4.7051, 'grad_norm': 0.42874085903167725, 'learning_rate': 0.0006782039289055192, 'epoch': 0.95}
782
 
 
783
  7%|β–‹ | 725/10682 [06:10<1:21:41, 2.03it/s]
784
  7%|β–‹ | 726/10682 [06:10<1:21:43, 2.03it/s]
785
  7%|β–‹ | 727/10682 [06:11<1:21:45, 2.03it/s]
786
  7%|β–‹ | 728/10682 [06:11<1:21:43, 2.03it/s]
787
  7%|β–‹ | 729/10682 [06:12<1:21:43, 2.03it/s]
788
  7%|β–‹ | 730/10682 [06:12<1:21:43, 2.03it/s]
789
  7%|β–‹ | 731/10682 [06:13<1:21:38, 2.03it/s]
790
  7%|β–‹ | 732/10682 [06:13<1:21:37, 2.03it/s]
791
  7%|β–‹ | 733/10682 [06:14<1:21:36, 2.03it/s]
792
  7%|β–‹ | 734/10682 [06:14<1:21:36, 2.03it/s]
793
  7%|β–‹ | 735/10682 [06:15<1:21:38, 2.03it/s]
794
  7%|β–‹ | 736/10682 [06:15<1:21:37, 2.03it/s]
795
  7%|β–‹ | 737/10682 [06:16<1:21:37, 2.03it/s]
796
  7%|β–‹ | 738/10682 [06:16<1:21:36, 2.03it/s]
797
  7%|β–‹ | 739/10682 [06:17<1:21:32, 2.03it/s]
798
  7%|β–‹ | 740/10682 [06:17<1:21:32, 2.03it/s]
799
  7%|β–‹ | 741/10682 [06:18<1:21:29, 2.03it/s]
800
  7%|β–‹ | 742/10682 [06:18<1:21:31, 2.03it/s]
801
  7%|β–‹ | 743/10682 [06:19<1:21:30, 2.03it/s]
802
  7%|β–‹ | 744/10682 [06:19<1:21:32, 2.03it/s]
803
  7%|β–‹ | 745/10682 [06:20<1:21:28, 2.03it/s]
804
  7%|β–‹ | 746/10682 [06:20<1:21:29, 2.03it/s]
805
  7%|β–‹ | 747/10682 [06:21<1:21:29, 2.03it/s]
806
  7%|β–‹ | 748/10682 [06:21<1:21:34, 2.03it/s]
807
  7%|β–‹ | 749/10682 [06:22<1:21:30, 2.03it/s]
808
  7%|β–‹ | 750/10682 [06:22<1:21:26, 2.03it/s]
809
  {'loss': 4.6758, 'grad_norm': 0.3773520588874817, 'learning_rate': 0.0007015902712815716, 'epoch': 0.98}
 
810
  7%|β–‹ | 750/10682 [06:22<1:21:26, 2.03it/s]
811
  7%|β–‹ | 751/10682 [06:23<1:21:31, 2.03it/s]
812
  7%|β–‹ | 752/10682 [06:23<1:21:26, 2.03it/s]
813
  7%|β–‹ | 753/10682 [06:24<1:21:25, 2.03it/s]
814
  7%|β–‹ | 754/10682 [06:24<1:21:27, 2.03it/s]
815
  7%|β–‹ | 755/10682 [06:25<1:21:21, 2.03it/s]
816
  7%|β–‹ | 756/10682 [06:25<1:21:19, 2.03it/s]
817
  7%|β–‹ | 757/10682 [06:26<1:21:21, 2.03it/s]
818
  7%|β–‹ | 758/10682 [06:26<1:21:23, 2.03it/s]
819
  7%|β–‹ | 759/10682 [06:27<1:21:27, 2.03it/s]
820
  7%|β–‹ | 760/10682 [06:27<1:21:26, 2.03it/s]
821
  7%|β–‹ | 761/10682 [06:28<1:21:26, 2.03it/s]
822
  7%|β–‹ | 762/10682 [06:28<1:21:29, 2.03it/s]
823
  7%|β–‹ | 763/10682 [06:29<1:20:48, 2.05it/s]
824
  7%|β–‹ | 764/10682 [06:41<10:54:45, 3.96s/it]
825
  7%|β–‹ | 765/10682 [06:41<8:02:42, 2.92s/it]
826
  7%|β–‹ | 766/10682 [06:42<6:02:29, 2.19s/it]
827
  7%|β–‹ | 767/10682 [06:42<4:38:05, 1.68s/it]
828
  7%|β–‹ | 768/10682 [06:43<3:39:03, 1.33s/it]
829
  7%|β–‹ | 769/10682 [06:43<2:57:38, 1.08s/it]
830
  7%|β–‹ | 770/10682 [06:44<2:28:48, 1.11it/s]
831
  7%|β–‹ | 771/10682 [06:44<2:09:50, 1.27it/s]
832
  7%|β–‹ | 772/10682 [06:45<1:55:25, 1.43it/s]
833
  7%|β–‹ | 773/10682 [06:45<1:45:07, 1.57it/s]
834
  7%|β–‹ | 774/10682 [06:46<1:38:06, 1.68it/s]
835
  7%|β–‹ | 775/10682 [06:46<1:33:05, 1.77it/s]{'loss': 4.6275, 'grad_norm': 0.46725699305534363, 'learning_rate': 0.0007249766136576241, 'epoch': 1.02}
836
 
 
837
  7%|β–‹ | 775/10682 [06:46<1:33:05, 1.77it/s]
838
  7%|β–‹ | 776/10682 [06:47<1:29:45, 1.84it/s]
839
  7%|β–‹ | 777/10682 [06:47<1:27:14, 1.89it/s]
840
  7%|β–‹ | 778/10682 [06:48<1:25:30, 1.93it/s]
841
  7%|β–‹ | 779/10682 [06:48<1:24:15, 1.96it/s]
842
  7%|β–‹ | 780/10682 [06:49<1:23:16, 1.98it/s]
843
  7%|β–‹ | 781/10682 [06:49<1:22:32, 2.00it/s]
844
  7%|β–‹ | 782/10682 [06:50<1:22:18, 2.00it/s]
845
  7%|β–‹ | 783/10682 [06:50<1:21:57, 2.01it/s]
846
  7%|β–‹ | 784/10682 [06:51<1:21:51, 2.02it/s]
847
  7%|β–‹ | 785/10682 [06:51<1:21:38, 2.02it/s]
 
1
+ slurm submission log: 2024-05-11 22:52:02.103456
2
+ created following sbatch script:
3
+
4
+ ###############################
5
+
6
+ #!/bin/bash
7
+
8
+ #SBATCH --account=nlp
9
+ #SBATCH --cpus-per-task=16
10
+ #SBATCH --dependency=afterok:
11
+ #SBATCH --gres=gpu:2
12
+ #SBATCH --job-name=tthrush-job-2343873
13
+ #SBATCH --mem=400G
14
+ #SBATCH --nodelist=sphinx2
15
+ #SBATCH --open-mode=append
16
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/train_job_output.txt
17
+ #SBATCH --partition=sphinx
18
+ #SBATCH --time=14-0
19
+
20
+ # activate your desired anaconda environment
21
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
22
+
23
+ # cd to working directory
24
+ cd .
25
+
26
+ # launch commands
27
+ srun --unbuffered run_as_child_processes 'torchrun --master_port 29499 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default --output_hub_id pythia-70m_default --model_id EleutherAI/pythia-70m --num_train_epochs 14 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2'
28
+
29
+ ###############################
30
+
31
+ submission to slurm complete!
32
+
33
+
34
+ ###############################
35
+ slurm submission output
36
+
37
+
38
+
39
+ sbatch: error: Batch job submission failed: Job dependency problem
40
+
41
+ ###############################
42
+
43
+ slurm submission log: 2024-05-11 22:53:19.792950
44
+ created following sbatch script:
45
+
46
+ ###############################
47
+
48
+ #!/bin/bash
49
+
50
+ #SBATCH --account=nlp
51
+ #SBATCH --cpus-per-task=16
52
+ #SBATCH --dependency=afterok:7599821
53
+ #SBATCH --gres=gpu:2
54
+ #SBATCH --job-name=tthrush-job-4621093
55
+ #SBATCH --mem=400G
56
+ #SBATCH --nodelist=sphinx2
57
+ #SBATCH --open-mode=append
58
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/train_job_output.txt
59
+ #SBATCH --partition=sphinx
60
+ #SBATCH --time=14-0
61
+
62
+ # activate your desired anaconda environment
63
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
64
+
65
+ # cd to working directory
66
+ cd .
67
+
68
+ # launch commands
69
+ srun --unbuffered run_as_child_processes 'torchrun --master_port 29499 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default --output_hub_id pythia-70m_default --model_id EleutherAI/pythia-70m --num_train_epochs 14 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2'
70
+
71
+ ###############################
72
+
73
+ submission to slurm complete!
74
+
75
+
76
+ ###############################
77
+ slurm submission output
78
+
79
+ Submitted batch job 7599822
80
+
81
+
82
+
83
+ ###############################
84
+
85
+ slurm submission log: 2024-05-11 23:09:47.984388
86
+ created following sbatch script:
87
+
88
+ ###############################
89
+
90
+ #!/bin/bash
91
+
92
+ #SBATCH --account=nlp
93
+ #SBATCH --cpus-per-task=16
94
+ #SBATCH --dependency=afterok:7599867
95
+ #SBATCH --gres=gpu:2
96
+ #SBATCH --job-name=tthrush-job-4866328
97
+ #SBATCH --mem=400G
98
+ #SBATCH --nodelist=sphinx2
99
+ #SBATCH --open-mode=append
100
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default/train_job_output.txt
101
+ #SBATCH --partition=sphinx
102
+ #SBATCH --time=14-0
103
+
104
+ # activate your desired anaconda environment
105
+ . /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
106
+
107
+ # cd to working directory
108
+ cd .
109
+
110
+ # launch commands
111
+ srun --unbuffered run_as_child_processes 'torchrun --master_port 29499 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default --output_hub_id pythia-70m_default --model_id EleutherAI/pythia-70m --num_train_epochs 14 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2'
112
+
113
+ ###############################
114
+
115
+ submission to slurm complete!
116
+
117
+
118
+ ###############################
119
+ slurm submission output
120
+
121
+ Submitted batch job 7599868
122
+
123
+
124
+
125
+ ###############################
126
+
127
+ ###############################
128
+ start time: 2024-05-11 23:52:22.564861
129
+ machine: sphinx2
130
+ conda env: pretraining-coreset-selection
131
+ ###############################
132
+ running following processes
133
+
134
+ torchrun --master_port 29499 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default --output_hub_id pythia-70m_default --model_id EleutherAI/pythia-70m --num_train_epochs 14 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2
135
+
136
+
137
+ ###############################
138
+ command outputs:
139
+
140
+
141
+ [2024-05-11 23:52:24,535] torch.distributed.run: [WARNING]
142
+ [2024-05-11 23:52:24,535] torch.distributed.run: [WARNING] *****************************************
143
+ [2024-05-11 23:52:24,535] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
144
+ [2024-05-11 23:52:24,535] torch.distributed.run: [WARNING] *****************************************
145
+ 05/11/2024 23:52:30 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default', output_hub_id='pythia-70m_default', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)
146
+ 05/11/2024 23:52:30 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_5/default', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_default', output_hub_id='pythia-70m_default', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)
147
+
148
  0%| | 0/10682 [00:00<?, ?it/s][rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
149
+ [rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
150
+
151
  0%| | 1/10682 [00:05<15:57:49, 5.38s/it]
152
  0%| | 2/10682 [00:07<10:01:48, 3.38s/it]
153
  0%| | 3/10682 [00:08<7:22:52, 2.49s/it]
154
  0%| | 4/10682 [00:10<5:58:45, 2.02s/it]
155
  0%| | 5/10682 [00:11<5:00:13, 1.69s/it]
156
  0%| | 6/10682 [00:12<4:15:17, 1.43s/it]
157
  0%| | 7/10682 [00:13<3:46:05, 1.27s/it]
158
  0%| | 8/10682 [00:13<3:23:48, 1.15s/it]
159
  0%| | 9/10682 [00:14<3:05:00, 1.04s/it]
160
  0%| | 10/10682 [00:15<2:51:07, 1.04it/s]
161
  0%| | 11/10682 [00:16<2:37:51, 1.13it/s]
162
  0%| | 12/10682 [00:16<2:25:52, 1.22it/s]
163
  0%| | 13/10682 [00:17<2:17:06, 1.30it/s]
164
  0%| | 14/10682 [00:18<2:11:06, 1.36it/s]
165
  0%| | 15/10682 [00:18<2:05:30, 1.42it/s]
166
  0%| | 16/10682 [00:19<1:59:37, 1.49it/s]
167
  0%| | 17/10682 [00:20<1:56:34, 1.52it/s]
168
  0%| | 18/10682 [00:20<1:52:32, 1.58it/s]
169
  0%| | 19/10682 [00:21<1:49:53, 1.62it/s]
170
  0%| | 20/10682 [00:21<1:48:21, 1.64it/s]
171
  0%| | 21/10682 [00:22<1:46:12, 1.67it/s]
172
  0%| | 22/10682 [00:22<1:44:20, 1.70it/s]
173
  0%| | 23/10682 [00:23<1:41:52, 1.74it/s]
174
  0%| | 24/10682 [00:24<1:40:45, 1.76it/s]
175
  0%| | 25/10682 [00:24<1:40:14, 1.77it/s]
176
 
177
+
178
  0%| | 25/10682 [00:24<1:40:14, 1.77it/s]
179
  0%| | 26/10682 [00:25<1:39:10, 1.79it/s]
180
  0%| | 27/10682 [00:25<1:37:34, 1.82it/s]
181
  0%| | 28/10682 [00:26<1:36:48, 1.83it/s]
182
  0%| | 29/10682 [00:26<1:36:00, 1.85it/s]
183
  0%| | 30/10682 [00:27<1:35:12, 1.86it/s]
184
  0%| | 31/10682 [00:27<1:34:57, 1.87it/s]
185
  0%| | 32/10682 [00:28<1:34:15, 1.88it/s]
186
  0%| | 33/10682 [00:28<1:34:03, 1.89it/s]
187
  0%| | 34/10682 [00:29<1:33:35, 1.90it/s]
188
  0%| | 35/10682 [00:29<1:32:53, 1.91it/s]
189
  0%| | 36/10682 [00:30<1:32:07, 1.93it/s]
190
  0%| | 37/10682 [00:30<1:31:43, 1.93it/s]
191
  0%| | 38/10682 [00:31<1:31:06, 1.95it/s]
192
  0%| | 39/10682 [00:31<1:30:51, 1.95it/s]
193
  0%| | 40/10682 [00:32<1:30:16, 1.96it/s]
194
  0%| | 41/10682 [00:32<1:30:12, 1.97it/s]
195
  0%| | 42/10682 [00:33<1:31:55, 1.93it/s]
196
  0%| | 43/10682 [00:34<1:31:57, 1.93it/s]
197
  0%| | 44/10682 [00:34<1:32:51, 1.91it/s]
198
  0%| | 45/10682 [00:35<1:34:55, 1.87it/s]
199
  0%| | 46/10682 [00:35<1:34:07, 1.88it/s]
200
  0%| | 47/10682 [00:36<1:34:23, 1.88it/s]
201
  0%| | 48/10682 [00:36<1:34:34, 1.87it/s]
202
  0%| | 49/10682 [00:37<1:34:01, 1.88it/s]
203
  0%| | 50/10682 [00:37<1:33:22, 1.90it/s]
204
 
205
+
206
  0%| | 50/10682 [00:37<1:33:22, 1.90it/s]
207
  0%| | 51/10682 [00:38<1:32:21, 1.92it/s]
208
  0%| | 52/10682 [00:38<1:31:09, 1.94it/s]
209
  0%| | 53/10682 [00:39<1:30:15, 1.96it/s]
210
  1%| | 54/10682 [00:39<1:29:35, 1.98it/s]
211
  1%| | 55/10682 [00:40<1:29:07, 1.99it/s]
212
  1%| | 56/10682 [00:40<1:29:19, 1.98it/s]
213
  1%| | 57/10682 [00:41<1:29:05, 1.99it/s]
214
  1%| | 58/10682 [00:41<1:29:05, 1.99it/s]
215
  1%| | 59/10682 [00:42<1:28:56, 1.99it/s]
216
  1%| | 60/10682 [00:42<1:28:40, 2.00it/s]
217
  1%| | 61/10682 [00:43<1:28:37, 2.00it/s]
218
  1%| | 62/10682 [00:43<1:28:44, 1.99it/s]
219
  1%| | 63/10682 [00:44<1:28:26, 2.00it/s]
220
  1%| | 64/10682 [00:44<1:28:10, 2.01it/s]
221
  1%| | 65/10682 [00:45<1:27:48, 2.02it/s]
222
  1%| | 66/10682 [00:45<1:27:39, 2.02it/s]
223
  1%| | 67/10682 [00:46<1:27:42, 2.02it/s]
224
  1%| | 68/10682 [00:46<1:27:37, 2.02it/s]
225
  1%| | 69/10682 [00:47<1:27:36, 2.02it/s]
226
  1%| | 70/10682 [00:47<1:27:33, 2.02it/s]
227
  1%| | 71/10682 [00:48<1:27:35, 2.02it/s]
228
  1%| | 72/10682 [00:48<1:27:30, 2.02it/s]
229
  1%| | 73/10682 [00:49<1:27:31, 2.02it/s]
230
  1%| | 74/10682 [00:49<1:27:25, 2.02it/s]
231
  1%| | 75/10682 [00:50<1:27:17, 2.03it/s]{'loss': 9.2238, 'grad_norm': 1.1420856714248657, 'learning_rate': 7.015902712815715e-05, 'epoch': 0.1}
232
+
233
 
234
  1%| | 75/10682 [00:50<1:27:17, 2.03it/s]
235
  1%| | 76/10682 [00:50<1:27:25, 2.02it/s]
236
  1%| | 77/10682 [00:51<1:27:27, 2.02it/s]
237
  1%| | 78/10682 [00:51<1:27:22, 2.02it/s]
238
  1%| | 79/10682 [00:52<1:27:13, 2.03it/s]
239
  1%| | 80/10682 [00:52<1:27:13, 2.03it/s]
240
  1%| | 81/10682 [00:53<1:27:19, 2.02it/s]
241
  1%| | 82/10682 [00:53<1:27:19, 2.02it/s]
242
  1%| | 83/10682 [00:54<1:27:39, 2.02it/s]
243
  1%| | 84/10682 [00:54<1:27:38, 2.02it/s]
244
  1%| | 85/10682 [00:55<1:27:39, 2.01it/s]
245
  1%| | 86/10682 [00:55<1:27:50, 2.01it/s]
246
  1%| | 87/10682 [00:56<1:27:38, 2.02it/s]
247
  1%| | 88/10682 [00:56<1:27:22, 2.02it/s]
248
  1%| | 89/10682 [00:57<1:27:23, 2.02it/s]
249
  1%| | 90/10682 [00:57<1:27:17, 2.02it/s]
250
  1%| | 91/10682 [00:58<1:27:10, 2.02it/s]
251
  1%| | 92/10682 [00:58<1:27:09, 2.03it/s]
252
  1%| | 93/10682 [00:59<1:27:03, 2.03it/s]
253
  1%| | 94/10682 [00:59<1:26:59, 2.03it/s]
254
  1%| | 95/10682 [01:00<1:26:56, 2.03it/s]
255
  1%| | 96/10682 [01:00<1:26:53, 2.03it/s]
256
  1%| | 97/10682 [01:01<1:26:58, 2.03it/s]
257
  1%| | 98/10682 [01:01<1:26:53, 2.03it/s]
258
  1%| | 99/10682 [01:02<1:26:56, 2.03it/s]
259
  1%| | 100/10682 [01:02<1:26:55, 2.03it/s]{'loss': 8.428, 'grad_norm': 0.7997293472290039, 'learning_rate': 9.354536950420954e-05, 'epoch': 0.13}
260
+
261
 
262
  1%| | 100/10682 [01:02<1:26:55, 2.03it/s]
263
  1%| | 101/10682 [01:03<1:27:08, 2.02it/s]
264
  1%| | 102/10682 [01:03<1:27:00, 2.03it/s]
265
  1%| | 103/10682 [01:04<1:26:59, 2.03it/s]
266
  1%| | 104/10682 [01:04<1:26:57, 2.03it/s]
267
  1%| | 105/10682 [01:04<1:26:51, 2.03it/s]
268
  1%| | 106/10682 [01:05<1:26:49, 2.03it/s]
269
  1%| | 107/10682 [01:05<1:26:50, 2.03it/s]
270
  1%| | 108/10682 [01:06<1:26:51, 2.03it/s]
271
  1%| | 109/10682 [01:06<1:26:47, 2.03it/s]
272
  1%| | 110/10682 [01:07<1:26:40, 2.03it/s]
273
  1%| | 111/10682 [01:07<1:26:43, 2.03it/s]
274
  1%| | 112/10682 [01:08<1:26:42, 2.03it/s]
275
  1%| | 113/10682 [01:08<1:26:45, 2.03it/s]
276
  1%| | 114/10682 [01:09<1:26:41, 2.03it/s]
277
  1%| | 115/10682 [01:09<1:26:39, 2.03it/s]
278
  1%| | 116/10682 [01:10<1:26:40, 2.03it/s]
279
  1%| | 117/10682 [01:10<1:26:41, 2.03it/s]
280
  1%| | 118/10682 [01:11<1:26:46, 2.03it/s]
281
  1%| | 119/10682 [01:11<1:26:41, 2.03it/s]
282
  1%| | 120/10682 [01:12<1:26:34, 2.03it/s]
283
  1%| | 121/10682 [01:12<1:26:37, 2.03it/s]
284
  1%| | 122/10682 [01:13<1:26:44, 2.03it/s]
285
  1%| | 123/10682 [01:13<1:26:45, 2.03it/s]
286
  1%| | 124/10682 [01:14<1:26:44, 2.03it/s]
287
  1%| | 125/10682 [01:14<1:26:43, 2.03it/s]
288
 
289
+
290
  1%| | 125/10682 [01:14<1:26:43, 2.03it/s]
291
  1%| | 126/10682 [01:15<1:26:50, 2.03it/s]
292
  1%| | 127/10682 [01:15<1:26:45, 2.03it/s]
293
  1%| | 128/10682 [01:16<1:26:35, 2.03it/s]
294
  1%| | 129/10682 [01:16<1:26:35, 2.03it/s]
295
  1%| | 130/10682 [01:17<1:26:35, 2.03it/s]
296
  1%| | 131/10682 [01:17<1:26:31, 2.03it/s]
297
  1%| | 132/10682 [01:18<1:26:32, 2.03it/s]
298
  1%| | 133/10682 [01:18<1:26:33, 2.03it/s]
299
  1%|▏ | 134/10682 [01:19<1:26:27, 2.03it/s]
300
  1%|▏ | 135/10682 [01:19<1:26:29, 2.03it/s]
301
  1%|▏ | 136/10682 [01:20<1:26:31, 2.03it/s]
302
  1%|▏ | 137/10682 [01:20<1:26:31, 2.03it/s]
303
  1%|▏ | 138/10682 [01:21<1:26:30, 2.03it/s]
304
  1%|▏ | 139/10682 [01:21<1:26:30, 2.03it/s]
305
  1%|▏ | 140/10682 [01:22<1:26:29, 2.03it/s]
306
  1%|▏ | 141/10682 [01:22<1:26:35, 2.03it/s]
307
  1%|▏ | 142/10682 [01:23<1:26:28, 2.03it/s]
308
  1%|▏ | 143/10682 [01:23<1:26:27, 2.03it/s]
309
  1%|▏ | 144/10682 [01:24<1:26:27, 2.03it/s]
310
  1%|▏ | 145/10682 [01:24<1:26:23, 2.03it/s]
311
  1%|▏ | 146/10682 [01:25<1:26:32, 2.03it/s]
312
  1%|▏ | 147/10682 [01:25<1:26:25, 2.03it/s]
313
  1%|▏ | 148/10682 [01:26<1:26:20, 2.03it/s]
314
  1%|▏ | 149/10682 [01:26<1:26:23, 2.03it/s]
315
  1%|▏ | 150/10682 [01:27<1:26:20, 2.03it/s]
316
  {'loss': 7.289, 'grad_norm': 0.367520809173584, 'learning_rate': 0.0001403180542563143, 'epoch': 0.2}
317
+
318
  1%|▏ | 150/10682 [01:27<1:26:20, 2.03it/s]
319
  1%|▏ | 151/10682 [01:27<1:26:26, 2.03it/s]
320
  1%|▏ | 152/10682 [01:28<1:26:25, 2.03it/s]
321
  1%|▏ | 153/10682 [01:28<1:26:23, 2.03it/s]
322
  1%|▏ | 154/10682 [01:29<1:26:26, 2.03it/s]
323
  1%|▏ | 155/10682 [01:29<1:26:23, 2.03it/s]
324
  1%|▏ | 156/10682 [01:30<1:26:20, 2.03it/s]
325
  1%|▏ | 157/10682 [01:30<1:26:22, 2.03it/s]
326
  1%|▏ | 158/10682 [01:31<1:26:23, 2.03it/s]
327
  1%|▏ | 159/10682 [01:31<1:26:25, 2.03it/s]
328
  1%|▏ | 160/10682 [01:32<1:26:15, 2.03it/s]
329
  2%|▏ | 161/10682 [01:32<1:26:14, 2.03it/s]
330
  2%|▏ | 162/10682 [01:33<1:26:18, 2.03it/s]
331
  2%|▏ | 163/10682 [01:33<1:26:12, 2.03it/s]
332
  2%|▏ | 164/10682 [01:34<1:26:10, 2.03it/s]
333
  2%|▏ | 165/10682 [01:34<1:26:10, 2.03it/s]
334
  2%|▏ | 166/10682 [01:35<1:26:08, 2.03it/s]
335
  2%|▏ | 167/10682 [01:35<1:26:13, 2.03it/s]
336
  2%|▏ | 168/10682 [01:36<1:26:12, 2.03it/s]
337
  2%|▏ | 169/10682 [01:36<1:26:15, 2.03it/s]
338
  2%|▏ | 170/10682 [01:36<1:26:14, 2.03it/s]
339
  2%|▏ | 171/10682 [01:37<1:26:08, 2.03it/s]
340
  2%|▏ | 172/10682 [01:37<1:26:18, 2.03it/s]
341
  2%|▏ | 173/10682 [01:38<1:26:12, 2.03it/s]
342
  2%|▏ | 174/10682 [01:38<1:26:13, 2.03it/s]
343
  2%|▏ | 175/10682 [01:39<1:26:11, 2.03it/s]{'loss': 6.8807, 'grad_norm': 0.33734890818595886, 'learning_rate': 0.00016370439663236668, 'epoch': 0.23}
344
 
345
+
346
  2%|▏ | 175/10682 [01:39<1:26:11, 2.03it/s]
347
  2%|▏ | 176/10682 [01:39<1:26:12, 2.03it/s]
348
  2%|▏ | 177/10682 [01:40<1:26:12, 2.03it/s]
349
  2%|▏ | 178/10682 [01:40<1:26:09, 2.03it/s]
350
  2%|▏ | 179/10682 [01:41<1:26:12, 2.03it/s]
351
  2%|▏ | 180/10682 [01:41<1:26:13, 2.03it/s]
352
  2%|▏ | 181/10682 [01:42<1:26:08, 2.03it/s]
353
  2%|▏ | 182/10682 [01:42<1:26:14, 2.03it/s]
354
  2%|▏ | 183/10682 [01:43<1:26:10, 2.03it/s]
355
  2%|▏ | 184/10682 [01:43<1:26:09, 2.03it/s]
356
  2%|▏ | 185/10682 [01:44<1:26:08, 2.03it/s]
357
  2%|▏ | 186/10682 [01:44<1:26:02, 2.03it/s]
358
  2%|▏ | 187/10682 [01:45<1:26:04, 2.03it/s]
359
  2%|▏ | 188/10682 [01:45<1:26:03, 2.03it/s]
360
  2%|▏ | 189/10682 [01:46<1:25:58, 2.03it/s]
361
  2%|▏ | 190/10682 [01:46<1:25:58, 2.03it/s]
362
  2%|▏ | 191/10682 [01:47<1:25:58, 2.03it/s]
363
  2%|▏ | 192/10682 [01:47<1:26:01, 2.03it/s]
364
  2%|▏ | 193/10682 [01:48<1:26:01, 2.03it/s]
365
  2%|▏ | 194/10682 [01:48<1:26:01, 2.03it/s]
366
  2%|▏ | 195/10682 [01:49<1:25:58, 2.03it/s]
367
  2%|▏ | 196/10682 [01:49<1:26:02, 2.03it/s]
368
  2%|▏ | 197/10682 [01:50<1:26:00, 2.03it/s]
369
  2%|▏ | 198/10682 [01:50<1:26:04, 2.03it/s]
370
  2%|▏ | 199/10682 [01:51<1:26:00, 2.03it/s]
371
  2%|▏ | 200/10682 [01:51<1:25:57, 2.03it/s]{'loss': 6.5556, 'grad_norm': 0.46203550696372986, 'learning_rate': 0.00018709073900841907, 'epoch': 0.26}
372
+
373
 
374
  2%|▏ | 200/10682 [01:51<1:25:57, 2.03it/s]
375
  2%|▏ | 201/10682 [01:52<1:26:04, 2.03it/s]
376
  2%|▏ | 202/10682 [01:52<1:26:00, 2.03it/s]
377
  2%|▏ | 203/10682 [01:53<1:26:03, 2.03it/s]
378
  2%|▏ | 204/10682 [01:53<1:26:00, 2.03it/s]
379
  2%|▏ | 205/10682 [01:54<1:25:56, 2.03it/s]
380
  2%|▏ | 206/10682 [01:54<1:25:57, 2.03it/s]
381
  2%|▏ | 207/10682 [01:55<1:25:52, 2.03it/s]
382
  2%|▏ | 208/10682 [01:55<1:25:51, 2.03it/s]
383
  2%|▏ | 209/10682 [01:56<1:25:54, 2.03it/s]
384
  2%|▏ | 210/10682 [01:56<1:25:49, 2.03it/s]
385
  2%|▏ | 211/10682 [01:57<1:25:45, 2.04it/s]
386
  2%|▏ | 212/10682 [01:57<1:25:49, 2.03it/s]
387
  2%|▏ | 213/10682 [01:58<1:25:44, 2.04it/s]
388
  2%|▏ | 214/10682 [01:58<1:25:40, 2.04it/s]
389
  2%|▏ | 215/10682 [01:59<1:25:44, 2.03it/s]
390
  2%|▏ | 216/10682 [01:59<1:25:41, 2.04it/s]
391
  2%|▏ | 217/10682 [02:00<1:25:42, 2.03it/s]
392
  2%|▏ | 218/10682 [02:00<1:25:45, 2.03it/s]
393
  2%|▏ | 219/10682 [02:01<1:25:42, 2.03it/s]
394
  2%|▏ | 220/10682 [02:01<1:25:40, 2.04it/s]
395
  2%|▏ | 221/10682 [02:02<1:25:44, 2.03it/s]
396
  2%|▏ | 222/10682 [02:02<1:25:42, 2.03it/s]
397
  2%|▏ | 223/10682 [02:03<1:25:39, 2.04it/s]
398
  2%|▏ | 224/10682 [02:03<1:25:41, 2.03it/s]
399
  2%|▏ | 225/10682 [02:04<1:25:38, 2.04it/s]{'loss': 6.2908, 'grad_norm': 0.7612385153770447, 'learning_rate': 0.00021047708138447147, 'epoch': 0.29}
400
+
401
 
402
  2%|▏ | 225/10682 [02:04<1:25:38, 2.04it/s]
403
  2%|▏ | 226/10682 [02:04<1:25:45, 2.03it/s]
404
  2%|▏ | 227/10682 [02:05<1:25:46, 2.03it/s]
405
  2%|▏ | 228/10682 [02:05<1:25:42, 2.03it/s]
406
  2%|▏ | 229/10682 [02:06<1:25:41, 2.03it/s]
407
  2%|▏ | 230/10682 [02:06<1:25:43, 2.03it/s]
408
  2%|▏ | 231/10682 [02:07<1:25:37, 2.03it/s]
409
  2%|▏ | 232/10682 [02:07<1:25:41, 2.03it/s]
410
  2%|▏ | 233/10682 [02:07<1:25:39, 2.03it/s]
411
  2%|▏ | 234/10682 [02:08<1:25:33, 2.04it/s]
412
  2%|▏ | 235/10682 [02:08<1:25:38, 2.03it/s]
413
  2%|▏ | 236/10682 [02:09<1:25:37, 2.03it/s]
414
  2%|▏ | 237/10682 [02:09<1:25:35, 2.03it/s]
415
  2%|▏ | 238/10682 [02:10<1:25:40, 2.03it/s]
416
  2%|▏ | 239/10682 [02:10<1:25:36, 2.03it/s]
417
  2%|▏ | 240/10682 [02:11<1:25:38, 2.03it/s]
418
  2%|▏ | 241/10682 [02:11<1:25:39, 2.03it/s]
419
  2%|▏ | 242/10682 [02:12<1:25:38, 2.03it/s]
420
  2%|▏ | 243/10682 [02:12<1:25:43, 2.03it/s]
421
  2%|▏ | 244/10682 [02:13<1:25:35, 2.03it/s]
422
  2%|▏ | 245/10682 [02:13<1:25:33, 2.03it/s]
423
  2%|▏ | 246/10682 [02:14<1:25:38, 2.03it/s]
424
  2%|▏ | 247/10682 [02:14<1:25:35, 2.03it/s]
425
  2%|▏ | 248/10682 [02:15<1:25:37, 2.03it/s]
426
  2%|▏ | 249/10682 [02:15<1:25:34, 2.03it/s]
427
  2%|▏ | 250/10682 [02:16<1:25:30, 2.03it/s]{'loss': 6.0883, 'grad_norm': 0.3854532241821289, 'learning_rate': 0.00023386342376052386, 'epoch': 0.33}
428
 
429
+
430
  2%|▏ | 250/10682 [02:16<1:25:30, 2.03it/s]
431
  2%|▏ | 251/10682 [02:16<1:25:42, 2.03it/s]
432
  2%|▏ | 252/10682 [02:17<1:25:35, 2.03it/s]
433
  2%|▏ | 253/10682 [02:17<1:25:34, 2.03it/s]
434
  2%|▏ | 254/10682 [02:18<1:25:33, 2.03it/s]
435
  2%|▏ | 255/10682 [02:18<1:25:34, 2.03it/s]
436
  2%|▏ | 256/10682 [02:19<1:25:33, 2.03it/s]
437
  2%|▏ | 257/10682 [02:19<1:25:31, 2.03it/s]
438
  2%|▏ | 258/10682 [02:20<1:25:29, 2.03it/s]
439
  2%|▏ | 259/10682 [02:20<1:25:25, 2.03it/s]
440
  2%|▏ | 260/10682 [02:21<1:25:28, 2.03it/s]
441
  2%|▏ | 261/10682 [02:21<1:25:25, 2.03it/s]
442
  2%|▏ | 262/10682 [02:22<1:25:23, 2.03it/s]
443
  2%|▏ | 263/10682 [02:22<1:25:27, 2.03it/s]
444
  2%|▏ | 264/10682 [02:23<1:25:22, 2.03it/s]
445
  2%|▏ | 265/10682 [02:23<1:25:19, 2.03it/s]
446
  2%|▏ | 266/10682 [02:24<1:25:23, 2.03it/s]
447
  2%|▏ | 267/10682 [02:24<1:25:23, 2.03it/s]
448
  3%|β–Ž | 268/10682 [02:25<1:25:23, 2.03it/s]
449
  3%|β–Ž | 269/10682 [02:25<1:25:27, 2.03it/s]
450
  3%|β–Ž | 270/10682 [02:26<1:25:27, 2.03it/s]
451
  3%|β–Ž | 271/10682 [02:26<1:25:24, 2.03it/s]
452
  3%|β–Ž | 272/10682 [02:27<1:25:22, 2.03it/s]
453
  3%|β–Ž | 273/10682 [02:27<1:25:20, 2.03it/s]
454
  3%|β–Ž | 274/10682 [02:28<1:25:21, 2.03it/s]
455
  3%|β–Ž | 275/10682 [02:28<1:25:21, 2.03it/s]
456
  {'loss': 5.9181, 'grad_norm': 0.7595835328102112, 'learning_rate': 0.00025724976613657625, 'epoch': 0.36}
457
+
458
  3%|β–Ž | 275/10682 [02:28<1:25:21, 2.03it/s]
459
  3%|β–Ž | 276/10682 [02:29<1:25:26, 2.03it/s]
460
  3%|β–Ž | 277/10682 [02:29<1:25:26, 2.03it/s]
461
  3%|β–Ž | 278/10682 [02:30<1:25:24, 2.03it/s]
462
  3%|β–Ž | 279/10682 [02:30<1:25:23, 2.03it/s]
463
  3%|β–Ž | 280/10682 [02:31<1:25:23, 2.03it/s]
464
  3%|β–Ž | 281/10682 [02:31<1:25:21, 2.03it/s]
465
  3%|β–Ž | 282/10682 [02:32<1:25:25, 2.03it/s]
466
  3%|β–Ž | 283/10682 [02:32<1:25:22, 2.03it/s]
467
  3%|β–Ž | 284/10682 [02:33<1:25:20, 2.03it/s]
468
  3%|β–Ž | 285/10682 [02:33<1:25:21, 2.03it/s]
469
  3%|β–Ž | 286/10682 [02:34<1:25:24, 2.03it/s]
470
  3%|β–Ž | 287/10682 [02:34<1:25:27, 2.03it/s]
471
  3%|β–Ž | 288/10682 [02:35<1:25:22, 2.03it/s]
472
  3%|β–Ž | 289/10682 [02:35<1:25:16, 2.03it/s]
473
  3%|β–Ž | 290/10682 [02:36<1:25:18, 2.03it/s]
474
  3%|β–Ž | 291/10682 [02:36<1:25:15, 2.03it/s]
475
  3%|β–Ž | 292/10682 [02:37<1:25:08, 2.03it/s]
476
  3%|β–Ž | 293/10682 [02:37<1:25:12, 2.03it/s]
477
  3%|β–Ž | 294/10682 [02:38<1:25:09, 2.03it/s]
478
  3%|β–Ž | 295/10682 [02:38<1:25:07, 2.03it/s]
479
  3%|β–Ž | 296/10682 [02:39<1:25:10, 2.03it/s]
480
  3%|β–Ž | 297/10682 [02:39<1:25:02, 2.04it/s]
481
  3%|β–Ž | 298/10682 [02:39<1:25:05, 2.03it/s]
482
  3%|β–Ž | 299/10682 [02:40<1:25:08, 2.03it/s]
483
  3%|β–Ž | 300/10682 [02:40<1:25:04, 2.03it/s]{'loss': 5.7819, 'grad_norm': 0.6112937927246094, 'learning_rate': 0.0002806361085126286, 'epoch': 0.39}
484
+
485
 
486
  3%|β–Ž | 300/10682 [02:40<1:25:04, 2.03it/s]
487
  3%|β–Ž | 301/10682 [02:41<1:25:24, 2.03it/s]
488
  3%|β–Ž | 302/10682 [02:41<1:25:14, 2.03it/s]
489
  3%|β–Ž | 303/10682 [02:42<1:25:13, 2.03it/s]
490
  3%|β–Ž | 304/10682 [02:42<1:25:08, 2.03it/s]
491
  3%|β–Ž | 305/10682 [02:43<1:25:08, 2.03it/s]
492
  3%|β–Ž | 306/10682 [02:43<1:25:08, 2.03it/s]
493
  3%|β–Ž | 307/10682 [02:44<1:25:02, 2.03it/s]
494
  3%|β–Ž | 308/10682 [02:44<1:25:02, 2.03it/s]
495
  3%|β–Ž | 309/10682 [02:45<1:25:04, 2.03it/s]
496
  3%|β–Ž | 310/10682 [02:45<1:25:05, 2.03it/s]
497
  3%|β–Ž | 311/10682 [02:46<1:25:07, 2.03it/s]
498
  3%|β–Ž | 312/10682 [02:46<1:25:04, 2.03it/s]
499
  3%|β–Ž | 313/10682 [02:47<1:25:04, 2.03it/s]
500
  3%|β–Ž | 314/10682 [02:47<1:25:01, 2.03it/s]
501
  3%|β–Ž | 315/10682 [02:48<1:24:58, 2.03it/s]
502
  3%|β–Ž | 316/10682 [02:48<1:24:57, 2.03it/s]
503
  3%|β–Ž | 317/10682 [02:49<1:24:59, 2.03it/s]
504
  3%|β–Ž | 318/10682 [02:49<1:24:58, 2.03it/s]
505
  3%|β–Ž | 319/10682 [02:50<1:25:00, 2.03it/s]
506
  3%|β–Ž | 320/10682 [02:50<1:24:55, 2.03it/s]
507
  3%|β–Ž | 321/10682 [02:51<1:25:00, 2.03it/s]
508
  3%|β–Ž | 322/10682 [02:51<1:25:05, 2.03it/s]
509
  3%|β–Ž | 323/10682 [02:52<1:24:59, 2.03it/s]
510
  3%|β–Ž | 324/10682 [02:52<1:24:58, 2.03it/s]
511
  3%|β–Ž | 325/10682 [02:53<1:24:55, 2.03it/s]
512
  {'loss': 5.6576, 'grad_norm': 1.0010818243026733, 'learning_rate': 0.00030402245088868103, 'epoch': 0.43}
513
+
514
  3%|β–Ž | 325/10682 [02:53<1:24:55, 2.03it/s]
515
  3%|β–Ž | 326/10682 [02:53<1:25:01, 2.03it/s]
516
  3%|β–Ž | 327/10682 [02:54<1:25:04, 2.03it/s]
517
  3%|β–Ž | 328/10682 [02:54<1:25:01, 2.03it/s]
518
  3%|β–Ž | 329/10682 [02:55<1:25:04, 2.03it/s]
519
  3%|β–Ž | 330/10682 [02:55<1:25:00, 2.03it/s]
520
  3%|β–Ž | 331/10682 [02:56<1:24:56, 2.03it/s]
521
  3%|β–Ž | 332/10682 [02:56<1:24:56, 2.03it/s]
522
  3%|β–Ž | 333/10682 [02:57<1:24:52, 2.03it/s]
523
  3%|β–Ž | 334/10682 [02:57<1:24:51, 2.03it/s]
524
  3%|β–Ž | 335/10682 [02:58<1:24:51, 2.03it/s]
525
  3%|β–Ž | 336/10682 [02:58<1:24:48, 2.03it/s]
526
  3%|β–Ž | 337/10682 [02:59<1:24:50, 2.03it/s]
527
  3%|β–Ž | 338/10682 [02:59<1:24:49, 2.03it/s]
528
  3%|β–Ž | 339/10682 [03:00<1:24:46, 2.03it/s]
529
  3%|β–Ž | 340/10682 [03:00<1:24:53, 2.03it/s]
530
  3%|β–Ž | 341/10682 [03:01<1:24:48, 2.03it/s]
531
  3%|β–Ž | 342/10682 [03:01<1:24:45, 2.03it/s]
532
  3%|β–Ž | 343/10682 [03:02<1:24:49, 2.03it/s]
533
  3%|β–Ž | 344/10682 [03:02<1:24:46, 2.03it/s]
534
  3%|β–Ž | 345/10682 [03:03<1:24:46, 2.03it/s]
535
  3%|β–Ž | 346/10682 [03:03<1:24:44, 2.03it/s]
536
  3%|β–Ž | 347/10682 [03:04<1:24:40, 2.03it/s]
537
  3%|β–Ž | 348/10682 [03:04<1:24:45, 2.03it/s]
538
  3%|β–Ž | 349/10682 [03:05<1:24:42, 2.03it/s]
539
  3%|β–Ž | 350/10682 [03:05<1:24:40, 2.03it/s]{'loss': 5.5561, 'grad_norm': 0.5823507308959961, 'learning_rate': 0.00032740879326473337, 'epoch': 0.46}
540
 
541
+
542
  3%|β–Ž | 350/10682 [03:05<1:24:40, 2.03it/s]
543
  3%|β–Ž | 351/10682 [03:06<1:24:50, 2.03it/s]
544
  3%|β–Ž | 352/10682 [03:06<1:24:41, 2.03it/s]
545
  3%|β–Ž | 353/10682 [03:07<1:24:44, 2.03it/s]
546
  3%|β–Ž | 354/10682 [03:07<1:24:42, 2.03it/s]
547
  3%|β–Ž | 355/10682 [03:08<1:24:36, 2.03it/s]
548
  3%|β–Ž | 356/10682 [03:08<1:24:42, 2.03it/s]
549
  3%|β–Ž | 357/10682 [03:09<1:24:39, 2.03it/s]
550
  3%|β–Ž | 358/10682 [03:09<1:24:36, 2.03it/s]
551
  3%|β–Ž | 359/10682 [03:10<1:24:39, 2.03it/s]
552
  3%|β–Ž | 360/10682 [03:10<1:24:39, 2.03it/s]
553
  3%|β–Ž | 361/10682 [03:10<1:24:40, 2.03it/s]
554
  3%|β–Ž | 362/10682 [03:11<1:24:39, 2.03it/s]
555
  3%|β–Ž | 363/10682 [03:11<1:24:35, 2.03it/s]
556
  3%|β–Ž | 364/10682 [03:12<1:24:36, 2.03it/s]
557
  3%|β–Ž | 365/10682 [03:12<1:24:38, 2.03it/s]
558
  3%|β–Ž | 366/10682 [03:13<1:24:36, 2.03it/s]
559
  3%|β–Ž | 367/10682 [03:13<1:24:37, 2.03it/s]
560
  3%|β–Ž | 368/10682 [03:14<1:24:38, 2.03it/s]
561
  3%|β–Ž | 369/10682 [03:14<1:24:37, 2.03it/s]
562
  3%|β–Ž | 370/10682 [03:15<1:24:39, 2.03it/s]
563
  3%|β–Ž | 371/10682 [03:15<1:24:34, 2.03it/s]
564
  3%|β–Ž | 372/10682 [03:16<1:24:37, 2.03it/s]
565
  3%|β–Ž | 373/10682 [03:16<1:24:37, 2.03it/s]
566
  4%|β–Ž | 374/10682 [03:17<1:24:36, 2.03it/s]
567
  4%|β–Ž | 375/10682 [03:17<1:24:36, 2.03it/s]
568
 
569
+
570
  4%|β–Ž | 375/10682 [03:17<1:24:36, 2.03it/s]
571
  4%|β–Ž | 376/10682 [03:18<1:25:00, 2.02it/s]
572
  4%|β–Ž | 377/10682 [03:18<1:24:53, 2.02it/s]
573
  4%|β–Ž | 378/10682 [03:19<1:24:43, 2.03it/s]
574
  4%|β–Ž | 379/10682 [03:19<1:24:39, 2.03it/s]
575
  4%|β–Ž | 380/10682 [03:20<1:24:37, 2.03it/s]
576
  4%|β–Ž | 381/10682 [03:20<1:24:30, 2.03it/s]
577
  4%|β–Ž | 382/10682 [03:21<1:24:32, 2.03it/s]
578
  4%|β–Ž | 383/10682 [03:21<1:24:28, 2.03it/s]
579
  4%|β–Ž | 384/10682 [03:22<1:24:27, 2.03it/s]
580
  4%|β–Ž | 385/10682 [03:22<1:24:28, 2.03it/s]
581
  4%|β–Ž | 386/10682 [03:23<1:24:22, 2.03it/s]
582
  4%|β–Ž | 387/10682 [03:23<1:24:24, 2.03it/s]
583
  4%|β–Ž | 388/10682 [03:24<1:24:26, 2.03it/s]
584
  4%|β–Ž | 389/10682 [03:24<1:24:25, 2.03it/s]
585
  4%|β–Ž | 390/10682 [03:25<1:24:25, 2.03it/s]
586
  4%|β–Ž | 391/10682 [03:25<1:24:25, 2.03it/s]
587
  4%|β–Ž | 392/10682 [03:26<1:24:20, 2.03it/s]
588
  4%|β–Ž | 393/10682 [03:26<1:24:23, 2.03it/s]
589
  4%|β–Ž | 394/10682 [03:27<1:24:21, 2.03it/s]
590
  4%|β–Ž | 395/10682 [03:27<1:24:21, 2.03it/s]
591
  4%|β–Ž | 396/10682 [03:28<1:24:23, 2.03it/s]
592
  4%|β–Ž | 397/10682 [03:28<1:24:28, 2.03it/s]
593
  4%|β–Ž | 398/10682 [03:29<1:24:28, 2.03it/s]
594
  4%|β–Ž | 399/10682 [03:29<1:24:28, 2.03it/s]
595
  4%|β–Ž | 400/10682 [03:30<1:24:31, 2.03it/s]{'loss': 5.375, 'grad_norm': 0.6424997448921204, 'learning_rate': 0.00037418147801683815, 'epoch': 0.52}
596
+
597
 
598
  4%|β–Ž | 400/10682 [03:30<1:24:31, 2.03it/s]
599
  4%|▍ | 401/10682 [03:30<1:24:46, 2.02it/s]
600
  4%|▍ | 402/10682 [03:31<1:24:39, 2.02it/s]
601
  4%|▍ | 403/10682 [03:31<1:24:35, 2.03it/s]
602
  4%|▍ | 404/10682 [03:32<1:24:31, 2.03it/s]
603
  4%|▍ | 405/10682 [03:32<1:24:23, 2.03it/s]
604
  4%|▍ | 406/10682 [03:33<1:24:22, 2.03it/s]
605
  4%|▍ | 407/10682 [03:33<1:24:23, 2.03it/s]
606
  4%|▍ | 408/10682 [03:34<1:24:18, 2.03it/s]
607
  4%|▍ | 409/10682 [03:34<1:24:17, 2.03it/s]
608
  4%|▍ | 410/10682 [03:35<1:24:17, 2.03it/s]
609
  4%|▍ | 411/10682 [03:35<1:24:12, 2.03it/s]
610
  4%|▍ | 412/10682 [03:36<1:24:08, 2.03it/s]
611
  4%|▍ | 413/10682 [03:36<1:24:10, 2.03it/s]
612
  4%|▍ | 414/10682 [03:37<1:24:06, 2.03it/s]
613
  4%|▍ | 415/10682 [03:37<1:24:09, 2.03it/s]
614
  4%|▍ | 416/10682 [03:38<1:24:10, 2.03it/s]
615
  4%|▍ | 417/10682 [03:38<1:24:06, 2.03it/s]
616
  4%|▍ | 418/10682 [03:39<1:24:06, 2.03it/s]
617
  4%|▍ | 419/10682 [03:39<1:24:08, 2.03it/s]
618
  4%|▍ | 420/10682 [03:40<1:24:08, 2.03it/s]
619
  4%|▍ | 421/10682 [03:40<1:24:09, 2.03it/s]
620
  4%|▍ | 422/10682 [03:41<1:24:08, 2.03it/s]
621
  4%|▍ | 423/10682 [03:41<1:24:06, 2.03it/s]
622
  4%|▍ | 424/10682 [03:42<1:24:06, 2.03it/s]
623
  4%|▍ | 425/10682 [03:42<1:24:03, 2.03it/s]{'loss': 5.2944, 'grad_norm': 0.4700624942779541, 'learning_rate': 0.0003975678203928906, 'epoch': 0.56}
624
+
625
 
626
  4%|▍ | 425/10682 [03:42<1:24:03, 2.03it/s]
627
  4%|▍ | 426/10682 [03:43<1:24:12, 2.03it/s]
628
  4%|▍ | 427/10682 [03:43<1:24:08, 2.03it/s]
629
  4%|▍ | 428/10682 [03:43<1:24:06, 2.03it/s]
630
  4%|▍ | 429/10682 [03:44<1:24:09, 2.03it/s]
631
  4%|▍ | 430/10682 [03:44<1:24:07, 2.03it/s]
632
  4%|▍ | 431/10682 [03:45<1:24:08, 2.03it/s]
633
  4%|▍ | 432/10682 [03:45<1:24:04, 2.03it/s]
634
  4%|▍ | 433/10682 [03:46<1:23:59, 2.03it/s]
635
  4%|▍ | 434/10682 [03:46<1:24:01, 2.03it/s]
636
  4%|▍ | 435/10682 [03:47<1:23:59, 2.03it/s]
637
  4%|▍ | 436/10682 [03:47<1:24:02, 2.03it/s]
638
  4%|▍ | 437/10682 [03:48<1:24:03, 2.03it/s]
639
  4%|▍ | 438/10682 [03:48<1:23:58, 2.03it/s]
640
  4%|▍ | 439/10682 [03:49<1:24:02, 2.03it/s]
641
  4%|▍ | 440/10682 [03:49<1:23:57, 2.03it/s]
642
  4%|▍ | 441/10682 [03:50<1:23:53, 2.03it/s]
643
  4%|▍ | 442/10682 [03:50<1:23:57, 2.03it/s]
644
  4%|▍ | 443/10682 [03:51<1:23:55, 2.03it/s]
645
  4%|▍ | 444/10682 [03:51<1:23:51, 2.03it/s]
646
  4%|▍ | 445/10682 [03:52<1:23:58, 2.03it/s]
647
  4%|▍ | 446/10682 [03:52<1:23:52, 2.03it/s]
648
  4%|▍ | 447/10682 [03:53<1:23:56, 2.03it/s]
649
  4%|▍ | 448/10682 [03:53<1:23:55, 2.03it/s]
650
  4%|▍ | 449/10682 [03:54<1:23:52, 2.03it/s]
651
  4%|▍ | 450/10682 [03:54<1:23:56, 2.03it/s]
652
  {'loss': 5.2223, 'grad_norm': 0.4889560043811798, 'learning_rate': 0.00042095416276894293, 'epoch': 0.59}
653
+
654
  4%|▍ | 450/10682 [03:54<1:23:56, 2.03it/s]
655
  4%|▍ | 451/10682 [03:55<1:23:56, 2.03it/s]
656
  4%|▍ | 452/10682 [03:55<1:23:55, 2.03it/s]
657
  4%|▍ | 453/10682 [03:56<1:23:53, 2.03it/s]
658
  4%|▍ | 454/10682 [03:56<1:23:48, 2.03it/s]
659
  4%|▍ | 455/10682 [03:57<1:23:58, 2.03it/s]
660
  4%|▍ | 456/10682 [03:57<1:23:51, 2.03it/s]
661
  4%|▍ | 457/10682 [03:58<1:23:51, 2.03it/s]
662
  4%|▍ | 458/10682 [03:58<1:23:50, 2.03it/s]
663
  4%|▍ | 459/10682 [03:59<1:23:47, 2.03it/s]
664
  4%|▍ | 460/10682 [03:59<1:23:45, 2.03it/s]
665
  4%|▍ | 461/10682 [04:00<1:23:48, 2.03it/s]
666
  4%|▍ | 462/10682 [04:00<1:23:44, 2.03it/s]
667
  4%|▍ | 463/10682 [04:01<1:23:48, 2.03it/s]
668
  4%|▍ | 464/10682 [04:01<1:23:47, 2.03it/s]
669
  4%|▍ | 465/10682 [04:02<1:23:48, 2.03it/s]
670
  4%|▍ | 466/10682 [04:02<1:23:50, 2.03it/s]
671
  4%|▍ | 467/10682 [04:03<1:23:53, 2.03it/s]
672
  4%|▍ | 468/10682 [04:03<1:23:52, 2.03it/s]
673
  4%|▍ | 469/10682 [04:04<1:23:51, 2.03it/s]
674
  4%|▍ | 470/10682 [04:04<1:23:49, 2.03it/s]
675
  4%|▍ | 471/10682 [04:05<1:23:48, 2.03it/s]
676
  4%|▍ | 472/10682 [04:05<1:23:45, 2.03it/s]
677
  4%|▍ | 473/10682 [04:06<1:23:50, 2.03it/s]
678
  4%|▍ | 474/10682 [04:06<1:23:45, 2.03it/s]
679
  4%|▍ | 475/10682 [04:07<1:23:40, 2.03it/s]{'loss': 5.1492, 'grad_norm': 0.5106998682022095, 'learning_rate': 0.0004443405051449954, 'epoch': 0.62}
680
+
681
 
682
  4%|▍ | 475/10682 [04:07<1:23:40, 2.03it/s]
683
  4%|▍ | 476/10682 [04:07<1:23:50, 2.03it/s]
684
  4%|▍ | 477/10682 [04:08<1:23:45, 2.03it/s]
685
  4%|▍ | 478/10682 [04:08<1:23:44, 2.03it/s]
686
  4%|▍ | 479/10682 [04:09<1:23:43, 2.03it/s]
687
  4%|▍ | 480/10682 [04:09<1:23:37, 2.03it/s]
688
  5%|▍ | 481/10682 [04:10<1:23:37, 2.03it/s]
689
  5%|▍ | 482/10682 [04:10<1:23:40, 2.03it/s]
690
  5%|▍ | 483/10682 [04:11<1:23:35, 2.03it/s]
691
  5%|▍ | 484/10682 [04:11<1:23:36, 2.03it/s]
692
  5%|▍ | 485/10682 [04:12<1:23:39, 2.03it/s]
693
  5%|▍ | 486/10682 [04:12<1:23:37, 2.03it/s]
694
  5%|▍ | 487/10682 [04:13<1:23:40, 2.03it/s]
695
  5%|▍ | 488/10682 [04:13<1:23:36, 2.03it/s]
696
  5%|▍ | 489/10682 [04:14<1:23:36, 2.03it/s]
697
  5%|▍ | 490/10682 [04:14<1:23:43, 2.03it/s]
698
  5%|▍ | 491/10682 [04:14<1:23:37, 2.03it/s]
699
  5%|▍ | 492/10682 [04:15<1:23:40, 2.03it/s]
700
  5%|▍ | 493/10682 [04:15<1:23:37, 2.03it/s]
701
  5%|▍ | 494/10682 [04:16<1:23:39, 2.03it/s]
702
  5%|▍ | 495/10682 [04:16<1:23:40, 2.03it/s]
703
  5%|▍ | 496/10682 [04:17<1:23:36, 2.03it/s]
704
  5%|▍ | 497/10682 [04:17<1:23:34, 2.03it/s]
705
  5%|▍ | 498/10682 [04:18<1:23:35, 2.03it/s]
706
  5%|▍ | 499/10682 [04:18<1:23:34, 2.03it/s]
707
  5%|▍ | 500/10682 [04:19<1:23:36, 2.03it/s]
708
  {'loss': 5.0961, 'grad_norm': 0.5852717161178589, 'learning_rate': 0.0004677268475210477, 'epoch': 0.66}
709
+
710
  5%|▍ | 500/10682 [04:19<1:23:36, 2.03it/s]
711
  5%|▍ | 501/10682 [04:19<1:23:38, 2.03it/s]
712
  5%|▍ | 502/10682 [04:20<1:23:35, 2.03it/s]
713
  5%|▍ | 503/10682 [04:20<1:23:33, 2.03it/s]
714
  5%|▍ | 504/10682 [04:21<1:23:27, 2.03it/s]
715
  5%|▍ | 505/10682 [04:21<1:23:25, 2.03it/s]
716
  5%|▍ | 506/10682 [04:22<1:23:30, 2.03it/s]
717
  5%|▍ | 507/10682 [04:22<1:23:32, 2.03it/s]
718
  5%|▍ | 508/10682 [04:23<1:23:34, 2.03it/s]
719
  5%|▍ | 509/10682 [04:23<1:23:31, 2.03it/s]
720
  5%|▍ | 510/10682 [04:24<1:23:28, 2.03it/s]
721
  5%|▍ | 511/10682 [04:24<1:23:30, 2.03it/s]
722
  5%|▍ | 512/10682 [04:25<1:23:30, 2.03it/s]
723
  5%|▍ | 513/10682 [04:25<1:23:31, 2.03it/s]
724
  5%|▍ | 514/10682 [04:26<1:23:28, 2.03it/s]
725
  5%|▍ | 515/10682 [04:26<1:23:25, 2.03it/s]
726
  5%|▍ | 516/10682 [04:27<1:23:24, 2.03it/s]
727
  5%|▍ | 517/10682 [04:27<1:23:23, 2.03it/s]
728
  5%|▍ | 518/10682 [04:28<1:23:19, 2.03it/s]
729
  5%|▍ | 519/10682 [04:28<1:23:23, 2.03it/s]
730
  5%|▍ | 520/10682 [04:29<1:23:21, 2.03it/s]
731
  5%|▍ | 521/10682 [04:29<1:23:23, 2.03it/s]
732
  5%|▍ | 522/10682 [04:30<1:23:26, 2.03it/s]
733
  5%|▍ | 523/10682 [04:30<1:23:26, 2.03it/s]
734
  5%|▍ | 524/10682 [04:31<1:23:26, 2.03it/s]
735
  5%|▍ | 525/10682 [04:31<1:23:17, 2.03it/s]{'loss': 5.0379, 'grad_norm': 0.4721851348876953, 'learning_rate': 0.0004911131898971, 'epoch': 0.69}
736
+
737
 
738
  5%|▍ | 525/10682 [04:31<1:23:17, 2.03it/s]
739
  5%|▍ | 526/10682 [04:32<1:23:25, 2.03it/s]
740
  5%|▍ | 527/10682 [04:32<1:23:22, 2.03it/s]
741
  5%|▍ | 528/10682 [04:33<1:23:13, 2.03it/s]
742
  5%|▍ | 529/10682 [04:33<1:23:15, 2.03it/s]
743
  5%|▍ | 530/10682 [04:34<1:23:11, 2.03it/s]
744
  5%|▍ | 531/10682 [04:34<1:23:05, 2.04it/s]
745
  5%|▍ | 532/10682 [04:35<1:23:09, 2.03it/s]
746
  5%|▍ | 533/10682 [04:35<1:23:10, 2.03it/s]
747
  5%|▍ | 534/10682 [04:36<1:23:07, 2.03it/s]
748
  5%|β–Œ | 535/10682 [04:36<1:23:09, 2.03it/s]
749
  5%|β–Œ | 536/10682 [04:37<1:23:13, 2.03it/s]
750
  5%|β–Œ | 537/10682 [04:37<1:23:13, 2.03it/s]
751
  5%|β–Œ | 538/10682 [04:38<1:23:09, 2.03it/s]
752
  5%|β–Œ | 539/10682 [04:38<1:23:12, 2.03it/s]
753
  5%|β–Œ | 540/10682 [04:39<1:23:08, 2.03it/s]
754
  5%|β–Œ | 541/10682 [04:39<1:23:06, 2.03it/s]
755
  5%|β–Œ | 542/10682 [04:40<1:23:10, 2.03it/s]
756
  5%|β–Œ | 543/10682 [04:40<1:23:03, 2.03it/s]
757
  5%|β–Œ | 544/10682 [04:41<1:22:57, 2.04it/s]
758
  5%|β–Œ | 545/10682 [04:41<1:22:57, 2.04it/s]
759
  5%|β–Œ | 546/10682 [04:42<1:23:04, 2.03it/s]
760
  5%|β–Œ | 547/10682 [04:42<1:23:01, 2.03it/s]
761
  5%|β–Œ | 548/10682 [04:43<1:23:04, 2.03it/s]
762
  5%|β–Œ | 549/10682 [04:43<1:23:11, 2.03it/s]
763
  5%|β–Œ | 550/10682 [04:44<1:23:07, 2.03it/s]
764
  {'loss': 4.9823, 'grad_norm': 0.5419530272483826, 'learning_rate': 0.0005144995322731525, 'epoch': 0.72}
765
+
766
  5%|β–Œ | 550/10682 [04:44<1:23:07, 2.03it/s]
767
  5%|β–Œ | 551/10682 [04:44<1:23:14, 2.03it/s]
768
  5%|β–Œ | 552/10682 [04:45<1:23:08, 2.03it/s]
769
  5%|β–Œ | 553/10682 [04:45<1:23:05, 2.03it/s]
770
  5%|β–Œ | 554/10682 [04:46<1:23:07, 2.03it/s]
771
  5%|β–Œ | 555/10682 [04:46<1:23:04, 2.03it/s]
772
  5%|β–Œ | 556/10682 [04:46<1:23:02, 2.03it/s]
773
  5%|β–Œ | 557/10682 [04:47<1:23:02, 2.03it/s]
774
  5%|β–Œ | 558/10682 [04:47<1:22:53, 2.04it/s]
775
  5%|β–Œ | 559/10682 [04:48<1:22:56, 2.03it/s]
776
  5%|β–Œ | 560/10682 [04:48<1:22:58, 2.03it/s]
777
  5%|β–Œ | 561/10682 [04:49<1:22:57, 2.03it/s]
778
  5%|β–Œ | 562/10682 [04:49<1:23:01, 2.03it/s]
779
  5%|β–Œ | 563/10682 [04:50<1:22:55, 2.03it/s]
780
  5%|β–Œ | 564/10682 [04:50<1:22:50, 2.04it/s]
781
  5%|β–Œ | 565/10682 [04:51<1:22:55, 2.03it/s]
782
  5%|β–Œ | 566/10682 [04:51<1:22:54, 2.03it/s]
783
  5%|β–Œ | 567/10682 [04:52<1:22:52, 2.03it/s]
784
  5%|β–Œ | 568/10682 [04:52<1:22:55, 2.03it/s]
785
  5%|β–Œ | 569/10682 [04:53<1:22:49, 2.03it/s]
786
  5%|β–Œ | 570/10682 [04:53<1:22:49, 2.03it/s]
787
  5%|β–Œ | 571/10682 [04:54<1:22:56, 2.03it/s]
788
  5%|β–Œ | 572/10682 [04:54<1:22:55, 2.03it/s]
789
  5%|β–Œ | 573/10682 [04:55<1:22:58, 2.03it/s]
790
  5%|β–Œ | 574/10682 [04:55<1:22:51, 2.03it/s]
791
  5%|β–Œ | 575/10682 [04:56<1:22:54, 2.03it/s]
792
  {'loss': 4.9327, 'grad_norm': 0.5166158080101013, 'learning_rate': 0.0005378858746492049, 'epoch': 0.75}
793
+
794
  5%|β–Œ | 575/10682 [04:56<1:22:54, 2.03it/s]
795
  5%|β–Œ | 576/10682 [04:56<1:23:00, 2.03it/s]
796
  5%|β–Œ | 577/10682 [04:57<1:22:52, 2.03it/s]
797
  5%|β–Œ | 578/10682 [04:57<1:22:54, 2.03it/s]
798
  5%|β–Œ | 579/10682 [04:58<1:22:50, 2.03it/s]
799
  5%|β–Œ | 580/10682 [04:58<1:22:46, 2.03it/s]
800
  5%|β–Œ | 581/10682 [04:59<1:22:49, 2.03it/s]
801
  5%|β–Œ | 582/10682 [04:59<1:22:44, 2.03it/s]
802
  5%|β–Œ | 583/10682 [05:00<1:22:42, 2.04it/s]
803
  5%|β–Œ | 584/10682 [05:00<1:22:46, 2.03it/s]
804
  5%|β–Œ | 585/10682 [05:01<1:22:42, 2.03it/s]
805
  5%|β–Œ | 586/10682 [05:01<1:22:45, 2.03it/s]
806
  5%|β–Œ | 587/10682 [05:02<1:22:45, 2.03it/s]
807
  6%|β–Œ | 588/10682 [05:02<1:22:42, 2.03it/s]
808
  6%|β–Œ | 589/10682 [05:03<1:22:43, 2.03it/s]
809
  6%|β–Œ | 590/10682 [05:03<1:22:42, 2.03it/s]
810
  6%|β–Œ | 591/10682 [05:04<1:22:41, 2.03it/s]
811
  6%|β–Œ | 592/10682 [05:04<1:22:46, 2.03it/s]
812
  6%|β–Œ | 593/10682 [05:05<1:22:41, 2.03it/s]
813
  6%|β–Œ | 594/10682 [05:05<1:22:42, 2.03it/s]
814
  6%|β–Œ | 595/10682 [05:06<1:22:45, 2.03it/s]
815
  6%|β–Œ | 596/10682 [05:06<1:22:45, 2.03it/s]
816
  6%|β–Œ | 597/10682 [05:07<1:22:45, 2.03it/s]
817
  6%|β–Œ | 598/10682 [05:07<1:22:40, 2.03it/s]
818
  6%|β–Œ | 599/10682 [05:08<1:22:39, 2.03it/s]
819
  6%|β–Œ | 600/10682 [05:08<1:22:42, 2.03it/s]
820
  {'loss': 4.8904, 'grad_norm': 0.47772836685180664, 'learning_rate': 0.0005612722170252572, 'epoch': 0.79}
821
+
822
  6%|β–Œ | 600/10682 [05:08<1:22:42, 2.03it/s]
823
  6%|β–Œ | 601/10682 [05:09<1:22:56, 2.03it/s]
824
  6%|β–Œ | 602/10682 [05:09<1:22:56, 2.03it/s]
825
  6%|β–Œ | 603/10682 [05:10<1:22:49, 2.03it/s]
826
  6%|β–Œ | 604/10682 [05:10<1:22:51, 2.03it/s]
827
  6%|β–Œ | 605/10682 [05:11<1:22:47, 2.03it/s]
828
  6%|β–Œ | 606/10682 [05:11<1:22:42, 2.03it/s]
829
  6%|β–Œ | 607/10682 [05:12<1:22:46, 2.03it/s]
830
  6%|β–Œ | 608/10682 [05:12<1:22:41, 2.03it/s]
831
  6%|β–Œ | 609/10682 [05:13<1:29:38, 1.87it/s]
832
  6%|β–Œ | 610/10682 [05:13<1:27:31, 1.92it/s]
833
  6%|β–Œ | 611/10682 [05:14<1:25:59, 1.95it/s]
834
  6%|β–Œ | 612/10682 [05:14<1:24:58, 1.98it/s]
835
  6%|β–Œ | 613/10682 [05:15<1:24:20, 1.99it/s]
836
  6%|β–Œ | 614/10682 [05:15<1:23:43, 2.00it/s]
837
  6%|β–Œ | 615/10682 [05:16<1:23:26, 2.01it/s]
838
  6%|β–Œ | 616/10682 [05:16<1:30:13, 1.86it/s]
839
  6%|β–Œ | 617/10682 [05:17<1:27:49, 1.91it/s]
840
  6%|β–Œ | 618/10682 [05:17<1:26:16, 1.94it/s]
841
  6%|β–Œ | 619/10682 [05:18<1:25:07, 1.97it/s]
842
  6%|β–Œ | 620/10682 [05:18<1:24:15, 1.99it/s]
843
  6%|β–Œ | 621/10682 [05:19<1:23:44, 2.00it/s]
844
  6%|β–Œ | 622/10682 [05:19<1:23:18, 2.01it/s]
845
  6%|β–Œ | 623/10682 [05:20<1:23:01, 2.02it/s]
846
  6%|β–Œ | 624/10682 [05:20<1:22:56, 2.02it/s]
847
  6%|β–Œ | 625/10682 [05:21<1:22:46, 2.03it/s]
848
  {'loss': 4.84, 'grad_norm': 0.46007564663887024, 'learning_rate': 0.0005846585594013096, 'epoch': 0.82}
849
+
850
  6%|β–Œ | 625/10682 [05:21<1:22:46, 2.03it/s]
851
  6%|β–Œ | 626/10682 [05:21<1:22:47, 2.02it/s]
852
  6%|β–Œ | 627/10682 [05:22<1:22:42, 2.03it/s]
853
  6%|β–Œ | 628/10682 [05:22<1:22:37, 2.03it/s]
854
  6%|β–Œ | 629/10682 [05:23<1:22:35, 2.03it/s]
855
  6%|β–Œ | 630/10682 [05:23<1:22:34, 2.03it/s]
856
  6%|β–Œ | 631/10682 [05:24<1:22:36, 2.03it/s]
857
  6%|β–Œ | 632/10682 [05:24<1:22:34, 2.03it/s]
858
  6%|β–Œ | 633/10682 [05:25<1:22:31, 2.03it/s]
859
  6%|β–Œ | 634/10682 [05:25<1:22:29, 2.03it/s]
860
  6%|β–Œ | 635/10682 [05:26<1:22:27, 2.03it/s]
861
  6%|β–Œ | 636/10682 [05:26<1:22:26, 2.03it/s]
862
  6%|β–Œ | 637/10682 [05:27<1:22:25, 2.03it/s]
863
  6%|β–Œ | 638/10682 [05:27<1:22:25, 2.03it/s]
864
  6%|β–Œ | 639/10682 [05:28<1:22:22, 2.03it/s]
865
  6%|β–Œ | 640/10682 [05:28<1:22:25, 2.03it/s]
866
  6%|β–Œ | 641/10682 [05:29<1:22:22, 2.03it/s]
867
  6%|β–Œ | 642/10682 [05:29<1:22:34, 2.03it/s]
868
  6%|β–Œ | 643/10682 [05:30<1:22:29, 2.03it/s]
869
  6%|β–Œ | 644/10682 [05:30<1:22:26, 2.03it/s]
870
  6%|β–Œ | 645/10682 [05:31<1:22:23, 2.03it/s]
871
  6%|β–Œ | 646/10682 [05:31<1:22:21, 2.03it/s]
872
  6%|β–Œ | 647/10682 [05:32<1:22:23, 2.03it/s]
873
  6%|β–Œ | 648/10682 [05:32<1:22:19, 2.03it/s]
874
  6%|β–Œ | 649/10682 [05:33<1:22:13, 2.03it/s]
875
  6%|β–Œ | 650/10682 [05:33<1:22:13, 2.03it/s]{'loss': 4.8005, 'grad_norm': 0.5678160190582275, 'learning_rate': 0.0006080449017773621, 'epoch': 0.85}
876
+
877
 
878
  6%|β–Œ | 650/10682 [05:33<1:22:13, 2.03it/s]
879
  6%|β–Œ | 651/10682 [05:34<1:22:16, 2.03it/s]
880
  6%|β–Œ | 652/10682 [05:34<1:22:10, 2.03it/s]
881
  6%|β–Œ | 653/10682 [05:35<1:22:16, 2.03it/s]
882
  6%|β–Œ | 654/10682 [05:35<1:22:15, 2.03it/s]
883
  6%|β–Œ | 655/10682 [05:35<1:22:15, 2.03it/s]
884
  6%|β–Œ | 656/10682 [05:36<1:22:15, 2.03it/s]
885
  6%|β–Œ | 657/10682 [05:36<1:22:10, 2.03it/s]
886
  6%|β–Œ | 658/10682 [05:37<1:22:12, 2.03it/s]
887
  6%|β–Œ | 659/10682 [05:37<1:22:11, 2.03it/s]
888
  6%|β–Œ | 660/10682 [05:38<1:22:09, 2.03it/s]
889
  6%|β–Œ | 661/10682 [05:38<1:22:13, 2.03it/s]
890
  6%|β–Œ | 662/10682 [05:39<1:22:08, 2.03it/s]
891
  6%|β–Œ | 663/10682 [05:39<1:22:09, 2.03it/s]
892
  6%|β–Œ | 664/10682 [05:40<1:22:09, 2.03it/s]
893
  6%|β–Œ | 665/10682 [05:40<1:22:09, 2.03it/s]
894
  6%|β–Œ | 666/10682 [05:41<1:22:07, 2.03it/s]
895
  6%|β–Œ | 667/10682 [05:41<1:22:09, 2.03it/s]
896
  6%|β–‹ | 668/10682 [05:42<1:22:09, 2.03it/s]
897
  6%|β–‹ | 669/10682 [05:42<1:22:11, 2.03it/s]
898
  6%|β–‹ | 670/10682 [05:43<1:22:10, 2.03it/s]
899
  6%|β–‹ | 671/10682 [05:43<1:22:05, 2.03it/s]
900
  6%|β–‹ | 672/10682 [05:44<1:22:01, 2.03it/s]
901
  6%|β–‹ | 673/10682 [05:44<1:22:07, 2.03it/s]
902
  6%|β–‹ | 674/10682 [05:45<1:22:03, 2.03it/s]
903
  6%|β–‹ | 675/10682 [05:45<1:22:01, 2.03it/s]
904
  {'loss': 4.7671, 'grad_norm': 0.4880385100841522, 'learning_rate': 0.0006314312441534145, 'epoch': 0.88}
905
+
906
  6%|β–‹ | 675/10682 [05:45<1:22:01, 2.03it/s]
907
  6%|β–‹ | 676/10682 [05:46<1:22:10, 2.03it/s]
908
  6%|β–‹ | 677/10682 [05:46<1:22:05, 2.03it/s]
909
  6%|β–‹ | 678/10682 [05:47<1:22:01, 2.03it/s]
910
  6%|β–‹ | 679/10682 [05:47<1:22:00, 2.03it/s]
911
  6%|β–‹ | 680/10682 [05:48<1:22:00, 2.03it/s]
912
  6%|β–‹ | 681/10682 [05:48<1:21:58, 2.03it/s]
913
  6%|β–‹ | 682/10682 [05:49<1:22:00, 2.03it/s]
914
  6%|β–‹ | 683/10682 [05:49<1:22:00, 2.03it/s]
915
  6%|β–‹ | 684/10682 [05:50<1:21:53, 2.03it/s]
916
  6%|β–‹ | 685/10682 [05:50<1:21:53, 2.03it/s]
917
  6%|β–‹ | 686/10682 [05:51<1:21:58, 2.03it/s]
918
  6%|β–‹ | 687/10682 [05:51<1:21:59, 2.03it/s]
919
  6%|β–‹ | 688/10682 [05:52<1:21:58, 2.03it/s]
920
  6%|β–‹ | 689/10682 [05:52<1:21:58, 2.03it/s]
921
  6%|β–‹ | 690/10682 [05:53<1:21:54, 2.03it/s]
922
  6%|β–‹ | 691/10682 [05:53<1:21:55, 2.03it/s]
923
  6%|β–‹ | 692/10682 [05:54<1:21:55, 2.03it/s]
924
  6%|β–‹ | 693/10682 [05:54<1:21:52, 2.03it/s]
925
  6%|β–‹ | 694/10682 [05:55<1:21:53, 2.03it/s]
926
  7%|β–‹ | 695/10682 [05:55<1:21:52, 2.03it/s]
927
  7%|β–‹ | 696/10682 [05:56<1:21:49, 2.03it/s]
928
  7%|β–‹ | 697/10682 [05:56<1:21:49, 2.03it/s]
929
  7%|β–‹ | 698/10682 [05:57<1:21:50, 2.03it/s]
930
  7%|β–‹ | 699/10682 [05:57<1:21:52, 2.03it/s]
931
  7%|β–‹ | 700/10682 [05:58<1:21:56, 2.03it/s]{'loss': 4.7325, 'grad_norm': 0.42659100890159607, 'learning_rate': 0.0006548175865294667, 'epoch': 0.92}
932
+
933
 
934
  7%|β–‹ | 700/10682 [05:58<1:21:56, 2.03it/s]
935
  7%|β–‹ | 701/10682 [05:58<1:22:04, 2.03it/s]
936
  7%|β–‹ | 702/10682 [05:59<1:22:01, 2.03it/s]
937
  7%|β–‹ | 703/10682 [05:59<1:21:58, 2.03it/s]
938
  7%|β–‹ | 704/10682 [06:00<1:21:55, 2.03it/s]
939
  7%|β–‹ | 705/10682 [06:00<1:21:51, 2.03it/s]
940
  7%|β–‹ | 706/10682 [06:01<1:21:51, 2.03it/s]
941
  7%|β–‹ | 707/10682 [06:01<1:21:50, 2.03it/s]
942
  7%|β–‹ | 708/10682 [06:02<1:21:47, 2.03it/s]
943
  7%|β–‹ | 709/10682 [06:02<1:21:49, 2.03it/s]
944
  7%|β–‹ | 710/10682 [06:03<1:21:50, 2.03it/s]
945
  7%|β–‹ | 711/10682 [06:03<1:21:46, 2.03it/s]
946
  7%|β–‹ | 712/10682 [06:04<1:21:52, 2.03it/s]
947
  7%|β–‹ | 713/10682 [06:04<1:21:42, 2.03it/s]
948
  7%|β–‹ | 714/10682 [06:05<1:21:42, 2.03it/s]
949
  7%|β–‹ | 715/10682 [06:05<1:21:42, 2.03it/s]
950
  7%|β–‹ | 716/10682 [06:06<1:21:38, 2.03it/s]
951
  7%|β–‹ | 717/10682 [06:06<1:21:44, 2.03it/s]
952
  7%|β–‹ | 718/10682 [06:06<1:21:44, 2.03it/s]
953
  7%|β–‹ | 719/10682 [06:07<1:21:40, 2.03it/s]
954
  7%|β–‹ | 720/10682 [06:07<1:21:42, 2.03it/s]
955
  7%|β–‹ | 721/10682 [06:08<1:21:39, 2.03it/s]
956
  7%|β–‹ | 722/10682 [06:08<1:21:44, 2.03it/s]
957
  7%|β–‹ | 723/10682 [06:09<1:21:43, 2.03it/s]
958
  7%|β–‹ | 724/10682 [06:09<1:21:41, 2.03it/s]
959
  7%|β–‹ | 725/10682 [06:10<1:21:41, 2.03it/s]{'loss': 4.7051, 'grad_norm': 0.42874085903167725, 'learning_rate': 0.0006782039289055192, 'epoch': 0.95}
960
 
961
+
962
  7%|β–‹ | 725/10682 [06:10<1:21:41, 2.03it/s]
963
  7%|β–‹ | 726/10682 [06:10<1:21:43, 2.03it/s]
964
  7%|β–‹ | 727/10682 [06:11<1:21:45, 2.03it/s]
965
  7%|β–‹ | 728/10682 [06:11<1:21:43, 2.03it/s]
966
  7%|β–‹ | 729/10682 [06:12<1:21:43, 2.03it/s]
967
  7%|β–‹ | 730/10682 [06:12<1:21:43, 2.03it/s]
968
  7%|β–‹ | 731/10682 [06:13<1:21:38, 2.03it/s]
969
  7%|β–‹ | 732/10682 [06:13<1:21:37, 2.03it/s]
970
  7%|β–‹ | 733/10682 [06:14<1:21:36, 2.03it/s]
971
  7%|β–‹ | 734/10682 [06:14<1:21:36, 2.03it/s]
972
  7%|β–‹ | 735/10682 [06:15<1:21:38, 2.03it/s]
973
  7%|β–‹ | 736/10682 [06:15<1:21:37, 2.03it/s]
974
  7%|β–‹ | 737/10682 [06:16<1:21:37, 2.03it/s]
975
  7%|β–‹ | 738/10682 [06:16<1:21:36, 2.03it/s]
976
  7%|β–‹ | 739/10682 [06:17<1:21:32, 2.03it/s]
977
  7%|β–‹ | 740/10682 [06:17<1:21:32, 2.03it/s]
978
  7%|β–‹ | 741/10682 [06:18<1:21:29, 2.03it/s]
979
  7%|β–‹ | 742/10682 [06:18<1:21:31, 2.03it/s]
980
  7%|β–‹ | 743/10682 [06:19<1:21:30, 2.03it/s]
981
  7%|β–‹ | 744/10682 [06:19<1:21:32, 2.03it/s]
982
  7%|β–‹ | 745/10682 [06:20<1:21:28, 2.03it/s]
983
  7%|β–‹ | 746/10682 [06:20<1:21:29, 2.03it/s]
984
  7%|β–‹ | 747/10682 [06:21<1:21:29, 2.03it/s]
985
  7%|β–‹ | 748/10682 [06:21<1:21:34, 2.03it/s]
986
  7%|β–‹ | 749/10682 [06:22<1:21:30, 2.03it/s]
987
  7%|β–‹ | 750/10682 [06:22<1:21:26, 2.03it/s]
988
  {'loss': 4.6758, 'grad_norm': 0.3773520588874817, 'learning_rate': 0.0007015902712815716, 'epoch': 0.98}
989
+
990
  7%|β–‹ | 750/10682 [06:22<1:21:26, 2.03it/s]
991
  7%|β–‹ | 751/10682 [06:23<1:21:31, 2.03it/s]
992
  7%|β–‹ | 752/10682 [06:23<1:21:26, 2.03it/s]
993
  7%|β–‹ | 753/10682 [06:24<1:21:25, 2.03it/s]
994
  7%|β–‹ | 754/10682 [06:24<1:21:27, 2.03it/s]
995
  7%|β–‹ | 755/10682 [06:25<1:21:21, 2.03it/s]
996
  7%|β–‹ | 756/10682 [06:25<1:21:19, 2.03it/s]
997
  7%|β–‹ | 757/10682 [06:26<1:21:21, 2.03it/s]
998
  7%|β–‹ | 758/10682 [06:26<1:21:23, 2.03it/s]
999
  7%|β–‹ | 759/10682 [06:27<1:21:27, 2.03it/s]
1000
  7%|β–‹ | 760/10682 [06:27<1:21:26, 2.03it/s]
1001
  7%|β–‹ | 761/10682 [06:28<1:21:26, 2.03it/s]
1002
  7%|β–‹ | 762/10682 [06:28<1:21:29, 2.03it/s]
1003
  7%|β–‹ | 763/10682 [06:29<1:20:48, 2.05it/s]
1004
  7%|β–‹ | 764/10682 [06:41<10:54:45, 3.96s/it]
1005
  7%|β–‹ | 765/10682 [06:41<8:02:42, 2.92s/it]
1006
  7%|β–‹ | 766/10682 [06:42<6:02:29, 2.19s/it]
1007
  7%|β–‹ | 767/10682 [06:42<4:38:05, 1.68s/it]
1008
  7%|β–‹ | 768/10682 [06:43<3:39:03, 1.33s/it]
1009
  7%|β–‹ | 769/10682 [06:43<2:57:38, 1.08s/it]
1010
  7%|β–‹ | 770/10682 [06:44<2:28:48, 1.11it/s]
1011
  7%|β–‹ | 771/10682 [06:44<2:09:50, 1.27it/s]
1012
  7%|β–‹ | 772/10682 [06:45<1:55:25, 1.43it/s]
1013
  7%|β–‹ | 773/10682 [06:45<1:45:07, 1.57it/s]
1014
  7%|β–‹ | 774/10682 [06:46<1:38:06, 1.68it/s]
1015
  7%|β–‹ | 775/10682 [06:46<1:33:05, 1.77it/s]{'loss': 4.6275, 'grad_norm': 0.46725699305534363, 'learning_rate': 0.0007249766136576241, 'epoch': 1.02}
1016
 
1017
+
1018
  7%|β–‹ | 775/10682 [06:46<1:33:05, 1.77it/s]
1019
  7%|β–‹ | 776/10682 [06:47<1:29:45, 1.84it/s]
1020
  7%|β–‹ | 777/10682 [06:47<1:27:14, 1.89it/s]
1021
  7%|β–‹ | 778/10682 [06:48<1:25:30, 1.93it/s]
1022
  7%|β–‹ | 779/10682 [06:48<1:24:15, 1.96it/s]
1023
  7%|β–‹ | 780/10682 [06:49<1:23:16, 1.98it/s]
1024
  7%|β–‹ | 781/10682 [06:49<1:22:32, 2.00it/s]
1025
  7%|β–‹ | 782/10682 [06:50<1:22:18, 2.00it/s]
1026
  7%|β–‹ | 783/10682 [06:50<1:21:57, 2.01it/s]
1027
  7%|β–‹ | 784/10682 [06:51<1:21:51, 2.02it/s]
1028
  7%|β–‹ | 785/10682 [06:51<1:21:38, 2.02it/s]
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cf3be7689dc496c68d9042ba7ef79c5166b2cf3d4a5717ebfc52a3a1503f499
3
+ size 5112