Fizzarolli commited on
Commit
55c8b65
1 Parent(s): b9d48f4

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "EleutherAI/pythia-70m-deduped",
3
+ "architectures": [
4
+ "GPTNeoXForSequenceClassification"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 0,
11
+ "finetuning_task": "text-classification",
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout": 0.0,
14
+ "hidden_size": 512,
15
+ "id2label": {
16
+ "0": "0",
17
+ "1": "1"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 2048,
21
+ "label2id": {
22
+ "0": 0,
23
+ "1": 1
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 2048,
27
+ "model_type": "gpt_neox",
28
+ "num_attention_heads": 8,
29
+ "num_hidden_layers": 6,
30
+ "pad_token_id": 1,
31
+ "problem_type": "single_label_classification",
32
+ "rope_scaling": null,
33
+ "rotary_emb_base": 10000,
34
+ "rotary_pct": 0.25,
35
+ "tie_word_embeddings": false,
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.40.0",
38
+ "use_cache": true,
39
+ "use_parallel_residual": true,
40
+ "vocab_size": 50304
41
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1118e7e407c295adf98a69a551aac9d6816cb611646788c343edd055ab66f75f
3
+ size 178696672
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3a9a5936ab249c7ab9a75fcd851258c9cf4d44a5160702fae894b37b656d1ef
3
+ size 89894586
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91879dfe5c74e96eb6cf7f48ae0d302d4303d02663cbf4f239b508d954e630d6
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b44ab142c282ef925b5fc1e59342c63b1ff0d373b002203b20538609ebd3ecc1
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|padding|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<|padding|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "50254": {
23
+ "content": " ",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "50255": {
31
+ "content": " ",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": false
37
+ },
38
+ "50256": {
39
+ "content": " ",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "50257": {
47
+ "content": " ",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "50258": {
55
+ "content": " ",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "50259": {
63
+ "content": " ",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "50260": {
71
+ "content": " ",
72
+ "lstrip": false,
73
+ "normalized": true,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "50261": {
79
+ "content": " ",
80
+ "lstrip": false,
81
+ "normalized": true,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "50262": {
87
+ "content": " ",
88
+ "lstrip": false,
89
+ "normalized": true,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "50263": {
95
+ "content": " ",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "50264": {
103
+ "content": " ",
104
+ "lstrip": false,
105
+ "normalized": true,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "50265": {
111
+ "content": " ",
112
+ "lstrip": false,
113
+ "normalized": true,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": false
117
+ },
118
+ "50266": {
119
+ "content": " ",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "50267": {
127
+ "content": " ",
128
+ "lstrip": false,
129
+ "normalized": true,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "50268": {
135
+ "content": " ",
136
+ "lstrip": false,
137
+ "normalized": true,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "50269": {
143
+ "content": " ",
144
+ "lstrip": false,
145
+ "normalized": true,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "50270": {
151
+ "content": " ",
152
+ "lstrip": false,
153
+ "normalized": true,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "50271": {
159
+ "content": " ",
160
+ "lstrip": false,
161
+ "normalized": true,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "50272": {
167
+ "content": " ",
168
+ "lstrip": false,
169
+ "normalized": true,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "50273": {
175
+ "content": " ",
176
+ "lstrip": false,
177
+ "normalized": true,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "50274": {
183
+ "content": " ",
184
+ "lstrip": false,
185
+ "normalized": true,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "50275": {
191
+ "content": " ",
192
+ "lstrip": false,
193
+ "normalized": true,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "50276": {
199
+ "content": " ",
200
+ "lstrip": false,
201
+ "normalized": true,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ }
206
+ },
207
+ "bos_token": "<|endoftext|>",
208
+ "clean_up_tokenization_spaces": true,
209
+ "eos_token": "<|endoftext|>",
210
+ "model_max_length": 1000000000000000019884624838656,
211
+ "pad_token": "<|padding|>",
212
+ "tokenizer_class": "GPTNeoXTokenizer",
213
+ "unk_token": "<|endoftext|>"
214
+ }
trainer_state.json ADDED
@@ -0,0 +1,2391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.4821594787465093,
5
+ "eval_steps": 50,
6
+ "global_step": 1500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.008273864929155031,
13
+ "grad_norm": 193.22154235839844,
14
+ "learning_rate": 8.998478317951598e-06,
15
+ "loss": 1.2655,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.016547729858310063,
20
+ "grad_norm": 224.26678466796875,
21
+ "learning_rate": 8.993914300924725e-06,
22
+ "loss": 0.8096,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.024821594787465096,
27
+ "grad_norm": 233.3835906982422,
28
+ "learning_rate": 8.986311035578395e-06,
29
+ "loss": 0.7882,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.033095459716620125,
34
+ "grad_norm": 118.54664611816406,
35
+ "learning_rate": 8.97567366402478e-06,
36
+ "loss": 0.642,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.041369324645775155,
41
+ "grad_norm": 156.66001892089844,
42
+ "learning_rate": 8.962009380351584e-06,
43
+ "loss": 0.7024,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.04964318957493019,
48
+ "grad_norm": 272.93414306640625,
49
+ "learning_rate": 8.945327425756662e-06,
50
+ "loss": 0.6812,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.05791705450408522,
55
+ "grad_norm": 197.8726043701172,
56
+ "learning_rate": 8.925639082298161e-06,
57
+ "loss": 0.7415,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.06619091943324025,
62
+ "grad_norm": 114.77101135253906,
63
+ "learning_rate": 8.902957665264444e-06,
64
+ "loss": 0.6585,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.07446478436239529,
69
+ "grad_norm": 104.65145874023438,
70
+ "learning_rate": 8.8772985141689e-06,
71
+ "loss": 0.675,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.08273864929155031,
76
+ "grad_norm": 115.49458312988281,
77
+ "learning_rate": 8.848678982375792e-06,
78
+ "loss": 0.6154,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.08273864929155031,
83
+ "eval_accuracy": 0.6920930232558139,
84
+ "eval_loss": 0.6675890684127808,
85
+ "eval_runtime": 135.1534,
86
+ "eval_samples_per_second": 7.954,
87
+ "eval_steps_per_second": 7.954,
88
+ "step": 50
89
+ },
90
+ {
91
+ "epoch": 0.09101251422070535,
92
+ "grad_norm": 236.3074951171875,
93
+ "learning_rate": 8.817118425364132e-06,
94
+ "loss": 0.627,
95
+ "step": 55
96
+ },
97
+ {
98
+ "epoch": 0.09928637914986038,
99
+ "grad_norm": 75.98784637451172,
100
+ "learning_rate": 8.78263818763749e-06,
101
+ "loss": 0.5869,
102
+ "step": 60
103
+ },
104
+ {
105
+ "epoch": 0.1075602440790154,
106
+ "grad_norm": 216.54205322265625,
107
+ "learning_rate": 8.745261588288655e-06,
108
+ "loss": 0.6362,
109
+ "step": 65
110
+ },
111
+ {
112
+ "epoch": 0.11583410900817044,
113
+ "grad_norm": 145.553466796875,
114
+ "learning_rate": 8.705013905228855e-06,
115
+ "loss": 0.7391,
116
+ "step": 70
117
+ },
118
+ {
119
+ "epoch": 0.12410797393732548,
120
+ "grad_norm": 62.87096405029297,
121
+ "learning_rate": 8.661922358092222e-06,
122
+ "loss": 0.6121,
123
+ "step": 75
124
+ },
125
+ {
126
+ "epoch": 0.1323818388664805,
127
+ "grad_norm": 50.98234176635742,
128
+ "learning_rate": 8.616016089827077e-06,
129
+ "loss": 0.6115,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 0.14065570379563552,
134
+ "grad_norm": 63.97044372558594,
135
+ "learning_rate": 8.567326146986454e-06,
136
+ "loss": 0.6572,
137
+ "step": 85
138
+ },
139
+ {
140
+ "epoch": 0.14892956872479057,
141
+ "grad_norm": 101.70757293701172,
142
+ "learning_rate": 8.51588545873122e-06,
143
+ "loss": 0.8113,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 0.1572034336539456,
148
+ "grad_norm": 174.54574584960938,
149
+ "learning_rate": 8.461728814559992e-06,
150
+ "loss": 0.7307,
151
+ "step": 95
152
+ },
153
+ {
154
+ "epoch": 0.16547729858310062,
155
+ "grad_norm": 179.63229370117188,
156
+ "learning_rate": 8.404892840780868e-06,
157
+ "loss": 0.6292,
158
+ "step": 100
159
+ },
160
+ {
161
+ "epoch": 0.16547729858310062,
162
+ "eval_accuracy": 0.6846511627906977,
163
+ "eval_loss": 0.7192373871803284,
164
+ "eval_runtime": 136.6349,
165
+ "eval_samples_per_second": 7.868,
166
+ "eval_steps_per_second": 7.868,
167
+ "step": 100
168
+ },
169
+ {
170
+ "epoch": 0.17375116351225567,
171
+ "grad_norm": 177.11233520507812,
172
+ "learning_rate": 8.345415975740966e-06,
173
+ "loss": 0.853,
174
+ "step": 105
175
+ },
176
+ {
177
+ "epoch": 0.1820250284414107,
178
+ "grad_norm": 83.83477020263672,
179
+ "learning_rate": 8.283338443830433e-06,
180
+ "loss": 0.667,
181
+ "step": 110
182
+ },
183
+ {
184
+ "epoch": 0.1902988933705657,
185
+ "grad_norm": 65.83185577392578,
186
+ "learning_rate": 8.21870222827856e-06,
187
+ "loss": 0.7123,
188
+ "step": 115
189
+ },
190
+ {
191
+ "epoch": 0.19857275829972076,
192
+ "grad_norm": 99.53777313232422,
193
+ "learning_rate": 8.151551042760408e-06,
194
+ "loss": 0.6869,
195
+ "step": 120
196
+ },
197
+ {
198
+ "epoch": 0.2068466232288758,
199
+ "grad_norm": 55.019622802734375,
200
+ "learning_rate": 8.081930301833101e-06,
201
+ "loss": 0.6729,
202
+ "step": 125
203
+ },
204
+ {
205
+ "epoch": 0.2151204881580308,
206
+ "grad_norm": 99.40830993652344,
207
+ "learning_rate": 8.009887090221828e-06,
208
+ "loss": 0.6269,
209
+ "step": 130
210
+ },
211
+ {
212
+ "epoch": 0.22339435308718586,
213
+ "grad_norm": 30.939523696899414,
214
+ "learning_rate": 7.935470130976281e-06,
215
+ "loss": 0.5331,
216
+ "step": 135
217
+ },
218
+ {
219
+ "epoch": 0.23166821801634088,
220
+ "grad_norm": 195.37283325195312,
221
+ "learning_rate": 7.858729752519121e-06,
222
+ "loss": 0.6496,
223
+ "step": 140
224
+ },
225
+ {
226
+ "epoch": 0.2399420829454959,
227
+ "grad_norm": 163.1190185546875,
228
+ "learning_rate": 7.779717854608697e-06,
229
+ "loss": 0.6195,
230
+ "step": 145
231
+ },
232
+ {
233
+ "epoch": 0.24821594787465096,
234
+ "grad_norm": 142.21194458007812,
235
+ "learning_rate": 7.69848787323908e-06,
236
+ "loss": 0.6002,
237
+ "step": 150
238
+ },
239
+ {
240
+ "epoch": 0.24821594787465096,
241
+ "eval_accuracy": 0.6781395348837209,
242
+ "eval_loss": 0.6401144862174988,
243
+ "eval_runtime": 135.4948,
244
+ "eval_samples_per_second": 7.934,
245
+ "eval_steps_per_second": 7.934,
246
+ "step": 150
247
+ },
248
+ {
249
+ "epoch": 0.256489812803806,
250
+ "grad_norm": 33.41708755493164,
251
+ "learning_rate": 7.615094744501132e-06,
252
+ "loss": 0.6123,
253
+ "step": 155
254
+ },
255
+ {
256
+ "epoch": 0.264763677732961,
257
+ "grad_norm": 134.9268798828125,
258
+ "learning_rate": 7.5295948674290595e-06,
259
+ "loss": 0.6182,
260
+ "step": 160
261
+ },
262
+ {
263
+ "epoch": 0.27303754266211605,
264
+ "grad_norm": 161.4491729736328,
265
+ "learning_rate": 7.442046065857564e-06,
266
+ "loss": 0.6316,
267
+ "step": 165
268
+ },
269
+ {
270
+ "epoch": 0.28131140759127105,
271
+ "grad_norm": 107.71475219726562,
272
+ "learning_rate": 7.352507549315408e-06,
273
+ "loss": 0.6307,
274
+ "step": 170
275
+ },
276
+ {
277
+ "epoch": 0.2895852725204261,
278
+ "grad_norm": 51.73357391357422,
279
+ "learning_rate": 7.261039872981816e-06,
280
+ "loss": 0.556,
281
+ "step": 175
282
+ },
283
+ {
284
+ "epoch": 0.29785913744958115,
285
+ "grad_norm": 76.03494262695312,
286
+ "learning_rate": 7.167704896732828e-06,
287
+ "loss": 0.7206,
288
+ "step": 180
289
+ },
290
+ {
291
+ "epoch": 0.30613300237873614,
292
+ "grad_norm": 62.3538818359375,
293
+ "learning_rate": 7.0725657433052645e-06,
294
+ "loss": 0.5621,
295
+ "step": 185
296
+ },
297
+ {
298
+ "epoch": 0.3144068673078912,
299
+ "grad_norm": 61.0316047668457,
300
+ "learning_rate": 6.975686755606624e-06,
301
+ "loss": 0.6524,
302
+ "step": 190
303
+ },
304
+ {
305
+ "epoch": 0.32268073223704624,
306
+ "grad_norm": 29.230974197387695,
307
+ "learning_rate": 6.877133453199773e-06,
308
+ "loss": 0.5413,
309
+ "step": 195
310
+ },
311
+ {
312
+ "epoch": 0.33095459716620124,
313
+ "grad_norm": 137.9040069580078,
314
+ "learning_rate": 6.7769724879918564e-06,
315
+ "loss": 0.6446,
316
+ "step": 200
317
+ },
318
+ {
319
+ "epoch": 0.33095459716620124,
320
+ "eval_accuracy": 0.6837209302325581,
321
+ "eval_loss": 0.6216422319412231,
322
+ "eval_runtime": 134.8141,
323
+ "eval_samples_per_second": 7.974,
324
+ "eval_steps_per_second": 7.974,
325
+ "step": 200
326
+ },
327
+ {
328
+ "epoch": 0.3392284620953563,
329
+ "grad_norm": 267.3862609863281,
330
+ "learning_rate": 6.675271599157415e-06,
331
+ "loss": 0.6703,
332
+ "step": 205
333
+ },
334
+ {
335
+ "epoch": 0.34750232702451134,
336
+ "grad_norm": 50.05187225341797,
337
+ "learning_rate": 6.572099567326157e-06,
338
+ "loss": 0.603,
339
+ "step": 210
340
+ },
341
+ {
342
+ "epoch": 0.35577619195366633,
343
+ "grad_norm": 102.92433166503906,
344
+ "learning_rate": 6.467526168066408e-06,
345
+ "loss": 0.6523,
346
+ "step": 215
347
+ },
348
+ {
349
+ "epoch": 0.3640500568828214,
350
+ "grad_norm": 114.91250610351562,
351
+ "learning_rate": 6.361622124695677e-06,
352
+ "loss": 0.5757,
353
+ "step": 220
354
+ },
355
+ {
356
+ "epoch": 0.37232392181197643,
357
+ "grad_norm": 48.830989837646484,
358
+ "learning_rate": 6.254459060450252e-06,
359
+ "loss": 0.6294,
360
+ "step": 225
361
+ },
362
+ {
363
+ "epoch": 0.3805977867411314,
364
+ "grad_norm": 36.482215881347656,
365
+ "learning_rate": 6.146109450046187e-06,
366
+ "loss": 0.5982,
367
+ "step": 230
368
+ },
369
+ {
370
+ "epoch": 0.3888716516702865,
371
+ "grad_norm": 38.25822830200195,
372
+ "learning_rate": 6.036646570664412e-06,
373
+ "loss": 0.5573,
374
+ "step": 235
375
+ },
376
+ {
377
+ "epoch": 0.39714551659944153,
378
+ "grad_norm": 73.86414337158203,
379
+ "learning_rate": 5.926144452393163e-06,
380
+ "loss": 0.6372,
381
+ "step": 240
382
+ },
383
+ {
384
+ "epoch": 0.4054193815285965,
385
+ "grad_norm": 34.50651550292969,
386
+ "learning_rate": 5.814677828161186e-06,
387
+ "loss": 0.5968,
388
+ "step": 245
389
+ },
390
+ {
391
+ "epoch": 0.4136932464577516,
392
+ "grad_norm": 109.74346160888672,
393
+ "learning_rate": 5.7023220831956335e-06,
394
+ "loss": 0.673,
395
+ "step": 250
396
+ },
397
+ {
398
+ "epoch": 0.4136932464577516,
399
+ "eval_accuracy": 0.6874418604651162,
400
+ "eval_loss": 0.6246343851089478,
401
+ "eval_runtime": 136.0439,
402
+ "eval_samples_per_second": 7.902,
403
+ "eval_steps_per_second": 7.902,
404
+ "step": 250
405
+ },
406
+ {
407
+ "epoch": 0.4219671113869066,
408
+ "grad_norm": 91.58174133300781,
409
+ "learning_rate": 5.589153204038793e-06,
410
+ "loss": 0.5698,
411
+ "step": 255
412
+ },
413
+ {
414
+ "epoch": 0.4302409763160616,
415
+ "grad_norm": 162.50047302246094,
416
+ "learning_rate": 5.475247727158154e-06,
417
+ "loss": 0.6554,
418
+ "step": 260
419
+ },
420
+ {
421
+ "epoch": 0.43851484124521667,
422
+ "grad_norm": 144.00473022460938,
423
+ "learning_rate": 5.360682687184554e-06,
424
+ "loss": 0.5838,
425
+ "step": 265
426
+ },
427
+ {
428
+ "epoch": 0.4467887061743717,
429
+ "grad_norm": 94.68612670898438,
430
+ "learning_rate": 5.2455355648134174e-06,
431
+ "loss": 0.5319,
432
+ "step": 270
433
+ },
434
+ {
435
+ "epoch": 0.4550625711035267,
436
+ "grad_norm": 55.78764724731445,
437
+ "learning_rate": 5.129884234404314e-06,
438
+ "loss": 0.5505,
439
+ "step": 275
440
+ },
441
+ {
442
+ "epoch": 0.46333643603268176,
443
+ "grad_norm": 132.5346221923828,
444
+ "learning_rate": 5.013806911314294e-06,
445
+ "loss": 0.6897,
446
+ "step": 280
447
+ },
448
+ {
449
+ "epoch": 0.4716103009618368,
450
+ "grad_norm": 114.60745239257812,
451
+ "learning_rate": 4.897382099000587e-06,
452
+ "loss": 0.6442,
453
+ "step": 285
454
+ },
455
+ {
456
+ "epoch": 0.4798841658909918,
457
+ "grad_norm": 35.80769348144531,
458
+ "learning_rate": 4.780688535928477e-06,
459
+ "loss": 0.7135,
460
+ "step": 290
461
+ },
462
+ {
463
+ "epoch": 0.48815803082014686,
464
+ "grad_norm": 102.89435577392578,
465
+ "learning_rate": 4.66380514232023e-06,
466
+ "loss": 0.605,
467
+ "step": 295
468
+ },
469
+ {
470
+ "epoch": 0.4964318957493019,
471
+ "grad_norm": 34.77372741699219,
472
+ "learning_rate": 4.546810966781089e-06,
473
+ "loss": 0.711,
474
+ "step": 300
475
+ },
476
+ {
477
+ "epoch": 0.4964318957493019,
478
+ "eval_accuracy": 0.6827906976744186,
479
+ "eval_loss": 0.6517416834831238,
480
+ "eval_runtime": 135.0365,
481
+ "eval_samples_per_second": 7.961,
482
+ "eval_steps_per_second": 7.961,
483
+ "step": 300
484
+ },
485
+ {
486
+ "epoch": 0.504705760678457,
487
+ "grad_norm": 88.30133819580078,
488
+ "learning_rate": 4.429785132838475e-06,
489
+ "loss": 0.5888,
490
+ "step": 305
491
+ },
492
+ {
493
+ "epoch": 0.512979625607612,
494
+ "grad_norm": 29.411426544189453,
495
+ "learning_rate": 4.312806785430477e-06,
496
+ "loss": 0.566,
497
+ "step": 310
498
+ },
499
+ {
500
+ "epoch": 0.521253490536767,
501
+ "grad_norm": 118.55986022949219,
502
+ "learning_rate": 4.195955037379899e-06,
503
+ "loss": 0.5398,
504
+ "step": 315
505
+ },
506
+ {
507
+ "epoch": 0.529527355465922,
508
+ "grad_norm": 57.91450500488281,
509
+ "learning_rate": 4.079308915889999e-06,
510
+ "loss": 0.5738,
511
+ "step": 320
512
+ },
513
+ {
514
+ "epoch": 0.537801220395077,
515
+ "grad_norm": 61.48877716064453,
516
+ "learning_rate": 3.96294730909815e-06,
517
+ "loss": 0.5945,
518
+ "step": 325
519
+ },
520
+ {
521
+ "epoch": 0.5460750853242321,
522
+ "grad_norm": 48.938419342041016,
523
+ "learning_rate": 3.846948912723542e-06,
524
+ "loss": 0.6338,
525
+ "step": 330
526
+ },
527
+ {
528
+ "epoch": 0.5543489502533872,
529
+ "grad_norm": 145.6968231201172,
530
+ "learning_rate": 3.731392176845023e-06,
531
+ "loss": 0.5681,
532
+ "step": 335
533
+ },
534
+ {
535
+ "epoch": 0.5626228151825421,
536
+ "grad_norm": 105.9911117553711,
537
+ "learning_rate": 3.6163552528450616e-06,
538
+ "loss": 0.6693,
539
+ "step": 340
540
+ },
541
+ {
542
+ "epoch": 0.5708966801116971,
543
+ "grad_norm": 140.98574829101562,
544
+ "learning_rate": 3.501915940555728e-06,
545
+ "loss": 0.6621,
546
+ "step": 345
547
+ },
548
+ {
549
+ "epoch": 0.5791705450408522,
550
+ "grad_norm": 92.60147094726562,
551
+ "learning_rate": 3.3881516356424178e-06,
552
+ "loss": 0.6682,
553
+ "step": 350
554
+ },
555
+ {
556
+ "epoch": 0.5791705450408522,
557
+ "eval_accuracy": 0.6809302325581396,
558
+ "eval_loss": 0.6307579278945923,
559
+ "eval_runtime": 136.0513,
560
+ "eval_samples_per_second": 7.901,
561
+ "eval_steps_per_second": 7.901,
562
+ "step": 350
563
+ },
564
+ {
565
+ "epoch": 0.5874444099700072,
566
+ "grad_norm": 121.18115997314453,
567
+ "learning_rate": 3.275139277260925e-06,
568
+ "loss": 0.677,
569
+ "step": 355
570
+ },
571
+ {
572
+ "epoch": 0.5957182748991623,
573
+ "grad_norm": 158.01596069335938,
574
+ "learning_rate": 3.162955296023247e-06,
575
+ "loss": 0.6703,
576
+ "step": 360
577
+ },
578
+ {
579
+ "epoch": 0.6039921398283173,
580
+ "grad_norm": 59.304481506347656,
581
+ "learning_rate": 3.0516755623073273e-06,
582
+ "loss": 0.5698,
583
+ "step": 365
584
+ },
585
+ {
586
+ "epoch": 0.6122660047574723,
587
+ "grad_norm": 40.63115310668945,
588
+ "learning_rate": 2.941375334945674e-06,
589
+ "loss": 0.5812,
590
+ "step": 370
591
+ },
592
+ {
593
+ "epoch": 0.6205398696866273,
594
+ "grad_norm": 71.80590057373047,
595
+ "learning_rate": 2.8321292103275813e-06,
596
+ "loss": 0.5766,
597
+ "step": 375
598
+ },
599
+ {
600
+ "epoch": 0.6288137346157824,
601
+ "grad_norm": 22.09684944152832,
602
+ "learning_rate": 2.7240110719493568e-06,
603
+ "loss": 0.5898,
604
+ "step": 380
605
+ },
606
+ {
607
+ "epoch": 0.6370875995449374,
608
+ "grad_norm": 32.82063293457031,
609
+ "learning_rate": 2.617094040446676e-06,
610
+ "loss": 0.6404,
611
+ "step": 385
612
+ },
613
+ {
614
+ "epoch": 0.6453614644740925,
615
+ "grad_norm": 35.220211029052734,
616
+ "learning_rate": 2.511450424142878e-06,
617
+ "loss": 0.649,
618
+ "step": 390
619
+ },
620
+ {
621
+ "epoch": 0.6536353294032475,
622
+ "grad_norm": 22.362548828125,
623
+ "learning_rate": 2.4071516701466072e-06,
624
+ "loss": 0.5896,
625
+ "step": 395
626
+ },
627
+ {
628
+ "epoch": 0.6619091943324025,
629
+ "grad_norm": 159.33010864257812,
630
+ "learning_rate": 2.304268316031922e-06,
631
+ "loss": 0.5437,
632
+ "step": 400
633
+ },
634
+ {
635
+ "epoch": 0.6619091943324025,
636
+ "eval_accuracy": 0.68,
637
+ "eval_loss": 0.6238510608673096,
638
+ "eval_runtime": 137.5204,
639
+ "eval_samples_per_second": 7.817,
640
+ "eval_steps_per_second": 7.817,
641
+ "step": 400
642
+ },
643
+ {
644
+ "epoch": 0.6701830592615575,
645
+ "grad_norm": 15.642170906066895,
646
+ "learning_rate": 2.2028699421335074e-06,
647
+ "loss": 0.5992,
648
+ "step": 405
649
+ },
650
+ {
651
+ "epoch": 0.6784569241907126,
652
+ "grad_norm": 84.97122955322266,
653
+ "learning_rate": 2.1030251244892713e-06,
654
+ "loss": 0.6152,
655
+ "step": 410
656
+ },
657
+ {
658
+ "epoch": 0.6867307891198676,
659
+ "grad_norm": 22.397871017456055,
660
+ "learning_rate": 2.0048013884621617e-06,
661
+ "loss": 0.5848,
662
+ "step": 415
663
+ },
664
+ {
665
+ "epoch": 0.6950046540490227,
666
+ "grad_norm": 102.53570556640625,
667
+ "learning_rate": 1.908265163072554e-06,
668
+ "loss": 0.5884,
669
+ "step": 420
670
+ },
671
+ {
672
+ "epoch": 0.7032785189781777,
673
+ "grad_norm": 175.41653442382812,
674
+ "learning_rate": 1.81348173607209e-06,
675
+ "loss": 0.575,
676
+ "step": 425
677
+ },
678
+ {
679
+ "epoch": 0.7115523839073327,
680
+ "grad_norm": 50.430362701416016,
681
+ "learning_rate": 1.7205152097893694e-06,
682
+ "loss": 0.6514,
683
+ "step": 430
684
+ },
685
+ {
686
+ "epoch": 0.7198262488364877,
687
+ "grad_norm": 121.63434600830078,
688
+ "learning_rate": 1.6294284577773493e-06,
689
+ "loss": 0.5527,
690
+ "step": 435
691
+ },
692
+ {
693
+ "epoch": 0.7281001137656428,
694
+ "grad_norm": 29.108440399169922,
695
+ "learning_rate": 1.540283082291754e-06,
696
+ "loss": 0.6797,
697
+ "step": 440
698
+ },
699
+ {
700
+ "epoch": 0.7363739786947978,
701
+ "grad_norm": 71.40788269042969,
702
+ "learning_rate": 1.4531393726292826e-06,
703
+ "loss": 0.6111,
704
+ "step": 445
705
+ },
706
+ {
707
+ "epoch": 0.7446478436239529,
708
+ "grad_norm": 43.776885986328125,
709
+ "learning_rate": 1.3680562643537693e-06,
710
+ "loss": 0.6352,
711
+ "step": 450
712
+ },
713
+ {
714
+ "epoch": 0.7446478436239529,
715
+ "eval_accuracy": 0.6827906976744186,
716
+ "eval_loss": 0.6232383251190186,
717
+ "eval_runtime": 142.7642,
718
+ "eval_samples_per_second": 7.53,
719
+ "eval_steps_per_second": 7.53,
720
+ "step": 450
721
+ },
722
+ {
723
+ "epoch": 0.7529217085531079,
724
+ "grad_norm": 169.23094177246094,
725
+ "learning_rate": 1.285091299437875e-06,
726
+ "loss": 0.537,
727
+ "step": 455
728
+ },
729
+ {
730
+ "epoch": 0.7611955734822629,
731
+ "grad_norm": 154.0270538330078,
732
+ "learning_rate": 1.2043005873472697e-06,
733
+ "loss": 0.5469,
734
+ "step": 460
735
+ },
736
+ {
737
+ "epoch": 0.7694694384114179,
738
+ "grad_norm": 106.58590698242188,
739
+ "learning_rate": 1.125738767093626e-06,
740
+ "loss": 0.6973,
741
+ "step": 465
742
+ },
743
+ {
744
+ "epoch": 0.777743303340573,
745
+ "grad_norm": 52.2432746887207,
746
+ "learning_rate": 1.049458970282088e-06,
747
+ "loss": 0.6092,
748
+ "step": 470
749
+ },
750
+ {
751
+ "epoch": 0.786017168269728,
752
+ "grad_norm": 27.52301788330078,
753
+ "learning_rate": 9.755127851781945e-07,
754
+ "loss": 0.5584,
755
+ "step": 475
756
+ },
757
+ {
758
+ "epoch": 0.7942910331988831,
759
+ "grad_norm": 113.73265075683594,
760
+ "learning_rate": 9.039502218185748e-07,
761
+ "loss": 0.5841,
762
+ "step": 480
763
+ },
764
+ {
765
+ "epoch": 0.8025648981280381,
766
+ "grad_norm": 85.42076110839844,
767
+ "learning_rate": 8.348196781890096e-07,
768
+ "loss": 0.6151,
769
+ "step": 485
770
+ },
771
+ {
772
+ "epoch": 0.810838763057193,
773
+ "grad_norm": 40.63630294799805,
774
+ "learning_rate": 7.681679074927166e-07,
775
+ "loss": 0.5904,
776
+ "step": 490
777
+ },
778
+ {
779
+ "epoch": 0.8191126279863481,
780
+ "grad_norm": 50.79077911376953,
781
+ "learning_rate": 7.04039986531011e-07,
782
+ "loss": 0.6572,
783
+ "step": 495
784
+ },
785
+ {
786
+ "epoch": 0.8273864929155031,
787
+ "grad_norm": 53.14852523803711,
788
+ "learning_rate": 6.424792852177275e-07,
789
+ "loss": 0.6784,
790
+ "step": 500
791
+ },
792
+ {
793
+ "epoch": 0.8273864929155031,
794
+ "eval_accuracy": 0.6865116279069767,
795
+ "eval_loss": 0.621077835559845,
796
+ "eval_runtime": 142.2156,
797
+ "eval_samples_per_second": 7.559,
798
+ "eval_steps_per_second": 7.559,
799
+ "step": 500
800
+ },
801
+ {
802
+ "epoch": 0.8356603578446582,
803
+ "grad_norm": 240.6154022216797,
804
+ "learning_rate": 7.382571090205965e-06,
805
+ "loss": 0.6728,
806
+ "step": 505
807
+ },
808
+ {
809
+ "epoch": 0.8439342227738132,
810
+ "grad_norm": 38.96548843383789,
811
+ "learning_rate": 7.352507549315408e-06,
812
+ "loss": 0.6614,
813
+ "step": 510
814
+ },
815
+ {
816
+ "epoch": 0.8522080877029683,
817
+ "grad_norm": 20.97259521484375,
818
+ "learning_rate": 7.322229646597724e-06,
819
+ "loss": 0.5843,
820
+ "step": 515
821
+ },
822
+ {
823
+ "epoch": 0.8604819526321232,
824
+ "grad_norm": 108.45309448242188,
825
+ "learning_rate": 7.2917396573936266e-06,
826
+ "loss": 0.6245,
827
+ "step": 520
828
+ },
829
+ {
830
+ "epoch": 0.8687558175612783,
831
+ "grad_norm": 142.10877990722656,
832
+ "learning_rate": 7.261039872981816e-06,
833
+ "loss": 0.5639,
834
+ "step": 525
835
+ },
836
+ {
837
+ "epoch": 0.8770296824904333,
838
+ "grad_norm": 74.65398406982422,
839
+ "learning_rate": 7.2301326004068e-06,
840
+ "loss": 0.5823,
841
+ "step": 530
842
+ },
843
+ {
844
+ "epoch": 0.8853035474195884,
845
+ "grad_norm": 89.85851287841797,
846
+ "learning_rate": 7.199020162305524e-06,
847
+ "loss": 0.5769,
848
+ "step": 535
849
+ },
850
+ {
851
+ "epoch": 0.8935774123487434,
852
+ "grad_norm": 60.187286376953125,
853
+ "learning_rate": 7.167704896732828e-06,
854
+ "loss": 0.6494,
855
+ "step": 540
856
+ },
857
+ {
858
+ "epoch": 0.9018512772778985,
859
+ "grad_norm": 77.58560180664062,
860
+ "learning_rate": 7.136189156985742e-06,
861
+ "loss": 0.6549,
862
+ "step": 545
863
+ },
864
+ {
865
+ "epoch": 0.9101251422070534,
866
+ "grad_norm": 35.27056121826172,
867
+ "learning_rate": 7.10447531142664e-06,
868
+ "loss": 0.6278,
869
+ "step": 550
870
+ },
871
+ {
872
+ "epoch": 0.9101251422070534,
873
+ "eval_accuracy": 0.6827906976744186,
874
+ "eval_loss": 0.6337892413139343,
875
+ "eval_runtime": 139.0169,
876
+ "eval_samples_per_second": 7.733,
877
+ "eval_steps_per_second": 7.733,
878
+ "step": 550
879
+ },
880
+ {
881
+ "epoch": 0.9183990071362085,
882
+ "grad_norm": 44.5506591796875,
883
+ "learning_rate": 7.0725657433052645e-06,
884
+ "loss": 0.6628,
885
+ "step": 555
886
+ },
887
+ {
888
+ "epoch": 0.9266728720653635,
889
+ "grad_norm": 164.92425537109375,
890
+ "learning_rate": 7.040462850579625e-06,
891
+ "loss": 0.5611,
892
+ "step": 560
893
+ },
894
+ {
895
+ "epoch": 0.9349467369945186,
896
+ "grad_norm": 64.9032211303711,
897
+ "learning_rate": 7.0081690457357975e-06,
898
+ "loss": 0.6663,
899
+ "step": 565
900
+ },
901
+ {
902
+ "epoch": 0.9432206019236736,
903
+ "grad_norm": 210.38734436035156,
904
+ "learning_rate": 6.975686755606624e-06,
905
+ "loss": 0.659,
906
+ "step": 570
907
+ },
908
+ {
909
+ "epoch": 0.9514944668528287,
910
+ "grad_norm": 60.48363494873047,
911
+ "learning_rate": 6.943018421189348e-06,
912
+ "loss": 0.597,
913
+ "step": 575
914
+ },
915
+ {
916
+ "epoch": 0.9597683317819836,
917
+ "grad_norm": 115.6025619506836,
918
+ "learning_rate": 6.910166497462174e-06,
919
+ "loss": 0.6677,
920
+ "step": 580
921
+ },
922
+ {
923
+ "epoch": 0.9680421967111387,
924
+ "grad_norm": 36.28033447265625,
925
+ "learning_rate": 6.877133453199773e-06,
926
+ "loss": 0.6969,
927
+ "step": 585
928
+ },
929
+ {
930
+ "epoch": 0.9763160616402937,
931
+ "grad_norm": 26.922504425048828,
932
+ "learning_rate": 6.843921770787765e-06,
933
+ "loss": 0.5802,
934
+ "step": 590
935
+ },
936
+ {
937
+ "epoch": 0.9845899265694488,
938
+ "grad_norm": 44.03938293457031,
939
+ "learning_rate": 6.810533946036172e-06,
940
+ "loss": 0.5862,
941
+ "step": 595
942
+ },
943
+ {
944
+ "epoch": 0.9928637914986038,
945
+ "grad_norm": 33.03104019165039,
946
+ "learning_rate": 6.7769724879918564e-06,
947
+ "loss": 0.6086,
948
+ "step": 600
949
+ },
950
+ {
951
+ "epoch": 0.9928637914986038,
952
+ "eval_accuracy": 0.6874418604651162,
953
+ "eval_loss": 0.6202675104141235,
954
+ "eval_runtime": 139.0223,
955
+ "eval_samples_per_second": 7.733,
956
+ "eval_steps_per_second": 7.733,
957
+ "step": 600
958
+ },
959
+ {
960
+ "epoch": 1.0011376564277588,
961
+ "grad_norm": 87.3049087524414,
962
+ "learning_rate": 6.743239918749973e-06,
963
+ "loss": 0.5948,
964
+ "step": 605
965
+ },
966
+ {
967
+ "epoch": 1.009411521356914,
968
+ "grad_norm": 57.935916900634766,
969
+ "learning_rate": 6.709338773264435e-06,
970
+ "loss": 0.6225,
971
+ "step": 610
972
+ },
973
+ {
974
+ "epoch": 1.0176853862860689,
975
+ "grad_norm": 87.15520477294922,
976
+ "learning_rate": 6.675271599157415e-06,
977
+ "loss": 0.5865,
978
+ "step": 615
979
+ },
980
+ {
981
+ "epoch": 1.025959251215224,
982
+ "grad_norm": 35.6851806640625,
983
+ "learning_rate": 6.6410409565279035e-06,
984
+ "loss": 0.5406,
985
+ "step": 620
986
+ },
987
+ {
988
+ "epoch": 1.034233116144379,
989
+ "grad_norm": 110.91497802734375,
990
+ "learning_rate": 6.60664941775931e-06,
991
+ "loss": 0.6117,
992
+ "step": 625
993
+ },
994
+ {
995
+ "epoch": 1.042506981073534,
996
+ "grad_norm": 196.0850372314453,
997
+ "learning_rate": 6.572099567326157e-06,
998
+ "loss": 0.5984,
999
+ "step": 630
1000
+ },
1001
+ {
1002
+ "epoch": 1.050780846002689,
1003
+ "grad_norm": 61.60392379760742,
1004
+ "learning_rate": 6.53739400159986e-06,
1005
+ "loss": 0.5954,
1006
+ "step": 635
1007
+ },
1008
+ {
1009
+ "epoch": 1.059054710931844,
1010
+ "grad_norm": 81.43512725830078,
1011
+ "learning_rate": 6.502535328653616e-06,
1012
+ "loss": 0.6445,
1013
+ "step": 640
1014
+ },
1015
+ {
1016
+ "epoch": 1.0673285758609992,
1017
+ "grad_norm": 84.67086791992188,
1018
+ "learning_rate": 6.467526168066408e-06,
1019
+ "loss": 0.5913,
1020
+ "step": 645
1021
+ },
1022
+ {
1023
+ "epoch": 1.075602440790154,
1024
+ "grad_norm": 58.613800048828125,
1025
+ "learning_rate": 6.432369150726147e-06,
1026
+ "loss": 0.5708,
1027
+ "step": 650
1028
+ },
1029
+ {
1030
+ "epoch": 1.075602440790154,
1031
+ "eval_accuracy": 0.6911627906976744,
1032
+ "eval_loss": 0.6270654201507568,
1033
+ "eval_runtime": 135.6532,
1034
+ "eval_samples_per_second": 7.925,
1035
+ "eval_steps_per_second": 7.925,
1036
+ "step": 650
1037
+ },
1038
+ {
1039
+ "epoch": 1.083876305719309,
1040
+ "grad_norm": 150.11831665039062,
1041
+ "learning_rate": 6.397066918631964e-06,
1042
+ "loss": 0.6169,
1043
+ "step": 655
1044
+ },
1045
+ {
1046
+ "epoch": 1.0921501706484642,
1047
+ "grad_norm": 40.26316452026367,
1048
+ "learning_rate": 6.361622124695677e-06,
1049
+ "loss": 0.5921,
1050
+ "step": 660
1051
+ },
1052
+ {
1053
+ "epoch": 1.1004240355776191,
1054
+ "grad_norm": 139.96380615234375,
1055
+ "learning_rate": 6.326037432542411e-06,
1056
+ "loss": 0.6532,
1057
+ "step": 665
1058
+ },
1059
+ {
1060
+ "epoch": 1.1086979005067743,
1061
+ "grad_norm": 75.26846313476562,
1062
+ "learning_rate": 6.290315516310446e-06,
1063
+ "loss": 0.671,
1064
+ "step": 670
1065
+ },
1066
+ {
1067
+ "epoch": 1.1169717654359292,
1068
+ "grad_norm": 67.06864166259766,
1069
+ "learning_rate": 6.254459060450252e-06,
1070
+ "loss": 0.6279,
1071
+ "step": 675
1072
+ },
1073
+ {
1074
+ "epoch": 1.1252456303650842,
1075
+ "grad_norm": 31.252126693725586,
1076
+ "learning_rate": 6.21847075952276e-06,
1077
+ "loss": 0.4873,
1078
+ "step": 680
1079
+ },
1080
+ {
1081
+ "epoch": 1.1335194952942393,
1082
+ "grad_norm": 26.07908058166504,
1083
+ "learning_rate": 6.1823533179968615e-06,
1084
+ "loss": 0.5434,
1085
+ "step": 685
1086
+ },
1087
+ {
1088
+ "epoch": 1.1417933602233943,
1089
+ "grad_norm": 188.46253967285156,
1090
+ "learning_rate": 6.146109450046187e-06,
1091
+ "loss": 0.6456,
1092
+ "step": 690
1093
+ },
1094
+ {
1095
+ "epoch": 1.1500672251525494,
1096
+ "grad_norm": 51.78955078125,
1097
+ "learning_rate": 6.1097418793451195e-06,
1098
+ "loss": 0.469,
1099
+ "step": 695
1100
+ },
1101
+ {
1102
+ "epoch": 1.1583410900817044,
1103
+ "grad_norm": 115.07262420654297,
1104
+ "learning_rate": 6.073253338864137e-06,
1105
+ "loss": 0.5732,
1106
+ "step": 700
1107
+ },
1108
+ {
1109
+ "epoch": 1.1583410900817044,
1110
+ "eval_accuracy": 0.6865116279069767,
1111
+ "eval_loss": 0.6430281400680542,
1112
+ "eval_runtime": 135.7675,
1113
+ "eval_samples_per_second": 7.918,
1114
+ "eval_steps_per_second": 7.918,
1115
+ "step": 700
1116
+ },
1117
+ {
1118
+ "epoch": 1.1666149550108595,
1119
+ "grad_norm": 137.6046905517578,
1120
+ "learning_rate": 6.036646570664412e-06,
1121
+ "loss": 0.5884,
1122
+ "step": 705
1123
+ },
1124
+ {
1125
+ "epoch": 1.1748888199400145,
1126
+ "grad_norm": 89.0150146484375,
1127
+ "learning_rate": 5.999924325691765e-06,
1128
+ "loss": 0.583,
1129
+ "step": 710
1130
+ },
1131
+ {
1132
+ "epoch": 1.1831626848691694,
1133
+ "grad_norm": 69.22703552246094,
1134
+ "learning_rate": 5.963089363569928e-06,
1135
+ "loss": 0.5935,
1136
+ "step": 715
1137
+ },
1138
+ {
1139
+ "epoch": 1.1914365497983246,
1140
+ "grad_norm": 143.3881072998047,
1141
+ "learning_rate": 5.926144452393163e-06,
1142
+ "loss": 0.6591,
1143
+ "step": 720
1144
+ },
1145
+ {
1146
+ "epoch": 1.1997104147274795,
1147
+ "grad_norm": 33.42167282104492,
1148
+ "learning_rate": 5.8890923685182464e-06,
1149
+ "loss": 0.5891,
1150
+ "step": 725
1151
+ },
1152
+ {
1153
+ "epoch": 1.2079842796566347,
1154
+ "grad_norm": 79.74076843261719,
1155
+ "learning_rate": 5.851935896355828e-06,
1156
+ "loss": 0.6158,
1157
+ "step": 730
1158
+ },
1159
+ {
1160
+ "epoch": 1.2162581445857896,
1161
+ "grad_norm": 68.50188446044922,
1162
+ "learning_rate": 5.814677828161186e-06,
1163
+ "loss": 0.5299,
1164
+ "step": 735
1165
+ },
1166
+ {
1167
+ "epoch": 1.2245320095149448,
1168
+ "grad_norm": 110.22908782958984,
1169
+ "learning_rate": 5.7773209638243965e-06,
1170
+ "loss": 0.6114,
1171
+ "step": 740
1172
+ },
1173
+ {
1174
+ "epoch": 1.2328058744440997,
1175
+ "grad_norm": 126.0649642944336,
1176
+ "learning_rate": 5.739868110659916e-06,
1177
+ "loss": 0.5822,
1178
+ "step": 745
1179
+ },
1180
+ {
1181
+ "epoch": 1.2410797393732547,
1182
+ "grad_norm": 78.34774017333984,
1183
+ "learning_rate": 5.7023220831956335e-06,
1184
+ "loss": 0.5251,
1185
+ "step": 750
1186
+ },
1187
+ {
1188
+ "epoch": 1.2410797393732547,
1189
+ "eval_accuracy": 0.6865116279069767,
1190
+ "eval_loss": 0.6465556025505066,
1191
+ "eval_runtime": 134.8395,
1192
+ "eval_samples_per_second": 7.972,
1193
+ "eval_steps_per_second": 7.972,
1194
+ "step": 750
1195
+ },
1196
+ {
1197
+ "epoch": 1.2493536043024098,
1198
+ "grad_norm": 44.90232849121094,
1199
+ "learning_rate": 5.664685702961344e-06,
1200
+ "loss": 0.5814,
1201
+ "step": 755
1202
+ },
1203
+ {
1204
+ "epoch": 1.2576274692315648,
1205
+ "grad_norm": 61.099143981933594,
1206
+ "learning_rate": 5.626961798276727e-06,
1207
+ "loss": 0.6314,
1208
+ "step": 760
1209
+ },
1210
+ {
1211
+ "epoch": 1.26590133416072,
1212
+ "grad_norm": 96.56782531738281,
1213
+ "learning_rate": 5.589153204038793e-06,
1214
+ "loss": 0.6495,
1215
+ "step": 765
1216
+ },
1217
+ {
1218
+ "epoch": 1.2741751990898749,
1219
+ "grad_norm": 96.17337799072266,
1220
+ "learning_rate": 5.551262761508857e-06,
1221
+ "loss": 0.5472,
1222
+ "step": 770
1223
+ },
1224
+ {
1225
+ "epoch": 1.2824490640190298,
1226
+ "grad_norm": 134.74688720703125,
1227
+ "learning_rate": 5.513293318099013e-06,
1228
+ "loss": 0.5901,
1229
+ "step": 775
1230
+ },
1231
+ {
1232
+ "epoch": 1.290722928948185,
1233
+ "grad_norm": 58.68545913696289,
1234
+ "learning_rate": 5.475247727158154e-06,
1235
+ "loss": 0.625,
1236
+ "step": 780
1237
+ },
1238
+ {
1239
+ "epoch": 1.29899679387734,
1240
+ "grad_norm": 44.06023406982422,
1241
+ "learning_rate": 5.437128847757554e-06,
1242
+ "loss": 0.5603,
1243
+ "step": 785
1244
+ },
1245
+ {
1246
+ "epoch": 1.307270658806495,
1247
+ "grad_norm": 79.8021011352539,
1248
+ "learning_rate": 5.398939544476004e-06,
1249
+ "loss": 0.5608,
1250
+ "step": 790
1251
+ },
1252
+ {
1253
+ "epoch": 1.31554452373565,
1254
+ "grad_norm": 163.2118377685547,
1255
+ "learning_rate": 5.360682687184554e-06,
1256
+ "loss": 0.5516,
1257
+ "step": 795
1258
+ },
1259
+ {
1260
+ "epoch": 1.323818388664805,
1261
+ "grad_norm": 133.09230041503906,
1262
+ "learning_rate": 5.3223611508308385e-06,
1263
+ "loss": 0.565,
1264
+ "step": 800
1265
+ },
1266
+ {
1267
+ "epoch": 1.323818388664805,
1268
+ "eval_accuracy": 0.6902325581395349,
1269
+ "eval_loss": 0.6557974219322205,
1270
+ "eval_runtime": 136.9094,
1271
+ "eval_samples_per_second": 7.852,
1272
+ "eval_steps_per_second": 7.852,
1273
+ "step": 800
1274
+ },
1275
+ {
1276
+ "epoch": 1.33209225359396,
1277
+ "grad_norm": 269.4532165527344,
1278
+ "learning_rate": 5.2839778152230266e-06,
1279
+ "loss": 0.6325,
1280
+ "step": 805
1281
+ },
1282
+ {
1283
+ "epoch": 1.340366118523115,
1284
+ "grad_norm": 31.07293701171875,
1285
+ "learning_rate": 5.2455355648134174e-06,
1286
+ "loss": 0.6786,
1287
+ "step": 810
1288
+ },
1289
+ {
1290
+ "epoch": 1.3486399834522702,
1291
+ "grad_norm": 110.39642333984375,
1292
+ "learning_rate": 5.207037288481668e-06,
1293
+ "loss": 0.707,
1294
+ "step": 815
1295
+ },
1296
+ {
1297
+ "epoch": 1.3569138483814251,
1298
+ "grad_norm": 119.31515502929688,
1299
+ "learning_rate": 5.168485879317707e-06,
1300
+ "loss": 0.6234,
1301
+ "step": 820
1302
+ },
1303
+ {
1304
+ "epoch": 1.36518771331058,
1305
+ "grad_norm": 152.03526306152344,
1306
+ "learning_rate": 5.129884234404314e-06,
1307
+ "loss": 0.6802,
1308
+ "step": 825
1309
+ },
1310
+ {
1311
+ "epoch": 1.3734615782397352,
1312
+ "grad_norm": 85.00423431396484,
1313
+ "learning_rate": 5.091235254599416e-06,
1314
+ "loss": 0.5709,
1315
+ "step": 830
1316
+ },
1317
+ {
1318
+ "epoch": 1.3817354431688902,
1319
+ "grad_norm": 160.42672729492188,
1320
+ "learning_rate": 5.052541844318089e-06,
1321
+ "loss": 0.5747,
1322
+ "step": 835
1323
+ },
1324
+ {
1325
+ "epoch": 1.3900093080980453,
1326
+ "grad_norm": 57.966976165771484,
1327
+ "learning_rate": 5.013806911314294e-06,
1328
+ "loss": 0.5937,
1329
+ "step": 840
1330
+ },
1331
+ {
1332
+ "epoch": 1.3982831730272003,
1333
+ "grad_norm": 39.06420135498047,
1334
+ "learning_rate": 4.975033366462361e-06,
1335
+ "loss": 0.6496,
1336
+ "step": 845
1337
+ },
1338
+ {
1339
+ "epoch": 1.4065570379563552,
1340
+ "grad_norm": 124.18098449707031,
1341
+ "learning_rate": 4.936224123538254e-06,
1342
+ "loss": 0.6163,
1343
+ "step": 850
1344
+ },
1345
+ {
1346
+ "epoch": 1.4065570379563552,
1347
+ "eval_accuracy": 0.6809302325581396,
1348
+ "eval_loss": 0.6283326745033264,
1349
+ "eval_runtime": 135.7927,
1350
+ "eval_samples_per_second": 7.916,
1351
+ "eval_steps_per_second": 7.916,
1352
+ "step": 850
1353
+ },
1354
+ {
1355
+ "epoch": 1.4148309028855104,
1356
+ "grad_norm": 109.61412048339844,
1357
+ "learning_rate": 4.897382099000587e-06,
1358
+ "loss": 0.5904,
1359
+ "step": 855
1360
+ },
1361
+ {
1362
+ "epoch": 1.4231047678146655,
1363
+ "grad_norm": 97.67943572998047,
1364
+ "learning_rate": 4.858510211771469e-06,
1365
+ "loss": 0.6084,
1366
+ "step": 860
1367
+ },
1368
+ {
1369
+ "epoch": 1.4313786327438205,
1370
+ "grad_norm": 32.73789978027344,
1371
+ "learning_rate": 4.819611383017145e-06,
1372
+ "loss": 0.5371,
1373
+ "step": 865
1374
+ },
1375
+ {
1376
+ "epoch": 1.4396524976729754,
1377
+ "grad_norm": 37.779815673828125,
1378
+ "learning_rate": 4.780688535928477e-06,
1379
+ "loss": 0.5433,
1380
+ "step": 870
1381
+ },
1382
+ {
1383
+ "epoch": 1.4479263626021306,
1384
+ "grad_norm": 65.8934326171875,
1385
+ "learning_rate": 4.741744595501275e-06,
1386
+ "loss": 0.5889,
1387
+ "step": 875
1388
+ },
1389
+ {
1390
+ "epoch": 1.4562002275312855,
1391
+ "grad_norm": 94.01367950439453,
1392
+ "learning_rate": 4.702782488316478e-06,
1393
+ "loss": 0.5833,
1394
+ "step": 880
1395
+ },
1396
+ {
1397
+ "epoch": 1.4644740924604407,
1398
+ "grad_norm": 100.82835388183594,
1399
+ "learning_rate": 4.66380514232023e-06,
1400
+ "loss": 0.6123,
1401
+ "step": 885
1402
+ },
1403
+ {
1404
+ "epoch": 1.4727479573895956,
1405
+ "grad_norm": 118.06622314453125,
1406
+ "learning_rate": 4.6248154866038504e-06,
1407
+ "loss": 0.5869,
1408
+ "step": 890
1409
+ },
1410
+ {
1411
+ "epoch": 1.4810218223187506,
1412
+ "grad_norm": 80.13727569580078,
1413
+ "learning_rate": 4.585816451183721e-06,
1414
+ "loss": 0.5605,
1415
+ "step": 895
1416
+ },
1417
+ {
1418
+ "epoch": 1.4892956872479057,
1419
+ "grad_norm": 64.97086334228516,
1420
+ "learning_rate": 4.546810966781089e-06,
1421
+ "loss": 0.5409,
1422
+ "step": 900
1423
+ },
1424
+ {
1425
+ "epoch": 1.4892956872479057,
1426
+ "eval_accuracy": 0.6893023255813954,
1427
+ "eval_loss": 0.6331726908683777,
1428
+ "eval_runtime": 136.5834,
1429
+ "eval_samples_per_second": 7.871,
1430
+ "eval_steps_per_second": 7.871,
1431
+ "step": 900
1432
+ },
1433
+ {
1434
+ "epoch": 1.4975695521770607,
1435
+ "grad_norm": 70.27656555175781,
1436
+ "learning_rate": 4.507801964601837e-06,
1437
+ "loss": 0.543,
1438
+ "step": 905
1439
+ },
1440
+ {
1441
+ "epoch": 1.5058434171062158,
1442
+ "grad_norm": 44.15786361694336,
1443
+ "learning_rate": 4.468792376116199e-06,
1444
+ "loss": 0.6316,
1445
+ "step": 910
1446
+ },
1447
+ {
1448
+ "epoch": 1.5141172820353708,
1449
+ "grad_norm": 26.016767501831055,
1450
+ "learning_rate": 4.429785132838475e-06,
1451
+ "loss": 0.5746,
1452
+ "step": 915
1453
+ },
1454
+ {
1455
+ "epoch": 1.5223911469645257,
1456
+ "grad_norm": 121.1869888305664,
1457
+ "learning_rate": 4.390783166106721e-06,
1458
+ "loss": 0.7063,
1459
+ "step": 920
1460
+ },
1461
+ {
1462
+ "epoch": 1.5306650118936809,
1463
+ "grad_norm": 45.07545852661133,
1464
+ "learning_rate": 4.3517894068624704e-06,
1465
+ "loss": 0.5817,
1466
+ "step": 925
1467
+ },
1468
+ {
1469
+ "epoch": 1.538938876822836,
1470
+ "grad_norm": 87.12654876708984,
1471
+ "learning_rate": 4.312806785430477e-06,
1472
+ "loss": 0.6364,
1473
+ "step": 930
1474
+ },
1475
+ {
1476
+ "epoch": 1.547212741751991,
1477
+ "grad_norm": 66.20915222167969,
1478
+ "learning_rate": 4.273838231298501e-06,
1479
+ "loss": 0.5762,
1480
+ "step": 935
1481
+ },
1482
+ {
1483
+ "epoch": 1.555486606681146,
1484
+ "grad_norm": 91.70149993896484,
1485
+ "learning_rate": 4.2348866728971695e-06,
1486
+ "loss": 0.6013,
1487
+ "step": 940
1488
+ },
1489
+ {
1490
+ "epoch": 1.5637604716103009,
1491
+ "grad_norm": 61.58055877685547,
1492
+ "learning_rate": 4.195955037379899e-06,
1493
+ "loss": 0.6175,
1494
+ "step": 945
1495
+ },
1496
+ {
1497
+ "epoch": 1.572034336539456,
1498
+ "grad_norm": 208.4636993408203,
1499
+ "learning_rate": 4.157046250402935e-06,
1500
+ "loss": 0.6015,
1501
+ "step": 950
1502
+ },
1503
+ {
1504
+ "epoch": 1.572034336539456,
1505
+ "eval_accuracy": 0.6902325581395349,
1506
+ "eval_loss": 0.621203601360321,
1507
+ "eval_runtime": 135.2354,
1508
+ "eval_samples_per_second": 7.949,
1509
+ "eval_steps_per_second": 7.949,
1510
+ "step": 950
1511
+ },
1512
+ {
1513
+ "epoch": 1.5803082014686112,
1514
+ "grad_norm": 26.804574966430664,
1515
+ "learning_rate": 4.118163235905488e-06,
1516
+ "loss": 0.5719,
1517
+ "step": 955
1518
+ },
1519
+ {
1520
+ "epoch": 1.5885820663977661,
1521
+ "grad_norm": 128.2564239501953,
1522
+ "learning_rate": 4.079308915889999e-06,
1523
+ "loss": 0.5666,
1524
+ "step": 960
1525
+ },
1526
+ {
1527
+ "epoch": 1.596855931326921,
1528
+ "grad_norm": 69.91098022460938,
1529
+ "learning_rate": 4.040486210202567e-06,
1530
+ "loss": 0.5785,
1531
+ "step": 965
1532
+ },
1533
+ {
1534
+ "epoch": 1.605129796256076,
1535
+ "grad_norm": 134.2613067626953,
1536
+ "learning_rate": 4.001698036313514e-06,
1537
+ "loss": 0.6613,
1538
+ "step": 970
1539
+ },
1540
+ {
1541
+ "epoch": 1.6134036611852312,
1542
+ "grad_norm": 104.1937255859375,
1543
+ "learning_rate": 3.96294730909815e-06,
1544
+ "loss": 0.5157,
1545
+ "step": 975
1546
+ },
1547
+ {
1548
+ "epoch": 1.6216775261143863,
1549
+ "grad_norm": 75.18810272216797,
1550
+ "learning_rate": 3.924236940617723e-06,
1551
+ "loss": 0.6356,
1552
+ "step": 980
1553
+ },
1554
+ {
1555
+ "epoch": 1.6299513910435413,
1556
+ "grad_norm": 35.88945007324219,
1557
+ "learning_rate": 3.885569839900576e-06,
1558
+ "loss": 0.6173,
1559
+ "step": 985
1560
+ },
1561
+ {
1562
+ "epoch": 1.6382252559726962,
1563
+ "grad_norm": 29.352340698242188,
1564
+ "learning_rate": 3.846948912723542e-06,
1565
+ "loss": 0.6053,
1566
+ "step": 990
1567
+ },
1568
+ {
1569
+ "epoch": 1.6464991209018511,
1570
+ "grad_norm": 132.6302490234375,
1571
+ "learning_rate": 3.808377061393587e-06,
1572
+ "loss": 0.6604,
1573
+ "step": 995
1574
+ },
1575
+ {
1576
+ "epoch": 1.6547729858310063,
1577
+ "grad_norm": 158.6077423095703,
1578
+ "learning_rate": 3.7698571845296876e-06,
1579
+ "loss": 0.5921,
1580
+ "step": 1000
1581
+ },
1582
+ {
1583
+ "epoch": 1.6547729858310063,
1584
+ "eval_accuracy": 0.6893023255813954,
1585
+ "eval_loss": 0.6269645690917969,
1586
+ "eval_runtime": 132.1367,
1587
+ "eval_samples_per_second": 8.136,
1588
+ "eval_steps_per_second": 8.136,
1589
+ "step": 1000
1590
+ },
1591
+ {
1592
+ "epoch": 1.6630468507601615,
1593
+ "grad_norm": 64.79986572265625,
1594
+ "learning_rate": 3.731392176845023e-06,
1595
+ "loss": 0.5632,
1596
+ "step": 1005
1597
+ },
1598
+ {
1599
+ "epoch": 1.6713207156893164,
1600
+ "grad_norm": 69.55998229980469,
1601
+ "learning_rate": 3.692984928929426e-06,
1602
+ "loss": 0.5757,
1603
+ "step": 1010
1604
+ },
1605
+ {
1606
+ "epoch": 1.6795945806184713,
1607
+ "grad_norm": 58.388427734375,
1608
+ "learning_rate": 3.6546383270321673e-06,
1609
+ "loss": 0.5728,
1610
+ "step": 1015
1611
+ },
1612
+ {
1613
+ "epoch": 1.6878684455476263,
1614
+ "grad_norm": 209.67808532714844,
1615
+ "learning_rate": 3.6163552528450616e-06,
1616
+ "loss": 0.6385,
1617
+ "step": 1020
1618
+ },
1619
+ {
1620
+ "epoch": 1.6961423104767814,
1621
+ "grad_norm": 41.65730667114258,
1622
+ "learning_rate": 3.5781385832859032e-06,
1623
+ "loss": 0.5542,
1624
+ "step": 1025
1625
+ },
1626
+ {
1627
+ "epoch": 1.7044161754059366,
1628
+ "grad_norm": 40.03094482421875,
1629
+ "learning_rate": 3.539991190282274e-06,
1630
+ "loss": 0.5365,
1631
+ "step": 1030
1632
+ },
1633
+ {
1634
+ "epoch": 1.7126900403350915,
1635
+ "grad_norm": 243.15150451660156,
1636
+ "learning_rate": 3.501915940555728e-06,
1637
+ "loss": 0.7109,
1638
+ "step": 1035
1639
+ },
1640
+ {
1641
+ "epoch": 1.7209639052642465,
1642
+ "grad_norm": 42.63383865356445,
1643
+ "learning_rate": 3.4639156954063487e-06,
1644
+ "loss": 0.5262,
1645
+ "step": 1040
1646
+ },
1647
+ {
1648
+ "epoch": 1.7292377701934016,
1649
+ "grad_norm": 110.11820983886719,
1650
+ "learning_rate": 3.4259933104977394e-06,
1651
+ "loss": 0.581,
1652
+ "step": 1045
1653
+ },
1654
+ {
1655
+ "epoch": 1.7375116351225566,
1656
+ "grad_norm": 140.5242919921875,
1657
+ "learning_rate": 3.3881516356424178e-06,
1658
+ "loss": 0.5132,
1659
+ "step": 1050
1660
+ },
1661
+ {
1662
+ "epoch": 1.7375116351225566,
1663
+ "eval_accuracy": 0.6911627906976744,
1664
+ "eval_loss": 0.6242228150367737,
1665
+ "eval_runtime": 135.3409,
1666
+ "eval_samples_per_second": 7.943,
1667
+ "eval_steps_per_second": 7.943,
1668
+ "step": 1050
1669
+ },
1670
+ {
1671
+ "epoch": 1.7457855000517117,
1672
+ "grad_norm": 117.73894500732422,
1673
+ "learning_rate": 3.3503935145876525e-06,
1674
+ "loss": 0.6865,
1675
+ "step": 1055
1676
+ },
1677
+ {
1678
+ "epoch": 1.7540593649808667,
1679
+ "grad_norm": 64.21101379394531,
1680
+ "learning_rate": 3.312721784801774e-06,
1681
+ "loss": 0.5874,
1682
+ "step": 1060
1683
+ },
1684
+ {
1685
+ "epoch": 1.7623332299100216,
1686
+ "grad_norm": 74.21529388427734,
1687
+ "learning_rate": 3.275139277260925e-06,
1688
+ "loss": 0.6284,
1689
+ "step": 1065
1690
+ },
1691
+ {
1692
+ "epoch": 1.7706070948391768,
1693
+ "grad_norm": 63.436607360839844,
1694
+ "learning_rate": 3.2376488162363275e-06,
1695
+ "loss": 0.5562,
1696
+ "step": 1070
1697
+ },
1698
+ {
1699
+ "epoch": 1.778880959768332,
1700
+ "grad_norm": 57.009483337402344,
1701
+ "learning_rate": 3.2002532190820422e-06,
1702
+ "loss": 0.5425,
1703
+ "step": 1075
1704
+ },
1705
+ {
1706
+ "epoch": 1.7871548246974869,
1707
+ "grad_norm": 45.6854362487793,
1708
+ "learning_rate": 3.162955296023247e-06,
1709
+ "loss": 0.8092,
1710
+ "step": 1080
1711
+ },
1712
+ {
1713
+ "epoch": 1.7954286896266418,
1714
+ "grad_norm": 68.54329681396484,
1715
+ "learning_rate": 3.125757849945051e-06,
1716
+ "loss": 0.6519,
1717
+ "step": 1085
1718
+ },
1719
+ {
1720
+ "epoch": 1.8037025545557968,
1721
+ "grad_norm": 62.51036834716797,
1722
+ "learning_rate": 3.088663676181864e-06,
1723
+ "loss": 0.5576,
1724
+ "step": 1090
1725
+ },
1726
+ {
1727
+ "epoch": 1.811976419484952,
1728
+ "grad_norm": 108.84650421142578,
1729
+ "learning_rate": 3.0516755623073273e-06,
1730
+ "loss": 0.5965,
1731
+ "step": 1095
1732
+ },
1733
+ {
1734
+ "epoch": 1.820250284414107,
1735
+ "grad_norm": 39.56748580932617,
1736
+ "learning_rate": 3.0147962879248402e-06,
1737
+ "loss": 0.5851,
1738
+ "step": 1100
1739
+ },
1740
+ {
1741
+ "epoch": 1.820250284414107,
1742
+ "eval_accuracy": 0.6874418604651162,
1743
+ "eval_loss": 0.6186655163764954,
1744
+ "eval_runtime": 132.3,
1745
+ "eval_samples_per_second": 8.125,
1746
+ "eval_steps_per_second": 8.125,
1747
+ "step": 1100
1748
+ },
1749
+ {
1750
+ "epoch": 1.828524149343262,
1751
+ "grad_norm": 115.91883087158203,
1752
+ "learning_rate": 2.978028624458668e-06,
1753
+ "loss": 0.5346,
1754
+ "step": 1105
1755
+ },
1756
+ {
1757
+ "epoch": 1.836798014272417,
1758
+ "grad_norm": 58.3183708190918,
1759
+ "learning_rate": 2.941375334945674e-06,
1760
+ "loss": 0.5022,
1761
+ "step": 1110
1762
+ },
1763
+ {
1764
+ "epoch": 1.845071879201572,
1765
+ "grad_norm": 80.68195343017578,
1766
+ "learning_rate": 2.9048391738276908e-06,
1767
+ "loss": 0.6279,
1768
+ "step": 1115
1769
+ },
1770
+ {
1771
+ "epoch": 1.853345744130727,
1772
+ "grad_norm": 71.54075622558594,
1773
+ "learning_rate": 2.8684228867445133e-06,
1774
+ "loss": 0.5217,
1775
+ "step": 1120
1776
+ },
1777
+ {
1778
+ "epoch": 1.8616196090598822,
1779
+ "grad_norm": 45.025848388671875,
1780
+ "learning_rate": 2.8321292103275813e-06,
1781
+ "loss": 0.5751,
1782
+ "step": 1125
1783
+ },
1784
+ {
1785
+ "epoch": 1.8698934739890372,
1786
+ "grad_norm": 119.63114929199219,
1787
+ "learning_rate": 2.7959608719943193e-06,
1788
+ "loss": 0.5505,
1789
+ "step": 1130
1790
+ },
1791
+ {
1792
+ "epoch": 1.878167338918192,
1793
+ "grad_norm": 29.393390655517578,
1794
+ "learning_rate": 2.7599205897431707e-06,
1795
+ "loss": 0.5426,
1796
+ "step": 1135
1797
+ },
1798
+ {
1799
+ "epoch": 1.886441203847347,
1800
+ "grad_norm": 86.28485870361328,
1801
+ "learning_rate": 2.7240110719493568e-06,
1802
+ "loss": 0.5846,
1803
+ "step": 1140
1804
+ },
1805
+ {
1806
+ "epoch": 1.8947150687765022,
1807
+ "grad_norm": 102.7513656616211,
1808
+ "learning_rate": 2.688235017161331e-06,
1809
+ "loss": 0.6677,
1810
+ "step": 1145
1811
+ },
1812
+ {
1813
+ "epoch": 1.9029889337056574,
1814
+ "grad_norm": 45.93974685668945,
1815
+ "learning_rate": 2.6525951138979987e-06,
1816
+ "loss": 0.5869,
1817
+ "step": 1150
1818
+ },
1819
+ {
1820
+ "epoch": 1.9029889337056574,
1821
+ "eval_accuracy": 0.6827906976744186,
1822
+ "eval_loss": 0.6229726076126099,
1823
+ "eval_runtime": 142.7285,
1824
+ "eval_samples_per_second": 7.532,
1825
+ "eval_steps_per_second": 7.532,
1826
+ "step": 1150
1827
+ },
1828
+ {
1829
+ "epoch": 1.9112627986348123,
1830
+ "grad_norm": 39.167236328125,
1831
+ "learning_rate": 2.617094040446676e-06,
1832
+ "loss": 0.5982,
1833
+ "step": 1155
1834
+ },
1835
+ {
1836
+ "epoch": 1.9195366635639672,
1837
+ "grad_norm": 161.03598022460938,
1838
+ "learning_rate": 2.5817344646618134e-06,
1839
+ "loss": 0.6664,
1840
+ "step": 1160
1841
+ },
1842
+ {
1843
+ "epoch": 1.9278105284931224,
1844
+ "grad_norm": 94.9665298461914,
1845
+ "learning_rate": 2.5465190437645224e-06,
1846
+ "loss": 0.5211,
1847
+ "step": 1165
1848
+ },
1849
+ {
1850
+ "epoch": 1.9360843934222773,
1851
+ "grad_norm": 36.561275482177734,
1852
+ "learning_rate": 2.511450424142878e-06,
1853
+ "loss": 0.6397,
1854
+ "step": 1170
1855
+ },
1856
+ {
1857
+ "epoch": 1.9443582583514325,
1858
+ "grad_norm": 144.52769470214844,
1859
+ "learning_rate": 2.4765312411530504e-06,
1860
+ "loss": 0.6234,
1861
+ "step": 1175
1862
+ },
1863
+ {
1864
+ "epoch": 1.9526321232805874,
1865
+ "grad_norm": 70.71371459960938,
1866
+ "learning_rate": 2.4417641189212683e-06,
1867
+ "loss": 0.6121,
1868
+ "step": 1180
1869
+ },
1870
+ {
1871
+ "epoch": 1.9609059882097424,
1872
+ "grad_norm": 150.60153198242188,
1873
+ "learning_rate": 2.4071516701466072e-06,
1874
+ "loss": 0.5721,
1875
+ "step": 1185
1876
+ },
1877
+ {
1878
+ "epoch": 1.9691798531388975,
1879
+ "grad_norm": 65.08445739746094,
1880
+ "learning_rate": 2.37269649590466e-06,
1881
+ "loss": 0.574,
1882
+ "step": 1190
1883
+ },
1884
+ {
1885
+ "epoch": 1.9774537180680527,
1886
+ "grad_norm": 85.44747161865234,
1887
+ "learning_rate": 2.3384011854520643e-06,
1888
+ "loss": 0.55,
1889
+ "step": 1195
1890
+ },
1891
+ {
1892
+ "epoch": 1.9857275829972076,
1893
+ "grad_norm": 95.13154602050781,
1894
+ "learning_rate": 2.304268316031922e-06,
1895
+ "loss": 0.558,
1896
+ "step": 1200
1897
+ },
1898
+ {
1899
+ "epoch": 1.9857275829972076,
1900
+ "eval_accuracy": 0.693953488372093,
1901
+ "eval_loss": 0.6230591535568237,
1902
+ "eval_runtime": 139.4556,
1903
+ "eval_samples_per_second": 7.709,
1904
+ "eval_steps_per_second": 7.709,
1905
+ "step": 1200
1906
+ },
1907
+ {
1908
+ "epoch": 1.9940014479263626,
1909
+ "grad_norm": 72.64482879638672,
1910
+ "learning_rate": 2.2703004526801315e-06,
1911
+ "loss": 0.6093,
1912
+ "step": 1205
1913
+ },
1914
+ {
1915
+ "epoch": 2.0022753128555175,
1916
+ "grad_norm": 91.53067016601562,
1917
+ "learning_rate": 2.236500148032616e-06,
1918
+ "loss": 0.5926,
1919
+ "step": 1210
1920
+ },
1921
+ {
1922
+ "epoch": 2.0105491777846725,
1923
+ "grad_norm": 101.95372772216797,
1924
+ "learning_rate": 2.2028699421335074e-06,
1925
+ "loss": 0.5415,
1926
+ "step": 1215
1927
+ },
1928
+ {
1929
+ "epoch": 2.018823042713828,
1930
+ "grad_norm": 103.2565689086914,
1931
+ "learning_rate": 2.1694123622442645e-06,
1932
+ "loss": 0.5096,
1933
+ "step": 1220
1934
+ },
1935
+ {
1936
+ "epoch": 2.027096907642983,
1937
+ "grad_norm": 32.46419906616211,
1938
+ "learning_rate": 2.1361299226537465e-06,
1939
+ "loss": 0.4894,
1940
+ "step": 1225
1941
+ },
1942
+ {
1943
+ "epoch": 2.0353707725721377,
1944
+ "grad_norm": 59.69719696044922,
1945
+ "learning_rate": 2.1030251244892713e-06,
1946
+ "loss": 0.5322,
1947
+ "step": 1230
1948
+ },
1949
+ {
1950
+ "epoch": 2.0436446375012927,
1951
+ "grad_norm": 44.855892181396484,
1952
+ "learning_rate": 2.0701004555286637e-06,
1953
+ "loss": 0.5304,
1954
+ "step": 1235
1955
+ },
1956
+ {
1957
+ "epoch": 2.051918502430448,
1958
+ "grad_norm": 124.84123992919922,
1959
+ "learning_rate": 2.0373583900132974e-06,
1960
+ "loss": 0.5007,
1961
+ "step": 1240
1962
+ },
1963
+ {
1964
+ "epoch": 2.060192367359603,
1965
+ "grad_norm": 178.7327117919922,
1966
+ "learning_rate": 2.0048013884621617e-06,
1967
+ "loss": 0.5525,
1968
+ "step": 1245
1969
+ },
1970
+ {
1971
+ "epoch": 2.068466232288758,
1972
+ "grad_norm": 89.66055297851562,
1973
+ "learning_rate": 1.9724318974869516e-06,
1974
+ "loss": 0.5181,
1975
+ "step": 1250
1976
+ },
1977
+ {
1978
+ "epoch": 2.068466232288758,
1979
+ "eval_accuracy": 0.6920930232558139,
1980
+ "eval_loss": 0.6200288534164429,
1981
+ "eval_runtime": 139.1734,
1982
+ "eval_samples_per_second": 7.724,
1983
+ "eval_steps_per_second": 7.724,
1984
+ "step": 1250
1985
+ },
1986
+ {
1987
+ "epoch": 2.076740097217913,
1988
+ "grad_norm": 144.76039123535156,
1989
+ "learning_rate": 1.940252349608215e-06,
1990
+ "loss": 0.5971,
1991
+ "step": 1255
1992
+ },
1993
+ {
1994
+ "epoch": 2.085013962147068,
1995
+ "grad_norm": 78.81449890136719,
1996
+ "learning_rate": 1.908265163072554e-06,
1997
+ "loss": 0.6142,
1998
+ "step": 1260
1999
+ },
2000
+ {
2001
+ "epoch": 2.093287827076223,
2002
+ "grad_norm": 161.04014587402344,
2003
+ "learning_rate": 1.8764727416708883e-06,
2004
+ "loss": 0.5488,
2005
+ "step": 1265
2006
+ },
2007
+ {
2008
+ "epoch": 2.101561692005378,
2009
+ "grad_norm": 175.5146942138672,
2010
+ "learning_rate": 1.8448774745578186e-06,
2011
+ "loss": 0.6606,
2012
+ "step": 1270
2013
+ },
2014
+ {
2015
+ "epoch": 2.109835556934533,
2016
+ "grad_norm": 62.92079544067383,
2017
+ "learning_rate": 1.81348173607209e-06,
2018
+ "loss": 0.5204,
2019
+ "step": 1275
2020
+ },
2021
+ {
2022
+ "epoch": 2.118109421863688,
2023
+ "grad_norm": 150.65797424316406,
2024
+ "learning_rate": 1.7822878855581543e-06,
2025
+ "loss": 0.4931,
2026
+ "step": 1280
2027
+ },
2028
+ {
2029
+ "epoch": 2.126383286792843,
2030
+ "grad_norm": 64.05603790283203,
2031
+ "learning_rate": 1.7512982671888757e-06,
2032
+ "loss": 0.5615,
2033
+ "step": 1285
2034
+ },
2035
+ {
2036
+ "epoch": 2.1346571517219983,
2037
+ "grad_norm": 42.53824234008789,
2038
+ "learning_rate": 1.7205152097893694e-06,
2039
+ "loss": 0.5017,
2040
+ "step": 1290
2041
+ },
2042
+ {
2043
+ "epoch": 2.1429310166511533,
2044
+ "grad_norm": 67.79627990722656,
2045
+ "learning_rate": 1.689941026661986e-06,
2046
+ "loss": 0.5048,
2047
+ "step": 1295
2048
+ },
2049
+ {
2050
+ "epoch": 2.151204881580308,
2051
+ "grad_norm": 45.754302978515625,
2052
+ "learning_rate": 1.6595780154124827e-06,
2053
+ "loss": 0.5542,
2054
+ "step": 1300
2055
+ },
2056
+ {
2057
+ "epoch": 2.151204881580308,
2058
+ "eval_accuracy": 0.6874418604651162,
2059
+ "eval_loss": 0.625083327293396,
2060
+ "eval_runtime": 139.2113,
2061
+ "eval_samples_per_second": 7.722,
2062
+ "eval_steps_per_second": 7.722,
2063
+ "step": 1300
2064
+ },
2065
+ {
2066
+ "epoch": 2.159478746509463,
2067
+ "grad_norm": 53.39175033569336,
2068
+ "learning_rate": 1.6294284577773493e-06,
2069
+ "loss": 0.5467,
2070
+ "step": 1305
2071
+ },
2072
+ {
2073
+ "epoch": 2.167752611438618,
2074
+ "grad_norm": 135.67037963867188,
2075
+ "learning_rate": 1.5994946194523452e-06,
2076
+ "loss": 0.5632,
2077
+ "step": 1310
2078
+ },
2079
+ {
2080
+ "epoch": 2.1760264763677735,
2081
+ "grad_norm": 52.69210433959961,
2082
+ "learning_rate": 1.569778749922239e-06,
2083
+ "loss": 0.6039,
2084
+ "step": 1315
2085
+ },
2086
+ {
2087
+ "epoch": 2.1843003412969284,
2088
+ "grad_norm": 159.80264282226562,
2089
+ "learning_rate": 1.540283082291754e-06,
2090
+ "loss": 0.593,
2091
+ "step": 1320
2092
+ },
2093
+ {
2094
+ "epoch": 2.1925742062260833,
2095
+ "grad_norm": 37.81462097167969,
2096
+ "learning_rate": 1.5110098331177598e-06,
2097
+ "loss": 0.5655,
2098
+ "step": 1325
2099
+ },
2100
+ {
2101
+ "epoch": 2.2008480711552383,
2102
+ "grad_norm": 245.13482666015625,
2103
+ "learning_rate": 1.4819612022427027e-06,
2104
+ "loss": 0.6078,
2105
+ "step": 1330
2106
+ },
2107
+ {
2108
+ "epoch": 2.2091219360843932,
2109
+ "grad_norm": 139.10708618164062,
2110
+ "learning_rate": 1.4531393726292826e-06,
2111
+ "loss": 0.5248,
2112
+ "step": 1335
2113
+ },
2114
+ {
2115
+ "epoch": 2.2173958010135486,
2116
+ "grad_norm": 122.25520324707031,
2117
+ "learning_rate": 1.4245465101964164e-06,
2118
+ "loss": 0.5201,
2119
+ "step": 1340
2120
+ },
2121
+ {
2122
+ "epoch": 2.2256696659427035,
2123
+ "grad_norm": 68.17268371582031,
2124
+ "learning_rate": 1.3961847636564672e-06,
2125
+ "loss": 0.532,
2126
+ "step": 1345
2127
+ },
2128
+ {
2129
+ "epoch": 2.2339435308718585,
2130
+ "grad_norm": 88.55684661865234,
2131
+ "learning_rate": 1.3680562643537693e-06,
2132
+ "loss": 0.4773,
2133
+ "step": 1350
2134
+ },
2135
+ {
2136
+ "epoch": 2.2339435308718585,
2137
+ "eval_accuracy": 0.6874418604651162,
2138
+ "eval_loss": 0.6236450672149658,
2139
+ "eval_runtime": 139.249,
2140
+ "eval_samples_per_second": 7.72,
2141
+ "eval_steps_per_second": 7.72,
2142
+ "step": 1350
2143
+ },
2144
+ {
2145
+ "epoch": 2.2422173958010134,
2146
+ "grad_norm": 67.30037689208984,
2147
+ "learning_rate": 1.3401631261044664e-06,
2148
+ "loss": 0.5876,
2149
+ "step": 1355
2150
+ },
2151
+ {
2152
+ "epoch": 2.2504912607301684,
2153
+ "grad_norm": 66.93975067138672,
2154
+ "learning_rate": 1.3125074450376577e-06,
2155
+ "loss": 0.5292,
2156
+ "step": 1360
2157
+ },
2158
+ {
2159
+ "epoch": 2.2587651256593237,
2160
+ "grad_norm": 103.20861053466797,
2161
+ "learning_rate": 1.285091299437875e-06,
2162
+ "loss": 0.5353,
2163
+ "step": 1365
2164
+ },
2165
+ {
2166
+ "epoch": 2.2670389905884787,
2167
+ "grad_norm": 57.18837356567383,
2168
+ "learning_rate": 1.2579167495889117e-06,
2169
+ "loss": 0.525,
2170
+ "step": 1370
2171
+ },
2172
+ {
2173
+ "epoch": 2.2753128555176336,
2174
+ "grad_norm": 28.59246253967285,
2175
+ "learning_rate": 1.230985837618981e-06,
2176
+ "loss": 0.5938,
2177
+ "step": 1375
2178
+ },
2179
+ {
2180
+ "epoch": 2.2835867204467886,
2181
+ "grad_norm": 71.6416015625,
2182
+ "learning_rate": 1.2043005873472697e-06,
2183
+ "loss": 0.6092,
2184
+ "step": 1380
2185
+ },
2186
+ {
2187
+ "epoch": 2.291860585375944,
2188
+ "grad_norm": 89.52198028564453,
2189
+ "learning_rate": 1.1778630041318344e-06,
2190
+ "loss": 0.5926,
2191
+ "step": 1385
2192
+ },
2193
+ {
2194
+ "epoch": 2.300134450305099,
2195
+ "grad_norm": 119.52609252929688,
2196
+ "learning_rate": 1.1516750747189145e-06,
2197
+ "loss": 0.5102,
2198
+ "step": 1390
2199
+ },
2200
+ {
2201
+ "epoch": 2.308408315234254,
2202
+ "grad_norm": 27.523805618286133,
2203
+ "learning_rate": 1.125738767093626e-06,
2204
+ "loss": 0.5085,
2205
+ "step": 1395
2206
+ },
2207
+ {
2208
+ "epoch": 2.3166821801634088,
2209
+ "grad_norm": 108.35863494873047,
2210
+ "learning_rate": 1.1000560303320687e-06,
2211
+ "loss": 0.6225,
2212
+ "step": 1400
2213
+ },
2214
+ {
2215
+ "epoch": 2.3166821801634088,
2216
+ "eval_accuracy": 0.6837209302325581,
2217
+ "eval_loss": 0.6350362300872803,
2218
+ "eval_runtime": 139.1672,
2219
+ "eval_samples_per_second": 7.725,
2220
+ "eval_steps_per_second": 7.725,
2221
+ "step": 1400
2222
+ },
2223
+ {
2224
+ "epoch": 2.3249560450925637,
2225
+ "grad_norm": 29.207040786743164,
2226
+ "learning_rate": 1.0746287944548576e-06,
2227
+ "loss": 0.5416,
2228
+ "step": 1405
2229
+ },
2230
+ {
2231
+ "epoch": 2.333229910021719,
2232
+ "grad_norm": 67.55722045898438,
2233
+ "learning_rate": 1.049458970282088e-06,
2234
+ "loss": 0.5944,
2235
+ "step": 1410
2236
+ },
2237
+ {
2238
+ "epoch": 2.341503774950874,
2239
+ "grad_norm": 188.1350555419922,
2240
+ "learning_rate": 1.0245484492897327e-06,
2241
+ "loss": 0.5461,
2242
+ "step": 1415
2243
+ },
2244
+ {
2245
+ "epoch": 2.349777639880029,
2246
+ "grad_norm": 74.21896362304688,
2247
+ "learning_rate": 9.998991034675096e-07,
2248
+ "loss": 0.4793,
2249
+ "step": 1420
2250
+ },
2251
+ {
2252
+ "epoch": 2.358051504809184,
2253
+ "grad_norm": 87.09430694580078,
2254
+ "learning_rate": 9.755127851781945e-07,
2255
+ "loss": 0.563,
2256
+ "step": 1425
2257
+ },
2258
+ {
2259
+ "epoch": 2.366325369738339,
2260
+ "grad_norm": 236.31300354003906,
2261
+ "learning_rate": 9.513913270184244e-07,
2262
+ "loss": 0.4763,
2263
+ "step": 1430
2264
+ },
2265
+ {
2266
+ "epoch": 2.3745992346674942,
2267
+ "grad_norm": 131.0776824951172,
2268
+ "learning_rate": 9.275365416809833e-07,
2269
+ "loss": 0.5199,
2270
+ "step": 1435
2271
+ },
2272
+ {
2273
+ "epoch": 2.382873099596649,
2274
+ "grad_norm": 52.74057388305664,
2275
+ "learning_rate": 9.039502218185748e-07,
2276
+ "loss": 0.6553,
2277
+ "step": 1440
2278
+ },
2279
+ {
2280
+ "epoch": 2.391146964525804,
2281
+ "grad_norm": 94.90296936035156,
2282
+ "learning_rate": 8.806341399091078e-07,
2283
+ "loss": 0.5441,
2284
+ "step": 1445
2285
+ },
2286
+ {
2287
+ "epoch": 2.399420829454959,
2288
+ "grad_norm": 40.44979476928711,
2289
+ "learning_rate": 8.575900481225027e-07,
2290
+ "loss": 0.5655,
2291
+ "step": 1450
2292
+ },
2293
+ {
2294
+ "epoch": 2.399420829454959,
2295
+ "eval_accuracy": 0.6893023255813954,
2296
+ "eval_loss": 0.6249026656150818,
2297
+ "eval_runtime": 135.6342,
2298
+ "eval_samples_per_second": 7.926,
2299
+ "eval_steps_per_second": 7.926,
2300
+ "step": 1450
2301
+ },
2302
+ {
2303
+ "epoch": 2.407694694384114,
2304
+ "grad_norm": 45.417179107666016,
2305
+ "learning_rate": 8.348196781890096e-07,
2306
+ "loss": 0.5787,
2307
+ "step": 1455
2308
+ },
2309
+ {
2310
+ "epoch": 2.4159685593132694,
2311
+ "grad_norm": 120.42521667480469,
2312
+ "learning_rate": 8.123247412690823e-07,
2313
+ "loss": 0.5079,
2314
+ "step": 1460
2315
+ },
2316
+ {
2317
+ "epoch": 2.4242424242424243,
2318
+ "grad_norm": 41.423912048339844,
2319
+ "learning_rate": 7.901069278247756e-07,
2320
+ "loss": 0.6137,
2321
+ "step": 1465
2322
+ },
2323
+ {
2324
+ "epoch": 2.4325162891715792,
2325
+ "grad_norm": 73.1041259765625,
2326
+ "learning_rate": 7.681679074927166e-07,
2327
+ "loss": 0.5198,
2328
+ "step": 1470
2329
+ },
2330
+ {
2331
+ "epoch": 2.440790154100734,
2332
+ "grad_norm": 129.06178283691406,
2333
+ "learning_rate": 7.465093289586345e-07,
2334
+ "loss": 0.5538,
2335
+ "step": 1475
2336
+ },
2337
+ {
2338
+ "epoch": 2.4490640190298896,
2339
+ "grad_norm": 171.7907257080078,
2340
+ "learning_rate": 7.25132819833459e-07,
2341
+ "loss": 0.5026,
2342
+ "step": 1480
2343
+ },
2344
+ {
2345
+ "epoch": 2.4573378839590445,
2346
+ "grad_norm": 85.45380401611328,
2347
+ "learning_rate": 7.04039986531011e-07,
2348
+ "loss": 0.6185,
2349
+ "step": 1485
2350
+ },
2351
+ {
2352
+ "epoch": 2.4656117488881995,
2353
+ "grad_norm": 139.45574951171875,
2354
+ "learning_rate": 6.832324141472841e-07,
2355
+ "loss": 0.5379,
2356
+ "step": 1490
2357
+ },
2358
+ {
2359
+ "epoch": 2.4738856138173544,
2360
+ "grad_norm": 150.1260986328125,
2361
+ "learning_rate": 6.627116663413249e-07,
2362
+ "loss": 0.5337,
2363
+ "step": 1495
2364
+ },
2365
+ {
2366
+ "epoch": 2.4821594787465093,
2367
+ "grad_norm": 62.410194396972656,
2368
+ "learning_rate": 6.424792852177275e-07,
2369
+ "loss": 0.5692,
2370
+ "step": 1500
2371
+ },
2372
+ {
2373
+ "epoch": 2.4821594787465093,
2374
+ "eval_accuracy": 0.6883720930232559,
2375
+ "eval_loss": 0.6257066130638123,
2376
+ "eval_runtime": 137.001,
2377
+ "eval_samples_per_second": 7.847,
2378
+ "eval_steps_per_second": 7.847,
2379
+ "step": 1500
2380
+ }
2381
+ ],
2382
+ "logging_steps": 5,
2383
+ "max_steps": 1812,
2384
+ "num_input_tokens_seen": 0,
2385
+ "num_train_epochs": 3,
2386
+ "save_steps": 500,
2387
+ "total_flos": 5578659201024000.0,
2388
+ "train_batch_size": 1,
2389
+ "trial_name": null,
2390
+ "trial_params": null
2391
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2419f00512ba281543dd41fa6de85621e29bc67e3e2c05212bd1146dae1f3d56
3
+ size 5048