emendes3 commited on
Commit
b4ce372
1 Parent(s): cce797b

Model save

Browse files
README.md CHANGED
@@ -1,36 +1,19 @@
1
  ---
2
  library_name: peft
3
  tags:
4
- - liuhaotian/llava-v1.5-13b_10.0_602276
5
- - liuhaotian/llava-v1.5-13b_9.0_602276
6
- - liuhaotian/llava-v1.5-13b_8.0_602276
7
- - liuhaotian/llava-v1.5-13b_7.0_602276
8
- - liuhaotian/llava-v1.5-13b_6.0_602276
9
- - liuhaotian/llava-v1.5-13b_5.0_602276
10
- - liuhaotian/llava-v1.5-13b_4.0_602276
11
- - liuhaotian/llava-v1.5-13b_3.0_602276
12
- - liuhaotian/llava-v1.5-13b_2.0_602276
13
- - liuhaotian/llava-v1.5-13b_1.0_602276
14
  - generated_from_trainer
15
  base_model: liuhaotian/llava-v1.5-13b
16
  model-index:
17
- - name: liuhaotian/llava-v1.5-13b_10.0_602276
18
  results: []
19
  ---
20
 
21
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
22
  should probably proofread and complete it, then remove this comment. -->
23
 
24
- # liuhaotian/llava-v1.5-13b_10.0_602276
25
 
26
- This model is a fine-tuned version of [liuhaotian/llava-v1.5-13b_10.0_602276](https://huggingface.co/liuhaotian/llava-v1.5-13b_10.0_602276) on an unknown dataset.
27
- It achieves the following results on the evaluation set:
28
- - eval_loss: 0.0022
29
- - eval_runtime: 55.1607
30
- - eval_samples_per_second: 15.718
31
- - eval_steps_per_second: 0.508
32
- - epoch: 9.0
33
- - step: 252
34
 
35
  ## Model description
36
 
 
1
  ---
2
  library_name: peft
3
  tags:
 
 
 
 
 
 
 
 
 
 
4
  - generated_from_trainer
5
  base_model: liuhaotian/llava-v1.5-13b
6
  model-index:
7
+ - name: llava_13b_exact_location_name_synthetic
8
  results: []
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
  should probably proofread and complete it, then remove this comment. -->
13
 
14
+ # llava_13b_exact_location_name_synthetic
15
 
16
+ This model is a fine-tuned version of [liuhaotian/llava-v1.5-13b](https://huggingface.co/liuhaotian/llava-v1.5-13b) on an unknown dataset.
 
 
 
 
 
 
 
17
 
18
  ## Model description
19
 
adapter_config.json CHANGED
@@ -21,12 +21,12 @@
21
  "revision": null,
22
  "target_modules": [
23
  "k_proj",
24
- "down_proj",
25
- "v_proj",
26
  "up_proj",
27
- "gate_proj",
28
  "o_proj",
29
- "q_proj"
 
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
21
  "revision": null,
22
  "target_modules": [
23
  "k_proj",
 
 
24
  "up_proj",
 
25
  "o_proj",
26
+ "q_proj",
27
+ "down_proj",
28
+ "gate_proj",
29
+ "v_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1ddaa241787f6409e37a5f48e2f063a18dd9b022874902be759093643f4d6ee
3
  size 1001466944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1b2582b692b13611e3505defb6e50477ccb75c89fbdcca86cf157e1a7ff687f
3
  size 1001466944
liuhaotian/llava-v1.5-13b_1.0/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: liuhaotian/llava-v1.5-13b
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
liuhaotian/llava-v1.5-13b_1.0/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "liuhaotian/llava-v1.5-13b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 256,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 128,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "up_proj",
25
+ "o_proj",
26
+ "q_proj",
27
+ "down_proj",
28
+ "gate_proj",
29
+ "v_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
liuhaotian/llava-v1.5-13b_1.0/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1b2582b692b13611e3505defb6e50477ccb75c89fbdcca86cf157e1a7ff687f
3
+ size 1001466944
liuhaotian/llava-v1.5-13b_1.0/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
liuhaotian/llava-v1.5-13b_1.0/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
liuhaotian/llava-v1.5-13b_1.0/tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "bos_token": "<s>",
31
+ "clean_up_tokenization_spaces": false,
32
+ "eos_token": "</s>",
33
+ "legacy": false,
34
+ "model_max_length": 2048,
35
+ "pad_token": "<unk>",
36
+ "padding_side": "right",
37
+ "sp_model_kwargs": {},
38
+ "spaces_between_special_tokens": false,
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }
liuhaotian/llava-v1.5-13b_1.0/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8e7002d9da22439cd2142660ed0913d77b3af07a95d83f820f2b137e4beb14e
3
+ size 6840
trainer_state.json CHANGED
@@ -17,145 +17,145 @@
17
  {
18
  "epoch": 0.07,
19
  "learning_rate": 6.309297535714573e-05,
20
- "loss": 1.202,
21
  "step": 2
22
  },
23
  {
24
  "epoch": 0.11,
25
  "learning_rate": 0.0001,
26
- "loss": 1.1727,
27
  "step": 3
28
  },
29
  {
30
  "epoch": 0.14,
31
  "learning_rate": 0.00012618595071429146,
32
- "loss": 1.1316,
33
  "step": 4
34
  },
35
  {
36
  "epoch": 0.18,
37
  "learning_rate": 0.0001464973520717927,
38
- "loss": 1.0778,
39
  "step": 5
40
  },
41
  {
42
  "epoch": 0.21,
43
  "learning_rate": 0.00016309297535714573,
44
- "loss": 1.0263,
45
  "step": 6
46
  },
47
  {
48
  "epoch": 0.25,
49
  "learning_rate": 0.00017712437491614223,
50
- "loss": 0.9944,
51
  "step": 7
52
  },
53
  {
54
  "epoch": 0.29,
55
  "learning_rate": 0.0001892789260714372,
56
- "loss": 0.9553,
57
  "step": 8
58
  },
59
  {
60
  "epoch": 0.32,
61
  "learning_rate": 0.0002,
62
- "loss": 0.9323,
63
  "step": 9
64
  },
65
  {
66
  "epoch": 0.36,
67
  "learning_rate": 0.0002,
68
- "loss": 0.95,
69
  "step": 10
70
  },
71
  {
72
  "epoch": 0.39,
73
  "learning_rate": 0.0002,
74
- "loss": 0.881,
75
  "step": 11
76
  },
77
  {
78
  "epoch": 0.43,
79
  "learning_rate": 0.0002,
80
- "loss": 0.844,
81
  "step": 12
82
  },
83
  {
84
  "epoch": 0.46,
85
  "learning_rate": 0.0002,
86
- "loss": 0.8188,
87
  "step": 13
88
  },
89
  {
90
  "epoch": 0.5,
91
  "learning_rate": 0.0002,
92
- "loss": 0.7805,
93
  "step": 14
94
  },
95
  {
96
  "epoch": 0.54,
97
  "learning_rate": 0.0002,
98
- "loss": 0.7746,
99
  "step": 15
100
  },
101
  {
102
  "epoch": 0.57,
103
  "learning_rate": 0.0002,
104
- "loss": 0.7301,
105
  "step": 16
106
  },
107
  {
108
  "epoch": 0.61,
109
  "learning_rate": 0.0002,
110
- "loss": 0.6493,
111
  "step": 17
112
  },
113
  {
114
  "epoch": 0.64,
115
  "learning_rate": 0.0002,
116
- "loss": 0.7035,
117
  "step": 18
118
  },
119
  {
120
  "epoch": 0.68,
121
  "learning_rate": 0.0002,
122
- "loss": 0.6117,
123
  "step": 19
124
  },
125
  {
126
  "epoch": 0.71,
127
  "learning_rate": 0.0002,
128
- "loss": 0.6592,
129
  "step": 20
130
  },
131
  {
132
  "epoch": 0.75,
133
  "learning_rate": 0.0002,
134
- "loss": 0.5196,
135
  "step": 21
136
  },
137
  {
138
  "epoch": 0.79,
139
  "learning_rate": 0.0002,
140
- "loss": 0.4843,
141
  "step": 22
142
  },
143
  {
144
  "epoch": 0.82,
145
  "learning_rate": 0.0002,
146
- "loss": 0.4495,
147
  "step": 23
148
  },
149
  {
150
  "epoch": 0.86,
151
  "learning_rate": 0.0002,
152
- "loss": 0.3754,
153
  "step": 24
154
  },
155
  {
156
  "epoch": 0.89,
157
  "learning_rate": 0.0002,
158
- "loss": 0.4255,
159
  "step": 25
160
  },
161
  {
@@ -167,153 +167,153 @@
167
  {
168
  "epoch": 0.96,
169
  "learning_rate": 0.0002,
170
- "loss": 0.3981,
171
  "step": 27
172
  },
173
  {
174
  "epoch": 1.0,
175
  "learning_rate": 0.0002,
176
- "loss": 0.3214,
177
  "step": 28
178
  },
179
  {
180
  "epoch": 1.0,
181
- "eval_loss": 0.24818769097328186,
182
- "eval_runtime": 55.2393,
183
- "eval_samples_per_second": 15.695,
184
- "eval_steps_per_second": 0.507,
185
  "step": 28
186
  },
187
  {
188
  "epoch": 1.04,
189
  "learning_rate": 0.0002,
190
- "loss": 0.2308,
191
  "step": 29
192
  },
193
  {
194
  "epoch": 1.07,
195
  "learning_rate": 0.0002,
196
- "loss": 0.2436,
197
  "step": 30
198
  },
199
  {
200
  "epoch": 1.11,
201
  "learning_rate": 0.0002,
202
- "loss": 0.2235,
203
  "step": 31
204
  },
205
  {
206
  "epoch": 1.14,
207
  "learning_rate": 0.0002,
208
- "loss": 0.1936,
209
  "step": 32
210
  },
211
  {
212
  "epoch": 1.18,
213
  "learning_rate": 0.0002,
214
- "loss": 0.2112,
215
  "step": 33
216
  },
217
  {
218
  "epoch": 1.21,
219
  "learning_rate": 0.0002,
220
- "loss": 0.1917,
221
  "step": 34
222
  },
223
  {
224
  "epoch": 1.25,
225
  "learning_rate": 0.0002,
226
- "loss": 0.2321,
227
  "step": 35
228
  },
229
  {
230
  "epoch": 1.29,
231
  "learning_rate": 0.0002,
232
- "loss": 0.1529,
233
  "step": 36
234
  },
235
  {
236
  "epoch": 1.32,
237
  "learning_rate": 0.0002,
238
- "loss": 0.153,
239
  "step": 37
240
  },
241
  {
242
  "epoch": 1.36,
243
  "learning_rate": 0.0002,
244
- "loss": 0.0953,
245
  "step": 38
246
  },
247
  {
248
  "epoch": 1.39,
249
  "learning_rate": 0.0002,
250
- "loss": 0.1034,
251
  "step": 39
252
  },
253
  {
254
  "epoch": 1.43,
255
  "learning_rate": 0.0002,
256
- "loss": 0.1744,
257
  "step": 40
258
  },
259
  {
260
  "epoch": 1.46,
261
  "learning_rate": 0.0002,
262
- "loss": 0.114,
263
  "step": 41
264
  },
265
  {
266
  "epoch": 1.5,
267
  "learning_rate": 0.0002,
268
- "loss": 0.1056,
269
  "step": 42
270
  },
271
  {
272
  "epoch": 1.54,
273
  "learning_rate": 0.0002,
274
- "loss": 0.1304,
275
  "step": 43
276
  },
277
  {
278
  "epoch": 1.57,
279
  "learning_rate": 0.0002,
280
- "loss": 0.1312,
281
  "step": 44
282
  },
283
  {
284
  "epoch": 1.61,
285
  "learning_rate": 0.0002,
286
- "loss": 0.0674,
287
  "step": 45
288
  },
289
  {
290
  "epoch": 1.64,
291
  "learning_rate": 0.0002,
292
- "loss": 0.0739,
293
  "step": 46
294
  },
295
  {
296
  "epoch": 1.68,
297
  "learning_rate": 0.0002,
298
- "loss": 0.0908,
299
  "step": 47
300
  },
301
  {
302
  "epoch": 1.71,
303
  "learning_rate": 0.0002,
304
- "loss": 0.1123,
305
  "step": 48
306
  },
307
  {
308
  "epoch": 1.75,
309
  "learning_rate": 0.0002,
310
- "loss": 0.0984,
311
  "step": 49
312
  },
313
  {
314
  "epoch": 1.79,
315
  "learning_rate": 0.0002,
316
- "loss": 0.0546,
317
  "step": 50
318
  },
319
  {
@@ -325,457 +325,457 @@
325
  {
326
  "epoch": 1.86,
327
  "learning_rate": 0.0002,
328
- "loss": 0.0477,
329
  "step": 52
330
  },
331
  {
332
  "epoch": 1.89,
333
  "learning_rate": 0.0002,
334
- "loss": 0.1093,
335
  "step": 53
336
  },
337
  {
338
  "epoch": 1.93,
339
  "learning_rate": 0.0002,
340
- "loss": 0.1574,
341
  "step": 54
342
  },
343
  {
344
  "epoch": 1.96,
345
  "learning_rate": 0.0002,
346
- "loss": 0.1059,
347
  "step": 55
348
  },
349
  {
350
  "epoch": 2.0,
351
  "learning_rate": 0.0002,
352
- "loss": 0.0395,
353
  "step": 56
354
  },
355
  {
356
  "epoch": 2.0,
357
- "eval_loss": 0.04451237618923187,
358
- "eval_runtime": 55.2461,
359
- "eval_samples_per_second": 15.693,
360
- "eval_steps_per_second": 0.507,
361
  "step": 56
362
  },
363
  {
364
  "epoch": 2.04,
365
  "learning_rate": 0.0002,
366
- "loss": 0.0381,
367
  "step": 57
368
  },
369
  {
370
  "epoch": 2.07,
371
  "learning_rate": 0.0002,
372
- "loss": 0.0298,
373
  "step": 58
374
  },
375
  {
376
  "epoch": 2.11,
377
  "learning_rate": 0.0002,
378
- "loss": 0.0321,
379
  "step": 59
380
  },
381
  {
382
  "epoch": 2.14,
383
  "learning_rate": 0.0002,
384
- "loss": 0.0437,
385
  "step": 60
386
  },
387
  {
388
  "epoch": 2.18,
389
  "learning_rate": 0.0002,
390
- "loss": 0.0272,
391
  "step": 61
392
  },
393
  {
394
  "epoch": 2.21,
395
  "learning_rate": 0.0002,
396
- "loss": 0.0381,
397
  "step": 62
398
  },
399
  {
400
  "epoch": 2.25,
401
  "learning_rate": 0.0002,
402
- "loss": 0.0266,
403
  "step": 63
404
  },
405
  {
406
  "epoch": 2.29,
407
  "learning_rate": 0.0002,
408
- "loss": 0.0228,
409
  "step": 64
410
  },
411
  {
412
  "epoch": 2.32,
413
  "learning_rate": 0.0002,
414
- "loss": 0.029,
415
  "step": 65
416
  },
417
  {
418
  "epoch": 2.36,
419
  "learning_rate": 0.0002,
420
- "loss": 0.0288,
421
  "step": 66
422
  },
423
  {
424
  "epoch": 2.39,
425
  "learning_rate": 0.0002,
426
- "loss": 0.0155,
427
  "step": 67
428
  },
429
  {
430
  "epoch": 2.43,
431
  "learning_rate": 0.0002,
432
- "loss": 0.0294,
433
  "step": 68
434
  },
435
  {
436
  "epoch": 2.46,
437
  "learning_rate": 0.0002,
438
- "loss": 0.0368,
439
  "step": 69
440
  },
441
  {
442
  "epoch": 2.5,
443
  "learning_rate": 0.0002,
444
- "loss": 0.0278,
445
  "step": 70
446
  },
447
  {
448
  "epoch": 2.54,
449
  "learning_rate": 0.0002,
450
- "loss": 0.0194,
451
  "step": 71
452
  },
453
  {
454
  "epoch": 2.57,
455
  "learning_rate": 0.0002,
456
- "loss": 0.0216,
457
  "step": 72
458
  },
459
  {
460
  "epoch": 2.61,
461
  "learning_rate": 0.0002,
462
- "loss": 0.0142,
463
  "step": 73
464
  },
465
  {
466
  "epoch": 2.64,
467
  "learning_rate": 0.0002,
468
- "loss": 0.0362,
469
  "step": 74
470
  },
471
  {
472
  "epoch": 2.68,
473
  "learning_rate": 0.0002,
474
- "loss": 0.0279,
475
  "step": 75
476
  },
477
  {
478
  "epoch": 2.71,
479
  "learning_rate": 0.0002,
480
- "loss": 0.0171,
481
  "step": 76
482
  },
483
  {
484
  "epoch": 2.75,
485
  "learning_rate": 0.0002,
486
- "loss": 0.0126,
487
  "step": 77
488
  },
489
  {
490
  "epoch": 2.79,
491
  "learning_rate": 0.0002,
492
- "loss": 0.026,
493
  "step": 78
494
  },
495
  {
496
  "epoch": 2.82,
497
  "learning_rate": 0.0002,
498
- "loss": 0.03,
499
  "step": 79
500
  },
501
  {
502
  "epoch": 2.86,
503
  "learning_rate": 0.0002,
504
- "loss": 0.0253,
505
  "step": 80
506
  },
507
  {
508
  "epoch": 2.89,
509
  "learning_rate": 0.0002,
510
- "loss": 0.0229,
511
  "step": 81
512
  },
513
  {
514
  "epoch": 2.93,
515
  "learning_rate": 0.0002,
516
- "loss": 0.0245,
517
  "step": 82
518
  },
519
  {
520
  "epoch": 2.96,
521
  "learning_rate": 0.0002,
522
- "loss": 0.0309,
523
  "step": 83
524
  },
525
  {
526
  "epoch": 3.0,
527
  "learning_rate": 0.0002,
528
- "loss": 0.0125,
529
  "step": 84
530
  },
531
  {
532
  "epoch": 3.0,
533
- "eval_loss": 0.012688164599239826,
534
- "eval_runtime": 55.285,
535
- "eval_samples_per_second": 15.682,
536
- "eval_steps_per_second": 0.506,
537
  "step": 84
538
  },
539
  {
540
  "epoch": 3.04,
541
  "learning_rate": 0.0002,
542
- "loss": 0.0061,
543
  "step": 85
544
  },
545
  {
546
  "epoch": 3.07,
547
  "learning_rate": 0.0002,
548
- "loss": 0.012,
549
  "step": 86
550
  },
551
  {
552
  "epoch": 3.11,
553
  "learning_rate": 0.0002,
554
- "loss": 0.0101,
555
  "step": 87
556
  },
557
  {
558
  "epoch": 3.14,
559
  "learning_rate": 0.0002,
560
- "loss": 0.0051,
561
  "step": 88
562
  },
563
  {
564
  "epoch": 3.18,
565
  "learning_rate": 0.0002,
566
- "loss": 0.0099,
567
  "step": 89
568
  },
569
  {
570
  "epoch": 3.21,
571
  "learning_rate": 0.0002,
572
- "loss": 0.0128,
573
  "step": 90
574
  },
575
  {
576
  "epoch": 3.25,
577
  "learning_rate": 0.0002,
578
- "loss": 0.0107,
579
  "step": 91
580
  },
581
  {
582
  "epoch": 3.29,
583
  "learning_rate": 0.0002,
584
- "loss": 0.0051,
585
  "step": 92
586
  },
587
  {
588
  "epoch": 3.32,
589
  "learning_rate": 0.0002,
590
- "loss": 0.0053,
591
  "step": 93
592
  },
593
  {
594
  "epoch": 3.36,
595
  "learning_rate": 0.0002,
596
- "loss": 0.0072,
597
  "step": 94
598
  },
599
  {
600
  "epoch": 3.39,
601
  "learning_rate": 0.0002,
602
- "loss": 0.0105,
603
  "step": 95
604
  },
605
  {
606
  "epoch": 3.43,
607
  "learning_rate": 0.0002,
608
- "loss": 0.0118,
609
  "step": 96
610
  },
611
  {
612
  "epoch": 3.46,
613
  "learning_rate": 0.0002,
614
- "loss": 0.0087,
615
  "step": 97
616
  },
617
  {
618
  "epoch": 3.5,
619
  "learning_rate": 0.0002,
620
- "loss": 0.0117,
621
  "step": 98
622
  },
623
  {
624
  "epoch": 3.54,
625
  "learning_rate": 0.0002,
626
- "loss": 0.0174,
627
  "step": 99
628
  },
629
  {
630
  "epoch": 3.57,
631
  "learning_rate": 0.0002,
632
- "loss": 0.0053,
633
  "step": 100
634
  },
635
  {
636
  "epoch": 3.61,
637
  "learning_rate": 0.0002,
638
- "loss": 0.0189,
639
  "step": 101
640
  },
641
  {
642
  "epoch": 3.64,
643
  "learning_rate": 0.0002,
644
- "loss": 0.0093,
645
  "step": 102
646
  },
647
  {
648
  "epoch": 3.68,
649
  "learning_rate": 0.0002,
650
- "loss": 0.0142,
651
  "step": 103
652
  },
653
  {
654
  "epoch": 3.71,
655
  "learning_rate": 0.0002,
656
- "loss": 0.0151,
657
  "step": 104
658
  },
659
  {
660
  "epoch": 3.75,
661
  "learning_rate": 0.0002,
662
- "loss": 0.0162,
663
  "step": 105
664
  },
665
  {
666
  "epoch": 3.79,
667
  "learning_rate": 0.0002,
668
- "loss": 0.0134,
669
  "step": 106
670
  },
671
  {
672
  "epoch": 3.82,
673
  "learning_rate": 0.0002,
674
- "loss": 0.0071,
675
  "step": 107
676
  },
677
  {
678
  "epoch": 3.86,
679
  "learning_rate": 0.0002,
680
- "loss": 0.0113,
681
  "step": 108
682
  },
683
  {
684
  "epoch": 3.89,
685
  "learning_rate": 0.0002,
686
- "loss": 0.0084,
687
  "step": 109
688
  },
689
  {
690
  "epoch": 3.93,
691
  "learning_rate": 0.0002,
692
- "loss": 0.0171,
693
  "step": 110
694
  },
695
  {
696
  "epoch": 3.96,
697
  "learning_rate": 0.0002,
698
- "loss": 0.0032,
699
  "step": 111
700
  },
701
  {
702
  "epoch": 4.0,
703
  "learning_rate": 0.0002,
704
- "loss": 0.0046,
705
  "step": 112
706
  },
707
  {
708
  "epoch": 4.0,
709
- "eval_loss": 0.005700697656720877,
710
- "eval_runtime": 55.3607,
711
- "eval_samples_per_second": 15.661,
712
- "eval_steps_per_second": 0.506,
713
  "step": 112
714
  },
715
  {
716
  "epoch": 4.04,
717
  "learning_rate": 0.0002,
718
- "loss": 0.0057,
719
  "step": 113
720
  },
721
  {
722
  "epoch": 4.07,
723
  "learning_rate": 0.0002,
724
- "loss": 0.0052,
725
  "step": 114
726
  },
727
  {
728
  "epoch": 4.11,
729
  "learning_rate": 0.0002,
730
- "loss": 0.0047,
731
  "step": 115
732
  },
733
  {
734
  "epoch": 4.14,
735
  "learning_rate": 0.0002,
736
- "loss": 0.0073,
737
  "step": 116
738
  },
739
  {
740
  "epoch": 4.18,
741
  "learning_rate": 0.0002,
742
- "loss": 0.0047,
743
  "step": 117
744
  },
745
  {
746
  "epoch": 4.21,
747
  "learning_rate": 0.0002,
748
- "loss": 0.0061,
749
  "step": 118
750
  },
751
  {
752
  "epoch": 4.25,
753
  "learning_rate": 0.0002,
754
- "loss": 0.0106,
755
  "step": 119
756
  },
757
  {
758
  "epoch": 4.29,
759
  "learning_rate": 0.0002,
760
- "loss": 0.0082,
761
  "step": 120
762
  },
763
  {
764
  "epoch": 4.32,
765
  "learning_rate": 0.0002,
766
- "loss": 0.006,
767
  "step": 121
768
  },
769
  {
770
  "epoch": 4.36,
771
  "learning_rate": 0.0002,
772
- "loss": 0.0059,
773
  "step": 122
774
  },
775
  {
776
  "epoch": 4.39,
777
  "learning_rate": 0.0002,
778
- "loss": 0.0064,
779
  "step": 123
780
  },
781
  {
@@ -787,995 +787,995 @@
787
  {
788
  "epoch": 4.46,
789
  "learning_rate": 0.0002,
790
- "loss": 0.0075,
791
  "step": 125
792
  },
793
  {
794
  "epoch": 4.5,
795
  "learning_rate": 0.0002,
796
- "loss": 0.0051,
797
  "step": 126
798
  },
799
  {
800
  "epoch": 4.54,
801
  "learning_rate": 0.0002,
802
- "loss": 0.0024,
803
  "step": 127
804
  },
805
  {
806
  "epoch": 4.57,
807
  "learning_rate": 0.0002,
808
- "loss": 0.0034,
809
  "step": 128
810
  },
811
  {
812
  "epoch": 4.61,
813
  "learning_rate": 0.0002,
814
- "loss": 0.0039,
815
  "step": 129
816
  },
817
  {
818
  "epoch": 4.64,
819
  "learning_rate": 0.0002,
820
- "loss": 0.0039,
821
  "step": 130
822
  },
823
  {
824
  "epoch": 4.68,
825
  "learning_rate": 0.0002,
826
- "loss": 0.0035,
827
  "step": 131
828
  },
829
  {
830
  "epoch": 4.71,
831
  "learning_rate": 0.0002,
832
- "loss": 0.0033,
833
  "step": 132
834
  },
835
  {
836
  "epoch": 4.75,
837
  "learning_rate": 0.0002,
838
- "loss": 0.0088,
839
  "step": 133
840
  },
841
  {
842
  "epoch": 4.79,
843
  "learning_rate": 0.0002,
844
- "loss": 0.0022,
845
  "step": 134
846
  },
847
  {
848
  "epoch": 4.82,
849
  "learning_rate": 0.0002,
850
- "loss": 0.0049,
851
  "step": 135
852
  },
853
  {
854
  "epoch": 4.86,
855
  "learning_rate": 0.0002,
856
- "loss": 0.0036,
857
  "step": 136
858
  },
859
  {
860
  "epoch": 4.89,
861
  "learning_rate": 0.0002,
862
- "loss": 0.0035,
863
  "step": 137
864
  },
865
  {
866
  "epoch": 4.93,
867
  "learning_rate": 0.0002,
868
- "loss": 0.0077,
869
  "step": 138
870
  },
871
  {
872
  "epoch": 4.96,
873
  "learning_rate": 0.0002,
874
- "loss": 0.0046,
875
  "step": 139
876
  },
877
  {
878
  "epoch": 5.0,
879
  "learning_rate": 0.0002,
880
- "loss": 0.0044,
881
  "step": 140
882
  },
883
  {
884
  "epoch": 5.0,
885
- "eval_loss": 0.0035695512779057026,
886
- "eval_runtime": 55.238,
887
- "eval_samples_per_second": 15.696,
888
- "eval_steps_per_second": 0.507,
889
  "step": 140
890
  },
891
  {
892
  "epoch": 5.04,
893
  "learning_rate": 0.0002,
894
- "loss": 0.0013,
895
  "step": 141
896
  },
897
  {
898
  "epoch": 5.07,
899
  "learning_rate": 0.0002,
900
- "loss": 0.0029,
901
  "step": 142
902
  },
903
  {
904
  "epoch": 5.11,
905
  "learning_rate": 0.0002,
906
- "loss": 0.0056,
907
  "step": 143
908
  },
909
  {
910
  "epoch": 5.14,
911
  "learning_rate": 0.0002,
912
- "loss": 0.0072,
913
  "step": 144
914
  },
915
  {
916
  "epoch": 5.18,
917
  "learning_rate": 0.0002,
918
- "loss": 0.0051,
919
  "step": 145
920
  },
921
  {
922
  "epoch": 5.21,
923
  "learning_rate": 0.0002,
924
- "loss": 0.0046,
925
  "step": 146
926
  },
927
  {
928
  "epoch": 5.25,
929
  "learning_rate": 0.0002,
930
- "loss": 0.0062,
931
  "step": 147
932
  },
933
  {
934
  "epoch": 5.29,
935
  "learning_rate": 0.0002,
936
- "loss": 0.0029,
937
  "step": 148
938
  },
939
  {
940
  "epoch": 5.32,
941
  "learning_rate": 0.0002,
942
- "loss": 0.0088,
943
  "step": 149
944
  },
945
  {
946
  "epoch": 5.36,
947
  "learning_rate": 0.0002,
948
- "loss": 0.0035,
949
  "step": 150
950
  },
951
  {
952
  "epoch": 5.39,
953
  "learning_rate": 0.0002,
954
- "loss": 0.0019,
955
  "step": 151
956
  },
957
  {
958
  "epoch": 5.43,
959
  "learning_rate": 0.0002,
960
- "loss": 0.0021,
961
  "step": 152
962
  },
963
  {
964
  "epoch": 5.46,
965
  "learning_rate": 0.0002,
966
- "loss": 0.0049,
967
  "step": 153
968
  },
969
  {
970
  "epoch": 5.5,
971
  "learning_rate": 0.0002,
972
- "loss": 0.0034,
973
  "step": 154
974
  },
975
  {
976
  "epoch": 5.54,
977
  "learning_rate": 0.0002,
978
- "loss": 0.0037,
979
  "step": 155
980
  },
981
  {
982
  "epoch": 5.57,
983
  "learning_rate": 0.0002,
984
- "loss": 0.0059,
985
  "step": 156
986
  },
987
  {
988
  "epoch": 5.61,
989
  "learning_rate": 0.0002,
990
- "loss": 0.0055,
991
  "step": 157
992
  },
993
  {
994
  "epoch": 5.64,
995
  "learning_rate": 0.0002,
996
- "loss": 0.004,
997
  "step": 158
998
  },
999
  {
1000
  "epoch": 5.68,
1001
  "learning_rate": 0.0002,
1002
- "loss": 0.0043,
1003
  "step": 159
1004
  },
1005
  {
1006
  "epoch": 5.71,
1007
  "learning_rate": 0.0002,
1008
- "loss": 0.0051,
1009
  "step": 160
1010
  },
1011
  {
1012
  "epoch": 5.75,
1013
  "learning_rate": 0.0002,
1014
- "loss": 0.0054,
1015
  "step": 161
1016
  },
1017
  {
1018
  "epoch": 5.79,
1019
  "learning_rate": 0.0002,
1020
- "loss": 0.0046,
1021
  "step": 162
1022
  },
1023
  {
1024
  "epoch": 5.82,
1025
  "learning_rate": 0.0002,
1026
- "loss": 0.0062,
1027
  "step": 163
1028
  },
1029
  {
1030
  "epoch": 5.86,
1031
  "learning_rate": 0.0002,
1032
- "loss": 0.0081,
1033
  "step": 164
1034
  },
1035
  {
1036
  "epoch": 5.89,
1037
  "learning_rate": 0.0002,
1038
- "loss": 0.0053,
1039
  "step": 165
1040
  },
1041
  {
1042
  "epoch": 5.93,
1043
  "learning_rate": 0.0002,
1044
- "loss": 0.0057,
1045
  "step": 166
1046
  },
1047
  {
1048
  "epoch": 5.96,
1049
  "learning_rate": 0.0002,
1050
- "loss": 0.01,
1051
  "step": 167
1052
  },
1053
  {
1054
  "epoch": 6.0,
1055
  "learning_rate": 0.0002,
1056
- "loss": 0.0044,
1057
  "step": 168
1058
  },
1059
  {
1060
  "epoch": 6.0,
1061
- "eval_loss": 0.004916314501315355,
1062
- "eval_runtime": 55.2584,
1063
- "eval_samples_per_second": 15.69,
1064
- "eval_steps_per_second": 0.507,
1065
  "step": 168
1066
  },
1067
  {
1068
  "epoch": 6.04,
1069
  "learning_rate": 0.0002,
1070
- "loss": 0.0035,
1071
  "step": 169
1072
  },
1073
  {
1074
  "epoch": 6.07,
1075
  "learning_rate": 0.0002,
1076
- "loss": 0.0051,
1077
  "step": 170
1078
  },
1079
  {
1080
  "epoch": 6.11,
1081
  "learning_rate": 0.0002,
1082
- "loss": 0.0036,
1083
  "step": 171
1084
  },
1085
  {
1086
  "epoch": 6.14,
1087
  "learning_rate": 0.0002,
1088
- "loss": 0.0032,
1089
  "step": 172
1090
  },
1091
  {
1092
  "epoch": 6.18,
1093
  "learning_rate": 0.0002,
1094
- "loss": 0.0078,
1095
  "step": 173
1096
  },
1097
  {
1098
  "epoch": 6.21,
1099
  "learning_rate": 0.0002,
1100
- "loss": 0.0061,
1101
  "step": 174
1102
  },
1103
  {
1104
  "epoch": 6.25,
1105
  "learning_rate": 0.0002,
1106
- "loss": 0.0081,
1107
  "step": 175
1108
  },
1109
  {
1110
  "epoch": 6.29,
1111
  "learning_rate": 0.0002,
1112
- "loss": 0.0034,
1113
  "step": 176
1114
  },
1115
  {
1116
  "epoch": 6.32,
1117
  "learning_rate": 0.0002,
1118
- "loss": 0.0035,
1119
  "step": 177
1120
  },
1121
  {
1122
  "epoch": 6.36,
1123
  "learning_rate": 0.0002,
1124
- "loss": 0.0041,
1125
  "step": 178
1126
  },
1127
  {
1128
  "epoch": 6.39,
1129
  "learning_rate": 0.0002,
1130
- "loss": 0.003,
1131
  "step": 179
1132
  },
1133
  {
1134
  "epoch": 6.43,
1135
  "learning_rate": 0.0002,
1136
- "loss": 0.0025,
1137
  "step": 180
1138
  },
1139
  {
1140
  "epoch": 6.46,
1141
  "learning_rate": 0.0002,
1142
- "loss": 0.0032,
1143
  "step": 181
1144
  },
1145
  {
1146
  "epoch": 6.5,
1147
  "learning_rate": 0.0002,
1148
- "loss": 0.004,
1149
  "step": 182
1150
  },
1151
  {
1152
  "epoch": 6.54,
1153
  "learning_rate": 0.0002,
1154
- "loss": 0.0042,
1155
  "step": 183
1156
  },
1157
  {
1158
  "epoch": 6.57,
1159
  "learning_rate": 0.0002,
1160
- "loss": 0.0058,
1161
  "step": 184
1162
  },
1163
  {
1164
  "epoch": 6.61,
1165
  "learning_rate": 0.0002,
1166
- "loss": 0.0027,
1167
  "step": 185
1168
  },
1169
  {
1170
  "epoch": 6.64,
1171
  "learning_rate": 0.0002,
1172
- "loss": 0.0043,
1173
  "step": 186
1174
  },
1175
  {
1176
  "epoch": 6.68,
1177
  "learning_rate": 0.0002,
1178
- "loss": 0.0025,
1179
  "step": 187
1180
  },
1181
  {
1182
  "epoch": 6.71,
1183
  "learning_rate": 0.0002,
1184
- "loss": 0.0039,
1185
  "step": 188
1186
  },
1187
  {
1188
  "epoch": 6.75,
1189
  "learning_rate": 0.0002,
1190
- "loss": 0.0024,
1191
  "step": 189
1192
  },
1193
  {
1194
  "epoch": 6.79,
1195
  "learning_rate": 0.0002,
1196
- "loss": 0.0013,
1197
  "step": 190
1198
  },
1199
  {
1200
  "epoch": 6.82,
1201
  "learning_rate": 0.0002,
1202
- "loss": 0.0027,
1203
  "step": 191
1204
  },
1205
  {
1206
  "epoch": 6.86,
1207
  "learning_rate": 0.0002,
1208
- "loss": 0.0075,
1209
  "step": 192
1210
  },
1211
  {
1212
  "epoch": 6.89,
1213
  "learning_rate": 0.0002,
1214
- "loss": 0.0028,
1215
  "step": 193
1216
  },
1217
  {
1218
  "epoch": 6.93,
1219
  "learning_rate": 0.0002,
1220
- "loss": 0.0029,
1221
  "step": 194
1222
  },
1223
  {
1224
  "epoch": 6.96,
1225
  "learning_rate": 0.0002,
1226
- "loss": 0.0017,
1227
  "step": 195
1228
  },
1229
  {
1230
  "epoch": 7.0,
1231
  "learning_rate": 0.0002,
1232
- "loss": 0.0014,
1233
  "step": 196
1234
  },
1235
  {
1236
  "epoch": 7.0,
1237
- "eval_loss": 0.0020930657628923655,
1238
- "eval_runtime": 55.303,
1239
- "eval_samples_per_second": 15.677,
1240
- "eval_steps_per_second": 0.506,
1241
  "step": 196
1242
  },
1243
  {
1244
  "epoch": 7.04,
1245
  "learning_rate": 0.0002,
1246
- "loss": 0.0009,
1247
  "step": 197
1248
  },
1249
  {
1250
  "epoch": 7.07,
1251
  "learning_rate": 0.0002,
1252
- "loss": 0.0074,
1253
  "step": 198
1254
  },
1255
  {
1256
  "epoch": 7.11,
1257
  "learning_rate": 0.0002,
1258
- "loss": 0.0018,
1259
  "step": 199
1260
  },
1261
  {
1262
  "epoch": 7.14,
1263
  "learning_rate": 0.0002,
1264
- "loss": 0.0017,
1265
  "step": 200
1266
  },
1267
  {
1268
  "epoch": 7.18,
1269
  "learning_rate": 0.0002,
1270
- "loss": 0.0008,
1271
  "step": 201
1272
  },
1273
  {
1274
  "epoch": 7.21,
1275
  "learning_rate": 0.0002,
1276
- "loss": 0.0017,
1277
  "step": 202
1278
  },
1279
  {
1280
  "epoch": 7.25,
1281
  "learning_rate": 0.0002,
1282
- "loss": 0.0028,
1283
  "step": 203
1284
  },
1285
  {
1286
  "epoch": 7.29,
1287
  "learning_rate": 0.0002,
1288
- "loss": 0.0024,
1289
  "step": 204
1290
  },
1291
  {
1292
  "epoch": 7.32,
1293
  "learning_rate": 0.0002,
1294
- "loss": 0.0014,
1295
  "step": 205
1296
  },
1297
  {
1298
  "epoch": 7.36,
1299
  "learning_rate": 0.0002,
1300
- "loss": 0.0059,
1301
  "step": 206
1302
  },
1303
  {
1304
  "epoch": 7.39,
1305
  "learning_rate": 0.0002,
1306
- "loss": 0.0012,
1307
  "step": 207
1308
  },
1309
  {
1310
  "epoch": 7.43,
1311
  "learning_rate": 0.0002,
1312
- "loss": 0.0019,
1313
  "step": 208
1314
  },
1315
  {
1316
  "epoch": 7.46,
1317
  "learning_rate": 0.0002,
1318
- "loss": 0.0028,
1319
  "step": 209
1320
  },
1321
  {
1322
  "epoch": 7.5,
1323
  "learning_rate": 0.0002,
1324
- "loss": 0.0025,
1325
  "step": 210
1326
  },
1327
  {
1328
  "epoch": 7.54,
1329
  "learning_rate": 0.0002,
1330
- "loss": 0.0003,
1331
  "step": 211
1332
  },
1333
  {
1334
  "epoch": 7.57,
1335
  "learning_rate": 0.0002,
1336
- "loss": 0.0006,
1337
  "step": 212
1338
  },
1339
  {
1340
  "epoch": 7.61,
1341
  "learning_rate": 0.0002,
1342
- "loss": 0.001,
1343
  "step": 213
1344
  },
1345
  {
1346
  "epoch": 7.64,
1347
  "learning_rate": 0.0002,
1348
- "loss": 0.0016,
1349
  "step": 214
1350
  },
1351
  {
1352
  "epoch": 7.68,
1353
  "learning_rate": 0.0002,
1354
- "loss": 0.0058,
1355
  "step": 215
1356
  },
1357
  {
1358
  "epoch": 7.71,
1359
  "learning_rate": 0.0002,
1360
- "loss": 0.0011,
1361
  "step": 216
1362
  },
1363
  {
1364
  "epoch": 7.75,
1365
  "learning_rate": 0.0002,
1366
- "loss": 0.0035,
1367
  "step": 217
1368
  },
1369
  {
1370
  "epoch": 7.79,
1371
  "learning_rate": 0.0002,
1372
- "loss": 0.0015,
1373
  "step": 218
1374
  },
1375
  {
1376
  "epoch": 7.82,
1377
  "learning_rate": 0.0002,
1378
- "loss": 0.0061,
1379
  "step": 219
1380
  },
1381
  {
1382
  "epoch": 7.86,
1383
  "learning_rate": 0.0002,
1384
- "loss": 0.0015,
1385
  "step": 220
1386
  },
1387
  {
1388
  "epoch": 7.89,
1389
  "learning_rate": 0.0002,
1390
- "loss": 0.0025,
1391
  "step": 221
1392
  },
1393
  {
1394
  "epoch": 7.93,
1395
  "learning_rate": 0.0002,
1396
- "loss": 0.0008,
1397
  "step": 222
1398
  },
1399
  {
1400
  "epoch": 7.96,
1401
  "learning_rate": 0.0002,
1402
- "loss": 0.0016,
1403
  "step": 223
1404
  },
1405
  {
1406
  "epoch": 8.0,
1407
  "learning_rate": 0.0002,
1408
- "loss": 0.003,
1409
  "step": 224
1410
  },
1411
  {
1412
  "epoch": 8.0,
1413
- "eval_loss": 0.0014942155685275793,
1414
- "eval_runtime": 55.2573,
1415
- "eval_samples_per_second": 15.69,
1416
- "eval_steps_per_second": 0.507,
1417
  "step": 224
1418
  },
1419
  {
1420
  "epoch": 8.04,
1421
  "learning_rate": 0.0002,
1422
- "loss": 0.0011,
1423
  "step": 225
1424
  },
1425
  {
1426
  "epoch": 8.07,
1427
  "learning_rate": 0.0002,
1428
- "loss": 0.0044,
1429
  "step": 226
1430
  },
1431
  {
1432
  "epoch": 8.11,
1433
  "learning_rate": 0.0002,
1434
- "loss": 0.0013,
1435
  "step": 227
1436
  },
1437
  {
1438
  "epoch": 8.14,
1439
  "learning_rate": 0.0002,
1440
- "loss": 0.0007,
1441
  "step": 228
1442
  },
1443
  {
1444
  "epoch": 8.18,
1445
  "learning_rate": 0.0002,
1446
- "loss": 0.0005,
1447
  "step": 229
1448
  },
1449
  {
1450
  "epoch": 8.21,
1451
  "learning_rate": 0.0002,
1452
- "loss": 0.0012,
1453
  "step": 230
1454
  },
1455
  {
1456
  "epoch": 8.25,
1457
  "learning_rate": 0.0002,
1458
- "loss": 0.0013,
1459
  "step": 231
1460
  },
1461
  {
1462
  "epoch": 8.29,
1463
  "learning_rate": 0.0002,
1464
- "loss": 0.0026,
1465
  "step": 232
1466
  },
1467
  {
1468
  "epoch": 8.32,
1469
  "learning_rate": 0.0002,
1470
- "loss": 0.0015,
1471
  "step": 233
1472
  },
1473
  {
1474
  "epoch": 8.36,
1475
  "learning_rate": 0.0002,
1476
- "loss": 0.0025,
1477
  "step": 234
1478
  },
1479
  {
1480
  "epoch": 8.39,
1481
  "learning_rate": 0.0002,
1482
- "loss": 0.0008,
1483
  "step": 235
1484
  },
1485
  {
1486
  "epoch": 8.43,
1487
  "learning_rate": 0.0002,
1488
- "loss": 0.0026,
1489
  "step": 236
1490
  },
1491
  {
1492
  "epoch": 8.46,
1493
  "learning_rate": 0.0002,
1494
- "loss": 0.0003,
1495
  "step": 237
1496
  },
1497
  {
1498
  "epoch": 8.5,
1499
  "learning_rate": 0.0002,
1500
- "loss": 0.0005,
1501
  "step": 238
1502
  },
1503
  {
1504
  "epoch": 8.54,
1505
  "learning_rate": 0.0002,
1506
- "loss": 0.0023,
1507
  "step": 239
1508
  },
1509
  {
1510
  "epoch": 8.57,
1511
  "learning_rate": 0.0002,
1512
- "loss": 0.002,
1513
  "step": 240
1514
  },
1515
  {
1516
  "epoch": 8.61,
1517
  "learning_rate": 0.0002,
1518
- "loss": 0.0017,
1519
  "step": 241
1520
  },
1521
  {
1522
  "epoch": 8.64,
1523
  "learning_rate": 0.0002,
1524
- "loss": 0.0025,
1525
  "step": 242
1526
  },
1527
  {
1528
  "epoch": 8.68,
1529
  "learning_rate": 0.0002,
1530
- "loss": 0.0006,
1531
  "step": 243
1532
  },
1533
  {
1534
  "epoch": 8.71,
1535
  "learning_rate": 0.0002,
1536
- "loss": 0.0014,
1537
  "step": 244
1538
  },
1539
  {
1540
  "epoch": 8.75,
1541
  "learning_rate": 0.0002,
1542
- "loss": 0.0018,
1543
  "step": 245
1544
  },
1545
  {
1546
  "epoch": 8.79,
1547
  "learning_rate": 0.0002,
1548
- "loss": 0.0006,
1549
  "step": 246
1550
  },
1551
  {
1552
  "epoch": 8.82,
1553
  "learning_rate": 0.0002,
1554
- "loss": 0.0012,
1555
  "step": 247
1556
  },
1557
  {
1558
  "epoch": 8.86,
1559
  "learning_rate": 0.0002,
1560
- "loss": 0.0005,
1561
  "step": 248
1562
  },
1563
  {
1564
  "epoch": 8.89,
1565
  "learning_rate": 0.0002,
1566
- "loss": 0.0007,
1567
  "step": 249
1568
  },
1569
  {
1570
  "epoch": 8.93,
1571
  "learning_rate": 0.0002,
1572
- "loss": 0.0027,
1573
  "step": 250
1574
  },
1575
  {
1576
  "epoch": 8.96,
1577
  "learning_rate": 0.0002,
1578
- "loss": 0.0018,
1579
  "step": 251
1580
  },
1581
  {
1582
  "epoch": 9.0,
1583
  "learning_rate": 0.0002,
1584
- "loss": 0.0029,
1585
  "step": 252
1586
  },
1587
  {
1588
  "epoch": 9.0,
1589
- "eval_loss": 0.0013407374499365687,
1590
- "eval_runtime": 54.9274,
1591
- "eval_samples_per_second": 15.784,
1592
- "eval_steps_per_second": 0.51,
1593
  "step": 252
1594
  },
1595
  {
1596
  "epoch": 9.04,
1597
  "learning_rate": 0.0002,
1598
- "loss": 0.0041,
1599
  "step": 253
1600
  },
1601
  {
1602
  "epoch": 9.07,
1603
  "learning_rate": 0.0002,
1604
- "loss": 0.003,
1605
  "step": 254
1606
  },
1607
  {
1608
  "epoch": 9.11,
1609
  "learning_rate": 0.0002,
1610
- "loss": 0.0004,
1611
  "step": 255
1612
  },
1613
  {
1614
  "epoch": 9.14,
1615
  "learning_rate": 0.0002,
1616
- "loss": 0.0027,
1617
  "step": 256
1618
  },
1619
  {
1620
  "epoch": 9.18,
1621
  "learning_rate": 0.0002,
1622
- "loss": 0.003,
1623
  "step": 257
1624
  },
1625
  {
1626
  "epoch": 9.21,
1627
  "learning_rate": 0.0002,
1628
- "loss": 0.0005,
1629
  "step": 258
1630
  },
1631
  {
1632
  "epoch": 9.25,
1633
  "learning_rate": 0.0002,
1634
- "loss": 0.0019,
1635
  "step": 259
1636
  },
1637
  {
1638
  "epoch": 9.29,
1639
  "learning_rate": 0.0002,
1640
- "loss": 0.0028,
1641
  "step": 260
1642
  },
1643
  {
1644
  "epoch": 9.32,
1645
  "learning_rate": 0.0002,
1646
- "loss": 0.0014,
1647
  "step": 261
1648
  },
1649
  {
1650
  "epoch": 9.36,
1651
  "learning_rate": 0.0002,
1652
- "loss": 0.0018,
1653
  "step": 262
1654
  },
1655
  {
1656
  "epoch": 9.39,
1657
  "learning_rate": 0.0002,
1658
- "loss": 0.0038,
1659
  "step": 263
1660
  },
1661
  {
1662
  "epoch": 9.43,
1663
  "learning_rate": 0.0002,
1664
- "loss": 0.0019,
1665
  "step": 264
1666
  },
1667
  {
1668
  "epoch": 9.46,
1669
  "learning_rate": 0.0002,
1670
- "loss": 0.0008,
1671
  "step": 265
1672
  },
1673
  {
1674
  "epoch": 9.5,
1675
  "learning_rate": 0.0002,
1676
- "loss": 0.0021,
1677
  "step": 266
1678
  },
1679
  {
1680
  "epoch": 9.54,
1681
  "learning_rate": 0.0002,
1682
- "loss": 0.0013,
1683
  "step": 267
1684
  },
1685
  {
1686
  "epoch": 9.57,
1687
  "learning_rate": 0.0002,
1688
- "loss": 0.0024,
1689
  "step": 268
1690
  },
1691
  {
1692
  "epoch": 9.61,
1693
  "learning_rate": 0.0002,
1694
- "loss": 0.0031,
1695
  "step": 269
1696
  },
1697
  {
1698
  "epoch": 9.64,
1699
  "learning_rate": 0.0002,
1700
- "loss": 0.0013,
1701
  "step": 270
1702
  },
1703
  {
1704
  "epoch": 9.68,
1705
  "learning_rate": 0.0002,
1706
- "loss": 0.0014,
1707
  "step": 271
1708
  },
1709
  {
1710
  "epoch": 9.71,
1711
  "learning_rate": 0.0002,
1712
- "loss": 0.0023,
1713
  "step": 272
1714
  },
1715
  {
1716
  "epoch": 9.75,
1717
  "learning_rate": 0.0002,
1718
- "loss": 0.0006,
1719
  "step": 273
1720
  },
1721
  {
1722
  "epoch": 9.79,
1723
  "learning_rate": 0.0002,
1724
- "loss": 0.0015,
1725
  "step": 274
1726
  },
1727
  {
1728
  "epoch": 9.82,
1729
  "learning_rate": 0.0002,
1730
- "loss": 0.0011,
1731
  "step": 275
1732
  },
1733
  {
1734
  "epoch": 9.86,
1735
  "learning_rate": 0.0002,
1736
- "loss": 0.0013,
1737
  "step": 276
1738
  },
1739
  {
1740
  "epoch": 9.89,
1741
  "learning_rate": 0.0002,
1742
- "loss": 0.0014,
1743
  "step": 277
1744
  },
1745
  {
1746
  "epoch": 9.93,
1747
  "learning_rate": 0.0002,
1748
- "loss": 0.0016,
1749
  "step": 278
1750
  },
1751
  {
1752
  "epoch": 9.96,
1753
  "learning_rate": 0.0002,
1754
- "loss": 0.0007,
1755
  "step": 279
1756
  },
1757
  {
1758
  "epoch": 10.0,
1759
  "learning_rate": 0.0002,
1760
- "loss": 0.0011,
1761
  "step": 280
1762
  },
1763
  {
1764
  "epoch": 10.0,
1765
- "eval_loss": 0.0036755690816789865,
1766
- "eval_runtime": 55.2858,
1767
- "eval_samples_per_second": 15.682,
1768
- "eval_steps_per_second": 0.506,
1769
  "step": 280
1770
  },
1771
  {
1772
  "epoch": 10.0,
1773
  "step": 280,
1774
  "total_flos": 8.298694499798876e+17,
1775
- "train_loss": 0.09669673074760275,
1776
- "train_runtime": 3076.0317,
1777
- "train_samples_per_second": 2.819,
1778
- "train_steps_per_second": 0.091
1779
  }
1780
  ],
1781
  "logging_steps": 1.0,
 
17
  {
18
  "epoch": 0.07,
19
  "learning_rate": 6.309297535714573e-05,
20
+ "loss": 1.2013,
21
  "step": 2
22
  },
23
  {
24
  "epoch": 0.11,
25
  "learning_rate": 0.0001,
26
+ "loss": 1.1723,
27
  "step": 3
28
  },
29
  {
30
  "epoch": 0.14,
31
  "learning_rate": 0.00012618595071429146,
32
+ "loss": 1.1317,
33
  "step": 4
34
  },
35
  {
36
  "epoch": 0.18,
37
  "learning_rate": 0.0001464973520717927,
38
+ "loss": 1.0776,
39
  "step": 5
40
  },
41
  {
42
  "epoch": 0.21,
43
  "learning_rate": 0.00016309297535714573,
44
+ "loss": 1.0266,
45
  "step": 6
46
  },
47
  {
48
  "epoch": 0.25,
49
  "learning_rate": 0.00017712437491614223,
50
+ "loss": 0.9953,
51
  "step": 7
52
  },
53
  {
54
  "epoch": 0.29,
55
  "learning_rate": 0.0001892789260714372,
56
+ "loss": 0.9563,
57
  "step": 8
58
  },
59
  {
60
  "epoch": 0.32,
61
  "learning_rate": 0.0002,
62
+ "loss": 0.9337,
63
  "step": 9
64
  },
65
  {
66
  "epoch": 0.36,
67
  "learning_rate": 0.0002,
68
+ "loss": 0.9513,
69
  "step": 10
70
  },
71
  {
72
  "epoch": 0.39,
73
  "learning_rate": 0.0002,
74
+ "loss": 0.8823,
75
  "step": 11
76
  },
77
  {
78
  "epoch": 0.43,
79
  "learning_rate": 0.0002,
80
+ "loss": 0.8438,
81
  "step": 12
82
  },
83
  {
84
  "epoch": 0.46,
85
  "learning_rate": 0.0002,
86
+ "loss": 0.8205,
87
  "step": 13
88
  },
89
  {
90
  "epoch": 0.5,
91
  "learning_rate": 0.0002,
92
+ "loss": 0.7802,
93
  "step": 14
94
  },
95
  {
96
  "epoch": 0.54,
97
  "learning_rate": 0.0002,
98
+ "loss": 0.7714,
99
  "step": 15
100
  },
101
  {
102
  "epoch": 0.57,
103
  "learning_rate": 0.0002,
104
+ "loss": 0.732,
105
  "step": 16
106
  },
107
  {
108
  "epoch": 0.61,
109
  "learning_rate": 0.0002,
110
+ "loss": 0.6475,
111
  "step": 17
112
  },
113
  {
114
  "epoch": 0.64,
115
  "learning_rate": 0.0002,
116
+ "loss": 0.7026,
117
  "step": 18
118
  },
119
  {
120
  "epoch": 0.68,
121
  "learning_rate": 0.0002,
122
+ "loss": 0.611,
123
  "step": 19
124
  },
125
  {
126
  "epoch": 0.71,
127
  "learning_rate": 0.0002,
128
+ "loss": 0.6581,
129
  "step": 20
130
  },
131
  {
132
  "epoch": 0.75,
133
  "learning_rate": 0.0002,
134
+ "loss": 0.5189,
135
  "step": 21
136
  },
137
  {
138
  "epoch": 0.79,
139
  "learning_rate": 0.0002,
140
+ "loss": 0.4852,
141
  "step": 22
142
  },
143
  {
144
  "epoch": 0.82,
145
  "learning_rate": 0.0002,
146
+ "loss": 0.4482,
147
  "step": 23
148
  },
149
  {
150
  "epoch": 0.86,
151
  "learning_rate": 0.0002,
152
+ "loss": 0.3752,
153
  "step": 24
154
  },
155
  {
156
  "epoch": 0.89,
157
  "learning_rate": 0.0002,
158
+ "loss": 0.4258,
159
  "step": 25
160
  },
161
  {
 
167
  {
168
  "epoch": 0.96,
169
  "learning_rate": 0.0002,
170
+ "loss": 0.3977,
171
  "step": 27
172
  },
173
  {
174
  "epoch": 1.0,
175
  "learning_rate": 0.0002,
176
+ "loss": 0.3239,
177
  "step": 28
178
  },
179
  {
180
  "epoch": 1.0,
181
+ "eval_loss": 0.24691244959831238,
182
+ "eval_runtime": 54.9798,
183
+ "eval_samples_per_second": 15.769,
184
+ "eval_steps_per_second": 0.509,
185
  "step": 28
186
  },
187
  {
188
  "epoch": 1.04,
189
  "learning_rate": 0.0002,
190
+ "loss": 0.2309,
191
  "step": 29
192
  },
193
  {
194
  "epoch": 1.07,
195
  "learning_rate": 0.0002,
196
+ "loss": 0.2426,
197
  "step": 30
198
  },
199
  {
200
  "epoch": 1.11,
201
  "learning_rate": 0.0002,
202
+ "loss": 0.2226,
203
  "step": 31
204
  },
205
  {
206
  "epoch": 1.14,
207
  "learning_rate": 0.0002,
208
+ "loss": 0.1932,
209
  "step": 32
210
  },
211
  {
212
  "epoch": 1.18,
213
  "learning_rate": 0.0002,
214
+ "loss": 0.212,
215
  "step": 33
216
  },
217
  {
218
  "epoch": 1.21,
219
  "learning_rate": 0.0002,
220
+ "loss": 0.1909,
221
  "step": 34
222
  },
223
  {
224
  "epoch": 1.25,
225
  "learning_rate": 0.0002,
226
+ "loss": 0.2315,
227
  "step": 35
228
  },
229
  {
230
  "epoch": 1.29,
231
  "learning_rate": 0.0002,
232
+ "loss": 0.1531,
233
  "step": 36
234
  },
235
  {
236
  "epoch": 1.32,
237
  "learning_rate": 0.0002,
238
+ "loss": 0.1523,
239
  "step": 37
240
  },
241
  {
242
  "epoch": 1.36,
243
  "learning_rate": 0.0002,
244
+ "loss": 0.0959,
245
  "step": 38
246
  },
247
  {
248
  "epoch": 1.39,
249
  "learning_rate": 0.0002,
250
+ "loss": 0.1044,
251
  "step": 39
252
  },
253
  {
254
  "epoch": 1.43,
255
  "learning_rate": 0.0002,
256
+ "loss": 0.1752,
257
  "step": 40
258
  },
259
  {
260
  "epoch": 1.46,
261
  "learning_rate": 0.0002,
262
+ "loss": 0.1159,
263
  "step": 41
264
  },
265
  {
266
  "epoch": 1.5,
267
  "learning_rate": 0.0002,
268
+ "loss": 0.1041,
269
  "step": 42
270
  },
271
  {
272
  "epoch": 1.54,
273
  "learning_rate": 0.0002,
274
+ "loss": 0.1311,
275
  "step": 43
276
  },
277
  {
278
  "epoch": 1.57,
279
  "learning_rate": 0.0002,
280
+ "loss": 0.1307,
281
  "step": 44
282
  },
283
  {
284
  "epoch": 1.61,
285
  "learning_rate": 0.0002,
286
+ "loss": 0.0678,
287
  "step": 45
288
  },
289
  {
290
  "epoch": 1.64,
291
  "learning_rate": 0.0002,
292
+ "loss": 0.0746,
293
  "step": 46
294
  },
295
  {
296
  "epoch": 1.68,
297
  "learning_rate": 0.0002,
298
+ "loss": 0.0903,
299
  "step": 47
300
  },
301
  {
302
  "epoch": 1.71,
303
  "learning_rate": 0.0002,
304
+ "loss": 0.1146,
305
  "step": 48
306
  },
307
  {
308
  "epoch": 1.75,
309
  "learning_rate": 0.0002,
310
+ "loss": 0.1019,
311
  "step": 49
312
  },
313
  {
314
  "epoch": 1.79,
315
  "learning_rate": 0.0002,
316
+ "loss": 0.056,
317
  "step": 50
318
  },
319
  {
 
325
  {
326
  "epoch": 1.86,
327
  "learning_rate": 0.0002,
328
+ "loss": 0.0487,
329
  "step": 52
330
  },
331
  {
332
  "epoch": 1.89,
333
  "learning_rate": 0.0002,
334
+ "loss": 0.1094,
335
  "step": 53
336
  },
337
  {
338
  "epoch": 1.93,
339
  "learning_rate": 0.0002,
340
+ "loss": 0.1581,
341
  "step": 54
342
  },
343
  {
344
  "epoch": 1.96,
345
  "learning_rate": 0.0002,
346
+ "loss": 0.1083,
347
  "step": 55
348
  },
349
  {
350
  "epoch": 2.0,
351
  "learning_rate": 0.0002,
352
+ "loss": 0.0396,
353
  "step": 56
354
  },
355
  {
356
  "epoch": 2.0,
357
+ "eval_loss": 0.04381483793258667,
358
+ "eval_runtime": 55.1711,
359
+ "eval_samples_per_second": 15.715,
360
+ "eval_steps_per_second": 0.508,
361
  "step": 56
362
  },
363
  {
364
  "epoch": 2.04,
365
  "learning_rate": 0.0002,
366
+ "loss": 0.0368,
367
  "step": 57
368
  },
369
  {
370
  "epoch": 2.07,
371
  "learning_rate": 0.0002,
372
+ "loss": 0.0299,
373
  "step": 58
374
  },
375
  {
376
  "epoch": 2.11,
377
  "learning_rate": 0.0002,
378
+ "loss": 0.0304,
379
  "step": 59
380
  },
381
  {
382
  "epoch": 2.14,
383
  "learning_rate": 0.0002,
384
+ "loss": 0.0429,
385
  "step": 60
386
  },
387
  {
388
  "epoch": 2.18,
389
  "learning_rate": 0.0002,
390
+ "loss": 0.0256,
391
  "step": 61
392
  },
393
  {
394
  "epoch": 2.21,
395
  "learning_rate": 0.0002,
396
+ "loss": 0.0375,
397
  "step": 62
398
  },
399
  {
400
  "epoch": 2.25,
401
  "learning_rate": 0.0002,
402
+ "loss": 0.0248,
403
  "step": 63
404
  },
405
  {
406
  "epoch": 2.29,
407
  "learning_rate": 0.0002,
408
+ "loss": 0.022,
409
  "step": 64
410
  },
411
  {
412
  "epoch": 2.32,
413
  "learning_rate": 0.0002,
414
+ "loss": 0.0292,
415
  "step": 65
416
  },
417
  {
418
  "epoch": 2.36,
419
  "learning_rate": 0.0002,
420
+ "loss": 0.0273,
421
  "step": 66
422
  },
423
  {
424
  "epoch": 2.39,
425
  "learning_rate": 0.0002,
426
+ "loss": 0.0152,
427
  "step": 67
428
  },
429
  {
430
  "epoch": 2.43,
431
  "learning_rate": 0.0002,
432
+ "loss": 0.0301,
433
  "step": 68
434
  },
435
  {
436
  "epoch": 2.46,
437
  "learning_rate": 0.0002,
438
+ "loss": 0.0358,
439
  "step": 69
440
  },
441
  {
442
  "epoch": 2.5,
443
  "learning_rate": 0.0002,
444
+ "loss": 0.0269,
445
  "step": 70
446
  },
447
  {
448
  "epoch": 2.54,
449
  "learning_rate": 0.0002,
450
+ "loss": 0.0179,
451
  "step": 71
452
  },
453
  {
454
  "epoch": 2.57,
455
  "learning_rate": 0.0002,
456
+ "loss": 0.0183,
457
  "step": 72
458
  },
459
  {
460
  "epoch": 2.61,
461
  "learning_rate": 0.0002,
462
+ "loss": 0.0132,
463
  "step": 73
464
  },
465
  {
466
  "epoch": 2.64,
467
  "learning_rate": 0.0002,
468
+ "loss": 0.0329,
469
  "step": 74
470
  },
471
  {
472
  "epoch": 2.68,
473
  "learning_rate": 0.0002,
474
+ "loss": 0.0267,
475
  "step": 75
476
  },
477
  {
478
  "epoch": 2.71,
479
  "learning_rate": 0.0002,
480
+ "loss": 0.0193,
481
  "step": 76
482
  },
483
  {
484
  "epoch": 2.75,
485
  "learning_rate": 0.0002,
486
+ "loss": 0.0101,
487
  "step": 77
488
  },
489
  {
490
  "epoch": 2.79,
491
  "learning_rate": 0.0002,
492
+ "loss": 0.023,
493
  "step": 78
494
  },
495
  {
496
  "epoch": 2.82,
497
  "learning_rate": 0.0002,
498
+ "loss": 0.0317,
499
  "step": 79
500
  },
501
  {
502
  "epoch": 2.86,
503
  "learning_rate": 0.0002,
504
+ "loss": 0.0225,
505
  "step": 80
506
  },
507
  {
508
  "epoch": 2.89,
509
  "learning_rate": 0.0002,
510
+ "loss": 0.0198,
511
  "step": 81
512
  },
513
  {
514
  "epoch": 2.93,
515
  "learning_rate": 0.0002,
516
+ "loss": 0.0232,
517
  "step": 82
518
  },
519
  {
520
  "epoch": 2.96,
521
  "learning_rate": 0.0002,
522
+ "loss": 0.0287,
523
  "step": 83
524
  },
525
  {
526
  "epoch": 3.0,
527
  "learning_rate": 0.0002,
528
+ "loss": 0.0086,
529
  "step": 84
530
  },
531
  {
532
  "epoch": 3.0,
533
+ "eval_loss": 0.01250830665230751,
534
+ "eval_runtime": 54.9958,
535
+ "eval_samples_per_second": 15.765,
536
+ "eval_steps_per_second": 0.509,
537
  "step": 84
538
  },
539
  {
540
  "epoch": 3.04,
541
  "learning_rate": 0.0002,
542
+ "loss": 0.0059,
543
  "step": 85
544
  },
545
  {
546
  "epoch": 3.07,
547
  "learning_rate": 0.0002,
548
+ "loss": 0.0121,
549
  "step": 86
550
  },
551
  {
552
  "epoch": 3.11,
553
  "learning_rate": 0.0002,
554
+ "loss": 0.0091,
555
  "step": 87
556
  },
557
  {
558
  "epoch": 3.14,
559
  "learning_rate": 0.0002,
560
+ "loss": 0.0078,
561
  "step": 88
562
  },
563
  {
564
  "epoch": 3.18,
565
  "learning_rate": 0.0002,
566
+ "loss": 0.0124,
567
  "step": 89
568
  },
569
  {
570
  "epoch": 3.21,
571
  "learning_rate": 0.0002,
572
+ "loss": 0.0107,
573
  "step": 90
574
  },
575
  {
576
  "epoch": 3.25,
577
  "learning_rate": 0.0002,
578
+ "loss": 0.0106,
579
  "step": 91
580
  },
581
  {
582
  "epoch": 3.29,
583
  "learning_rate": 0.0002,
584
+ "loss": 0.0107,
585
  "step": 92
586
  },
587
  {
588
  "epoch": 3.32,
589
  "learning_rate": 0.0002,
590
+ "loss": 0.0119,
591
  "step": 93
592
  },
593
  {
594
  "epoch": 3.36,
595
  "learning_rate": 0.0002,
596
+ "loss": 0.0054,
597
  "step": 94
598
  },
599
  {
600
  "epoch": 3.39,
601
  "learning_rate": 0.0002,
602
+ "loss": 0.0078,
603
  "step": 95
604
  },
605
  {
606
  "epoch": 3.43,
607
  "learning_rate": 0.0002,
608
+ "loss": 0.0132,
609
  "step": 96
610
  },
611
  {
612
  "epoch": 3.46,
613
  "learning_rate": 0.0002,
614
+ "loss": 0.0123,
615
  "step": 97
616
  },
617
  {
618
  "epoch": 3.5,
619
  "learning_rate": 0.0002,
620
+ "loss": 0.0144,
621
  "step": 98
622
  },
623
  {
624
  "epoch": 3.54,
625
  "learning_rate": 0.0002,
626
+ "loss": 0.0099,
627
  "step": 99
628
  },
629
  {
630
  "epoch": 3.57,
631
  "learning_rate": 0.0002,
632
+ "loss": 0.0075,
633
  "step": 100
634
  },
635
  {
636
  "epoch": 3.61,
637
  "learning_rate": 0.0002,
638
+ "loss": 0.0131,
639
  "step": 101
640
  },
641
  {
642
  "epoch": 3.64,
643
  "learning_rate": 0.0002,
644
+ "loss": 0.0076,
645
  "step": 102
646
  },
647
  {
648
  "epoch": 3.68,
649
  "learning_rate": 0.0002,
650
+ "loss": 0.0129,
651
  "step": 103
652
  },
653
  {
654
  "epoch": 3.71,
655
  "learning_rate": 0.0002,
656
+ "loss": 0.0122,
657
  "step": 104
658
  },
659
  {
660
  "epoch": 3.75,
661
  "learning_rate": 0.0002,
662
+ "loss": 0.0113,
663
  "step": 105
664
  },
665
  {
666
  "epoch": 3.79,
667
  "learning_rate": 0.0002,
668
+ "loss": 0.0101,
669
  "step": 106
670
  },
671
  {
672
  "epoch": 3.82,
673
  "learning_rate": 0.0002,
674
+ "loss": 0.006,
675
  "step": 107
676
  },
677
  {
678
  "epoch": 3.86,
679
  "learning_rate": 0.0002,
680
+ "loss": 0.0078,
681
  "step": 108
682
  },
683
  {
684
  "epoch": 3.89,
685
  "learning_rate": 0.0002,
686
+ "loss": 0.0052,
687
  "step": 109
688
  },
689
  {
690
  "epoch": 3.93,
691
  "learning_rate": 0.0002,
692
+ "loss": 0.0155,
693
  "step": 110
694
  },
695
  {
696
  "epoch": 3.96,
697
  "learning_rate": 0.0002,
698
+ "loss": 0.0043,
699
  "step": 111
700
  },
701
  {
702
  "epoch": 4.0,
703
  "learning_rate": 0.0002,
704
+ "loss": 0.005,
705
  "step": 112
706
  },
707
  {
708
  "epoch": 4.0,
709
+ "eval_loss": 0.005612094886600971,
710
+ "eval_runtime": 55.1695,
711
+ "eval_samples_per_second": 15.715,
712
+ "eval_steps_per_second": 0.508,
713
  "step": 112
714
  },
715
  {
716
  "epoch": 4.04,
717
  "learning_rate": 0.0002,
718
+ "loss": 0.0061,
719
  "step": 113
720
  },
721
  {
722
  "epoch": 4.07,
723
  "learning_rate": 0.0002,
724
+ "loss": 0.0056,
725
  "step": 114
726
  },
727
  {
728
  "epoch": 4.11,
729
  "learning_rate": 0.0002,
730
+ "loss": 0.0034,
731
  "step": 115
732
  },
733
  {
734
  "epoch": 4.14,
735
  "learning_rate": 0.0002,
736
+ "loss": 0.0048,
737
  "step": 116
738
  },
739
  {
740
  "epoch": 4.18,
741
  "learning_rate": 0.0002,
742
+ "loss": 0.005,
743
  "step": 117
744
  },
745
  {
746
  "epoch": 4.21,
747
  "learning_rate": 0.0002,
748
+ "loss": 0.0062,
749
  "step": 118
750
  },
751
  {
752
  "epoch": 4.25,
753
  "learning_rate": 0.0002,
754
+ "loss": 0.0131,
755
  "step": 119
756
  },
757
  {
758
  "epoch": 4.29,
759
  "learning_rate": 0.0002,
760
+ "loss": 0.0064,
761
  "step": 120
762
  },
763
  {
764
  "epoch": 4.32,
765
  "learning_rate": 0.0002,
766
+ "loss": 0.005,
767
  "step": 121
768
  },
769
  {
770
  "epoch": 4.36,
771
  "learning_rate": 0.0002,
772
+ "loss": 0.0044,
773
  "step": 122
774
  },
775
  {
776
  "epoch": 4.39,
777
  "learning_rate": 0.0002,
778
+ "loss": 0.0069,
779
  "step": 123
780
  },
781
  {
 
787
  {
788
  "epoch": 4.46,
789
  "learning_rate": 0.0002,
790
+ "loss": 0.0043,
791
  "step": 125
792
  },
793
  {
794
  "epoch": 4.5,
795
  "learning_rate": 0.0002,
796
+ "loss": 0.0032,
797
  "step": 126
798
  },
799
  {
800
  "epoch": 4.54,
801
  "learning_rate": 0.0002,
802
+ "loss": 0.0028,
803
  "step": 127
804
  },
805
  {
806
  "epoch": 4.57,
807
  "learning_rate": 0.0002,
808
+ "loss": 0.0053,
809
  "step": 128
810
  },
811
  {
812
  "epoch": 4.61,
813
  "learning_rate": 0.0002,
814
+ "loss": 0.0052,
815
  "step": 129
816
  },
817
  {
818
  "epoch": 4.64,
819
  "learning_rate": 0.0002,
820
+ "loss": 0.0106,
821
  "step": 130
822
  },
823
  {
824
  "epoch": 4.68,
825
  "learning_rate": 0.0002,
826
+ "loss": 0.0071,
827
  "step": 131
828
  },
829
  {
830
  "epoch": 4.71,
831
  "learning_rate": 0.0002,
832
+ "loss": 0.0044,
833
  "step": 132
834
  },
835
  {
836
  "epoch": 4.75,
837
  "learning_rate": 0.0002,
838
+ "loss": 0.0075,
839
  "step": 133
840
  },
841
  {
842
  "epoch": 4.79,
843
  "learning_rate": 0.0002,
844
+ "loss": 0.0026,
845
  "step": 134
846
  },
847
  {
848
  "epoch": 4.82,
849
  "learning_rate": 0.0002,
850
+ "loss": 0.0071,
851
  "step": 135
852
  },
853
  {
854
  "epoch": 4.86,
855
  "learning_rate": 0.0002,
856
+ "loss": 0.005,
857
  "step": 136
858
  },
859
  {
860
  "epoch": 4.89,
861
  "learning_rate": 0.0002,
862
+ "loss": 0.0042,
863
  "step": 137
864
  },
865
  {
866
  "epoch": 4.93,
867
  "learning_rate": 0.0002,
868
+ "loss": 0.0059,
869
  "step": 138
870
  },
871
  {
872
  "epoch": 4.96,
873
  "learning_rate": 0.0002,
874
+ "loss": 0.004,
875
  "step": 139
876
  },
877
  {
878
  "epoch": 5.0,
879
  "learning_rate": 0.0002,
880
+ "loss": 0.0051,
881
  "step": 140
882
  },
883
  {
884
  "epoch": 5.0,
885
+ "eval_loss": 0.004839635919779539,
886
+ "eval_runtime": 55.1037,
887
+ "eval_samples_per_second": 15.734,
888
+ "eval_steps_per_second": 0.508,
889
  "step": 140
890
  },
891
  {
892
  "epoch": 5.04,
893
  "learning_rate": 0.0002,
894
+ "loss": 0.0045,
895
  "step": 141
896
  },
897
  {
898
  "epoch": 5.07,
899
  "learning_rate": 0.0002,
900
+ "loss": 0.0066,
901
  "step": 142
902
  },
903
  {
904
  "epoch": 5.11,
905
  "learning_rate": 0.0002,
906
+ "loss": 0.0023,
907
  "step": 143
908
  },
909
  {
910
  "epoch": 5.14,
911
  "learning_rate": 0.0002,
912
+ "loss": 0.0047,
913
  "step": 144
914
  },
915
  {
916
  "epoch": 5.18,
917
  "learning_rate": 0.0002,
918
+ "loss": 0.0081,
919
  "step": 145
920
  },
921
  {
922
  "epoch": 5.21,
923
  "learning_rate": 0.0002,
924
+ "loss": 0.0074,
925
  "step": 146
926
  },
927
  {
928
  "epoch": 5.25,
929
  "learning_rate": 0.0002,
930
+ "loss": 0.002,
931
  "step": 147
932
  },
933
  {
934
  "epoch": 5.29,
935
  "learning_rate": 0.0002,
936
+ "loss": 0.0049,
937
  "step": 148
938
  },
939
  {
940
  "epoch": 5.32,
941
  "learning_rate": 0.0002,
942
+ "loss": 0.0093,
943
  "step": 149
944
  },
945
  {
946
  "epoch": 5.36,
947
  "learning_rate": 0.0002,
948
+ "loss": 0.0028,
949
  "step": 150
950
  },
951
  {
952
  "epoch": 5.39,
953
  "learning_rate": 0.0002,
954
+ "loss": 0.005,
955
  "step": 151
956
  },
957
  {
958
  "epoch": 5.43,
959
  "learning_rate": 0.0002,
960
+ "loss": 0.0027,
961
  "step": 152
962
  },
963
  {
964
  "epoch": 5.46,
965
  "learning_rate": 0.0002,
966
+ "loss": 0.0024,
967
  "step": 153
968
  },
969
  {
970
  "epoch": 5.5,
971
  "learning_rate": 0.0002,
972
+ "loss": 0.0051,
973
  "step": 154
974
  },
975
  {
976
  "epoch": 5.54,
977
  "learning_rate": 0.0002,
978
+ "loss": 0.0057,
979
  "step": 155
980
  },
981
  {
982
  "epoch": 5.57,
983
  "learning_rate": 0.0002,
984
+ "loss": 0.0042,
985
  "step": 156
986
  },
987
  {
988
  "epoch": 5.61,
989
  "learning_rate": 0.0002,
990
+ "loss": 0.0076,
991
  "step": 157
992
  },
993
  {
994
  "epoch": 5.64,
995
  "learning_rate": 0.0002,
996
+ "loss": 0.0026,
997
  "step": 158
998
  },
999
  {
1000
  "epoch": 5.68,
1001
  "learning_rate": 0.0002,
1002
+ "loss": 0.0062,
1003
  "step": 159
1004
  },
1005
  {
1006
  "epoch": 5.71,
1007
  "learning_rate": 0.0002,
1008
+ "loss": 0.0039,
1009
  "step": 160
1010
  },
1011
  {
1012
  "epoch": 5.75,
1013
  "learning_rate": 0.0002,
1014
+ "loss": 0.0028,
1015
  "step": 161
1016
  },
1017
  {
1018
  "epoch": 5.79,
1019
  "learning_rate": 0.0002,
1020
+ "loss": 0.0068,
1021
  "step": 162
1022
  },
1023
  {
1024
  "epoch": 5.82,
1025
  "learning_rate": 0.0002,
1026
+ "loss": 0.001,
1027
  "step": 163
1028
  },
1029
  {
1030
  "epoch": 5.86,
1031
  "learning_rate": 0.0002,
1032
+ "loss": 0.0036,
1033
  "step": 164
1034
  },
1035
  {
1036
  "epoch": 5.89,
1037
  "learning_rate": 0.0002,
1038
+ "loss": 0.004,
1039
  "step": 165
1040
  },
1041
  {
1042
  "epoch": 5.93,
1043
  "learning_rate": 0.0002,
1044
+ "loss": 0.0028,
1045
  "step": 166
1046
  },
1047
  {
1048
  "epoch": 5.96,
1049
  "learning_rate": 0.0002,
1050
+ "loss": 0.0046,
1051
  "step": 167
1052
  },
1053
  {
1054
  "epoch": 6.0,
1055
  "learning_rate": 0.0002,
1056
+ "loss": 0.0036,
1057
  "step": 168
1058
  },
1059
  {
1060
  "epoch": 6.0,
1061
+ "eval_loss": 0.0032981247641146183,
1062
+ "eval_runtime": 55.1126,
1063
+ "eval_samples_per_second": 15.731,
1064
+ "eval_steps_per_second": 0.508,
1065
  "step": 168
1066
  },
1067
  {
1068
  "epoch": 6.04,
1069
  "learning_rate": 0.0002,
1070
+ "loss": 0.0021,
1071
  "step": 169
1072
  },
1073
  {
1074
  "epoch": 6.07,
1075
  "learning_rate": 0.0002,
1076
+ "loss": 0.0053,
1077
  "step": 170
1078
  },
1079
  {
1080
  "epoch": 6.11,
1081
  "learning_rate": 0.0002,
1082
+ "loss": 0.0034,
1083
  "step": 171
1084
  },
1085
  {
1086
  "epoch": 6.14,
1087
  "learning_rate": 0.0002,
1088
+ "loss": 0.0051,
1089
  "step": 172
1090
  },
1091
  {
1092
  "epoch": 6.18,
1093
  "learning_rate": 0.0002,
1094
+ "loss": 0.004,
1095
  "step": 173
1096
  },
1097
  {
1098
  "epoch": 6.21,
1099
  "learning_rate": 0.0002,
1100
+ "loss": 0.0032,
1101
  "step": 174
1102
  },
1103
  {
1104
  "epoch": 6.25,
1105
  "learning_rate": 0.0002,
1106
+ "loss": 0.0039,
1107
  "step": 175
1108
  },
1109
  {
1110
  "epoch": 6.29,
1111
  "learning_rate": 0.0002,
1112
+ "loss": 0.0045,
1113
  "step": 176
1114
  },
1115
  {
1116
  "epoch": 6.32,
1117
  "learning_rate": 0.0002,
1118
+ "loss": 0.0105,
1119
  "step": 177
1120
  },
1121
  {
1122
  "epoch": 6.36,
1123
  "learning_rate": 0.0002,
1124
+ "loss": 0.0017,
1125
  "step": 178
1126
  },
1127
  {
1128
  "epoch": 6.39,
1129
  "learning_rate": 0.0002,
1130
+ "loss": 0.0073,
1131
  "step": 179
1132
  },
1133
  {
1134
  "epoch": 6.43,
1135
  "learning_rate": 0.0002,
1136
+ "loss": 0.0088,
1137
  "step": 180
1138
  },
1139
  {
1140
  "epoch": 6.46,
1141
  "learning_rate": 0.0002,
1142
+ "loss": 0.0029,
1143
  "step": 181
1144
  },
1145
  {
1146
  "epoch": 6.5,
1147
  "learning_rate": 0.0002,
1148
+ "loss": 0.0087,
1149
  "step": 182
1150
  },
1151
  {
1152
  "epoch": 6.54,
1153
  "learning_rate": 0.0002,
1154
+ "loss": 0.0045,
1155
  "step": 183
1156
  },
1157
  {
1158
  "epoch": 6.57,
1159
  "learning_rate": 0.0002,
1160
+ "loss": 0.0096,
1161
  "step": 184
1162
  },
1163
  {
1164
  "epoch": 6.61,
1165
  "learning_rate": 0.0002,
1166
+ "loss": 0.0036,
1167
  "step": 185
1168
  },
1169
  {
1170
  "epoch": 6.64,
1171
  "learning_rate": 0.0002,
1172
+ "loss": 0.0061,
1173
  "step": 186
1174
  },
1175
  {
1176
  "epoch": 6.68,
1177
  "learning_rate": 0.0002,
1178
+ "loss": 0.0052,
1179
  "step": 187
1180
  },
1181
  {
1182
  "epoch": 6.71,
1183
  "learning_rate": 0.0002,
1184
+ "loss": 0.0033,
1185
  "step": 188
1186
  },
1187
  {
1188
  "epoch": 6.75,
1189
  "learning_rate": 0.0002,
1190
+ "loss": 0.0072,
1191
  "step": 189
1192
  },
1193
  {
1194
  "epoch": 6.79,
1195
  "learning_rate": 0.0002,
1196
+ "loss": 0.0016,
1197
  "step": 190
1198
  },
1199
  {
1200
  "epoch": 6.82,
1201
  "learning_rate": 0.0002,
1202
+ "loss": 0.0024,
1203
  "step": 191
1204
  },
1205
  {
1206
  "epoch": 6.86,
1207
  "learning_rate": 0.0002,
1208
+ "loss": 0.0044,
1209
  "step": 192
1210
  },
1211
  {
1212
  "epoch": 6.89,
1213
  "learning_rate": 0.0002,
1214
+ "loss": 0.0011,
1215
  "step": 193
1216
  },
1217
  {
1218
  "epoch": 6.93,
1219
  "learning_rate": 0.0002,
1220
+ "loss": 0.0054,
1221
  "step": 194
1222
  },
1223
  {
1224
  "epoch": 6.96,
1225
  "learning_rate": 0.0002,
1226
+ "loss": 0.0023,
1227
  "step": 195
1228
  },
1229
  {
1230
  "epoch": 7.0,
1231
  "learning_rate": 0.0002,
1232
+ "loss": 0.006,
1233
  "step": 196
1234
  },
1235
  {
1236
  "epoch": 7.0,
1237
+ "eval_loss": 0.0035514547489583492,
1238
+ "eval_runtime": 55.2585,
1239
+ "eval_samples_per_second": 15.69,
1240
+ "eval_steps_per_second": 0.507,
1241
  "step": 196
1242
  },
1243
  {
1244
  "epoch": 7.04,
1245
  "learning_rate": 0.0002,
1246
+ "loss": 0.0016,
1247
  "step": 197
1248
  },
1249
  {
1250
  "epoch": 7.07,
1251
  "learning_rate": 0.0002,
1252
+ "loss": 0.0049,
1253
  "step": 198
1254
  },
1255
  {
1256
  "epoch": 7.11,
1257
  "learning_rate": 0.0002,
1258
+ "loss": 0.002,
1259
  "step": 199
1260
  },
1261
  {
1262
  "epoch": 7.14,
1263
  "learning_rate": 0.0002,
1264
+ "loss": 0.0067,
1265
  "step": 200
1266
  },
1267
  {
1268
  "epoch": 7.18,
1269
  "learning_rate": 0.0002,
1270
+ "loss": 0.002,
1271
  "step": 201
1272
  },
1273
  {
1274
  "epoch": 7.21,
1275
  "learning_rate": 0.0002,
1276
+ "loss": 0.0024,
1277
  "step": 202
1278
  },
1279
  {
1280
  "epoch": 7.25,
1281
  "learning_rate": 0.0002,
1282
+ "loss": 0.0032,
1283
  "step": 203
1284
  },
1285
  {
1286
  "epoch": 7.29,
1287
  "learning_rate": 0.0002,
1288
+ "loss": 0.0088,
1289
  "step": 204
1290
  },
1291
  {
1292
  "epoch": 7.32,
1293
  "learning_rate": 0.0002,
1294
+ "loss": 0.0013,
1295
  "step": 205
1296
  },
1297
  {
1298
  "epoch": 7.36,
1299
  "learning_rate": 0.0002,
1300
+ "loss": 0.0034,
1301
  "step": 206
1302
  },
1303
  {
1304
  "epoch": 7.39,
1305
  "learning_rate": 0.0002,
1306
+ "loss": 0.0022,
1307
  "step": 207
1308
  },
1309
  {
1310
  "epoch": 7.43,
1311
  "learning_rate": 0.0002,
1312
+ "loss": 0.0016,
1313
  "step": 208
1314
  },
1315
  {
1316
  "epoch": 7.46,
1317
  "learning_rate": 0.0002,
1318
+ "loss": 0.0024,
1319
  "step": 209
1320
  },
1321
  {
1322
  "epoch": 7.5,
1323
  "learning_rate": 0.0002,
1324
+ "loss": 0.0023,
1325
  "step": 210
1326
  },
1327
  {
1328
  "epoch": 7.54,
1329
  "learning_rate": 0.0002,
1330
+ "loss": 0.0022,
1331
  "step": 211
1332
  },
1333
  {
1334
  "epoch": 7.57,
1335
  "learning_rate": 0.0002,
1336
+ "loss": 0.0022,
1337
  "step": 212
1338
  },
1339
  {
1340
  "epoch": 7.61,
1341
  "learning_rate": 0.0002,
1342
+ "loss": 0.0084,
1343
  "step": 213
1344
  },
1345
  {
1346
  "epoch": 7.64,
1347
  "learning_rate": 0.0002,
1348
+ "loss": 0.0039,
1349
  "step": 214
1350
  },
1351
  {
1352
  "epoch": 7.68,
1353
  "learning_rate": 0.0002,
1354
+ "loss": 0.0035,
1355
  "step": 215
1356
  },
1357
  {
1358
  "epoch": 7.71,
1359
  "learning_rate": 0.0002,
1360
+ "loss": 0.0021,
1361
  "step": 216
1362
  },
1363
  {
1364
  "epoch": 7.75,
1365
  "learning_rate": 0.0002,
1366
+ "loss": 0.004,
1367
  "step": 217
1368
  },
1369
  {
1370
  "epoch": 7.79,
1371
  "learning_rate": 0.0002,
1372
+ "loss": 0.0027,
1373
  "step": 218
1374
  },
1375
  {
1376
  "epoch": 7.82,
1377
  "learning_rate": 0.0002,
1378
+ "loss": 0.0019,
1379
  "step": 219
1380
  },
1381
  {
1382
  "epoch": 7.86,
1383
  "learning_rate": 0.0002,
1384
+ "loss": 0.0013,
1385
  "step": 220
1386
  },
1387
  {
1388
  "epoch": 7.89,
1389
  "learning_rate": 0.0002,
1390
+ "loss": 0.0043,
1391
  "step": 221
1392
  },
1393
  {
1394
  "epoch": 7.93,
1395
  "learning_rate": 0.0002,
1396
+ "loss": 0.0012,
1397
  "step": 222
1398
  },
1399
  {
1400
  "epoch": 7.96,
1401
  "learning_rate": 0.0002,
1402
+ "loss": 0.0008,
1403
  "step": 223
1404
  },
1405
  {
1406
  "epoch": 8.0,
1407
  "learning_rate": 0.0002,
1408
+ "loss": 0.0017,
1409
  "step": 224
1410
  },
1411
  {
1412
  "epoch": 8.0,
1413
+ "eval_loss": 0.0020963489077985287,
1414
+ "eval_runtime": 55.1683,
1415
+ "eval_samples_per_second": 15.716,
1416
+ "eval_steps_per_second": 0.508,
1417
  "step": 224
1418
  },
1419
  {
1420
  "epoch": 8.04,
1421
  "learning_rate": 0.0002,
1422
+ "loss": 0.002,
1423
  "step": 225
1424
  },
1425
  {
1426
  "epoch": 8.07,
1427
  "learning_rate": 0.0002,
1428
+ "loss": 0.0008,
1429
  "step": 226
1430
  },
1431
  {
1432
  "epoch": 8.11,
1433
  "learning_rate": 0.0002,
1434
+ "loss": 0.0017,
1435
  "step": 227
1436
  },
1437
  {
1438
  "epoch": 8.14,
1439
  "learning_rate": 0.0002,
1440
+ "loss": 0.0013,
1441
  "step": 228
1442
  },
1443
  {
1444
  "epoch": 8.18,
1445
  "learning_rate": 0.0002,
1446
+ "loss": 0.0007,
1447
  "step": 229
1448
  },
1449
  {
1450
  "epoch": 8.21,
1451
  "learning_rate": 0.0002,
1452
+ "loss": 0.0005,
1453
  "step": 230
1454
  },
1455
  {
1456
  "epoch": 8.25,
1457
  "learning_rate": 0.0002,
1458
+ "loss": 0.0021,
1459
  "step": 231
1460
  },
1461
  {
1462
  "epoch": 8.29,
1463
  "learning_rate": 0.0002,
1464
+ "loss": 0.0037,
1465
  "step": 232
1466
  },
1467
  {
1468
  "epoch": 8.32,
1469
  "learning_rate": 0.0002,
1470
+ "loss": 0.0034,
1471
  "step": 233
1472
  },
1473
  {
1474
  "epoch": 8.36,
1475
  "learning_rate": 0.0002,
1476
+ "loss": 0.0038,
1477
  "step": 234
1478
  },
1479
  {
1480
  "epoch": 8.39,
1481
  "learning_rate": 0.0002,
1482
+ "loss": 0.0018,
1483
  "step": 235
1484
  },
1485
  {
1486
  "epoch": 8.43,
1487
  "learning_rate": 0.0002,
1488
+ "loss": 0.0006,
1489
  "step": 236
1490
  },
1491
  {
1492
  "epoch": 8.46,
1493
  "learning_rate": 0.0002,
1494
+ "loss": 0.0118,
1495
  "step": 237
1496
  },
1497
  {
1498
  "epoch": 8.5,
1499
  "learning_rate": 0.0002,
1500
+ "loss": 0.0046,
1501
  "step": 238
1502
  },
1503
  {
1504
  "epoch": 8.54,
1505
  "learning_rate": 0.0002,
1506
+ "loss": 0.0043,
1507
  "step": 239
1508
  },
1509
  {
1510
  "epoch": 8.57,
1511
  "learning_rate": 0.0002,
1512
+ "loss": 0.0023,
1513
  "step": 240
1514
  },
1515
  {
1516
  "epoch": 8.61,
1517
  "learning_rate": 0.0002,
1518
+ "loss": 0.0038,
1519
  "step": 241
1520
  },
1521
  {
1522
  "epoch": 8.64,
1523
  "learning_rate": 0.0002,
1524
+ "loss": 0.0056,
1525
  "step": 242
1526
  },
1527
  {
1528
  "epoch": 8.68,
1529
  "learning_rate": 0.0002,
1530
+ "loss": 0.0083,
1531
  "step": 243
1532
  },
1533
  {
1534
  "epoch": 8.71,
1535
  "learning_rate": 0.0002,
1536
+ "loss": 0.0039,
1537
  "step": 244
1538
  },
1539
  {
1540
  "epoch": 8.75,
1541
  "learning_rate": 0.0002,
1542
+ "loss": 0.003,
1543
  "step": 245
1544
  },
1545
  {
1546
  "epoch": 8.79,
1547
  "learning_rate": 0.0002,
1548
+ "loss": 0.0021,
1549
  "step": 246
1550
  },
1551
  {
1552
  "epoch": 8.82,
1553
  "learning_rate": 0.0002,
1554
+ "loss": 0.002,
1555
  "step": 247
1556
  },
1557
  {
1558
  "epoch": 8.86,
1559
  "learning_rate": 0.0002,
1560
+ "loss": 0.0037,
1561
  "step": 248
1562
  },
1563
  {
1564
  "epoch": 8.89,
1565
  "learning_rate": 0.0002,
1566
+ "loss": 0.0014,
1567
  "step": 249
1568
  },
1569
  {
1570
  "epoch": 8.93,
1571
  "learning_rate": 0.0002,
1572
+ "loss": 0.0037,
1573
  "step": 250
1574
  },
1575
  {
1576
  "epoch": 8.96,
1577
  "learning_rate": 0.0002,
1578
+ "loss": 0.0031,
1579
  "step": 251
1580
  },
1581
  {
1582
  "epoch": 9.0,
1583
  "learning_rate": 0.0002,
1584
+ "loss": 0.0041,
1585
  "step": 252
1586
  },
1587
  {
1588
  "epoch": 9.0,
1589
+ "eval_loss": 0.0022224283311516047,
1590
+ "eval_runtime": 55.1607,
1591
+ "eval_samples_per_second": 15.718,
1592
+ "eval_steps_per_second": 0.508,
1593
  "step": 252
1594
  },
1595
  {
1596
  "epoch": 9.04,
1597
  "learning_rate": 0.0002,
1598
+ "loss": 0.0009,
1599
  "step": 253
1600
  },
1601
  {
1602
  "epoch": 9.07,
1603
  "learning_rate": 0.0002,
1604
+ "loss": 0.0011,
1605
  "step": 254
1606
  },
1607
  {
1608
  "epoch": 9.11,
1609
  "learning_rate": 0.0002,
1610
+ "loss": 0.0038,
1611
  "step": 255
1612
  },
1613
  {
1614
  "epoch": 9.14,
1615
  "learning_rate": 0.0002,
1616
+ "loss": 0.0026,
1617
  "step": 256
1618
  },
1619
  {
1620
  "epoch": 9.18,
1621
  "learning_rate": 0.0002,
1622
+ "loss": 0.004,
1623
  "step": 257
1624
  },
1625
  {
1626
  "epoch": 9.21,
1627
  "learning_rate": 0.0002,
1628
+ "loss": 0.0047,
1629
  "step": 258
1630
  },
1631
  {
1632
  "epoch": 9.25,
1633
  "learning_rate": 0.0002,
1634
+ "loss": 0.0023,
1635
  "step": 259
1636
  },
1637
  {
1638
  "epoch": 9.29,
1639
  "learning_rate": 0.0002,
1640
+ "loss": 0.003,
1641
  "step": 260
1642
  },
1643
  {
1644
  "epoch": 9.32,
1645
  "learning_rate": 0.0002,
1646
+ "loss": 0.0007,
1647
  "step": 261
1648
  },
1649
  {
1650
  "epoch": 9.36,
1651
  "learning_rate": 0.0002,
1652
+ "loss": 0.0073,
1653
  "step": 262
1654
  },
1655
  {
1656
  "epoch": 9.39,
1657
  "learning_rate": 0.0002,
1658
+ "loss": 0.0023,
1659
  "step": 263
1660
  },
1661
  {
1662
  "epoch": 9.43,
1663
  "learning_rate": 0.0002,
1664
+ "loss": 0.0081,
1665
  "step": 264
1666
  },
1667
  {
1668
  "epoch": 9.46,
1669
  "learning_rate": 0.0002,
1670
+ "loss": 0.0043,
1671
  "step": 265
1672
  },
1673
  {
1674
  "epoch": 9.5,
1675
  "learning_rate": 0.0002,
1676
+ "loss": 0.0039,
1677
  "step": 266
1678
  },
1679
  {
1680
  "epoch": 9.54,
1681
  "learning_rate": 0.0002,
1682
+ "loss": 0.0052,
1683
  "step": 267
1684
  },
1685
  {
1686
  "epoch": 9.57,
1687
  "learning_rate": 0.0002,
1688
+ "loss": 0.0049,
1689
  "step": 268
1690
  },
1691
  {
1692
  "epoch": 9.61,
1693
  "learning_rate": 0.0002,
1694
+ "loss": 0.0076,
1695
  "step": 269
1696
  },
1697
  {
1698
  "epoch": 9.64,
1699
  "learning_rate": 0.0002,
1700
+ "loss": 0.0062,
1701
  "step": 270
1702
  },
1703
  {
1704
  "epoch": 9.68,
1705
  "learning_rate": 0.0002,
1706
+ "loss": 0.005,
1707
  "step": 271
1708
  },
1709
  {
1710
  "epoch": 9.71,
1711
  "learning_rate": 0.0002,
1712
+ "loss": 0.0063,
1713
  "step": 272
1714
  },
1715
  {
1716
  "epoch": 9.75,
1717
  "learning_rate": 0.0002,
1718
+ "loss": 0.0052,
1719
  "step": 273
1720
  },
1721
  {
1722
  "epoch": 9.79,
1723
  "learning_rate": 0.0002,
1724
+ "loss": 0.0042,
1725
  "step": 274
1726
  },
1727
  {
1728
  "epoch": 9.82,
1729
  "learning_rate": 0.0002,
1730
+ "loss": 0.0087,
1731
  "step": 275
1732
  },
1733
  {
1734
  "epoch": 9.86,
1735
  "learning_rate": 0.0002,
1736
+ "loss": 0.0064,
1737
  "step": 276
1738
  },
1739
  {
1740
  "epoch": 9.89,
1741
  "learning_rate": 0.0002,
1742
+ "loss": 0.0047,
1743
  "step": 277
1744
  },
1745
  {
1746
  "epoch": 9.93,
1747
  "learning_rate": 0.0002,
1748
+ "loss": 0.0034,
1749
  "step": 278
1750
  },
1751
  {
1752
  "epoch": 9.96,
1753
  "learning_rate": 0.0002,
1754
+ "loss": 0.0054,
1755
  "step": 279
1756
  },
1757
  {
1758
  "epoch": 10.0,
1759
  "learning_rate": 0.0002,
1760
+ "loss": 0.0031,
1761
  "step": 280
1762
  },
1763
  {
1764
  "epoch": 10.0,
1765
+ "eval_loss": 0.004367400426417589,
1766
+ "eval_runtime": 55.2334,
1767
+ "eval_samples_per_second": 15.697,
1768
+ "eval_steps_per_second": 0.507,
1769
  "step": 280
1770
  },
1771
  {
1772
  "epoch": 10.0,
1773
  "step": 280,
1774
  "total_flos": 8.298694499798876e+17,
1775
+ "train_loss": 0.09713323155904488,
1776
+ "train_runtime": 3137.8873,
1777
+ "train_samples_per_second": 2.763,
1778
+ "train_steps_per_second": 0.089
1779
  }
1780
  ],
1781
  "logging_steps": 1.0,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94bf3956f8afb1a220e42c04100c7f1281e8d2abcaf4f1bdc986598710eb313d
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8e7002d9da22439cd2142660ed0913d77b3af07a95d83f820f2b137e4beb14e
3
  size 6840