theblackcat102 commited on
Commit
59a004f
1 Parent(s): a13772e

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bigscience/mt0-large",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2816,
7
+ "d_kv": 64,
8
+ "d_model": 1024,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "mt5",
19
+ "num_decoder_layers": 24,
20
+ "num_heads": 16,
21
+ "num_layers": 24,
22
+ "output_past": true,
23
+ "pad_token_id": 0,
24
+ "relative_attention_max_distance": 128,
25
+ "relative_attention_num_buckets": 32,
26
+ "tie_word_embeddings": false,
27
+ "tokenizer_class": "T5Tokenizer",
28
+ "torch_dtype": "float16",
29
+ "transformers_version": "4.25.1",
30
+ "use_cache": true,
31
+ "vocab_size": 250112
32
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c60c9ae72fd6881459a29a78081a6fe74bb9daa049db046407128cdca9b1eb4a
3
+ size 2459242663
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6502d07619068a98aa2d3bb531332a694ffe108ca6c6fe62a467ccfe98d666b9
3
+ size 16315219
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "eos_token": "</s>",
4
+ "extra_ids": 0,
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "name_or_path": "bigscience/mt0-large",
7
+ "pad_token": "<pad>",
8
+ "sp_model_kwargs": {},
9
+ "special_tokens_map_file": "/home/patrick/.cache/torch/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276",
10
+ "tokenizer_class": "T5Tokenizer",
11
+ "unk_token": "<unk>"
12
+ }
trainer_state.json ADDED
@@ -0,0 +1,1716 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 19.83629191321499,
5
+ "global_step": 2500,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.08,
12
+ "learning_rate": 6.2827061767850974e-06,
13
+ "loss": 3.8041,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.16,
18
+ "learning_rate": 9.721930086076435e-06,
19
+ "loss": 3.5706,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.24,
24
+ "learning_rate": 1.1424336578396131e-05,
25
+ "loss": 3.2293,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.32,
30
+ "learning_rate": 1.2565412353570195e-05,
31
+ "loss": 2.8198,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.39,
36
+ "learning_rate": 1.3424920790724471e-05,
37
+ "loss": 2.6382,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.47,
42
+ "learning_rate": 1.4114674634318977e-05,
43
+ "loss": 2.5852,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.55,
48
+ "learning_rate": 1.4690794260273606e-05,
49
+ "loss": 2.4718,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 0.63,
54
+ "learning_rate": 1.5185478617142983e-05,
55
+ "loss": 2.3908,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 0.71,
60
+ "learning_rate": 1.561892416495773e-05,
61
+ "loss": 2.3321,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 0.79,
66
+ "learning_rate": 1.600463626286153e-05,
67
+ "loss": 2.3622,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.79,
72
+ "eval_loss": 2.0142996311187744,
73
+ "eval_runtime": 13.3285,
74
+ "eval_samples_per_second": 92.958,
75
+ "eval_steps_per_second": 18.607,
76
+ "step": 100
77
+ },
78
+ {
79
+ "epoch": 0.87,
80
+ "learning_rate": 1.6352093070986755e-05,
81
+ "loss": 2.2679,
82
+ "step": 110
83
+ },
84
+ {
85
+ "epoch": 0.95,
86
+ "learning_rate": 1.666820294156779e-05,
87
+ "loss": 2.2493,
88
+ "step": 120
89
+ },
90
+ {
91
+ "epoch": 1.03,
92
+ "learning_rate": 1.6958156901812732e-05,
93
+ "loss": 2.3766,
94
+ "step": 130
95
+ },
96
+ {
97
+ "epoch": 1.11,
98
+ "learning_rate": 1.7225954557575116e-05,
99
+ "loss": 2.1449,
100
+ "step": 140
101
+ },
102
+ {
103
+ "epoch": 1.19,
104
+ "learning_rate": 1.747474292314672e-05,
105
+ "loss": 2.1136,
106
+ "step": 150
107
+ },
108
+ {
109
+ "epoch": 1.27,
110
+ "learning_rate": 1.770704275518123e-05,
111
+ "loss": 2.107,
112
+ "step": 160
113
+ },
114
+ {
115
+ "epoch": 1.35,
116
+ "learning_rate": 1.7924904433402933e-05,
117
+ "loss": 2.1469,
118
+ "step": 170
119
+ },
120
+ {
121
+ "epoch": 1.43,
122
+ "learning_rate": 1.8130018169564944e-05,
123
+ "loss": 2.1229,
124
+ "step": 180
125
+ },
126
+ {
127
+ "epoch": 1.5,
128
+ "learning_rate": 1.832379371735486e-05,
129
+ "loss": 2.0893,
130
+ "step": 190
131
+ },
132
+ {
133
+ "epoch": 1.58,
134
+ "learning_rate": 1.8507419182561513e-05,
135
+ "loss": 2.0836,
136
+ "step": 200
137
+ },
138
+ {
139
+ "epoch": 1.58,
140
+ "eval_loss": 1.813197135925293,
141
+ "eval_runtime": 13.3546,
142
+ "eval_samples_per_second": 92.777,
143
+ "eval_steps_per_second": 18.57,
144
+ "step": 200
145
+ },
146
+ {
147
+ "epoch": 1.66,
148
+ "learning_rate": 1.868190518296623e-05,
149
+ "loss": 2.0307,
150
+ "step": 210
151
+ },
152
+ {
153
+ "epoch": 1.74,
154
+ "learning_rate": 1.8848118530355293e-05,
155
+ "loss": 2.0025,
156
+ "step": 220
157
+ },
158
+ {
159
+ "epoch": 1.82,
160
+ "learning_rate": 1.9006808283301558e-05,
161
+ "loss": 2.0116,
162
+ "step": 230
163
+ },
164
+ {
165
+ "epoch": 1.9,
166
+ "learning_rate": 1.915862615484805e-05,
167
+ "loss": 2.0355,
168
+ "step": 240
169
+ },
170
+ {
171
+ "epoch": 1.98,
172
+ "learning_rate": 1.9304142682139013e-05,
173
+ "loss": 2.0056,
174
+ "step": 250
175
+ },
176
+ {
177
+ "epoch": 2.06,
178
+ "learning_rate": 1.945753054066002e-05,
179
+ "loss": 2.1508,
180
+ "step": 260
181
+ },
182
+ {
183
+ "epoch": 2.14,
184
+ "learning_rate": 1.959138057324679e-05,
185
+ "loss": 1.9203,
186
+ "step": 270
187
+ },
188
+ {
189
+ "epoch": 2.22,
190
+ "learning_rate": 1.9720308518695147e-05,
191
+ "loss": 1.9358,
192
+ "step": 280
193
+ },
194
+ {
195
+ "epoch": 2.3,
196
+ "learning_rate": 1.9844663575115566e-05,
197
+ "loss": 1.877,
198
+ "step": 290
199
+ },
200
+ {
201
+ "epoch": 2.38,
202
+ "learning_rate": 1.9964759049286476e-05,
203
+ "loss": 1.9239,
204
+ "step": 300
205
+ },
206
+ {
207
+ "epoch": 2.38,
208
+ "eval_loss": 1.726140022277832,
209
+ "eval_runtime": 13.3534,
210
+ "eval_samples_per_second": 92.785,
211
+ "eval_steps_per_second": 18.572,
212
+ "step": 300
213
+ },
214
+ {
215
+ "epoch": 2.46,
216
+ "learning_rate": 1.994594594594595e-05,
217
+ "loss": 1.9449,
218
+ "step": 310
219
+ },
220
+ {
221
+ "epoch": 2.54,
222
+ "learning_rate": 1.9855855855855857e-05,
223
+ "loss": 1.895,
224
+ "step": 320
225
+ },
226
+ {
227
+ "epoch": 2.62,
228
+ "learning_rate": 1.9765765765765768e-05,
229
+ "loss": 1.864,
230
+ "step": 330
231
+ },
232
+ {
233
+ "epoch": 2.69,
234
+ "learning_rate": 1.967567567567568e-05,
235
+ "loss": 1.8928,
236
+ "step": 340
237
+ },
238
+ {
239
+ "epoch": 2.77,
240
+ "learning_rate": 1.9585585585585586e-05,
241
+ "loss": 1.8801,
242
+ "step": 350
243
+ },
244
+ {
245
+ "epoch": 2.85,
246
+ "learning_rate": 1.9495495495495497e-05,
247
+ "loss": 1.911,
248
+ "step": 360
249
+ },
250
+ {
251
+ "epoch": 2.93,
252
+ "learning_rate": 1.9405405405405408e-05,
253
+ "loss": 1.8945,
254
+ "step": 370
255
+ },
256
+ {
257
+ "epoch": 3.02,
258
+ "learning_rate": 1.930630630630631e-05,
259
+ "loss": 1.9508,
260
+ "step": 380
261
+ },
262
+ {
263
+ "epoch": 3.09,
264
+ "learning_rate": 1.9216216216216216e-05,
265
+ "loss": 1.7709,
266
+ "step": 390
267
+ },
268
+ {
269
+ "epoch": 3.17,
270
+ "learning_rate": 1.9126126126126127e-05,
271
+ "loss": 1.7693,
272
+ "step": 400
273
+ },
274
+ {
275
+ "epoch": 3.17,
276
+ "eval_loss": 1.673058032989502,
277
+ "eval_runtime": 13.3468,
278
+ "eval_samples_per_second": 92.832,
279
+ "eval_steps_per_second": 18.581,
280
+ "step": 400
281
+ },
282
+ {
283
+ "epoch": 3.25,
284
+ "learning_rate": 1.9036036036036038e-05,
285
+ "loss": 1.7773,
286
+ "step": 410
287
+ },
288
+ {
289
+ "epoch": 3.33,
290
+ "learning_rate": 1.8945945945945945e-05,
291
+ "loss": 1.8198,
292
+ "step": 420
293
+ },
294
+ {
295
+ "epoch": 3.41,
296
+ "learning_rate": 1.8855855855855856e-05,
297
+ "loss": 1.7813,
298
+ "step": 430
299
+ },
300
+ {
301
+ "epoch": 3.49,
302
+ "learning_rate": 1.8765765765765767e-05,
303
+ "loss": 1.7736,
304
+ "step": 440
305
+ },
306
+ {
307
+ "epoch": 3.57,
308
+ "learning_rate": 1.8675675675675678e-05,
309
+ "loss": 1.8544,
310
+ "step": 450
311
+ },
312
+ {
313
+ "epoch": 3.65,
314
+ "learning_rate": 1.8585585585585585e-05,
315
+ "loss": 1.7672,
316
+ "step": 460
317
+ },
318
+ {
319
+ "epoch": 3.73,
320
+ "learning_rate": 1.8495495495495496e-05,
321
+ "loss": 1.8223,
322
+ "step": 470
323
+ },
324
+ {
325
+ "epoch": 3.8,
326
+ "learning_rate": 1.8405405405405407e-05,
327
+ "loss": 1.7832,
328
+ "step": 480
329
+ },
330
+ {
331
+ "epoch": 3.88,
332
+ "learning_rate": 1.8315315315315318e-05,
333
+ "loss": 1.7937,
334
+ "step": 490
335
+ },
336
+ {
337
+ "epoch": 3.96,
338
+ "learning_rate": 1.822522522522523e-05,
339
+ "loss": 1.8078,
340
+ "step": 500
341
+ },
342
+ {
343
+ "epoch": 3.96,
344
+ "eval_loss": 1.6267061233520508,
345
+ "eval_runtime": 13.3535,
346
+ "eval_samples_per_second": 92.784,
347
+ "eval_steps_per_second": 18.572,
348
+ "step": 500
349
+ },
350
+ {
351
+ "epoch": 4.05,
352
+ "learning_rate": 1.8126126126126127e-05,
353
+ "loss": 1.8374,
354
+ "step": 510
355
+ },
356
+ {
357
+ "epoch": 4.13,
358
+ "learning_rate": 1.8036036036036037e-05,
359
+ "loss": 1.7228,
360
+ "step": 520
361
+ },
362
+ {
363
+ "epoch": 4.21,
364
+ "learning_rate": 1.7945945945945948e-05,
365
+ "loss": 1.6986,
366
+ "step": 530
367
+ },
368
+ {
369
+ "epoch": 4.28,
370
+ "learning_rate": 1.7855855855855856e-05,
371
+ "loss": 1.6858,
372
+ "step": 540
373
+ },
374
+ {
375
+ "epoch": 4.36,
376
+ "learning_rate": 1.7765765765765767e-05,
377
+ "loss": 1.7169,
378
+ "step": 550
379
+ },
380
+ {
381
+ "epoch": 4.44,
382
+ "learning_rate": 1.7675675675675677e-05,
383
+ "loss": 1.7069,
384
+ "step": 560
385
+ },
386
+ {
387
+ "epoch": 4.52,
388
+ "learning_rate": 1.7585585585585588e-05,
389
+ "loss": 1.6725,
390
+ "step": 570
391
+ },
392
+ {
393
+ "epoch": 4.6,
394
+ "learning_rate": 1.7495495495495496e-05,
395
+ "loss": 1.7052,
396
+ "step": 580
397
+ },
398
+ {
399
+ "epoch": 4.68,
400
+ "learning_rate": 1.7405405405405406e-05,
401
+ "loss": 1.6597,
402
+ "step": 590
403
+ },
404
+ {
405
+ "epoch": 4.76,
406
+ "learning_rate": 1.7315315315315317e-05,
407
+ "loss": 1.7565,
408
+ "step": 600
409
+ },
410
+ {
411
+ "epoch": 4.76,
412
+ "eval_loss": 1.5994915962219238,
413
+ "eval_runtime": 13.2972,
414
+ "eval_samples_per_second": 93.177,
415
+ "eval_steps_per_second": 18.651,
416
+ "step": 600
417
+ },
418
+ {
419
+ "epoch": 4.84,
420
+ "learning_rate": 1.7225225225225225e-05,
421
+ "loss": 1.723,
422
+ "step": 610
423
+ },
424
+ {
425
+ "epoch": 4.92,
426
+ "learning_rate": 1.7135135135135135e-05,
427
+ "loss": 1.6777,
428
+ "step": 620
429
+ },
430
+ {
431
+ "epoch": 4.99,
432
+ "learning_rate": 1.7045045045045046e-05,
433
+ "loss": 1.6943,
434
+ "step": 630
435
+ },
436
+ {
437
+ "epoch": 5.08,
438
+ "learning_rate": 1.6954954954954957e-05,
439
+ "loss": 1.7274,
440
+ "step": 640
441
+ },
442
+ {
443
+ "epoch": 5.16,
444
+ "learning_rate": 1.6864864864864868e-05,
445
+ "loss": 1.7062,
446
+ "step": 650
447
+ },
448
+ {
449
+ "epoch": 5.24,
450
+ "learning_rate": 1.6774774774774775e-05,
451
+ "loss": 1.6078,
452
+ "step": 660
453
+ },
454
+ {
455
+ "epoch": 5.32,
456
+ "learning_rate": 1.6684684684684686e-05,
457
+ "loss": 1.6484,
458
+ "step": 670
459
+ },
460
+ {
461
+ "epoch": 5.39,
462
+ "learning_rate": 1.6594594594594597e-05,
463
+ "loss": 1.6097,
464
+ "step": 680
465
+ },
466
+ {
467
+ "epoch": 5.47,
468
+ "learning_rate": 1.6504504504504508e-05,
469
+ "loss": 1.6255,
470
+ "step": 690
471
+ },
472
+ {
473
+ "epoch": 5.55,
474
+ "learning_rate": 1.641441441441442e-05,
475
+ "loss": 1.6188,
476
+ "step": 700
477
+ },
478
+ {
479
+ "epoch": 5.55,
480
+ "eval_loss": 1.5767285823822021,
481
+ "eval_runtime": 13.2952,
482
+ "eval_samples_per_second": 93.191,
483
+ "eval_steps_per_second": 18.653,
484
+ "step": 700
485
+ },
486
+ {
487
+ "epoch": 5.63,
488
+ "learning_rate": 1.6324324324324326e-05,
489
+ "loss": 1.68,
490
+ "step": 710
491
+ },
492
+ {
493
+ "epoch": 5.71,
494
+ "learning_rate": 1.6234234234234237e-05,
495
+ "loss": 1.618,
496
+ "step": 720
497
+ },
498
+ {
499
+ "epoch": 5.79,
500
+ "learning_rate": 1.6144144144144144e-05,
501
+ "loss": 1.5747,
502
+ "step": 730
503
+ },
504
+ {
505
+ "epoch": 5.87,
506
+ "learning_rate": 1.6054054054054055e-05,
507
+ "loss": 1.6617,
508
+ "step": 740
509
+ },
510
+ {
511
+ "epoch": 5.95,
512
+ "learning_rate": 1.5963963963963966e-05,
513
+ "loss": 1.6466,
514
+ "step": 750
515
+ },
516
+ {
517
+ "epoch": 6.03,
518
+ "learning_rate": 1.5864864864864867e-05,
519
+ "loss": 1.7621,
520
+ "step": 760
521
+ },
522
+ {
523
+ "epoch": 6.11,
524
+ "learning_rate": 1.5774774774774778e-05,
525
+ "loss": 1.5679,
526
+ "step": 770
527
+ },
528
+ {
529
+ "epoch": 6.19,
530
+ "learning_rate": 1.5684684684684685e-05,
531
+ "loss": 1.6356,
532
+ "step": 780
533
+ },
534
+ {
535
+ "epoch": 6.27,
536
+ "learning_rate": 1.5594594594594596e-05,
537
+ "loss": 1.5499,
538
+ "step": 790
539
+ },
540
+ {
541
+ "epoch": 6.35,
542
+ "learning_rate": 1.5504504504504504e-05,
543
+ "loss": 1.5783,
544
+ "step": 800
545
+ },
546
+ {
547
+ "epoch": 6.35,
548
+ "eval_loss": 1.5619192123413086,
549
+ "eval_runtime": 13.2986,
550
+ "eval_samples_per_second": 93.168,
551
+ "eval_steps_per_second": 18.649,
552
+ "step": 800
553
+ },
554
+ {
555
+ "epoch": 6.43,
556
+ "learning_rate": 1.5414414414414414e-05,
557
+ "loss": 1.5848,
558
+ "step": 810
559
+ },
560
+ {
561
+ "epoch": 6.5,
562
+ "learning_rate": 1.5324324324324325e-05,
563
+ "loss": 1.5019,
564
+ "step": 820
565
+ },
566
+ {
567
+ "epoch": 6.58,
568
+ "learning_rate": 1.5234234234234236e-05,
569
+ "loss": 1.6008,
570
+ "step": 830
571
+ },
572
+ {
573
+ "epoch": 6.66,
574
+ "learning_rate": 1.5144144144144147e-05,
575
+ "loss": 1.5757,
576
+ "step": 840
577
+ },
578
+ {
579
+ "epoch": 6.74,
580
+ "learning_rate": 1.5054054054054054e-05,
581
+ "loss": 1.5855,
582
+ "step": 850
583
+ },
584
+ {
585
+ "epoch": 6.82,
586
+ "learning_rate": 1.4963963963963965e-05,
587
+ "loss": 1.629,
588
+ "step": 860
589
+ },
590
+ {
591
+ "epoch": 6.9,
592
+ "learning_rate": 1.4873873873873874e-05,
593
+ "loss": 1.5948,
594
+ "step": 870
595
+ },
596
+ {
597
+ "epoch": 6.98,
598
+ "learning_rate": 1.4783783783783785e-05,
599
+ "loss": 1.5814,
600
+ "step": 880
601
+ },
602
+ {
603
+ "epoch": 7.06,
604
+ "learning_rate": 1.4684684684684686e-05,
605
+ "loss": 1.7112,
606
+ "step": 890
607
+ },
608
+ {
609
+ "epoch": 7.14,
610
+ "learning_rate": 1.4594594594594596e-05,
611
+ "loss": 1.4981,
612
+ "step": 900
613
+ },
614
+ {
615
+ "epoch": 7.14,
616
+ "eval_loss": 1.546679139137268,
617
+ "eval_runtime": 13.2814,
618
+ "eval_samples_per_second": 93.288,
619
+ "eval_steps_per_second": 18.673,
620
+ "step": 900
621
+ },
622
+ {
623
+ "epoch": 7.22,
624
+ "learning_rate": 1.4504504504504506e-05,
625
+ "loss": 1.5341,
626
+ "step": 910
627
+ },
628
+ {
629
+ "epoch": 7.3,
630
+ "learning_rate": 1.4414414414414416e-05,
631
+ "loss": 1.5301,
632
+ "step": 920
633
+ },
634
+ {
635
+ "epoch": 7.38,
636
+ "learning_rate": 1.4324324324324326e-05,
637
+ "loss": 1.5253,
638
+ "step": 930
639
+ },
640
+ {
641
+ "epoch": 7.46,
642
+ "learning_rate": 1.4234234234234234e-05,
643
+ "loss": 1.5697,
644
+ "step": 940
645
+ },
646
+ {
647
+ "epoch": 7.54,
648
+ "learning_rate": 1.4144144144144145e-05,
649
+ "loss": 1.5482,
650
+ "step": 950
651
+ },
652
+ {
653
+ "epoch": 7.62,
654
+ "learning_rate": 1.4054054054054055e-05,
655
+ "loss": 1.4849,
656
+ "step": 960
657
+ },
658
+ {
659
+ "epoch": 7.69,
660
+ "learning_rate": 1.3963963963963964e-05,
661
+ "loss": 1.536,
662
+ "step": 970
663
+ },
664
+ {
665
+ "epoch": 7.77,
666
+ "learning_rate": 1.3873873873873875e-05,
667
+ "loss": 1.5267,
668
+ "step": 980
669
+ },
670
+ {
671
+ "epoch": 7.85,
672
+ "learning_rate": 1.3783783783783784e-05,
673
+ "loss": 1.5003,
674
+ "step": 990
675
+ },
676
+ {
677
+ "epoch": 7.93,
678
+ "learning_rate": 1.3693693693693695e-05,
679
+ "loss": 1.5296,
680
+ "step": 1000
681
+ },
682
+ {
683
+ "epoch": 7.93,
684
+ "eval_loss": 1.5358564853668213,
685
+ "eval_runtime": 13.2903,
686
+ "eval_samples_per_second": 93.226,
687
+ "eval_steps_per_second": 18.66,
688
+ "step": 1000
689
+ },
690
+ {
691
+ "epoch": 8.02,
692
+ "learning_rate": 1.3594594594594597e-05,
693
+ "loss": 1.6728,
694
+ "step": 1010
695
+ },
696
+ {
697
+ "epoch": 8.09,
698
+ "learning_rate": 1.3504504504504506e-05,
699
+ "loss": 1.4944,
700
+ "step": 1020
701
+ },
702
+ {
703
+ "epoch": 8.17,
704
+ "learning_rate": 1.3414414414414417e-05,
705
+ "loss": 1.5016,
706
+ "step": 1030
707
+ },
708
+ {
709
+ "epoch": 8.25,
710
+ "learning_rate": 1.3324324324324324e-05,
711
+ "loss": 1.5407,
712
+ "step": 1040
713
+ },
714
+ {
715
+ "epoch": 8.33,
716
+ "learning_rate": 1.3234234234234235e-05,
717
+ "loss": 1.5129,
718
+ "step": 1050
719
+ },
720
+ {
721
+ "epoch": 8.41,
722
+ "learning_rate": 1.3144144144144144e-05,
723
+ "loss": 1.4774,
724
+ "step": 1060
725
+ },
726
+ {
727
+ "epoch": 8.49,
728
+ "learning_rate": 1.3054054054054055e-05,
729
+ "loss": 1.5181,
730
+ "step": 1070
731
+ },
732
+ {
733
+ "epoch": 8.57,
734
+ "learning_rate": 1.2963963963963966e-05,
735
+ "loss": 1.4897,
736
+ "step": 1080
737
+ },
738
+ {
739
+ "epoch": 8.65,
740
+ "learning_rate": 1.2873873873873875e-05,
741
+ "loss": 1.4918,
742
+ "step": 1090
743
+ },
744
+ {
745
+ "epoch": 8.73,
746
+ "learning_rate": 1.2783783783783785e-05,
747
+ "loss": 1.4734,
748
+ "step": 1100
749
+ },
750
+ {
751
+ "epoch": 8.73,
752
+ "eval_loss": 1.5299837589263916,
753
+ "eval_runtime": 13.3097,
754
+ "eval_samples_per_second": 93.09,
755
+ "eval_steps_per_second": 18.633,
756
+ "step": 1100
757
+ },
758
+ {
759
+ "epoch": 8.8,
760
+ "learning_rate": 1.2693693693693695e-05,
761
+ "loss": 1.4425,
762
+ "step": 1110
763
+ },
764
+ {
765
+ "epoch": 8.88,
766
+ "learning_rate": 1.2603603603603605e-05,
767
+ "loss": 1.5102,
768
+ "step": 1120
769
+ },
770
+ {
771
+ "epoch": 8.96,
772
+ "learning_rate": 1.2513513513513516e-05,
773
+ "loss": 1.4614,
774
+ "step": 1130
775
+ },
776
+ {
777
+ "epoch": 9.05,
778
+ "learning_rate": 1.2423423423423424e-05,
779
+ "loss": 1.5378,
780
+ "step": 1140
781
+ },
782
+ {
783
+ "epoch": 9.13,
784
+ "learning_rate": 1.2333333333333334e-05,
785
+ "loss": 1.5182,
786
+ "step": 1150
787
+ },
788
+ {
789
+ "epoch": 9.21,
790
+ "learning_rate": 1.2243243243243244e-05,
791
+ "loss": 1.4606,
792
+ "step": 1160
793
+ },
794
+ {
795
+ "epoch": 9.28,
796
+ "learning_rate": 1.2153153153153154e-05,
797
+ "loss": 1.4675,
798
+ "step": 1170
799
+ },
800
+ {
801
+ "epoch": 9.36,
802
+ "learning_rate": 1.2063063063063063e-05,
803
+ "loss": 1.5261,
804
+ "step": 1180
805
+ },
806
+ {
807
+ "epoch": 9.44,
808
+ "learning_rate": 1.1972972972972974e-05,
809
+ "loss": 1.4602,
810
+ "step": 1190
811
+ },
812
+ {
813
+ "epoch": 9.52,
814
+ "learning_rate": 1.1882882882882885e-05,
815
+ "loss": 1.4415,
816
+ "step": 1200
817
+ },
818
+ {
819
+ "epoch": 9.52,
820
+ "eval_loss": 1.521682620048523,
821
+ "eval_runtime": 13.2939,
822
+ "eval_samples_per_second": 93.2,
823
+ "eval_steps_per_second": 18.655,
824
+ "step": 1200
825
+ },
826
+ {
827
+ "epoch": 9.6,
828
+ "learning_rate": 1.1792792792792792e-05,
829
+ "loss": 1.4601,
830
+ "step": 1210
831
+ },
832
+ {
833
+ "epoch": 9.68,
834
+ "learning_rate": 1.1702702702702703e-05,
835
+ "loss": 1.4676,
836
+ "step": 1220
837
+ },
838
+ {
839
+ "epoch": 9.76,
840
+ "learning_rate": 1.1612612612612612e-05,
841
+ "loss": 1.4213,
842
+ "step": 1230
843
+ },
844
+ {
845
+ "epoch": 9.84,
846
+ "learning_rate": 1.1522522522522523e-05,
847
+ "loss": 1.4096,
848
+ "step": 1240
849
+ },
850
+ {
851
+ "epoch": 9.92,
852
+ "learning_rate": 1.1432432432432434e-05,
853
+ "loss": 1.4475,
854
+ "step": 1250
855
+ },
856
+ {
857
+ "epoch": 9.99,
858
+ "learning_rate": 1.1342342342342343e-05,
859
+ "loss": 1.4477,
860
+ "step": 1260
861
+ },
862
+ {
863
+ "epoch": 10.08,
864
+ "learning_rate": 1.1243243243243245e-05,
865
+ "loss": 1.4979,
866
+ "step": 1270
867
+ },
868
+ {
869
+ "epoch": 10.16,
870
+ "learning_rate": 1.1153153153153154e-05,
871
+ "loss": 1.4436,
872
+ "step": 1280
873
+ },
874
+ {
875
+ "epoch": 10.24,
876
+ "learning_rate": 1.1063063063063065e-05,
877
+ "loss": 1.4913,
878
+ "step": 1290
879
+ },
880
+ {
881
+ "epoch": 10.32,
882
+ "learning_rate": 1.0972972972972974e-05,
883
+ "loss": 1.4513,
884
+ "step": 1300
885
+ },
886
+ {
887
+ "epoch": 10.32,
888
+ "eval_loss": 1.5171560049057007,
889
+ "eval_runtime": 13.2984,
890
+ "eval_samples_per_second": 93.169,
891
+ "eval_steps_per_second": 18.649,
892
+ "step": 1300
893
+ },
894
+ {
895
+ "epoch": 10.39,
896
+ "learning_rate": 1.0882882882882884e-05,
897
+ "loss": 1.4649,
898
+ "step": 1310
899
+ },
900
+ {
901
+ "epoch": 10.47,
902
+ "learning_rate": 1.0792792792792795e-05,
903
+ "loss": 1.4126,
904
+ "step": 1320
905
+ },
906
+ {
907
+ "epoch": 10.55,
908
+ "learning_rate": 1.0702702702702703e-05,
909
+ "loss": 1.3916,
910
+ "step": 1330
911
+ },
912
+ {
913
+ "epoch": 10.63,
914
+ "learning_rate": 1.0612612612612613e-05,
915
+ "loss": 1.393,
916
+ "step": 1340
917
+ },
918
+ {
919
+ "epoch": 10.71,
920
+ "learning_rate": 1.0522522522522523e-05,
921
+ "loss": 1.3972,
922
+ "step": 1350
923
+ },
924
+ {
925
+ "epoch": 10.79,
926
+ "learning_rate": 1.0432432432432433e-05,
927
+ "loss": 1.4867,
928
+ "step": 1360
929
+ },
930
+ {
931
+ "epoch": 10.87,
932
+ "learning_rate": 1.0342342342342344e-05,
933
+ "loss": 1.4109,
934
+ "step": 1370
935
+ },
936
+ {
937
+ "epoch": 10.95,
938
+ "learning_rate": 1.0252252252252253e-05,
939
+ "loss": 1.4215,
940
+ "step": 1380
941
+ },
942
+ {
943
+ "epoch": 11.03,
944
+ "learning_rate": 1.0153153153153155e-05,
945
+ "loss": 1.5288,
946
+ "step": 1390
947
+ },
948
+ {
949
+ "epoch": 11.11,
950
+ "learning_rate": 1.0063063063063064e-05,
951
+ "loss": 1.3782,
952
+ "step": 1400
953
+ },
954
+ {
955
+ "epoch": 11.11,
956
+ "eval_loss": 1.5077377557754517,
957
+ "eval_runtime": 13.2758,
958
+ "eval_samples_per_second": 93.328,
959
+ "eval_steps_per_second": 18.681,
960
+ "step": 1400
961
+ },
962
+ {
963
+ "epoch": 11.19,
964
+ "learning_rate": 9.972972972972975e-06,
965
+ "loss": 1.4292,
966
+ "step": 1410
967
+ },
968
+ {
969
+ "epoch": 11.27,
970
+ "learning_rate": 9.882882882882884e-06,
971
+ "loss": 1.4404,
972
+ "step": 1420
973
+ },
974
+ {
975
+ "epoch": 11.35,
976
+ "learning_rate": 9.792792792792793e-06,
977
+ "loss": 1.4192,
978
+ "step": 1430
979
+ },
980
+ {
981
+ "epoch": 11.43,
982
+ "learning_rate": 9.702702702702704e-06,
983
+ "loss": 1.324,
984
+ "step": 1440
985
+ },
986
+ {
987
+ "epoch": 11.5,
988
+ "learning_rate": 9.612612612612613e-06,
989
+ "loss": 1.4342,
990
+ "step": 1450
991
+ },
992
+ {
993
+ "epoch": 11.58,
994
+ "learning_rate": 9.522522522522524e-06,
995
+ "loss": 1.4233,
996
+ "step": 1460
997
+ },
998
+ {
999
+ "epoch": 11.66,
1000
+ "learning_rate": 9.432432432432433e-06,
1001
+ "loss": 1.3475,
1002
+ "step": 1470
1003
+ },
1004
+ {
1005
+ "epoch": 11.74,
1006
+ "learning_rate": 9.342342342342344e-06,
1007
+ "loss": 1.4178,
1008
+ "step": 1480
1009
+ },
1010
+ {
1011
+ "epoch": 11.82,
1012
+ "learning_rate": 9.252252252252253e-06,
1013
+ "loss": 1.3842,
1014
+ "step": 1490
1015
+ },
1016
+ {
1017
+ "epoch": 11.9,
1018
+ "learning_rate": 9.162162162162162e-06,
1019
+ "loss": 1.4103,
1020
+ "step": 1500
1021
+ },
1022
+ {
1023
+ "epoch": 11.9,
1024
+ "eval_loss": 1.507853388786316,
1025
+ "eval_runtime": 13.2911,
1026
+ "eval_samples_per_second": 93.22,
1027
+ "eval_steps_per_second": 18.659,
1028
+ "step": 1500
1029
+ },
1030
+ {
1031
+ "epoch": 11.98,
1032
+ "learning_rate": 9.072072072072073e-06,
1033
+ "loss": 1.3918,
1034
+ "step": 1510
1035
+ },
1036
+ {
1037
+ "epoch": 12.06,
1038
+ "learning_rate": 8.972972972972974e-06,
1039
+ "loss": 1.4841,
1040
+ "step": 1520
1041
+ },
1042
+ {
1043
+ "epoch": 12.14,
1044
+ "learning_rate": 8.882882882882883e-06,
1045
+ "loss": 1.3866,
1046
+ "step": 1530
1047
+ },
1048
+ {
1049
+ "epoch": 12.22,
1050
+ "learning_rate": 8.792792792792794e-06,
1051
+ "loss": 1.3713,
1052
+ "step": 1540
1053
+ },
1054
+ {
1055
+ "epoch": 12.3,
1056
+ "learning_rate": 8.702702702702703e-06,
1057
+ "loss": 1.3384,
1058
+ "step": 1550
1059
+ },
1060
+ {
1061
+ "epoch": 12.38,
1062
+ "learning_rate": 8.612612612612612e-06,
1063
+ "loss": 1.4079,
1064
+ "step": 1560
1065
+ },
1066
+ {
1067
+ "epoch": 12.46,
1068
+ "learning_rate": 8.522522522522523e-06,
1069
+ "loss": 1.3715,
1070
+ "step": 1570
1071
+ },
1072
+ {
1073
+ "epoch": 12.54,
1074
+ "learning_rate": 8.432432432432434e-06,
1075
+ "loss": 1.3541,
1076
+ "step": 1580
1077
+ },
1078
+ {
1079
+ "epoch": 12.62,
1080
+ "learning_rate": 8.342342342342343e-06,
1081
+ "loss": 1.3764,
1082
+ "step": 1590
1083
+ },
1084
+ {
1085
+ "epoch": 12.69,
1086
+ "learning_rate": 8.252252252252254e-06,
1087
+ "loss": 1.3907,
1088
+ "step": 1600
1089
+ },
1090
+ {
1091
+ "epoch": 12.69,
1092
+ "eval_loss": 1.5033966302871704,
1093
+ "eval_runtime": 13.2864,
1094
+ "eval_samples_per_second": 93.254,
1095
+ "eval_steps_per_second": 18.666,
1096
+ "step": 1600
1097
+ },
1098
+ {
1099
+ "epoch": 12.77,
1100
+ "learning_rate": 8.162162162162163e-06,
1101
+ "loss": 1.3626,
1102
+ "step": 1610
1103
+ },
1104
+ {
1105
+ "epoch": 12.85,
1106
+ "learning_rate": 8.072072072072072e-06,
1107
+ "loss": 1.4094,
1108
+ "step": 1620
1109
+ },
1110
+ {
1111
+ "epoch": 12.93,
1112
+ "learning_rate": 7.981981981981983e-06,
1113
+ "loss": 1.3579,
1114
+ "step": 1630
1115
+ },
1116
+ {
1117
+ "epoch": 13.02,
1118
+ "learning_rate": 7.891891891891894e-06,
1119
+ "loss": 1.4517,
1120
+ "step": 1640
1121
+ },
1122
+ {
1123
+ "epoch": 13.09,
1124
+ "learning_rate": 7.801801801801803e-06,
1125
+ "loss": 1.301,
1126
+ "step": 1650
1127
+ },
1128
+ {
1129
+ "epoch": 13.17,
1130
+ "learning_rate": 7.711711711711712e-06,
1131
+ "loss": 1.3513,
1132
+ "step": 1660
1133
+ },
1134
+ {
1135
+ "epoch": 13.25,
1136
+ "learning_rate": 7.621621621621622e-06,
1137
+ "loss": 1.3487,
1138
+ "step": 1670
1139
+ },
1140
+ {
1141
+ "epoch": 13.33,
1142
+ "learning_rate": 7.531531531531532e-06,
1143
+ "loss": 1.3894,
1144
+ "step": 1680
1145
+ },
1146
+ {
1147
+ "epoch": 13.41,
1148
+ "learning_rate": 7.441441441441442e-06,
1149
+ "loss": 1.3619,
1150
+ "step": 1690
1151
+ },
1152
+ {
1153
+ "epoch": 13.49,
1154
+ "learning_rate": 7.3513513513513525e-06,
1155
+ "loss": 1.3663,
1156
+ "step": 1700
1157
+ },
1158
+ {
1159
+ "epoch": 13.49,
1160
+ "eval_loss": 1.501574993133545,
1161
+ "eval_runtime": 13.3092,
1162
+ "eval_samples_per_second": 93.094,
1163
+ "eval_steps_per_second": 18.634,
1164
+ "step": 1700
1165
+ },
1166
+ {
1167
+ "epoch": 13.57,
1168
+ "learning_rate": 7.2612612612612625e-06,
1169
+ "loss": 1.3268,
1170
+ "step": 1710
1171
+ },
1172
+ {
1173
+ "epoch": 13.65,
1174
+ "learning_rate": 7.1711711711711716e-06,
1175
+ "loss": 1.3703,
1176
+ "step": 1720
1177
+ },
1178
+ {
1179
+ "epoch": 13.73,
1180
+ "learning_rate": 7.0810810810810815e-06,
1181
+ "loss": 1.4246,
1182
+ "step": 1730
1183
+ },
1184
+ {
1185
+ "epoch": 13.8,
1186
+ "learning_rate": 6.9909909909909915e-06,
1187
+ "loss": 1.3642,
1188
+ "step": 1740
1189
+ },
1190
+ {
1191
+ "epoch": 13.88,
1192
+ "learning_rate": 6.900900900900901e-06,
1193
+ "loss": 1.3467,
1194
+ "step": 1750
1195
+ },
1196
+ {
1197
+ "epoch": 13.96,
1198
+ "learning_rate": 6.810810810810811e-06,
1199
+ "loss": 1.3802,
1200
+ "step": 1760
1201
+ },
1202
+ {
1203
+ "epoch": 14.05,
1204
+ "learning_rate": 6.711711711711713e-06,
1205
+ "loss": 1.4328,
1206
+ "step": 1770
1207
+ },
1208
+ {
1209
+ "epoch": 14.13,
1210
+ "learning_rate": 6.621621621621622e-06,
1211
+ "loss": 1.3316,
1212
+ "step": 1780
1213
+ },
1214
+ {
1215
+ "epoch": 14.21,
1216
+ "learning_rate": 6.531531531531532e-06,
1217
+ "loss": 1.3634,
1218
+ "step": 1790
1219
+ },
1220
+ {
1221
+ "epoch": 14.28,
1222
+ "learning_rate": 6.441441441441442e-06,
1223
+ "loss": 1.3565,
1224
+ "step": 1800
1225
+ },
1226
+ {
1227
+ "epoch": 14.28,
1228
+ "eval_loss": 1.4980181455612183,
1229
+ "eval_runtime": 13.2875,
1230
+ "eval_samples_per_second": 93.246,
1231
+ "eval_steps_per_second": 18.664,
1232
+ "step": 1800
1233
+ },
1234
+ {
1235
+ "epoch": 14.36,
1236
+ "learning_rate": 6.351351351351351e-06,
1237
+ "loss": 1.3217,
1238
+ "step": 1810
1239
+ },
1240
+ {
1241
+ "epoch": 14.44,
1242
+ "learning_rate": 6.261261261261262e-06,
1243
+ "loss": 1.33,
1244
+ "step": 1820
1245
+ },
1246
+ {
1247
+ "epoch": 14.52,
1248
+ "learning_rate": 6.171171171171172e-06,
1249
+ "loss": 1.3513,
1250
+ "step": 1830
1251
+ },
1252
+ {
1253
+ "epoch": 14.6,
1254
+ "learning_rate": 6.081081081081082e-06,
1255
+ "loss": 1.3649,
1256
+ "step": 1840
1257
+ },
1258
+ {
1259
+ "epoch": 14.68,
1260
+ "learning_rate": 5.990990990990992e-06,
1261
+ "loss": 1.3462,
1262
+ "step": 1850
1263
+ },
1264
+ {
1265
+ "epoch": 14.76,
1266
+ "learning_rate": 5.900900900900901e-06,
1267
+ "loss": 1.3454,
1268
+ "step": 1860
1269
+ },
1270
+ {
1271
+ "epoch": 14.84,
1272
+ "learning_rate": 5.810810810810811e-06,
1273
+ "loss": 1.3316,
1274
+ "step": 1870
1275
+ },
1276
+ {
1277
+ "epoch": 14.92,
1278
+ "learning_rate": 5.720720720720722e-06,
1279
+ "loss": 1.3347,
1280
+ "step": 1880
1281
+ },
1282
+ {
1283
+ "epoch": 14.99,
1284
+ "learning_rate": 5.6306306306306316e-06,
1285
+ "loss": 1.3039,
1286
+ "step": 1890
1287
+ },
1288
+ {
1289
+ "epoch": 15.08,
1290
+ "learning_rate": 5.531531531531532e-06,
1291
+ "loss": 1.4057,
1292
+ "step": 1900
1293
+ },
1294
+ {
1295
+ "epoch": 15.08,
1296
+ "eval_loss": 1.4985554218292236,
1297
+ "eval_runtime": 13.2747,
1298
+ "eval_samples_per_second": 93.335,
1299
+ "eval_steps_per_second": 18.682,
1300
+ "step": 1900
1301
+ },
1302
+ {
1303
+ "epoch": 15.16,
1304
+ "learning_rate": 5.441441441441442e-06,
1305
+ "loss": 1.3082,
1306
+ "step": 1910
1307
+ },
1308
+ {
1309
+ "epoch": 15.24,
1310
+ "learning_rate": 5.351351351351351e-06,
1311
+ "loss": 1.3589,
1312
+ "step": 1920
1313
+ },
1314
+ {
1315
+ "epoch": 15.32,
1316
+ "learning_rate": 5.261261261261261e-06,
1317
+ "loss": 1.3235,
1318
+ "step": 1930
1319
+ },
1320
+ {
1321
+ "epoch": 15.39,
1322
+ "learning_rate": 5.171171171171172e-06,
1323
+ "loss": 1.3153,
1324
+ "step": 1940
1325
+ },
1326
+ {
1327
+ "epoch": 15.47,
1328
+ "learning_rate": 5.081081081081082e-06,
1329
+ "loss": 1.3345,
1330
+ "step": 1950
1331
+ },
1332
+ {
1333
+ "epoch": 15.55,
1334
+ "learning_rate": 4.990990990990991e-06,
1335
+ "loss": 1.3824,
1336
+ "step": 1960
1337
+ },
1338
+ {
1339
+ "epoch": 15.63,
1340
+ "learning_rate": 4.900900900900901e-06,
1341
+ "loss": 1.291,
1342
+ "step": 1970
1343
+ },
1344
+ {
1345
+ "epoch": 15.71,
1346
+ "learning_rate": 4.810810810810811e-06,
1347
+ "loss": 1.3106,
1348
+ "step": 1980
1349
+ },
1350
+ {
1351
+ "epoch": 15.79,
1352
+ "learning_rate": 4.720720720720721e-06,
1353
+ "loss": 1.3559,
1354
+ "step": 1990
1355
+ },
1356
+ {
1357
+ "epoch": 15.87,
1358
+ "learning_rate": 4.630630630630631e-06,
1359
+ "loss": 1.3406,
1360
+ "step": 2000
1361
+ },
1362
+ {
1363
+ "epoch": 15.87,
1364
+ "eval_loss": 1.4952143430709839,
1365
+ "eval_runtime": 13.2782,
1366
+ "eval_samples_per_second": 93.31,
1367
+ "eval_steps_per_second": 18.677,
1368
+ "step": 2000
1369
+ },
1370
+ {
1371
+ "epoch": 15.95,
1372
+ "learning_rate": 4.540540540540541e-06,
1373
+ "loss": 1.328,
1374
+ "step": 2010
1375
+ },
1376
+ {
1377
+ "epoch": 16.03,
1378
+ "learning_rate": 4.441441441441442e-06,
1379
+ "loss": 1.4218,
1380
+ "step": 2020
1381
+ },
1382
+ {
1383
+ "epoch": 16.11,
1384
+ "learning_rate": 4.351351351351352e-06,
1385
+ "loss": 1.2962,
1386
+ "step": 2030
1387
+ },
1388
+ {
1389
+ "epoch": 16.19,
1390
+ "learning_rate": 4.2612612612612615e-06,
1391
+ "loss": 1.3122,
1392
+ "step": 2040
1393
+ },
1394
+ {
1395
+ "epoch": 16.27,
1396
+ "learning_rate": 4.1711711711711715e-06,
1397
+ "loss": 1.3641,
1398
+ "step": 2050
1399
+ },
1400
+ {
1401
+ "epoch": 16.35,
1402
+ "learning_rate": 4.0810810810810815e-06,
1403
+ "loss": 1.3058,
1404
+ "step": 2060
1405
+ },
1406
+ {
1407
+ "epoch": 16.43,
1408
+ "learning_rate": 3.990990990990991e-06,
1409
+ "loss": 1.2986,
1410
+ "step": 2070
1411
+ },
1412
+ {
1413
+ "epoch": 16.5,
1414
+ "learning_rate": 3.900900900900901e-06,
1415
+ "loss": 1.2902,
1416
+ "step": 2080
1417
+ },
1418
+ {
1419
+ "epoch": 16.58,
1420
+ "learning_rate": 3.810810810810811e-06,
1421
+ "loss": 1.3725,
1422
+ "step": 2090
1423
+ },
1424
+ {
1425
+ "epoch": 16.66,
1426
+ "learning_rate": 3.720720720720721e-06,
1427
+ "loss": 1.3031,
1428
+ "step": 2100
1429
+ },
1430
+ {
1431
+ "epoch": 16.66,
1432
+ "eval_loss": 1.495803713798523,
1433
+ "eval_runtime": 13.2846,
1434
+ "eval_samples_per_second": 93.266,
1435
+ "eval_steps_per_second": 18.668,
1436
+ "step": 2100
1437
+ },
1438
+ {
1439
+ "epoch": 16.74,
1440
+ "learning_rate": 3.6306306306306312e-06,
1441
+ "loss": 1.3091,
1442
+ "step": 2110
1443
+ },
1444
+ {
1445
+ "epoch": 16.82,
1446
+ "learning_rate": 3.5405405405405408e-06,
1447
+ "loss": 1.3003,
1448
+ "step": 2120
1449
+ },
1450
+ {
1451
+ "epoch": 16.9,
1452
+ "learning_rate": 3.4504504504504503e-06,
1453
+ "loss": 1.2694,
1454
+ "step": 2130
1455
+ },
1456
+ {
1457
+ "epoch": 16.98,
1458
+ "learning_rate": 3.3603603603603607e-06,
1459
+ "loss": 1.3349,
1460
+ "step": 2140
1461
+ },
1462
+ {
1463
+ "epoch": 17.06,
1464
+ "learning_rate": 3.2702702702702706e-06,
1465
+ "loss": 1.3553,
1466
+ "step": 2150
1467
+ },
1468
+ {
1469
+ "epoch": 17.14,
1470
+ "learning_rate": 3.1801801801801806e-06,
1471
+ "loss": 1.2844,
1472
+ "step": 2160
1473
+ },
1474
+ {
1475
+ "epoch": 17.22,
1476
+ "learning_rate": 3.0900900900900905e-06,
1477
+ "loss": 1.2815,
1478
+ "step": 2170
1479
+ },
1480
+ {
1481
+ "epoch": 17.3,
1482
+ "learning_rate": 3e-06,
1483
+ "loss": 1.2756,
1484
+ "step": 2180
1485
+ },
1486
+ {
1487
+ "epoch": 17.38,
1488
+ "learning_rate": 2.9099099099099105e-06,
1489
+ "loss": 1.33,
1490
+ "step": 2190
1491
+ },
1492
+ {
1493
+ "epoch": 17.46,
1494
+ "learning_rate": 2.81981981981982e-06,
1495
+ "loss": 1.31,
1496
+ "step": 2200
1497
+ },
1498
+ {
1499
+ "epoch": 17.46,
1500
+ "eval_loss": 1.4959148168563843,
1501
+ "eval_runtime": 13.2926,
1502
+ "eval_samples_per_second": 93.21,
1503
+ "eval_steps_per_second": 18.657,
1504
+ "step": 2200
1505
+ },
1506
+ {
1507
+ "epoch": 17.54,
1508
+ "learning_rate": 2.72972972972973e-06,
1509
+ "loss": 1.3306,
1510
+ "step": 2210
1511
+ },
1512
+ {
1513
+ "epoch": 17.62,
1514
+ "learning_rate": 2.63963963963964e-06,
1515
+ "loss": 1.331,
1516
+ "step": 2220
1517
+ },
1518
+ {
1519
+ "epoch": 17.69,
1520
+ "learning_rate": 2.54954954954955e-06,
1521
+ "loss": 1.3347,
1522
+ "step": 2230
1523
+ },
1524
+ {
1525
+ "epoch": 17.77,
1526
+ "learning_rate": 2.45945945945946e-06,
1527
+ "loss": 1.3291,
1528
+ "step": 2240
1529
+ },
1530
+ {
1531
+ "epoch": 17.85,
1532
+ "learning_rate": 2.3693693693693693e-06,
1533
+ "loss": 1.2774,
1534
+ "step": 2250
1535
+ },
1536
+ {
1537
+ "epoch": 17.93,
1538
+ "learning_rate": 2.2792792792792793e-06,
1539
+ "loss": 1.3102,
1540
+ "step": 2260
1541
+ },
1542
+ {
1543
+ "epoch": 18.02,
1544
+ "learning_rate": 2.1801801801801804e-06,
1545
+ "loss": 1.4393,
1546
+ "step": 2270
1547
+ },
1548
+ {
1549
+ "epoch": 18.09,
1550
+ "learning_rate": 2.0900900900900904e-06,
1551
+ "loss": 1.2779,
1552
+ "step": 2280
1553
+ },
1554
+ {
1555
+ "epoch": 18.17,
1556
+ "learning_rate": 2.0000000000000003e-06,
1557
+ "loss": 1.2708,
1558
+ "step": 2290
1559
+ },
1560
+ {
1561
+ "epoch": 18.25,
1562
+ "learning_rate": 1.90990990990991e-06,
1563
+ "loss": 1.3565,
1564
+ "step": 2300
1565
+ },
1566
+ {
1567
+ "epoch": 18.25,
1568
+ "eval_loss": 1.494253396987915,
1569
+ "eval_runtime": 13.2974,
1570
+ "eval_samples_per_second": 93.176,
1571
+ "eval_steps_per_second": 18.65,
1572
+ "step": 2300
1573
+ },
1574
+ {
1575
+ "epoch": 18.33,
1576
+ "learning_rate": 1.81981981981982e-06,
1577
+ "loss": 1.2744,
1578
+ "step": 2310
1579
+ },
1580
+ {
1581
+ "epoch": 18.41,
1582
+ "learning_rate": 1.72972972972973e-06,
1583
+ "loss": 1.2697,
1584
+ "step": 2320
1585
+ },
1586
+ {
1587
+ "epoch": 18.49,
1588
+ "learning_rate": 1.6396396396396397e-06,
1589
+ "loss": 1.2921,
1590
+ "step": 2330
1591
+ },
1592
+ {
1593
+ "epoch": 18.57,
1594
+ "learning_rate": 1.5495495495495497e-06,
1595
+ "loss": 1.3096,
1596
+ "step": 2340
1597
+ },
1598
+ {
1599
+ "epoch": 18.65,
1600
+ "learning_rate": 1.4594594594594596e-06,
1601
+ "loss": 1.2971,
1602
+ "step": 2350
1603
+ },
1604
+ {
1605
+ "epoch": 18.73,
1606
+ "learning_rate": 1.3693693693693694e-06,
1607
+ "loss": 1.3132,
1608
+ "step": 2360
1609
+ },
1610
+ {
1611
+ "epoch": 18.8,
1612
+ "learning_rate": 1.2792792792792793e-06,
1613
+ "loss": 1.3367,
1614
+ "step": 2370
1615
+ },
1616
+ {
1617
+ "epoch": 18.88,
1618
+ "learning_rate": 1.1891891891891893e-06,
1619
+ "loss": 1.2648,
1620
+ "step": 2380
1621
+ },
1622
+ {
1623
+ "epoch": 18.96,
1624
+ "learning_rate": 1.0990990990990993e-06,
1625
+ "loss": 1.3025,
1626
+ "step": 2390
1627
+ },
1628
+ {
1629
+ "epoch": 19.05,
1630
+ "learning_rate": 1.0000000000000002e-06,
1631
+ "loss": 1.3732,
1632
+ "step": 2400
1633
+ },
1634
+ {
1635
+ "epoch": 19.05,
1636
+ "eval_loss": 1.4953521490097046,
1637
+ "eval_runtime": 13.2927,
1638
+ "eval_samples_per_second": 93.209,
1639
+ "eval_steps_per_second": 18.657,
1640
+ "step": 2400
1641
+ },
1642
+ {
1643
+ "epoch": 19.13,
1644
+ "learning_rate": 9.0990990990991e-07,
1645
+ "loss": 1.2938,
1646
+ "step": 2410
1647
+ },
1648
+ {
1649
+ "epoch": 19.21,
1650
+ "learning_rate": 8.198198198198199e-07,
1651
+ "loss": 1.3374,
1652
+ "step": 2420
1653
+ },
1654
+ {
1655
+ "epoch": 19.28,
1656
+ "learning_rate": 7.297297297297298e-07,
1657
+ "loss": 1.297,
1658
+ "step": 2430
1659
+ },
1660
+ {
1661
+ "epoch": 19.36,
1662
+ "learning_rate": 6.396396396396397e-07,
1663
+ "loss": 1.2748,
1664
+ "step": 2440
1665
+ },
1666
+ {
1667
+ "epoch": 19.44,
1668
+ "learning_rate": 5.495495495495496e-07,
1669
+ "loss": 1.3259,
1670
+ "step": 2450
1671
+ },
1672
+ {
1673
+ "epoch": 19.52,
1674
+ "learning_rate": 4.5945945945945953e-07,
1675
+ "loss": 1.3099,
1676
+ "step": 2460
1677
+ },
1678
+ {
1679
+ "epoch": 19.6,
1680
+ "learning_rate": 3.693693693693694e-07,
1681
+ "loss": 1.3277,
1682
+ "step": 2470
1683
+ },
1684
+ {
1685
+ "epoch": 19.68,
1686
+ "learning_rate": 2.792792792792793e-07,
1687
+ "loss": 1.2708,
1688
+ "step": 2480
1689
+ },
1690
+ {
1691
+ "epoch": 19.76,
1692
+ "learning_rate": 1.8918918918918921e-07,
1693
+ "loss": 1.2546,
1694
+ "step": 2490
1695
+ },
1696
+ {
1697
+ "epoch": 19.84,
1698
+ "learning_rate": 9.90990990990991e-08,
1699
+ "loss": 1.2705,
1700
+ "step": 2500
1701
+ },
1702
+ {
1703
+ "epoch": 19.84,
1704
+ "eval_loss": 1.494584560394287,
1705
+ "eval_runtime": 13.2864,
1706
+ "eval_samples_per_second": 93.253,
1707
+ "eval_steps_per_second": 18.666,
1708
+ "step": 2500
1709
+ }
1710
+ ],
1711
+ "max_steps": 2520,
1712
+ "num_train_epochs": 20,
1713
+ "total_flos": 5.993865079244718e+17,
1714
+ "trial_name": null,
1715
+ "trial_params": null
1716
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61115c5cf7ceefe0d455cb65f5dca284bb6e55117a5791159dac2fe0548e9c12
3
+ size 4591