EvgeniyZh commited on
Commit
4602b89
1 Parent(s): 7d7e8cd

Model save

Browse files
README.md CHANGED
@@ -18,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 1.0922
22
 
23
  ## Model description
24
 
@@ -54,7 +54,7 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 1.3681 | 1.0 | 272 | 1.0922 |
58
 
59
 
60
  ### Framework versions
 
18
 
19
  This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 11.6801
22
 
23
  ## Model description
24
 
 
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
+ | 27787.675 | 1.0 | 272 | 11.6801 |
58
 
59
 
60
  ### Framework versions
adapter_config.json CHANGED
@@ -17,9 +17,9 @@
17
  "revision": null,
18
  "target_modules": [
19
  "q_proj",
 
20
  "v_proj",
21
- "o_proj",
22
- "k_proj"
23
  ],
24
  "task_type": "CAUSAL_LM"
25
  }
 
17
  "revision": null,
18
  "target_modules": [
19
  "q_proj",
20
+ "k_proj",
21
  "v_proj",
22
+ "o_proj"
 
23
  ],
24
  "task_type": "CAUSAL_LM"
25
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46933e34b2e227ff195e628e2c8b61bf212c9ef7309bda18a4978b0225e175c2
3
  size 109086672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f5e96ad2271a638d3a39cdfed716fd42448e7931922a043f53eae00d68e04d6
3
  size 109086672
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.0922234058380127,
4
- "eval_runtime": 2385.5237,
5
  "eval_samples": 23110,
6
- "eval_samples_per_second": 6.469,
7
- "eval_steps_per_second": 0.101,
8
- "train_loss": 1.4444872824584736,
9
- "train_runtime": 72477.8005,
10
  "train_samples": 207865,
11
- "train_samples_per_second": 1.924,
12
  "train_steps_per_second": 0.004
13
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 11.680082321166992,
4
+ "eval_runtime": 2370.9651,
5
  "eval_samples": 23110,
6
+ "eval_samples_per_second": 6.508,
7
+ "eval_steps_per_second": 0.102,
8
+ "train_loss": 27788.47707232307,
9
+ "train_runtime": 72625.1448,
10
  "train_samples": 207865,
11
+ "train_samples_per_second": 1.92,
12
  "train_steps_per_second": 0.004
13
  }
config.json CHANGED
@@ -3,6 +3,7 @@
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
 
6
  "bos_token_id": 1,
7
  "eos_token_id": 2,
8
  "hidden_act": "silu",
@@ -19,7 +20,7 @@
19
  "sliding_window": 4096,
20
  "tie_word_embeddings": false,
21
  "torch_dtype": "bfloat16",
22
- "transformers_version": "4.35.0",
23
  "use_cache": true,
24
  "vocab_size": 32000
25
  }
 
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
6
+ "attention_dropout": 0.0,
7
  "bos_token_id": 1,
8
  "eos_token_id": 2,
9
  "hidden_act": "silu",
 
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.36.0",
24
  "use_cache": true,
25
  "vocab_size": 32000
26
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.0922234058380127,
4
- "eval_runtime": 2385.5237,
5
  "eval_samples": 23110,
6
- "eval_samples_per_second": 6.469,
7
- "eval_steps_per_second": 0.101
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 11.680082321166992,
4
+ "eval_runtime": 2370.9651,
5
  "eval_samples": 23110,
6
+ "eval_samples_per_second": 6.508,
7
+ "eval_steps_per_second": 0.102
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 1.4444872824584736,
4
- "train_runtime": 72477.8005,
5
  "train_samples": 207865,
6
- "train_samples_per_second": 1.924,
7
  "train_steps_per_second": 0.004
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 27788.47707232307,
4
+ "train_runtime": 72625.1448,
5
  "train_samples": 207865,
6
+ "train_samples_per_second": 1.92,
7
  "train_steps_per_second": 0.004
8
  }
trainer_state.json CHANGED
@@ -11,348 +11,348 @@
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 1.9999332998034515e-05,
14
- "loss": 1.9596,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.02,
19
  "learning_rate": 1.99833293993636e-05,
20
- "loss": 1.9397,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.04,
25
  "learning_rate": 1.99333731792395e-05,
26
- "loss": 1.8704,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.06,
31
  "learning_rate": 1.985029789966671e-05,
32
- "loss": 1.7903,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.07,
37
  "learning_rate": 1.9734380543606932e-05,
38
- "loss": 1.6992,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.09,
43
  "learning_rate": 1.9586007593450098e-05,
44
- "loss": 1.6678,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.11,
49
  "learning_rate": 1.9405673742435677e-05,
50
- "loss": 1.6189,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.13,
55
  "learning_rate": 1.9193980245285967e-05,
56
- "loss": 1.6122,
57
  "step": 35
58
  },
59
  {
60
  "epoch": 0.15,
61
  "learning_rate": 1.8951632913550625e-05,
62
- "loss": 1.5687,
63
  "step": 40
64
  },
65
  {
66
  "epoch": 0.17,
67
  "learning_rate": 1.8679439762346186e-05,
68
- "loss": 1.5472,
69
  "step": 45
70
  },
71
  {
72
  "epoch": 0.18,
73
  "learning_rate": 1.8378308316336585e-05,
74
- "loss": 1.5302,
75
  "step": 50
76
  },
77
  {
78
  "epoch": 0.2,
79
  "learning_rate": 1.8049242583936923e-05,
80
- "loss": 1.5045,
81
  "step": 55
82
  },
83
  {
84
  "epoch": 0.22,
85
  "learning_rate": 1.769333970982879e-05,
86
- "loss": 1.49,
87
  "step": 60
88
  },
89
  {
90
  "epoch": 0.24,
91
  "learning_rate": 1.7311786316948112e-05,
92
- "loss": 1.4782,
93
  "step": 65
94
  },
95
  {
96
  "epoch": 0.26,
97
  "learning_rate": 1.6905854550141717e-05,
98
- "loss": 1.4596,
99
  "step": 70
100
  },
101
  {
102
  "epoch": 0.28,
103
  "learning_rate": 1.647689783468362e-05,
104
- "loss": 1.4509,
105
  "step": 75
106
  },
107
  {
108
  "epoch": 0.29,
109
  "learning_rate": 1.6026346363792565e-05,
110
- "loss": 1.4295,
111
  "step": 80
112
  },
113
  {
114
  "epoch": 0.31,
115
  "learning_rate": 1.5555702330196024e-05,
116
- "loss": 1.4301,
117
  "step": 85
118
  },
119
  {
120
  "epoch": 0.33,
121
  "learning_rate": 1.5066534917639195e-05,
122
- "loss": 1.4297,
123
  "step": 90
124
  },
125
  {
126
  "epoch": 0.35,
127
  "learning_rate": 1.4560475069037895e-05,
128
- "loss": 1.4161,
129
  "step": 95
130
  },
131
  {
132
  "epoch": 0.37,
133
  "learning_rate": 1.403921004871895e-05,
134
- "loss": 1.4144,
135
  "step": 100
136
  },
137
  {
138
  "epoch": 0.39,
139
  "learning_rate": 1.350447781687826e-05,
140
- "loss": 1.4051,
141
  "step": 105
142
  },
143
  {
144
  "epoch": 0.4,
145
  "learning_rate": 1.2958061235012707e-05,
146
- "loss": 1.4061,
147
  "step": 110
148
  },
149
  {
150
  "epoch": 0.42,
151
  "learning_rate": 1.2401782121645767e-05,
152
- "loss": 1.3968,
153
  "step": 115
154
  },
155
  {
156
  "epoch": 0.44,
157
  "learning_rate": 1.1837495178165706e-05,
158
- "loss": 1.3946,
159
  "step": 120
160
  },
161
  {
162
  "epoch": 0.46,
163
  "learning_rate": 1.126708180502834e-05,
164
- "loss": 1.382,
165
  "step": 125
166
  },
167
  {
168
  "epoch": 0.48,
169
  "learning_rate": 1.0692443828941918e-05,
170
- "loss": 1.3844,
171
  "step": 130
172
  },
173
  {
174
  "epoch": 0.5,
175
  "learning_rate": 1.0115497161948409e-05,
176
- "loss": 1.3857,
177
  "step": 135
178
  },
179
  {
180
  "epoch": 0.51,
181
  "learning_rate": 9.538165413542607e-06,
182
- "loss": 1.3812,
183
  "step": 140
184
  },
185
  {
186
  "epoch": 0.53,
187
  "learning_rate": 8.962373477126983e-06,
188
- "loss": 1.3657,
189
  "step": 145
190
  },
191
  {
192
  "epoch": 0.55,
193
  "learning_rate": 8.39004111218587e-06,
194
- "loss": 1.3757,
195
  "step": 150
196
  },
197
  {
198
  "epoch": 0.57,
199
  "learning_rate": 7.823076543576718e-06,
200
- "loss": 1.3713,
201
  "step": 155
202
  },
203
  {
204
  "epoch": 0.59,
205
  "learning_rate": 7.263370099279173e-06,
206
- "loss": 1.3653,
207
  "step": 160
208
  },
209
  {
210
  "epoch": 0.61,
211
  "learning_rate": 6.712787907814542e-06,
212
- "loss": 1.3749,
213
  "step": 165
214
  },
215
  {
216
  "epoch": 0.62,
217
  "learning_rate": 6.173165676349103e-06,
218
- "loss": 1.3718,
219
  "step": 170
220
  },
221
  {
222
  "epoch": 0.64,
223
  "learning_rate": 5.646302570225919e-06,
224
- "loss": 1.3728,
225
  "step": 175
226
  },
227
  {
228
  "epoch": 0.66,
229
  "learning_rate": 5.133955214331439e-06,
230
- "loss": 1.3672,
231
  "step": 180
232
  },
233
  {
234
  "epoch": 0.68,
235
  "learning_rate": 4.637831836297103e-06,
236
- "loss": 1.366,
237
  "step": 185
238
  },
239
  {
240
  "epoch": 0.7,
241
  "learning_rate": 4.1595865710632366e-06,
242
- "loss": 1.3708,
243
  "step": 190
244
  },
245
  {
246
  "epoch": 0.72,
247
  "learning_rate": 3.700813945794425e-06,
248
- "loss": 1.37,
249
  "step": 195
250
  },
251
  {
252
  "epoch": 0.73,
253
  "learning_rate": 3.2630435635344283e-06,
254
- "loss": 1.3679,
255
  "step": 200
256
  },
257
  {
258
  "epoch": 0.75,
259
  "learning_rate": 2.847735003325868e-06,
260
- "loss": 1.3671,
261
  "step": 205
262
  },
263
  {
264
  "epoch": 0.77,
265
  "learning_rate": 2.456272953798361e-06,
266
- "loss": 1.3677,
267
  "step": 210
268
  },
269
  {
270
  "epoch": 0.79,
271
  "learning_rate": 2.0899625964503113e-06,
272
- "loss": 1.3675,
273
  "step": 215
274
  },
275
  {
276
  "epoch": 0.81,
277
  "learning_rate": 1.7500252540169782e-06,
278
- "loss": 1.3647,
279
  "step": 220
280
  },
281
  {
282
  "epoch": 0.83,
283
  "learning_rate": 1.4375943184337871e-06,
284
- "loss": 1.371,
285
  "step": 225
286
  },
287
  {
288
  "epoch": 0.84,
289
  "learning_rate": 1.1537114719714482e-06,
290
- "loss": 1.3622,
291
  "step": 230
292
  },
293
  {
294
  "epoch": 0.86,
295
  "learning_rate": 8.993232141421415e-07,
296
- "loss": 1.3596,
297
  "step": 235
298
  },
299
  {
300
  "epoch": 0.88,
301
  "learning_rate": 6.752777059564431e-07,
302
- "loss": 1.369,
303
  "step": 240
304
  },
305
  {
306
  "epoch": 0.9,
307
  "learning_rate": 4.823219420526182e-07,
308
- "loss": 1.3621,
309
  "step": 245
310
  },
311
  {
312
  "epoch": 0.92,
313
  "learning_rate": 3.2109926012677484e-07,
314
- "loss": 1.3649,
315
  "step": 250
316
  },
317
  {
318
  "epoch": 0.94,
319
  "learning_rate": 1.921471959676957e-07,
320
- "loss": 1.3646,
321
  "step": 255
322
  },
323
  {
324
  "epoch": 0.95,
325
  "learning_rate": 9.589569124794918e-08,
326
- "loss": 1.3552,
327
  "step": 260
328
  },
329
  {
330
  "epoch": 0.97,
331
  "learning_rate": 3.266566004670013e-08,
332
- "loss": 1.3669,
333
  "step": 265
334
  },
335
  {
336
  "epoch": 0.99,
337
  "learning_rate": 2.667918883627607e-09,
338
- "loss": 1.3681,
339
  "step": 270
340
  },
341
  {
342
  "epoch": 1.0,
343
- "eval_loss": 1.0922234058380127,
344
- "eval_runtime": 2386.6161,
345
- "eval_samples_per_second": 6.466,
346
- "eval_steps_per_second": 0.101,
347
  "step": 272
348
  },
349
  {
350
  "epoch": 1.0,
351
  "step": 272,
352
  "total_flos": 7631468079611904.0,
353
- "train_loss": 1.4444872824584736,
354
- "train_runtime": 72477.8005,
355
- "train_samples_per_second": 1.924,
356
  "train_steps_per_second": 0.004
357
  }
358
  ],
 
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 1.9999332998034515e-05,
14
+ "loss": 27792.8418,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.02,
19
  "learning_rate": 1.99833293993636e-05,
20
+ "loss": 27792.707,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.04,
25
  "learning_rate": 1.99333731792395e-05,
26
+ "loss": 27792.2687,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.06,
31
  "learning_rate": 1.985029789966671e-05,
32
+ "loss": 27791.6813,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.07,
37
  "learning_rate": 1.9734380543606932e-05,
38
+ "loss": 27790.8375,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.09,
43
  "learning_rate": 1.9586007593450098e-05,
44
+ "loss": 27790.5062,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.11,
49
  "learning_rate": 1.9405673742435677e-05,
50
+ "loss": 27790.025,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.13,
55
  "learning_rate": 1.9193980245285967e-05,
56
+ "loss": 27790.0,
57
  "step": 35
58
  },
59
  {
60
  "epoch": 0.15,
61
  "learning_rate": 1.8951632913550625e-05,
62
+ "loss": 27789.6,
63
  "step": 40
64
  },
65
  {
66
  "epoch": 0.17,
67
  "learning_rate": 1.8679439762346186e-05,
68
+ "loss": 27789.4313,
69
  "step": 45
70
  },
71
  {
72
  "epoch": 0.18,
73
  "learning_rate": 1.8378308316336585e-05,
74
+ "loss": 27789.3187,
75
  "step": 50
76
  },
77
  {
78
  "epoch": 0.2,
79
  "learning_rate": 1.8049242583936923e-05,
80
+ "loss": 27789.1219,
81
  "step": 55
82
  },
83
  {
84
  "epoch": 0.22,
85
  "learning_rate": 1.769333970982879e-05,
86
+ "loss": 27789.0312,
87
  "step": 60
88
  },
89
  {
90
  "epoch": 0.24,
91
  "learning_rate": 1.7311786316948112e-05,
92
+ "loss": 27788.9625,
93
  "step": 65
94
  },
95
  {
96
  "epoch": 0.26,
97
  "learning_rate": 1.6905854550141717e-05,
98
+ "loss": 27788.8125,
99
  "step": 70
100
  },
101
  {
102
  "epoch": 0.28,
103
  "learning_rate": 1.647689783468362e-05,
104
+ "loss": 27788.75,
105
  "step": 75
106
  },
107
  {
108
  "epoch": 0.29,
109
  "learning_rate": 1.6026346363792565e-05,
110
+ "loss": 27788.5375,
111
  "step": 80
112
  },
113
  {
114
  "epoch": 0.31,
115
  "learning_rate": 1.5555702330196024e-05,
116
+ "loss": 27788.5438,
117
  "step": 85
118
  },
119
  {
120
  "epoch": 0.33,
121
  "learning_rate": 1.5066534917639195e-05,
122
+ "loss": 27788.5469,
123
  "step": 90
124
  },
125
  {
126
  "epoch": 0.35,
127
  "learning_rate": 1.4560475069037895e-05,
128
+ "loss": 27788.3937,
129
  "step": 95
130
  },
131
  {
132
  "epoch": 0.37,
133
  "learning_rate": 1.403921004871895e-05,
134
+ "loss": 27788.3688,
135
  "step": 100
136
  },
137
  {
138
  "epoch": 0.39,
139
  "learning_rate": 1.350447781687826e-05,
140
+ "loss": 27788.2625,
141
  "step": 105
142
  },
143
  {
144
  "epoch": 0.4,
145
  "learning_rate": 1.2958061235012707e-05,
146
+ "loss": 27788.2594,
147
  "step": 110
148
  },
149
  {
150
  "epoch": 0.42,
151
  "learning_rate": 1.2401782121645767e-05,
152
+ "loss": 27788.1562,
153
  "step": 115
154
  },
155
  {
156
  "epoch": 0.44,
157
  "learning_rate": 1.1837495178165706e-05,
158
+ "loss": 27788.1188,
159
  "step": 120
160
  },
161
  {
162
  "epoch": 0.46,
163
  "learning_rate": 1.126708180502834e-05,
164
+ "loss": 27787.9625,
165
  "step": 125
166
  },
167
  {
168
  "epoch": 0.48,
169
  "learning_rate": 1.0692443828941918e-05,
170
+ "loss": 27787.9813,
171
  "step": 130
172
  },
173
  {
174
  "epoch": 0.5,
175
  "learning_rate": 1.0115497161948409e-05,
176
+ "loss": 27787.9781,
177
  "step": 135
178
  },
179
  {
180
  "epoch": 0.51,
181
  "learning_rate": 9.538165413542607e-06,
182
+ "loss": 27787.9188,
183
  "step": 140
184
  },
185
  {
186
  "epoch": 0.53,
187
  "learning_rate": 8.962373477126983e-06,
188
+ "loss": 27787.7438,
189
  "step": 145
190
  },
191
  {
192
  "epoch": 0.55,
193
  "learning_rate": 8.39004111218587e-06,
194
+ "loss": 27787.8438,
195
  "step": 150
196
  },
197
  {
198
  "epoch": 0.57,
199
  "learning_rate": 7.823076543576718e-06,
200
+ "loss": 27787.7812,
201
  "step": 155
202
  },
203
  {
204
  "epoch": 0.59,
205
  "learning_rate": 7.263370099279173e-06,
206
+ "loss": 27787.7031,
207
  "step": 160
208
  },
209
  {
210
  "epoch": 0.61,
211
  "learning_rate": 6.712787907814542e-06,
212
+ "loss": 27787.7812,
213
  "step": 165
214
  },
215
  {
216
  "epoch": 0.62,
217
  "learning_rate": 6.173165676349103e-06,
218
+ "loss": 27787.7594,
219
  "step": 170
220
  },
221
  {
222
  "epoch": 0.64,
223
  "learning_rate": 5.646302570225919e-06,
224
+ "loss": 27787.7594,
225
  "step": 175
226
  },
227
  {
228
  "epoch": 0.66,
229
  "learning_rate": 5.133955214331439e-06,
230
+ "loss": 27787.7062,
231
  "step": 180
232
  },
233
  {
234
  "epoch": 0.68,
235
  "learning_rate": 4.637831836297103e-06,
236
+ "loss": 27787.6844,
237
  "step": 185
238
  },
239
  {
240
  "epoch": 0.7,
241
  "learning_rate": 4.1595865710632366e-06,
242
+ "loss": 27787.725,
243
  "step": 190
244
  },
245
  {
246
  "epoch": 0.72,
247
  "learning_rate": 3.700813945794425e-06,
248
+ "loss": 27787.7125,
249
  "step": 195
250
  },
251
  {
252
  "epoch": 0.73,
253
  "learning_rate": 3.2630435635344283e-06,
254
+ "loss": 27787.7,
255
  "step": 200
256
  },
257
  {
258
  "epoch": 0.75,
259
  "learning_rate": 2.847735003325868e-06,
260
+ "loss": 27787.6813,
261
  "step": 205
262
  },
263
  {
264
  "epoch": 0.77,
265
  "learning_rate": 2.456272953798361e-06,
266
+ "loss": 27787.6875,
267
  "step": 210
268
  },
269
  {
270
  "epoch": 0.79,
271
  "learning_rate": 2.0899625964503113e-06,
272
+ "loss": 27787.675,
273
  "step": 215
274
  },
275
  {
276
  "epoch": 0.81,
277
  "learning_rate": 1.7500252540169782e-06,
278
+ "loss": 27787.6437,
279
  "step": 220
280
  },
281
  {
282
  "epoch": 0.83,
283
  "learning_rate": 1.4375943184337871e-06,
284
+ "loss": 27787.7188,
285
  "step": 225
286
  },
287
  {
288
  "epoch": 0.84,
289
  "learning_rate": 1.1537114719714482e-06,
290
+ "loss": 27787.6188,
291
  "step": 230
292
  },
293
  {
294
  "epoch": 0.86,
295
  "learning_rate": 8.993232141421415e-07,
296
+ "loss": 27787.5938,
297
  "step": 235
298
  },
299
  {
300
  "epoch": 0.88,
301
  "learning_rate": 6.752777059564431e-07,
302
+ "loss": 27787.6937,
303
  "step": 240
304
  },
305
  {
306
  "epoch": 0.9,
307
  "learning_rate": 4.823219420526182e-07,
308
+ "loss": 27787.6219,
309
  "step": 245
310
  },
311
  {
312
  "epoch": 0.92,
313
  "learning_rate": 3.2109926012677484e-07,
314
+ "loss": 27787.6437,
315
  "step": 250
316
  },
317
  {
318
  "epoch": 0.94,
319
  "learning_rate": 1.921471959676957e-07,
320
+ "loss": 27787.6469,
321
  "step": 255
322
  },
323
  {
324
  "epoch": 0.95,
325
  "learning_rate": 9.589569124794918e-08,
326
+ "loss": 27787.55,
327
  "step": 260
328
  },
329
  {
330
  "epoch": 0.97,
331
  "learning_rate": 3.266566004670013e-08,
332
+ "loss": 27787.6688,
333
  "step": 265
334
  },
335
  {
336
  "epoch": 0.99,
337
  "learning_rate": 2.667918883627607e-09,
338
+ "loss": 27787.675,
339
  "step": 270
340
  },
341
  {
342
  "epoch": 1.0,
343
+ "eval_loss": 11.680082321166992,
344
+ "eval_runtime": 2372.6436,
345
+ "eval_samples_per_second": 6.504,
346
+ "eval_steps_per_second": 0.102,
347
  "step": 272
348
  },
349
  {
350
  "epoch": 1.0,
351
  "step": 272,
352
  "total_flos": 7631468079611904.0,
353
+ "train_loss": 27788.47707232307,
354
+ "train_runtime": 72625.1448,
355
+ "train_samples_per_second": 1.92,
356
  "train_steps_per_second": 0.004
357
  }
358
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c87469285d4d700c58756e9373aa90df9e87013e22d4b14969e26b47131d2b4
3
  size 5752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:994b25acdd78712b39ad725d07d1f583224f5b77b9fb320ef04258ede959fd42
3
  size 5752