pritamdeka commited on
Commit
f62d69e
1 Parent(s): c5d34a6

Updated Commit

Browse files
README.md CHANGED
@@ -5,18 +5,18 @@ tags:
5
  datasets:
6
  - pritamdeka/cord-19-abstract
7
  model-index:
8
- - name: PubMedBert-abstract-cord19
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
- # PubMedBert-abstract-cord19
16
 
17
- This model is a fine-tuned version of [microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext](https://huggingface.co/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext) on the [pritamdeka/cord-19-abstract](https://huggingface.co/datasets/pritamdeka/cord-19-abstract) dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 1.3279
20
 
21
  ## Model description
22
 
@@ -42,16 +42,38 @@ The following hyperparameters were used during training:
42
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
43
  - lr_scheduler_type: linear
44
  - lr_scheduler_warmup_steps: 10000
45
- - num_epochs: 1.0
46
  - mixed_precision_training: Native AMP
47
 
48
  ### Training results
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
  ### Framework versions
53
 
54
  - Transformers 4.17.0.dev0
55
  - Pytorch 1.10.0+cu111
56
- - Datasets 1.18.2
57
  - Tokenizers 0.11.0
 
5
  datasets:
6
  - pritamdeka/cord-19-abstract
7
  model-index:
8
+ - name: pubmedbert-abstract-cord19
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
+ # pubmedbert-abstract-cord19
16
 
17
+ This model is a fine-tuned version of [microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext](https://huggingface.co/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext) on the pritamdeka/cord-19-abstract dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 1.3005
20
 
21
  ## Model description
22
 
 
42
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
43
  - lr_scheduler_type: linear
44
  - lr_scheduler_warmup_steps: 10000
45
+ - num_epochs: 3.0
46
  - mixed_precision_training: Native AMP
47
 
48
  ### Training results
49
 
50
+ | Training Loss | Epoch | Step | Validation Loss |
51
+ |:-------------:|:-----:|:------:|:---------------:|
52
+ | 1.3774 | 0.15 | 5000 | 1.3212 |
53
+ | 1.3937 | 0.29 | 10000 | 1.4059 |
54
+ | 1.6812 | 0.44 | 15000 | 1.6174 |
55
+ | 1.4712 | 0.59 | 20000 | 1.4383 |
56
+ | 1.4293 | 0.73 | 25000 | 1.4356 |
57
+ | 1.4155 | 0.88 | 30000 | 1.4283 |
58
+ | 1.3963 | 1.03 | 35000 | 1.4135 |
59
+ | 1.3718 | 1.18 | 40000 | 1.3948 |
60
+ | 1.369 | 1.32 | 45000 | 1.3961 |
61
+ | 1.354 | 1.47 | 50000 | 1.3788 |
62
+ | 1.3399 | 1.62 | 55000 | 1.3866 |
63
+ | 1.3289 | 1.76 | 60000 | 1.3630 |
64
+ | 1.3155 | 1.91 | 65000 | 1.3609 |
65
+ | 1.2976 | 2.06 | 70000 | 1.3489 |
66
+ | 1.2783 | 2.2 | 75000 | 1.3333 |
67
+ | 1.2696 | 2.35 | 80000 | 1.3260 |
68
+ | 1.2607 | 2.5 | 85000 | 1.3232 |
69
+ | 1.2547 | 2.64 | 90000 | 1.3034 |
70
+ | 1.2495 | 2.79 | 95000 | 1.3035 |
71
+ | 1.2404 | 2.94 | 100000 | 1.3029 |
72
 
73
 
74
  ### Framework versions
75
 
76
  - Transformers 4.17.0.dev0
77
  - Pytorch 1.10.0+cu111
78
+ - Datasets 1.18.3
79
  - Tokenizers 0.11.0
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 1.3278663158416748,
4
- "eval_runtime": 170.2878,
5
  "eval_samples": 14605,
6
- "eval_samples_per_second": 85.767,
7
- "eval_steps_per_second": 10.723,
8
- "perplexity": 3.772984435877001,
9
- "train_loss": 1.4148471605151356,
10
- "train_runtime": 9789.6511,
11
  "train_samples": 272217,
12
- "train_samples_per_second": 27.807,
13
- "train_steps_per_second": 3.476
14
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "eval_loss": 1.3005212545394897,
4
+ "eval_runtime": 168.9513,
5
  "eval_samples": 14605,
6
+ "eval_samples_per_second": 86.445,
7
+ "eval_steps_per_second": 10.808,
8
+ "perplexity": 3.6712098037361005,
9
+ "train_loss": 1.35220570928797,
10
+ "train_runtime": 32542.0864,
11
  "train_samples": 272217,
12
+ "train_samples_per_second": 25.095,
13
+ "train_steps_per_second": 3.137
14
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 1.3278663158416748,
4
- "eval_runtime": 170.2878,
5
  "eval_samples": 14605,
6
- "eval_samples_per_second": 85.767,
7
- "eval_steps_per_second": 10.723,
8
- "perplexity": 3.772984435877001
9
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "eval_loss": 1.3005212545394897,
4
+ "eval_runtime": 168.9513,
5
  "eval_samples": 14605,
6
+ "eval_samples_per_second": 86.445,
7
+ "eval_steps_per_second": 10.808,
8
+ "perplexity": 3.6712098037361005
9
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8de0f4ad2b8d82d79a0e3e3c2e024d030152204c35a51c29e79016a48349585b
3
  size 438141995
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce02810425330e4b0a62825f9aa6bb5dfd3394c5a5ae82a04b7169a7bdb17e60
3
  size 438141995
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 1.4148471605151356,
4
- "train_runtime": 9789.6511,
5
  "train_samples": 272217,
6
- "train_samples_per_second": 27.807,
7
- "train_steps_per_second": 3.476
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "train_loss": 1.35220570928797,
4
+ "train_runtime": 32542.0864,
5
  "train_samples": 272217,
6
+ "train_samples_per_second": 25.095,
7
+ "train_steps_per_second": 3.137
8
  }
trainer_state.json CHANGED
@@ -1,433 +1,305 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
- "global_step": 34028,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
- {
11
- "epoch": 0.01,
12
- "learning_rate": 2.46e-06,
13
- "loss": 1.4943,
14
- "step": 500
15
- },
16
- {
17
- "epoch": 0.03,
18
- "learning_rate": 4.96e-06,
19
- "loss": 1.4172,
20
- "step": 1000
21
- },
22
- {
23
- "epoch": 0.04,
24
- "learning_rate": 7.4600000000000006e-06,
25
- "loss": 1.3694,
26
- "step": 1500
27
- },
28
- {
29
- "epoch": 0.06,
30
- "learning_rate": 9.96e-06,
31
- "loss": 1.3653,
32
- "step": 2000
33
- },
34
- {
35
- "epoch": 0.07,
36
- "learning_rate": 1.2460000000000001e-05,
37
- "loss": 1.3572,
38
- "step": 2500
39
- },
40
- {
41
- "epoch": 0.09,
42
- "learning_rate": 1.4960000000000002e-05,
43
- "loss": 1.3499,
44
- "step": 3000
45
- },
46
- {
47
- "epoch": 0.1,
48
- "learning_rate": 1.7460000000000002e-05,
49
- "loss": 1.347,
50
- "step": 3500
51
- },
52
- {
53
- "epoch": 0.12,
54
- "learning_rate": 1.9960000000000002e-05,
55
- "loss": 1.3345,
56
- "step": 4000
57
- },
58
- {
59
- "epoch": 0.13,
60
- "learning_rate": 2.2460000000000002e-05,
61
- "loss": 1.3558,
62
- "step": 4500
63
- },
64
  {
65
  "epoch": 0.15,
66
  "learning_rate": 2.496e-05,
67
- "loss": 1.3837,
68
  "step": 5000
69
  },
70
  {
71
- "epoch": 0.16,
72
- "learning_rate": 2.746e-05,
73
- "loss": 1.3784,
74
- "step": 5500
75
- },
76
- {
77
- "epoch": 0.18,
78
- "learning_rate": 2.9959999999999998e-05,
79
- "loss": 1.3708,
80
- "step": 6000
81
- },
82
- {
83
- "epoch": 0.19,
84
- "learning_rate": 3.2460000000000004e-05,
85
- "loss": 1.3762,
86
- "step": 6500
87
- },
88
- {
89
- "epoch": 0.21,
90
- "learning_rate": 3.4960000000000004e-05,
91
- "loss": 1.3771,
92
- "step": 7000
93
- },
94
- {
95
- "epoch": 0.22,
96
- "learning_rate": 3.7460000000000004e-05,
97
- "loss": 1.391,
98
- "step": 7500
99
- },
100
- {
101
- "epoch": 0.24,
102
- "learning_rate": 3.9960000000000004e-05,
103
- "loss": 1.3875,
104
- "step": 8000
105
- },
106
- {
107
- "epoch": 0.25,
108
- "learning_rate": 4.246e-05,
109
- "loss": 1.3885,
110
- "step": 8500
111
- },
112
- {
113
- "epoch": 0.26,
114
- "learning_rate": 4.496e-05,
115
- "loss": 1.4058,
116
- "step": 9000
117
- },
118
- {
119
- "epoch": 0.28,
120
- "learning_rate": 4.746e-05,
121
- "loss": 1.4011,
122
- "step": 9500
123
  },
124
  {
125
  "epoch": 0.29,
126
- "learning_rate": 4.996e-05,
127
- "loss": 1.4199,
128
  "step": 10000
129
  },
130
  {
131
- "epoch": 0.31,
132
- "learning_rate": 4.897619443982021e-05,
133
- "loss": 1.427,
134
- "step": 10500
135
- },
136
- {
137
- "epoch": 0.32,
138
- "learning_rate": 4.793782254036957e-05,
139
- "loss": 1.4302,
140
- "step": 11000
141
- },
142
- {
143
- "epoch": 0.34,
144
- "learning_rate": 4.689736973530881e-05,
145
- "loss": 1.4189,
146
- "step": 11500
147
- },
148
- {
149
- "epoch": 0.35,
150
- "learning_rate": 4.585899783585817e-05,
151
- "loss": 1.4264,
152
- "step": 12000
153
- },
154
- {
155
- "epoch": 0.37,
156
- "learning_rate": 4.482062593640753e-05,
157
- "loss": 1.4302,
158
- "step": 12500
159
- },
160
- {
161
- "epoch": 0.38,
162
- "learning_rate": 4.378017313134676e-05,
163
- "loss": 1.4285,
164
- "step": 13000
165
- },
166
- {
167
- "epoch": 0.4,
168
- "learning_rate": 4.2739720326286004e-05,
169
- "loss": 1.4212,
170
- "step": 13500
171
- },
172
- {
173
- "epoch": 0.41,
174
- "learning_rate": 4.169926752122524e-05,
175
- "loss": 1.4084,
176
- "step": 14000
177
- },
178
- {
179
- "epoch": 0.43,
180
- "learning_rate": 4.0662976527384723e-05,
181
- "loss": 1.4326,
182
- "step": 14500
183
  },
184
  {
185
  "epoch": 0.44,
186
- "learning_rate": 3.962252372232396e-05,
187
- "loss": 1.4097,
188
  "step": 15000
189
  },
190
  {
191
- "epoch": 0.46,
192
- "learning_rate": 3.8582070917263196e-05,
193
- "loss": 1.4176,
194
- "step": 15500
195
- },
196
- {
197
- "epoch": 0.47,
198
- "learning_rate": 3.754161811220243e-05,
199
- "loss": 1.5707,
200
- "step": 16000
201
- },
202
- {
203
- "epoch": 0.48,
204
- "learning_rate": 3.650324621275179e-05,
205
- "loss": 2.1501,
206
- "step": 16500
207
- },
208
- {
209
- "epoch": 0.5,
210
- "learning_rate": 3.546279340769103e-05,
211
- "loss": 1.8142,
212
- "step": 17000
213
  },
214
  {
215
- "epoch": 0.51,
216
- "learning_rate": 3.4422340602630264e-05,
217
- "loss": 1.744,
218
- "step": 17500
219
  },
220
  {
221
- "epoch": 0.53,
222
- "learning_rate": 3.338188779756951e-05,
223
- "loss": 1.6689,
224
- "step": 18000
 
 
225
  },
226
  {
227
- "epoch": 0.54,
228
- "learning_rate": 3.234143499250874e-05,
229
- "loss": 1.5661,
230
- "step": 18500
231
  },
232
  {
233
- "epoch": 0.56,
234
- "learning_rate": 3.130098218744798e-05,
235
- "loss": 1.4991,
236
- "step": 19000
 
 
237
  },
238
  {
239
- "epoch": 0.57,
240
- "learning_rate": 3.0260529382387215e-05,
241
- "loss": 1.4968,
242
- "step": 19500
243
  },
244
  {
245
- "epoch": 0.59,
246
- "learning_rate": 2.9220076577326455e-05,
247
- "loss": 1.4586,
248
- "step": 20000
 
 
249
  },
250
  {
251
- "epoch": 0.6,
252
- "learning_rate": 2.817962377226569e-05,
253
- "loss": 1.4327,
254
- "step": 20500
255
  },
256
  {
257
- "epoch": 0.62,
258
- "learning_rate": 2.713917096720493e-05,
259
- "loss": 1.4172,
260
- "step": 21000
 
 
261
  },
262
  {
263
- "epoch": 0.63,
264
- "learning_rate": 2.6098718162144163e-05,
265
- "loss": 1.4046,
266
- "step": 21500
267
  },
268
  {
269
- "epoch": 0.65,
270
- "learning_rate": 2.5058265357083406e-05,
271
- "loss": 1.3867,
272
- "step": 22000
 
 
273
  },
274
  {
275
- "epoch": 0.66,
276
- "learning_rate": 2.4017812552022643e-05,
277
- "loss": 1.3841,
278
- "step": 22500
279
  },
280
  {
281
- "epoch": 0.68,
282
- "learning_rate": 2.297735974696188e-05,
283
- "loss": 1.4225,
284
- "step": 23000
 
 
285
  },
286
  {
287
- "epoch": 0.69,
288
- "learning_rate": 2.1936906941901118e-05,
289
- "loss": 1.3852,
290
- "step": 23500
291
  },
292
  {
293
- "epoch": 0.71,
294
- "learning_rate": 2.0896454136840354e-05,
295
- "loss": 1.3934,
296
- "step": 24000
 
 
297
  },
298
  {
299
- "epoch": 0.72,
300
- "learning_rate": 1.985600133177959e-05,
301
- "loss": 1.3885,
302
- "step": 24500
303
  },
304
  {
305
- "epoch": 0.73,
306
- "learning_rate": 1.881554852671883e-05,
307
- "loss": 1.3587,
308
- "step": 25000
 
 
309
  },
310
  {
311
- "epoch": 0.75,
312
- "learning_rate": 1.7775095721658066e-05,
313
- "loss": 1.3546,
314
- "step": 25500
315
  },
316
  {
317
- "epoch": 0.76,
318
- "learning_rate": 1.6734642916597302e-05,
319
- "loss": 1.3435,
320
- "step": 26000
 
 
321
  },
322
  {
323
- "epoch": 0.78,
324
- "learning_rate": 1.5694190111536542e-05,
325
- "loss": 1.3399,
326
- "step": 26500
327
  },
328
  {
329
- "epoch": 0.79,
330
- "learning_rate": 1.4657899117696022e-05,
331
- "loss": 1.3495,
332
- "step": 27000
 
 
333
  },
334
  {
335
- "epoch": 0.81,
336
- "learning_rate": 1.361744631263526e-05,
337
- "loss": 1.3294,
338
- "step": 27500
339
  },
340
  {
341
- "epoch": 0.82,
342
- "learning_rate": 1.2576993507574498e-05,
343
- "loss": 1.3455,
344
- "step": 28000
 
 
345
  },
346
  {
347
- "epoch": 0.84,
348
- "learning_rate": 1.1536540702513734e-05,
349
- "loss": 1.3348,
350
- "step": 28500
351
  },
352
  {
353
- "epoch": 0.85,
354
- "learning_rate": 1.0496087897452972e-05,
355
- "loss": 1.3474,
356
- "step": 29000
 
 
357
  },
358
  {
359
- "epoch": 0.87,
360
- "learning_rate": 9.45563509239221e-06,
361
- "loss": 1.3358,
362
- "step": 29500
363
  },
364
  {
365
- "epoch": 0.88,
366
- "learning_rate": 8.415182287331448e-06,
367
- "loss": 1.3323,
368
- "step": 30000
 
 
369
  },
370
  {
371
- "epoch": 0.9,
372
- "learning_rate": 7.3747294822706845e-06,
373
- "loss": 1.3319,
374
- "step": 30500
375
  },
376
  {
377
- "epoch": 0.91,
378
- "learning_rate": 6.336357582820044e-06,
379
- "loss": 1.3203,
380
- "step": 31000
 
 
381
  },
382
  {
383
- "epoch": 0.93,
384
- "learning_rate": 5.297985683369403e-06,
385
- "loss": 1.3158,
386
- "step": 31500
387
  },
388
  {
389
- "epoch": 0.94,
390
- "learning_rate": 4.259613783918762e-06,
391
- "loss": 1.3217,
392
- "step": 32000
 
 
393
  },
394
  {
395
- "epoch": 0.96,
396
- "learning_rate": 3.219160978857999e-06,
397
- "loss": 1.3124,
398
- "step": 32500
399
  },
400
  {
401
- "epoch": 0.97,
402
- "learning_rate": 2.1787081737972366e-06,
403
- "loss": 1.3074,
404
- "step": 33000
 
 
405
  },
406
  {
407
- "epoch": 0.98,
408
- "learning_rate": 1.1382553687364742e-06,
409
- "loss": 1.3234,
410
- "step": 33500
411
  },
412
  {
413
- "epoch": 1.0,
414
- "learning_rate": 9.780256367571167e-08,
415
- "loss": 1.3078,
416
- "step": 34000
 
 
417
  },
418
  {
419
- "epoch": 1.0,
420
- "step": 34028,
421
- "total_flos": 3.4984777600575e+16,
422
- "train_loss": 1.4148471605151356,
423
- "train_runtime": 9789.6511,
424
- "train_samples_per_second": 27.807,
425
- "train_steps_per_second": 3.476
426
  }
427
  ],
428
- "max_steps": 34028,
429
- "num_train_epochs": 1,
430
- "total_flos": 3.4984777600575e+16,
431
  "trial_name": null,
432
  "trial_params": null
433
  }
 
1
  {
2
+ "best_metric": 1.3029065132141113,
3
+ "best_model_checkpoint": "/content/pubmedbert-abstract-cord19/checkpoint-100000",
4
+ "epoch": 3.0,
5
+ "global_step": 102084,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  {
11
  "epoch": 0.15,
12
  "learning_rate": 2.496e-05,
13
+ "loss": 1.3774,
14
  "step": 5000
15
  },
16
  {
17
+ "epoch": 0.15,
18
+ "eval_loss": 1.3211560249328613,
19
+ "eval_runtime": 169.3169,
20
+ "eval_samples_per_second": 86.258,
21
+ "eval_steps_per_second": 10.785,
22
+ "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  },
24
  {
25
  "epoch": 0.29,
26
+ "learning_rate": 4.9945000000000004e-05,
27
+ "loss": 1.3937,
28
  "step": 10000
29
  },
30
  {
31
+ "epoch": 0.29,
32
+ "eval_loss": 1.4059454202651978,
33
+ "eval_runtime": 169.3069,
34
+ "eval_samples_per_second": 86.263,
35
+ "eval_steps_per_second": 10.785,
36
+ "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  },
38
  {
39
  "epoch": 0.44,
40
+ "learning_rate": 4.7291603318709e-05,
41
+ "loss": 1.6812,
42
  "step": 15000
43
  },
44
  {
45
+ "epoch": 0.44,
46
+ "eval_loss": 1.6174367666244507,
47
+ "eval_runtime": 168.9751,
48
+ "eval_samples_per_second": 86.433,
49
+ "eval_steps_per_second": 10.806,
50
+ "step": 15000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  },
52
  {
53
+ "epoch": 0.59,
54
+ "learning_rate": 4.457723382998132e-05,
55
+ "loss": 1.4712,
56
+ "step": 20000
57
  },
58
  {
59
+ "epoch": 0.59,
60
+ "eval_loss": 1.4382678270339966,
61
+ "eval_runtime": 168.9508,
62
+ "eval_samples_per_second": 86.445,
63
+ "eval_steps_per_second": 10.808,
64
+ "step": 20000
65
  },
66
  {
67
+ "epoch": 0.73,
68
+ "learning_rate": 4.186340732374788e-05,
69
+ "loss": 1.4293,
70
+ "step": 25000
71
  },
72
  {
73
+ "epoch": 0.73,
74
+ "eval_loss": 1.4356446266174316,
75
+ "eval_runtime": 168.8828,
76
+ "eval_samples_per_second": 86.48,
77
+ "eval_steps_per_second": 10.812,
78
+ "step": 25000
79
  },
80
  {
81
+ "epoch": 0.88,
82
+ "learning_rate": 3.914849485252596e-05,
83
+ "loss": 1.4155,
84
+ "step": 30000
85
  },
86
  {
87
+ "epoch": 0.88,
88
+ "eval_loss": 1.4283361434936523,
89
+ "eval_runtime": 169.0831,
90
+ "eval_samples_per_second": 86.378,
91
+ "eval_steps_per_second": 10.799,
92
+ "step": 30000
93
  },
94
  {
95
+ "epoch": 1.03,
96
+ "learning_rate": 3.6435754311281005e-05,
97
+ "loss": 1.3963,
98
+ "step": 35000
99
  },
100
  {
101
+ "epoch": 1.03,
102
+ "eval_loss": 1.4134665727615356,
103
+ "eval_runtime": 169.088,
104
+ "eval_samples_per_second": 86.375,
105
+ "eval_steps_per_second": 10.799,
106
+ "step": 35000
107
  },
108
  {
109
+ "epoch": 1.18,
110
+ "learning_rate": 3.372138482255332e-05,
111
+ "loss": 1.3718,
112
+ "step": 40000
113
  },
114
  {
115
+ "epoch": 1.18,
116
+ "eval_loss": 1.3948187828063965,
117
+ "eval_runtime": 169.0271,
118
+ "eval_samples_per_second": 86.406,
119
+ "eval_steps_per_second": 10.803,
120
+ "step": 40000
121
  },
122
  {
123
+ "epoch": 1.32,
124
+ "learning_rate": 3.1006472351331396e-05,
125
+ "loss": 1.369,
126
+ "step": 45000
127
  },
128
  {
129
+ "epoch": 1.32,
130
+ "eval_loss": 1.3961154222488403,
131
+ "eval_runtime": 169.0308,
132
+ "eval_samples_per_second": 86.404,
133
+ "eval_steps_per_second": 10.803,
134
+ "step": 45000
135
  },
136
  {
137
+ "epoch": 1.47,
138
+ "learning_rate": 2.8292645845097955e-05,
139
+ "loss": 1.354,
140
+ "step": 50000
141
  },
142
  {
143
+ "epoch": 1.47,
144
+ "eval_loss": 1.378829836845398,
145
+ "eval_runtime": 169.1643,
146
+ "eval_samples_per_second": 86.336,
147
+ "eval_steps_per_second": 10.794,
148
+ "step": 50000
149
  },
150
  {
151
+ "epoch": 1.62,
152
+ "learning_rate": 2.5579362321358764e-05,
153
+ "loss": 1.3399,
154
+ "step": 55000
155
  },
156
  {
157
+ "epoch": 1.62,
158
+ "eval_loss": 1.3865987062454224,
159
+ "eval_runtime": 169.0688,
160
+ "eval_samples_per_second": 86.385,
161
+ "eval_steps_per_second": 10.8,
162
+ "step": 55000
163
  },
164
  {
165
+ "epoch": 1.76,
166
+ "learning_rate": 2.2866078797619567e-05,
167
+ "loss": 1.3289,
168
+ "step": 60000
169
  },
170
  {
171
+ "epoch": 1.76,
172
+ "eval_loss": 1.362976312637329,
173
+ "eval_runtime": 169.1136,
174
+ "eval_samples_per_second": 86.362,
175
+ "eval_steps_per_second": 10.797,
176
+ "step": 60000
177
  },
178
  {
179
+ "epoch": 1.91,
180
+ "learning_rate": 2.0151709308891882e-05,
181
+ "loss": 1.3155,
182
+ "step": 65000
183
  },
184
  {
185
+ "epoch": 1.91,
186
+ "eval_loss": 1.3609150648117065,
187
+ "eval_runtime": 169.0556,
188
+ "eval_samples_per_second": 86.392,
189
+ "eval_steps_per_second": 10.801,
190
+ "step": 65000
191
  },
192
  {
193
+ "epoch": 2.06,
194
+ "learning_rate": 1.743788280265844e-05,
195
+ "loss": 1.2976,
196
+ "step": 70000
197
  },
198
  {
199
+ "epoch": 2.06,
200
+ "eval_loss": 1.3489034175872803,
201
+ "eval_runtime": 169.1114,
202
+ "eval_samples_per_second": 86.363,
203
+ "eval_steps_per_second": 10.798,
204
+ "step": 70000
205
  },
206
  {
207
+ "epoch": 2.2,
208
+ "learning_rate": 1.4724599278919249e-05,
209
+ "loss": 1.2783,
210
+ "step": 75000
211
  },
212
  {
213
+ "epoch": 2.2,
214
+ "eval_loss": 1.3333380222320557,
215
+ "eval_runtime": 169.0576,
216
+ "eval_samples_per_second": 86.391,
217
+ "eval_steps_per_second": 10.801,
218
+ "step": 75000
219
  },
220
  {
221
+ "epoch": 2.35,
222
+ "learning_rate": 1.2009686807697321e-05,
223
+ "loss": 1.2696,
224
+ "step": 80000
225
  },
226
  {
227
+ "epoch": 2.35,
228
+ "eval_loss": 1.3259785175323486,
229
+ "eval_runtime": 168.9798,
230
+ "eval_samples_per_second": 86.43,
231
+ "eval_steps_per_second": 10.806,
232
+ "step": 80000
233
  },
234
  {
235
+ "epoch": 2.5,
236
+ "learning_rate": 9.29586030146388e-06,
237
+ "loss": 1.2607,
238
+ "step": 85000
239
  },
240
  {
241
+ "epoch": 2.5,
242
+ "eval_loss": 1.3232313394546509,
243
+ "eval_runtime": 168.7484,
244
+ "eval_samples_per_second": 86.549,
245
+ "eval_steps_per_second": 10.821,
246
+ "step": 85000
247
  },
248
  {
249
+ "epoch": 2.64,
250
+ "learning_rate": 6.582033795230443e-06,
251
+ "loss": 1.2547,
252
+ "step": 90000
253
  },
254
  {
255
+ "epoch": 2.64,
256
+ "eval_loss": 1.3033902645111084,
257
+ "eval_runtime": 168.7446,
258
+ "eval_samples_per_second": 86.551,
259
+ "eval_steps_per_second": 10.821,
260
+ "step": 90000
261
  },
262
  {
263
+ "epoch": 2.79,
264
+ "learning_rate": 3.867664306502758e-06,
265
+ "loss": 1.2495,
266
+ "step": 95000
267
  },
268
  {
269
+ "epoch": 2.79,
270
+ "eval_loss": 1.3035175800323486,
271
+ "eval_runtime": 168.7573,
272
+ "eval_samples_per_second": 86.544,
273
+ "eval_steps_per_second": 10.82,
274
+ "step": 95000
275
  },
276
  {
277
+ "epoch": 2.94,
278
+ "learning_rate": 1.1538378002693194e-06,
279
+ "loss": 1.2404,
280
+ "step": 100000
281
  },
282
  {
283
+ "epoch": 2.94,
284
+ "eval_loss": 1.3029065132141113,
285
+ "eval_runtime": 168.8016,
286
+ "eval_samples_per_second": 86.522,
287
+ "eval_steps_per_second": 10.817,
288
+ "step": 100000
289
  },
290
  {
291
+ "epoch": 3.0,
292
+ "step": 102084,
293
+ "total_flos": 1.04954332801725e+17,
294
+ "train_loss": 1.35220570928797,
295
+ "train_runtime": 32542.0864,
296
+ "train_samples_per_second": 25.095,
297
+ "train_steps_per_second": 3.137
298
  }
299
  ],
300
+ "max_steps": 102084,
301
+ "num_train_epochs": 3,
302
+ "total_flos": 1.04954332801725e+17,
303
  "trial_name": null,
304
  "trial_params": null
305
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9540fa7f3fcb1a3c3b8bf3110f690d8b114909915991500a100ecd4931cb6dd5
3
  size 3055
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2b18b7fbbb222cc58a7279e9f20f061015602730e5f2e94504e9367dc4156be
3
  size 3055