real-jiakai commited on
Commit
d3e2aa7
1 Parent(s): 155b245

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ eval_nbest_predictions.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ checkpoint-*/
2
+
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: google-bert/bert-base-uncased
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - squad
9
+ model-index:
10
+ - name: debug_squad
11
+ results: []
12
+ ---
13
+
14
+ # bert-base-uncased-finetuned-squad
15
+
16
+ This model is a fine-tuned version of [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) on the SQuAD dataset.
17
+
18
+ ## Model description
19
+
20
+ - **Model Type:** BERT for Question Answering
21
+ - **Base Model:** bert-base-uncased
22
+ - **Language:** English
23
+ - **Task:** Question Answering
24
+ - **Dataset:** [SQuAD v1.1](https://rajpurkar.github.io/SQuAD-explorer/)
25
+
26
+ ## Training Procedure
27
+
28
+ ### Training hyperparameters
29
+
30
+ The following hyperparameters were used during training:
31
+ - learning_rate: 3e-05
32
+ - train_batch_size: 12
33
+ - eval_batch_size: 8
34
+ - seed: 42
35
+ - optimizer: AdamW_TORCH with beta=(0.9,0.999) and epsilon=1e-08
36
+ - lr_scheduler_type: linear
37
+ - num_epochs: 5
38
+
39
+ ### Training results
40
+
41
+ - training_loss: 0.6077
42
+ - eval_exact_match: 79.508
43
+ - eval_f1: 87.7293
44
+ - train_runtime: 1:09:34.90
45
+ - train_samples_per_second: 106.019
46
+ - train_steps_per_second: 8.835
47
+
48
+ ## Intended uses & limitations
49
+
50
+ This model is intended for English question answering tasks. It performs best on factual questions where the answer is explicitly stated in the provided context. Note that this model was trained on SQuAD v1.1, which means it always tries to find an answer in the context (it cannot handle questions that have no answer).
51
+
52
+ ### Usage Example
53
+
54
+ ```python
55
+ from transformers import AutoModelForQuestionAnswering, AutoTokenizer
56
+
57
+ model = AutoModelForQuestionAnswering.from_pretrained("real-jiakai/bert-base-uncased-finetuned-squad")
58
+ tokenizer = AutoTokenizer.from_pretrained("real-jiakai/bert-base-uncased-finetuned-squad")
59
+
60
+ # Example usage
61
+ context = "BERT was developed by Google in 2018."
62
+ question = "Who developed BERT?"
63
+
64
+ inputs = tokenizer(question, context, return_tensors="pt")
65
+ outputs = model(**inputs)
66
+
67
+ answer_start = outputs.start_logits.argmax()
68
+ answer_end = outputs.end_logits.argmax()
69
+
70
+ answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end+1])
71
+ print(f"Answer: {answer}") # Expected output: "google"
72
+ ```
73
+
74
+ ## Training Infrastructure
75
+
76
+ - Training Device: Single GPU (NVIDIA Tesla V100 16GB)
77
+ - Training Time: ~70 minutes
78
+ - Framework: PyTorch
79
+ - Training Script: Hugging Face Transformers' `run_qa.py`
80
+
81
+ ### Framework versions
82
+
83
+ - Transformers 4.47.0.dev0
84
+ - Pytorch 2.5.1+cu124
85
+ - Datasets 3.1.0
86
+ - Tokenizers 0.20.3
.ipynb_checkpoints/special_tokens_map-checkpoint.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
.ipynb_checkpoints/tokenizer-checkpoint.json ADDED
The diff for this file is too large to render. See raw diff
 
.ipynb_checkpoints/train_results-checkpoint.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "total_flos": 8.674137784986624e+16,
4
+ "train_loss": 0.6077316944008034,
5
+ "train_runtime": 4174.9009,
6
+ "train_samples": 88524,
7
+ "train_samples_per_second": 106.019,
8
+ "train_steps_per_second": 8.835
9
+ }
.ipynb_checkpoints/trainer_state-checkpoint.json ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 36885,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06777822963264199,
13
+ "grad_norm": 27.22840690612793,
14
+ "learning_rate": 2.959333062220415e-05,
15
+ "loss": 2.3635,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.13555645926528398,
20
+ "grad_norm": 13.622244834899902,
21
+ "learning_rate": 2.9186661244408297e-05,
22
+ "loss": 1.5111,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.203334688897926,
27
+ "grad_norm": 14.186929702758789,
28
+ "learning_rate": 2.8779991866612443e-05,
29
+ "loss": 1.3571,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.27111291853056796,
34
+ "grad_norm": 23.26664161682129,
35
+ "learning_rate": 2.8373322488816593e-05,
36
+ "loss": 1.3122,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.33889114816321,
41
+ "grad_norm": 16.56296157836914,
42
+ "learning_rate": 2.796665311102074e-05,
43
+ "loss": 1.2847,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.406669377795852,
48
+ "grad_norm": 8.389954566955566,
49
+ "learning_rate": 2.755998373322489e-05,
50
+ "loss": 1.2343,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.47444760742849396,
55
+ "grad_norm": 10.328352928161621,
56
+ "learning_rate": 2.7153314355429036e-05,
57
+ "loss": 1.1899,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.5422258370611359,
62
+ "grad_norm": 14.220015525817871,
63
+ "learning_rate": 2.6746644977633186e-05,
64
+ "loss": 1.1337,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.6100040666937779,
69
+ "grad_norm": 23.481950759887695,
70
+ "learning_rate": 2.6339975599837332e-05,
71
+ "loss": 1.1337,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.67778229632642,
76
+ "grad_norm": 26.090124130249023,
77
+ "learning_rate": 2.593330622204148e-05,
78
+ "loss": 1.1607,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.745560525959062,
83
+ "grad_norm": 16.809215545654297,
84
+ "learning_rate": 2.552663684424563e-05,
85
+ "loss": 1.1114,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.813338755591704,
90
+ "grad_norm": 15.458964347839355,
91
+ "learning_rate": 2.5119967466449778e-05,
92
+ "loss": 1.0703,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.8811169852243459,
97
+ "grad_norm": 21.379941940307617,
98
+ "learning_rate": 2.4713298088653925e-05,
99
+ "loss": 1.0782,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.9488952148569879,
104
+ "grad_norm": 10.91637134552002,
105
+ "learning_rate": 2.4306628710858075e-05,
106
+ "loss": 1.0842,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 1.01667344448963,
111
+ "grad_norm": 13.034053802490234,
112
+ "learning_rate": 2.389995933306222e-05,
113
+ "loss": 0.9463,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 1.0844516741222718,
118
+ "grad_norm": 15.298956871032715,
119
+ "learning_rate": 2.3493289955266367e-05,
120
+ "loss": 0.7336,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 1.152229903754914,
125
+ "grad_norm": 10.17520523071289,
126
+ "learning_rate": 2.3086620577470517e-05,
127
+ "loss": 0.7455,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 1.2200081333875559,
132
+ "grad_norm": 15.160359382629395,
133
+ "learning_rate": 2.2679951199674667e-05,
134
+ "loss": 0.7516,
135
+ "step": 9000
136
+ },
137
+ {
138
+ "epoch": 1.287786363020198,
139
+ "grad_norm": 29.048612594604492,
140
+ "learning_rate": 2.2273281821878813e-05,
141
+ "loss": 0.7434,
142
+ "step": 9500
143
+ },
144
+ {
145
+ "epoch": 1.35556459265284,
146
+ "grad_norm": 24.640453338623047,
147
+ "learning_rate": 2.1866612444082963e-05,
148
+ "loss": 0.7673,
149
+ "step": 10000
150
+ },
151
+ {
152
+ "epoch": 1.4233428222854818,
153
+ "grad_norm": 16.07855224609375,
154
+ "learning_rate": 2.1459943066287106e-05,
155
+ "loss": 0.775,
156
+ "step": 10500
157
+ },
158
+ {
159
+ "epoch": 1.491121051918124,
160
+ "grad_norm": 12.662883758544922,
161
+ "learning_rate": 2.1053273688491256e-05,
162
+ "loss": 0.7561,
163
+ "step": 11000
164
+ },
165
+ {
166
+ "epoch": 1.5588992815507658,
167
+ "grad_norm": 13.590459823608398,
168
+ "learning_rate": 2.0646604310695406e-05,
169
+ "loss": 0.7537,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 1.626677511183408,
174
+ "grad_norm": 12.973872184753418,
175
+ "learning_rate": 2.0239934932899552e-05,
176
+ "loss": 0.7513,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 1.6944557408160499,
181
+ "grad_norm": 17.74153709411621,
182
+ "learning_rate": 1.9833265555103702e-05,
183
+ "loss": 0.7666,
184
+ "step": 12500
185
+ },
186
+ {
187
+ "epoch": 1.7622339704486918,
188
+ "grad_norm": 14.367899894714355,
189
+ "learning_rate": 1.9426596177307852e-05,
190
+ "loss": 0.7787,
191
+ "step": 13000
192
+ },
193
+ {
194
+ "epoch": 1.830012200081334,
195
+ "grad_norm": 30.40443992614746,
196
+ "learning_rate": 1.9019926799511995e-05,
197
+ "loss": 0.7373,
198
+ "step": 13500
199
+ },
200
+ {
201
+ "epoch": 1.8977904297139758,
202
+ "grad_norm": 8.429509162902832,
203
+ "learning_rate": 1.8613257421716145e-05,
204
+ "loss": 0.7559,
205
+ "step": 14000
206
+ },
207
+ {
208
+ "epoch": 1.965568659346618,
209
+ "grad_norm": 12.091238975524902,
210
+ "learning_rate": 1.8206588043920295e-05,
211
+ "loss": 0.7537,
212
+ "step": 14500
213
+ },
214
+ {
215
+ "epoch": 2.03334688897926,
216
+ "grad_norm": 7.968406677246094,
217
+ "learning_rate": 1.779991866612444e-05,
218
+ "loss": 0.6185,
219
+ "step": 15000
220
+ },
221
+ {
222
+ "epoch": 2.1011251186119018,
223
+ "grad_norm": 19.485437393188477,
224
+ "learning_rate": 1.739324928832859e-05,
225
+ "loss": 0.4977,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 2.1689033482445437,
230
+ "grad_norm": 19.35041046142578,
231
+ "learning_rate": 1.6986579910532734e-05,
232
+ "loss": 0.4627,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 2.236681577877186,
237
+ "grad_norm": 11.437379837036133,
238
+ "learning_rate": 1.6579910532736884e-05,
239
+ "loss": 0.4954,
240
+ "step": 16500
241
+ },
242
+ {
243
+ "epoch": 2.304459807509828,
244
+ "grad_norm": 6.330296039581299,
245
+ "learning_rate": 1.6173241154941034e-05,
246
+ "loss": 0.4885,
247
+ "step": 17000
248
+ },
249
+ {
250
+ "epoch": 2.37223803714247,
251
+ "grad_norm": 26.632694244384766,
252
+ "learning_rate": 1.576657177714518e-05,
253
+ "loss": 0.5022,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 2.4400162667751117,
258
+ "grad_norm": 10.865219116210938,
259
+ "learning_rate": 1.535990239934933e-05,
260
+ "loss": 0.484,
261
+ "step": 18000
262
+ },
263
+ {
264
+ "epoch": 2.5077944964077536,
265
+ "grad_norm": 15.621503829956055,
266
+ "learning_rate": 1.4953233021553476e-05,
267
+ "loss": 0.4992,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 2.575572726040396,
272
+ "grad_norm": 4.8670973777771,
273
+ "learning_rate": 1.4546563643757626e-05,
274
+ "loss": 0.5067,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 2.643350955673038,
279
+ "grad_norm": 22.809894561767578,
280
+ "learning_rate": 1.4139894265961774e-05,
281
+ "loss": 0.4827,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 2.71112918530568,
286
+ "grad_norm": 10.14173698425293,
287
+ "learning_rate": 1.373322488816592e-05,
288
+ "loss": 0.5003,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 2.7789074149383217,
293
+ "grad_norm": 19.475053787231445,
294
+ "learning_rate": 1.3326555510370069e-05,
295
+ "loss": 0.5022,
296
+ "step": 20500
297
+ },
298
+ {
299
+ "epoch": 2.8466856445709636,
300
+ "grad_norm": 22.009613037109375,
301
+ "learning_rate": 1.2919886132574219e-05,
302
+ "loss": 0.4949,
303
+ "step": 21000
304
+ },
305
+ {
306
+ "epoch": 2.914463874203606,
307
+ "grad_norm": 7.058548927307129,
308
+ "learning_rate": 1.2513216754778365e-05,
309
+ "loss": 0.4963,
310
+ "step": 21500
311
+ },
312
+ {
313
+ "epoch": 2.982242103836248,
314
+ "grad_norm": 45.82003402709961,
315
+ "learning_rate": 1.2106547376982513e-05,
316
+ "loss": 0.4881,
317
+ "step": 22000
318
+ },
319
+ {
320
+ "epoch": 3.05002033346889,
321
+ "grad_norm": 7.270058631896973,
322
+ "learning_rate": 1.1699877999186661e-05,
323
+ "loss": 0.3598,
324
+ "step": 22500
325
+ },
326
+ {
327
+ "epoch": 3.1177985631015317,
328
+ "grad_norm": 10.434412956237793,
329
+ "learning_rate": 1.129320862139081e-05,
330
+ "loss": 0.3055,
331
+ "step": 23000
332
+ },
333
+ {
334
+ "epoch": 3.1855767927341736,
335
+ "grad_norm": 2.4307031631469727,
336
+ "learning_rate": 1.0886539243594958e-05,
337
+ "loss": 0.3264,
338
+ "step": 23500
339
+ },
340
+ {
341
+ "epoch": 3.253355022366816,
342
+ "grad_norm": 47.332122802734375,
343
+ "learning_rate": 1.0479869865799104e-05,
344
+ "loss": 0.315,
345
+ "step": 24000
346
+ },
347
+ {
348
+ "epoch": 3.321133251999458,
349
+ "grad_norm": 16.34264373779297,
350
+ "learning_rate": 1.0073200488003254e-05,
351
+ "loss": 0.3251,
352
+ "step": 24500
353
+ },
354
+ {
355
+ "epoch": 3.3889114816320998,
356
+ "grad_norm": 13.301547050476074,
357
+ "learning_rate": 9.666531110207402e-06,
358
+ "loss": 0.3266,
359
+ "step": 25000
360
+ },
361
+ {
362
+ "epoch": 3.4566897112647417,
363
+ "grad_norm": 31.915794372558594,
364
+ "learning_rate": 9.259861732411548e-06,
365
+ "loss": 0.3122,
366
+ "step": 25500
367
+ },
368
+ {
369
+ "epoch": 3.5244679408973836,
370
+ "grad_norm": 4.690245151519775,
371
+ "learning_rate": 8.853192354615698e-06,
372
+ "loss": 0.3051,
373
+ "step": 26000
374
+ },
375
+ {
376
+ "epoch": 3.592246170530026,
377
+ "grad_norm": 22.70941734313965,
378
+ "learning_rate": 8.446522976819846e-06,
379
+ "loss": 0.3013,
380
+ "step": 26500
381
+ },
382
+ {
383
+ "epoch": 3.660024400162668,
384
+ "grad_norm": 19.372514724731445,
385
+ "learning_rate": 8.039853599023993e-06,
386
+ "loss": 0.3157,
387
+ "step": 27000
388
+ },
389
+ {
390
+ "epoch": 3.7278026297953097,
391
+ "grad_norm": 13.642813682556152,
392
+ "learning_rate": 7.633184221228141e-06,
393
+ "loss": 0.3301,
394
+ "step": 27500
395
+ },
396
+ {
397
+ "epoch": 3.7955808594279516,
398
+ "grad_norm": 31.518564224243164,
399
+ "learning_rate": 7.226514843432289e-06,
400
+ "loss": 0.3157,
401
+ "step": 28000
402
+ },
403
+ {
404
+ "epoch": 3.8633590890605936,
405
+ "grad_norm": 56.66007614135742,
406
+ "learning_rate": 6.819845465636438e-06,
407
+ "loss": 0.308,
408
+ "step": 28500
409
+ },
410
+ {
411
+ "epoch": 3.931137318693236,
412
+ "grad_norm": 11.975343704223633,
413
+ "learning_rate": 6.413176087840585e-06,
414
+ "loss": 0.2989,
415
+ "step": 29000
416
+ },
417
+ {
418
+ "epoch": 3.998915548325878,
419
+ "grad_norm": 17.2923641204834,
420
+ "learning_rate": 6.0065067100447335e-06,
421
+ "loss": 0.3123,
422
+ "step": 29500
423
+ },
424
+ {
425
+ "epoch": 4.06669377795852,
426
+ "grad_norm": 23.894207000732422,
427
+ "learning_rate": 5.5998373322488825e-06,
428
+ "loss": 0.2082,
429
+ "step": 30000
430
+ },
431
+ {
432
+ "epoch": 4.134472007591162,
433
+ "grad_norm": 9.123438835144043,
434
+ "learning_rate": 5.19316795445303e-06,
435
+ "loss": 0.2065,
436
+ "step": 30500
437
+ },
438
+ {
439
+ "epoch": 4.2022502372238035,
440
+ "grad_norm": 19.249637603759766,
441
+ "learning_rate": 4.786498576657178e-06,
442
+ "loss": 0.1926,
443
+ "step": 31000
444
+ },
445
+ {
446
+ "epoch": 4.270028466856446,
447
+ "grad_norm": 41.71086883544922,
448
+ "learning_rate": 4.379829198861326e-06,
449
+ "loss": 0.2034,
450
+ "step": 31500
451
+ },
452
+ {
453
+ "epoch": 4.337806696489087,
454
+ "grad_norm": 1.5960214138031006,
455
+ "learning_rate": 3.973159821065474e-06,
456
+ "loss": 0.2139,
457
+ "step": 32000
458
+ },
459
+ {
460
+ "epoch": 4.40558492612173,
461
+ "grad_norm": 10.51189136505127,
462
+ "learning_rate": 3.566490443269622e-06,
463
+ "loss": 0.195,
464
+ "step": 32500
465
+ },
466
+ {
467
+ "epoch": 4.473363155754372,
468
+ "grad_norm": 6.574166297912598,
469
+ "learning_rate": 3.15982106547377e-06,
470
+ "loss": 0.2121,
471
+ "step": 33000
472
+ },
473
+ {
474
+ "epoch": 4.5411413853870135,
475
+ "grad_norm": 49.01862716674805,
476
+ "learning_rate": 2.753151687677918e-06,
477
+ "loss": 0.2121,
478
+ "step": 33500
479
+ },
480
+ {
481
+ "epoch": 4.608919615019656,
482
+ "grad_norm": 24.359651565551758,
483
+ "learning_rate": 2.346482309882066e-06,
484
+ "loss": 0.2126,
485
+ "step": 34000
486
+ },
487
+ {
488
+ "epoch": 4.676697844652297,
489
+ "grad_norm": 11.81959056854248,
490
+ "learning_rate": 1.939812932086214e-06,
491
+ "loss": 0.2153,
492
+ "step": 34500
493
+ },
494
+ {
495
+ "epoch": 4.74447607428494,
496
+ "grad_norm": 20.48014259338379,
497
+ "learning_rate": 1.533143554290362e-06,
498
+ "loss": 0.1941,
499
+ "step": 35000
500
+ },
501
+ {
502
+ "epoch": 4.812254303917582,
503
+ "grad_norm": 10.205302238464355,
504
+ "learning_rate": 1.12647417649451e-06,
505
+ "loss": 0.1981,
506
+ "step": 35500
507
+ },
508
+ {
509
+ "epoch": 4.8800325335502235,
510
+ "grad_norm": 4.869925498962402,
511
+ "learning_rate": 7.19804798698658e-07,
512
+ "loss": 0.1984,
513
+ "step": 36000
514
+ },
515
+ {
516
+ "epoch": 4.947810763182866,
517
+ "grad_norm": 15.551806449890137,
518
+ "learning_rate": 3.13135420902806e-07,
519
+ "loss": 0.196,
520
+ "step": 36500
521
+ },
522
+ {
523
+ "epoch": 5.0,
524
+ "step": 36885,
525
+ "total_flos": 8.674137784986624e+16,
526
+ "train_loss": 0.6077316944008034,
527
+ "train_runtime": 4174.9009,
528
+ "train_samples_per_second": 106.019,
529
+ "train_steps_per_second": 8.835
530
+ }
531
+ ],
532
+ "logging_steps": 500,
533
+ "max_steps": 36885,
534
+ "num_input_tokens_seen": 0,
535
+ "num_train_epochs": 5,
536
+ "save_steps": 500,
537
+ "stateful_callbacks": {
538
+ "TrainerControl": {
539
+ "args": {
540
+ "should_epoch_stop": false,
541
+ "should_evaluate": false,
542
+ "should_log": false,
543
+ "should_save": true,
544
+ "should_training_stop": true
545
+ },
546
+ "attributes": {}
547
+ }
548
+ },
549
+ "total_flos": 8.674137784986624e+16,
550
+ "train_batch_size": 12,
551
+ "trial_name": null,
552
+ "trial_params": null
553
+ }
README.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: google-bert/bert-base-uncased
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - squad
9
+ model-index:
10
+ - name: debug_squad
11
+ results: []
12
+ ---
13
+
14
+ # bert-base-uncased-finetuned-squad
15
+
16
+ This model is a fine-tuned version of [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) on the SQuAD dataset.
17
+
18
+ ## Model description
19
+
20
+ - **Model Type:** BERT for Question Answering
21
+ - **Base Model:** bert-base-uncased
22
+ - **Language:** English
23
+ - **Task:** Question Answering
24
+ - **Dataset:** [SQuAD v1.1](https://rajpurkar.github.io/SQuAD-explorer/)
25
+
26
+ ## Training Procedure
27
+
28
+ ### Training hyperparameters
29
+
30
+ The following hyperparameters were used during training:
31
+ - learning_rate: 3e-05
32
+ - train_batch_size: 12
33
+ - eval_batch_size: 8
34
+ - seed: 42
35
+ - optimizer: AdamW_TORCH with beta=(0.9,0.999) and epsilon=1e-08
36
+ - lr_scheduler_type: linear
37
+ - num_epochs: 5
38
+
39
+ ### Training results
40
+
41
+ - training_loss: 0.6077
42
+ - eval_exact_match: 79.508
43
+ - eval_f1: 87.7293
44
+ - train_runtime: 1:09:34.90
45
+ - train_samples_per_second: 106.019
46
+ - train_steps_per_second: 8.835
47
+
48
+ ## Intended uses & limitations
49
+
50
+ This model is intended for English question answering tasks. It performs best on factual questions where the answer is explicitly stated in the provided context. Note that this model was trained on SQuAD v1.1, which means it always tries to find an answer in the context (it cannot handle questions that have no answer).
51
+
52
+ ### Usage Example
53
+
54
+ ```python
55
+ from transformers import AutoModelForQuestionAnswering, AutoTokenizer
56
+
57
+ model = AutoModelForQuestionAnswering.from_pretrained("real-jiakai/bert-base-uncased-finetuned-squad")
58
+ tokenizer = AutoTokenizer.from_pretrained("real-jiakai/bert-base-uncased-finetuned-squad")
59
+
60
+ # Example usage
61
+ context = "BERT was developed by Google in 2018."
62
+ question = "Who developed BERT?"
63
+
64
+ inputs = tokenizer(question, context, return_tensors="pt")
65
+ outputs = model(**inputs)
66
+
67
+ answer_start = outputs.start_logits.argmax()
68
+ answer_end = outputs.end_logits.argmax()
69
+
70
+ answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end+1])
71
+ print(f"Answer: {answer}") # Expected output: "google"
72
+ ```
73
+
74
+ ## Training Infrastructure
75
+
76
+ - Training Device: Single GPU (NVIDIA Tesla V100 16GB)
77
+ - Training Time: ~70 minutes
78
+ - Framework: PyTorch
79
+ - Training Script: Hugging Face Transformers' `run_qa.py`
80
+
81
+ ### Framework versions
82
+
83
+ - Transformers 4.47.0.dev0
84
+ - Pytorch 2.5.1+cu124
85
+ - Datasets 3.1.0
86
+ - Tokenizers 0.20.3
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_exact_match": 79.50804162724693,
4
+ "eval_f1": 87.72929499724285,
5
+ "eval_runtime": 37.9794,
6
+ "eval_samples": 10784,
7
+ "eval_samples_per_second": 283.944,
8
+ "eval_steps_per_second": 35.493,
9
+ "total_flos": 8.674137784986624e+16,
10
+ "train_loss": 0.6077316944008034,
11
+ "train_runtime": 4174.9009,
12
+ "train_samples": 88524,
13
+ "train_samples_per_second": 106.019,
14
+ "train_steps_per_second": 8.835
15
+ }
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google-bert/bert-base-uncased",
3
+ "architectures": [
4
+ "BertForQuestionAnswering"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.47.0.dev0",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
eval_nbest_predictions.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd23ae26442aa490ccf5ba110358c152327bb4173330536e6a75b52268c7a61d
3
+ size 49623255
eval_predictions.json ADDED
The diff for this file is too large to render. See raw diff
 
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_exact_match": 79.50804162724693,
4
+ "eval_f1": 87.72929499724285,
5
+ "eval_runtime": 37.9794,
6
+ "eval_samples": 10784,
7
+ "eval_samples_per_second": 283.944,
8
+ "eval_steps_per_second": 35.493
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce896c126d5592f15d16e130c207405576ec4e190565096669edfea7d130bbf0
3
+ size 435596088
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "total_flos": 8.674137784986624e+16,
4
+ "train_loss": 0.6077316944008034,
5
+ "train_runtime": 4174.9009,
6
+ "train_samples": 88524,
7
+ "train_samples_per_second": 106.019,
8
+ "train_steps_per_second": 8.835
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 36885,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06777822963264199,
13
+ "grad_norm": 27.22840690612793,
14
+ "learning_rate": 2.959333062220415e-05,
15
+ "loss": 2.3635,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.13555645926528398,
20
+ "grad_norm": 13.622244834899902,
21
+ "learning_rate": 2.9186661244408297e-05,
22
+ "loss": 1.5111,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.203334688897926,
27
+ "grad_norm": 14.186929702758789,
28
+ "learning_rate": 2.8779991866612443e-05,
29
+ "loss": 1.3571,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.27111291853056796,
34
+ "grad_norm": 23.26664161682129,
35
+ "learning_rate": 2.8373322488816593e-05,
36
+ "loss": 1.3122,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.33889114816321,
41
+ "grad_norm": 16.56296157836914,
42
+ "learning_rate": 2.796665311102074e-05,
43
+ "loss": 1.2847,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.406669377795852,
48
+ "grad_norm": 8.389954566955566,
49
+ "learning_rate": 2.755998373322489e-05,
50
+ "loss": 1.2343,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.47444760742849396,
55
+ "grad_norm": 10.328352928161621,
56
+ "learning_rate": 2.7153314355429036e-05,
57
+ "loss": 1.1899,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.5422258370611359,
62
+ "grad_norm": 14.220015525817871,
63
+ "learning_rate": 2.6746644977633186e-05,
64
+ "loss": 1.1337,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.6100040666937779,
69
+ "grad_norm": 23.481950759887695,
70
+ "learning_rate": 2.6339975599837332e-05,
71
+ "loss": 1.1337,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.67778229632642,
76
+ "grad_norm": 26.090124130249023,
77
+ "learning_rate": 2.593330622204148e-05,
78
+ "loss": 1.1607,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.745560525959062,
83
+ "grad_norm": 16.809215545654297,
84
+ "learning_rate": 2.552663684424563e-05,
85
+ "loss": 1.1114,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.813338755591704,
90
+ "grad_norm": 15.458964347839355,
91
+ "learning_rate": 2.5119967466449778e-05,
92
+ "loss": 1.0703,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.8811169852243459,
97
+ "grad_norm": 21.379941940307617,
98
+ "learning_rate": 2.4713298088653925e-05,
99
+ "loss": 1.0782,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.9488952148569879,
104
+ "grad_norm": 10.91637134552002,
105
+ "learning_rate": 2.4306628710858075e-05,
106
+ "loss": 1.0842,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 1.01667344448963,
111
+ "grad_norm": 13.034053802490234,
112
+ "learning_rate": 2.389995933306222e-05,
113
+ "loss": 0.9463,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 1.0844516741222718,
118
+ "grad_norm": 15.298956871032715,
119
+ "learning_rate": 2.3493289955266367e-05,
120
+ "loss": 0.7336,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 1.152229903754914,
125
+ "grad_norm": 10.17520523071289,
126
+ "learning_rate": 2.3086620577470517e-05,
127
+ "loss": 0.7455,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 1.2200081333875559,
132
+ "grad_norm": 15.160359382629395,
133
+ "learning_rate": 2.2679951199674667e-05,
134
+ "loss": 0.7516,
135
+ "step": 9000
136
+ },
137
+ {
138
+ "epoch": 1.287786363020198,
139
+ "grad_norm": 29.048612594604492,
140
+ "learning_rate": 2.2273281821878813e-05,
141
+ "loss": 0.7434,
142
+ "step": 9500
143
+ },
144
+ {
145
+ "epoch": 1.35556459265284,
146
+ "grad_norm": 24.640453338623047,
147
+ "learning_rate": 2.1866612444082963e-05,
148
+ "loss": 0.7673,
149
+ "step": 10000
150
+ },
151
+ {
152
+ "epoch": 1.4233428222854818,
153
+ "grad_norm": 16.07855224609375,
154
+ "learning_rate": 2.1459943066287106e-05,
155
+ "loss": 0.775,
156
+ "step": 10500
157
+ },
158
+ {
159
+ "epoch": 1.491121051918124,
160
+ "grad_norm": 12.662883758544922,
161
+ "learning_rate": 2.1053273688491256e-05,
162
+ "loss": 0.7561,
163
+ "step": 11000
164
+ },
165
+ {
166
+ "epoch": 1.5588992815507658,
167
+ "grad_norm": 13.590459823608398,
168
+ "learning_rate": 2.0646604310695406e-05,
169
+ "loss": 0.7537,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 1.626677511183408,
174
+ "grad_norm": 12.973872184753418,
175
+ "learning_rate": 2.0239934932899552e-05,
176
+ "loss": 0.7513,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 1.6944557408160499,
181
+ "grad_norm": 17.74153709411621,
182
+ "learning_rate": 1.9833265555103702e-05,
183
+ "loss": 0.7666,
184
+ "step": 12500
185
+ },
186
+ {
187
+ "epoch": 1.7622339704486918,
188
+ "grad_norm": 14.367899894714355,
189
+ "learning_rate": 1.9426596177307852e-05,
190
+ "loss": 0.7787,
191
+ "step": 13000
192
+ },
193
+ {
194
+ "epoch": 1.830012200081334,
195
+ "grad_norm": 30.40443992614746,
196
+ "learning_rate": 1.9019926799511995e-05,
197
+ "loss": 0.7373,
198
+ "step": 13500
199
+ },
200
+ {
201
+ "epoch": 1.8977904297139758,
202
+ "grad_norm": 8.429509162902832,
203
+ "learning_rate": 1.8613257421716145e-05,
204
+ "loss": 0.7559,
205
+ "step": 14000
206
+ },
207
+ {
208
+ "epoch": 1.965568659346618,
209
+ "grad_norm": 12.091238975524902,
210
+ "learning_rate": 1.8206588043920295e-05,
211
+ "loss": 0.7537,
212
+ "step": 14500
213
+ },
214
+ {
215
+ "epoch": 2.03334688897926,
216
+ "grad_norm": 7.968406677246094,
217
+ "learning_rate": 1.779991866612444e-05,
218
+ "loss": 0.6185,
219
+ "step": 15000
220
+ },
221
+ {
222
+ "epoch": 2.1011251186119018,
223
+ "grad_norm": 19.485437393188477,
224
+ "learning_rate": 1.739324928832859e-05,
225
+ "loss": 0.4977,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 2.1689033482445437,
230
+ "grad_norm": 19.35041046142578,
231
+ "learning_rate": 1.6986579910532734e-05,
232
+ "loss": 0.4627,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 2.236681577877186,
237
+ "grad_norm": 11.437379837036133,
238
+ "learning_rate": 1.6579910532736884e-05,
239
+ "loss": 0.4954,
240
+ "step": 16500
241
+ },
242
+ {
243
+ "epoch": 2.304459807509828,
244
+ "grad_norm": 6.330296039581299,
245
+ "learning_rate": 1.6173241154941034e-05,
246
+ "loss": 0.4885,
247
+ "step": 17000
248
+ },
249
+ {
250
+ "epoch": 2.37223803714247,
251
+ "grad_norm": 26.632694244384766,
252
+ "learning_rate": 1.576657177714518e-05,
253
+ "loss": 0.5022,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 2.4400162667751117,
258
+ "grad_norm": 10.865219116210938,
259
+ "learning_rate": 1.535990239934933e-05,
260
+ "loss": 0.484,
261
+ "step": 18000
262
+ },
263
+ {
264
+ "epoch": 2.5077944964077536,
265
+ "grad_norm": 15.621503829956055,
266
+ "learning_rate": 1.4953233021553476e-05,
267
+ "loss": 0.4992,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 2.575572726040396,
272
+ "grad_norm": 4.8670973777771,
273
+ "learning_rate": 1.4546563643757626e-05,
274
+ "loss": 0.5067,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 2.643350955673038,
279
+ "grad_norm": 22.809894561767578,
280
+ "learning_rate": 1.4139894265961774e-05,
281
+ "loss": 0.4827,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 2.71112918530568,
286
+ "grad_norm": 10.14173698425293,
287
+ "learning_rate": 1.373322488816592e-05,
288
+ "loss": 0.5003,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 2.7789074149383217,
293
+ "grad_norm": 19.475053787231445,
294
+ "learning_rate": 1.3326555510370069e-05,
295
+ "loss": 0.5022,
296
+ "step": 20500
297
+ },
298
+ {
299
+ "epoch": 2.8466856445709636,
300
+ "grad_norm": 22.009613037109375,
301
+ "learning_rate": 1.2919886132574219e-05,
302
+ "loss": 0.4949,
303
+ "step": 21000
304
+ },
305
+ {
306
+ "epoch": 2.914463874203606,
307
+ "grad_norm": 7.058548927307129,
308
+ "learning_rate": 1.2513216754778365e-05,
309
+ "loss": 0.4963,
310
+ "step": 21500
311
+ },
312
+ {
313
+ "epoch": 2.982242103836248,
314
+ "grad_norm": 45.82003402709961,
315
+ "learning_rate": 1.2106547376982513e-05,
316
+ "loss": 0.4881,
317
+ "step": 22000
318
+ },
319
+ {
320
+ "epoch": 3.05002033346889,
321
+ "grad_norm": 7.270058631896973,
322
+ "learning_rate": 1.1699877999186661e-05,
323
+ "loss": 0.3598,
324
+ "step": 22500
325
+ },
326
+ {
327
+ "epoch": 3.1177985631015317,
328
+ "grad_norm": 10.434412956237793,
329
+ "learning_rate": 1.129320862139081e-05,
330
+ "loss": 0.3055,
331
+ "step": 23000
332
+ },
333
+ {
334
+ "epoch": 3.1855767927341736,
335
+ "grad_norm": 2.4307031631469727,
336
+ "learning_rate": 1.0886539243594958e-05,
337
+ "loss": 0.3264,
338
+ "step": 23500
339
+ },
340
+ {
341
+ "epoch": 3.253355022366816,
342
+ "grad_norm": 47.332122802734375,
343
+ "learning_rate": 1.0479869865799104e-05,
344
+ "loss": 0.315,
345
+ "step": 24000
346
+ },
347
+ {
348
+ "epoch": 3.321133251999458,
349
+ "grad_norm": 16.34264373779297,
350
+ "learning_rate": 1.0073200488003254e-05,
351
+ "loss": 0.3251,
352
+ "step": 24500
353
+ },
354
+ {
355
+ "epoch": 3.3889114816320998,
356
+ "grad_norm": 13.301547050476074,
357
+ "learning_rate": 9.666531110207402e-06,
358
+ "loss": 0.3266,
359
+ "step": 25000
360
+ },
361
+ {
362
+ "epoch": 3.4566897112647417,
363
+ "grad_norm": 31.915794372558594,
364
+ "learning_rate": 9.259861732411548e-06,
365
+ "loss": 0.3122,
366
+ "step": 25500
367
+ },
368
+ {
369
+ "epoch": 3.5244679408973836,
370
+ "grad_norm": 4.690245151519775,
371
+ "learning_rate": 8.853192354615698e-06,
372
+ "loss": 0.3051,
373
+ "step": 26000
374
+ },
375
+ {
376
+ "epoch": 3.592246170530026,
377
+ "grad_norm": 22.70941734313965,
378
+ "learning_rate": 8.446522976819846e-06,
379
+ "loss": 0.3013,
380
+ "step": 26500
381
+ },
382
+ {
383
+ "epoch": 3.660024400162668,
384
+ "grad_norm": 19.372514724731445,
385
+ "learning_rate": 8.039853599023993e-06,
386
+ "loss": 0.3157,
387
+ "step": 27000
388
+ },
389
+ {
390
+ "epoch": 3.7278026297953097,
391
+ "grad_norm": 13.642813682556152,
392
+ "learning_rate": 7.633184221228141e-06,
393
+ "loss": 0.3301,
394
+ "step": 27500
395
+ },
396
+ {
397
+ "epoch": 3.7955808594279516,
398
+ "grad_norm": 31.518564224243164,
399
+ "learning_rate": 7.226514843432289e-06,
400
+ "loss": 0.3157,
401
+ "step": 28000
402
+ },
403
+ {
404
+ "epoch": 3.8633590890605936,
405
+ "grad_norm": 56.66007614135742,
406
+ "learning_rate": 6.819845465636438e-06,
407
+ "loss": 0.308,
408
+ "step": 28500
409
+ },
410
+ {
411
+ "epoch": 3.931137318693236,
412
+ "grad_norm": 11.975343704223633,
413
+ "learning_rate": 6.413176087840585e-06,
414
+ "loss": 0.2989,
415
+ "step": 29000
416
+ },
417
+ {
418
+ "epoch": 3.998915548325878,
419
+ "grad_norm": 17.2923641204834,
420
+ "learning_rate": 6.0065067100447335e-06,
421
+ "loss": 0.3123,
422
+ "step": 29500
423
+ },
424
+ {
425
+ "epoch": 4.06669377795852,
426
+ "grad_norm": 23.894207000732422,
427
+ "learning_rate": 5.5998373322488825e-06,
428
+ "loss": 0.2082,
429
+ "step": 30000
430
+ },
431
+ {
432
+ "epoch": 4.134472007591162,
433
+ "grad_norm": 9.123438835144043,
434
+ "learning_rate": 5.19316795445303e-06,
435
+ "loss": 0.2065,
436
+ "step": 30500
437
+ },
438
+ {
439
+ "epoch": 4.2022502372238035,
440
+ "grad_norm": 19.249637603759766,
441
+ "learning_rate": 4.786498576657178e-06,
442
+ "loss": 0.1926,
443
+ "step": 31000
444
+ },
445
+ {
446
+ "epoch": 4.270028466856446,
447
+ "grad_norm": 41.71086883544922,
448
+ "learning_rate": 4.379829198861326e-06,
449
+ "loss": 0.2034,
450
+ "step": 31500
451
+ },
452
+ {
453
+ "epoch": 4.337806696489087,
454
+ "grad_norm": 1.5960214138031006,
455
+ "learning_rate": 3.973159821065474e-06,
456
+ "loss": 0.2139,
457
+ "step": 32000
458
+ },
459
+ {
460
+ "epoch": 4.40558492612173,
461
+ "grad_norm": 10.51189136505127,
462
+ "learning_rate": 3.566490443269622e-06,
463
+ "loss": 0.195,
464
+ "step": 32500
465
+ },
466
+ {
467
+ "epoch": 4.473363155754372,
468
+ "grad_norm": 6.574166297912598,
469
+ "learning_rate": 3.15982106547377e-06,
470
+ "loss": 0.2121,
471
+ "step": 33000
472
+ },
473
+ {
474
+ "epoch": 4.5411413853870135,
475
+ "grad_norm": 49.01862716674805,
476
+ "learning_rate": 2.753151687677918e-06,
477
+ "loss": 0.2121,
478
+ "step": 33500
479
+ },
480
+ {
481
+ "epoch": 4.608919615019656,
482
+ "grad_norm": 24.359651565551758,
483
+ "learning_rate": 2.346482309882066e-06,
484
+ "loss": 0.2126,
485
+ "step": 34000
486
+ },
487
+ {
488
+ "epoch": 4.676697844652297,
489
+ "grad_norm": 11.81959056854248,
490
+ "learning_rate": 1.939812932086214e-06,
491
+ "loss": 0.2153,
492
+ "step": 34500
493
+ },
494
+ {
495
+ "epoch": 4.74447607428494,
496
+ "grad_norm": 20.48014259338379,
497
+ "learning_rate": 1.533143554290362e-06,
498
+ "loss": 0.1941,
499
+ "step": 35000
500
+ },
501
+ {
502
+ "epoch": 4.812254303917582,
503
+ "grad_norm": 10.205302238464355,
504
+ "learning_rate": 1.12647417649451e-06,
505
+ "loss": 0.1981,
506
+ "step": 35500
507
+ },
508
+ {
509
+ "epoch": 4.8800325335502235,
510
+ "grad_norm": 4.869925498962402,
511
+ "learning_rate": 7.19804798698658e-07,
512
+ "loss": 0.1984,
513
+ "step": 36000
514
+ },
515
+ {
516
+ "epoch": 4.947810763182866,
517
+ "grad_norm": 15.551806449890137,
518
+ "learning_rate": 3.13135420902806e-07,
519
+ "loss": 0.196,
520
+ "step": 36500
521
+ },
522
+ {
523
+ "epoch": 5.0,
524
+ "step": 36885,
525
+ "total_flos": 8.674137784986624e+16,
526
+ "train_loss": 0.6077316944008034,
527
+ "train_runtime": 4174.9009,
528
+ "train_samples_per_second": 106.019,
529
+ "train_steps_per_second": 8.835
530
+ }
531
+ ],
532
+ "logging_steps": 500,
533
+ "max_steps": 36885,
534
+ "num_input_tokens_seen": 0,
535
+ "num_train_epochs": 5,
536
+ "save_steps": 500,
537
+ "stateful_callbacks": {
538
+ "TrainerControl": {
539
+ "args": {
540
+ "should_epoch_stop": false,
541
+ "should_evaluate": false,
542
+ "should_log": false,
543
+ "should_save": true,
544
+ "should_training_stop": true
545
+ },
546
+ "attributes": {}
547
+ }
548
+ },
549
+ "total_flos": 8.674137784986624e+16,
550
+ "train_batch_size": 12,
551
+ "trial_name": null,
552
+ "trial_params": null
553
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cde90f6aad9ad0797f44f8353b152203daf510ae41a77550fbd06ecf2ecd736f
3
+ size 5368
vocab.txt ADDED
The diff for this file is too large to render. See raw diff