TTian commited on
Commit
e819850
β€’
1 Parent(s): 2e9c4f6

End of training

Browse files
Files changed (42) hide show
  1. checkpoint-10/trainer_state.json +0 -30
  2. {checkpoint-10 β†’ checkpoint-170}/added_tokens.json +0 -0
  3. {checkpoint-10 β†’ checkpoint-170}/config.json +0 -0
  4. {checkpoint-10 β†’ checkpoint-170}/optimizer.pt +1 -1
  5. {checkpoint-10 β†’ checkpoint-170}/pytorch_model.bin +1 -1
  6. {checkpoint-10 β†’ checkpoint-170}/rng_state.pth +1 -1
  7. {checkpoint-10 β†’ checkpoint-170}/scaler.pt +1 -1
  8. {checkpoint-10 β†’ checkpoint-170}/scheduler.pt +1 -1
  9. {checkpoint-10 β†’ checkpoint-170}/special_tokens_map.json +0 -0
  10. {checkpoint-10 β†’ checkpoint-170}/spm.model +0 -0
  11. {checkpoint-10 β†’ checkpoint-170}/tokenizer.json +0 -0
  12. {checkpoint-10 β†’ checkpoint-170}/tokenizer_config.json +0 -0
  13. checkpoint-170/trainer_state.json +254 -0
  14. {checkpoint-10 β†’ checkpoint-170}/training_args.bin +0 -0
  15. checkpoint-460/added_tokens.json +17 -0
  16. checkpoint-460/config.json +45 -0
  17. checkpoint-460/optimizer.pt +3 -0
  18. checkpoint-460/pytorch_model.bin +3 -0
  19. checkpoint-460/rng_state.pth +3 -0
  20. checkpoint-460/scaler.pt +3 -0
  21. checkpoint-460/scheduler.pt +3 -0
  22. checkpoint-460/special_tokens_map.json +25 -0
  23. checkpoint-460/spm.model +3 -0
  24. checkpoint-460/tokenizer.json +0 -0
  25. checkpoint-460/tokenizer_config.json +32 -0
  26. checkpoint-460/trainer_state.json +660 -0
  27. checkpoint-460/training_args.bin +3 -0
  28. checkpoint-470/added_tokens.json +17 -0
  29. checkpoint-470/config.json +45 -0
  30. checkpoint-470/optimizer.pt +3 -0
  31. checkpoint-470/pytorch_model.bin +3 -0
  32. checkpoint-470/rng_state.pth +3 -0
  33. checkpoint-470/scaler.pt +3 -0
  34. checkpoint-470/scheduler.pt +3 -0
  35. checkpoint-470/special_tokens_map.json +25 -0
  36. checkpoint-470/spm.model +3 -0
  37. checkpoint-470/tokenizer.json +0 -0
  38. checkpoint-470/tokenizer_config.json +32 -0
  39. checkpoint-470/trainer_state.json +674 -0
  40. checkpoint-470/training_args.bin +3 -0
  41. pytorch_model.bin +1 -1
  42. runs/Nov26_03-30-10_f19dc631087e/events.out.tfevents.1669433434.f19dc631087e.2463.0 +2 -2
checkpoint-10/trainer_state.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "best_metric": 0.5888153910636902,
3
- "best_model_checkpoint": "deberta-classifier-feedback-1024-pseudo-final/checkpoint-10",
4
- "epoch": 0.0423728813559322,
5
- "global_step": 10,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.04,
12
- "learning_rate": 1.9576271186440678e-05,
13
- "loss": 0.5814,
14
- "step": 10
15
- },
16
- {
17
- "epoch": 0.04,
18
- "eval_loss": 0.5888153910636902,
19
- "eval_runtime": 18.2433,
20
- "eval_samples_per_second": 23.022,
21
- "eval_steps_per_second": 2.905,
22
- "step": 10
23
- }
24
- ],
25
- "max_steps": 472,
26
- "num_train_epochs": 2,
27
- "total_flos": 295709116548768.0,
28
- "trial_name": null,
29
- "trial_params": null
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{checkpoint-10 β†’ checkpoint-170}/added_tokens.json RENAMED
File without changes
{checkpoint-10 β†’ checkpoint-170}/config.json RENAMED
File without changes
{checkpoint-10 β†’ checkpoint-170}/optimizer.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9058d120f116c9f075760a3cecfadde6c48be5f6c723f4bbe6e465cb2d3b617
3
  size 3472349601
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baa6e22d909cf5d0453aed3face68b24f933494a43a99118bc98e4945ac76620
3
  size 3472349601
{checkpoint-10 β†’ checkpoint-170}/pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c67ed9579fbf92ad582171655d9a63cf47be3ee9a1a837842b2683db66360847
3
  size 1736202543
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ea0f96260b582f68582a71bd74d29b120583100cf5dfbd08940813b056f101d
3
  size 1736202543
{checkpoint-10 β†’ checkpoint-170}/rng_state.pth RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32a20052bcd699bfa424ab9ab607dae575a69b96010be796c6e0d6996bd6ea4a
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:830cc2e0d9a7824a57ce1607b8211bbebfe61efc88850bec43b3cbb5ea3677f3
3
  size 14503
{checkpoint-10 β†’ checkpoint-170}/scaler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20e4b012b4b9ab1daa876390beea4afff370d4f83e20e939f16cdf1855daf52f
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:069dde2efc39ed81f1532fefa2daeb200e0a266e36ab86be682c859d85385e70
3
  size 559
{checkpoint-10 β†’ checkpoint-170}/scheduler.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:156d8d59fa6abdae5154be64292e80cfd2e7110c86b2201d4b65c651360c1fa9
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4765c243067f8d7892f1ed5820e11257f0de2a359365fdb2baab54814957e45
3
  size 623
{checkpoint-10 β†’ checkpoint-170}/special_tokens_map.json RENAMED
File without changes
{checkpoint-10 β†’ checkpoint-170}/spm.model RENAMED
File without changes
{checkpoint-10 β†’ checkpoint-170}/tokenizer.json RENAMED
File without changes
{checkpoint-10 β†’ checkpoint-170}/tokenizer_config.json RENAMED
File without changes
checkpoint-170/trainer_state.json ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5201743841171265,
3
+ "best_model_checkpoint": "deberta-classifier-feedback-1024-pseudo-final/checkpoint-170",
4
+ "epoch": 0.7203389830508474,
5
+ "global_step": 170,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.04,
12
+ "learning_rate": 1.9576271186440678e-05,
13
+ "loss": 0.5814,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "eval_loss": 0.5888153910636902,
19
+ "eval_runtime": 18.2433,
20
+ "eval_samples_per_second": 23.022,
21
+ "eval_steps_per_second": 2.905,
22
+ "step": 10
23
+ },
24
+ {
25
+ "epoch": 0.08,
26
+ "learning_rate": 1.9152542372881357e-05,
27
+ "loss": 0.5521,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.08,
32
+ "eval_loss": 0.5736112594604492,
33
+ "eval_runtime": 18.7271,
34
+ "eval_samples_per_second": 22.427,
35
+ "eval_steps_per_second": 2.83,
36
+ "step": 20
37
+ },
38
+ {
39
+ "epoch": 0.13,
40
+ "learning_rate": 1.8728813559322033e-05,
41
+ "loss": 0.5685,
42
+ "step": 30
43
+ },
44
+ {
45
+ "epoch": 0.13,
46
+ "eval_loss": 0.5809019804000854,
47
+ "eval_runtime": 17.2407,
48
+ "eval_samples_per_second": 24.361,
49
+ "eval_steps_per_second": 3.074,
50
+ "step": 30
51
+ },
52
+ {
53
+ "epoch": 0.17,
54
+ "learning_rate": 1.8305084745762713e-05,
55
+ "loss": 0.6052,
56
+ "step": 40
57
+ },
58
+ {
59
+ "epoch": 0.17,
60
+ "eval_loss": 0.5701586008071899,
61
+ "eval_runtime": 18.6909,
62
+ "eval_samples_per_second": 22.471,
63
+ "eval_steps_per_second": 2.836,
64
+ "step": 40
65
+ },
66
+ {
67
+ "epoch": 0.21,
68
+ "learning_rate": 1.788135593220339e-05,
69
+ "loss": 0.5532,
70
+ "step": 50
71
+ },
72
+ {
73
+ "epoch": 0.21,
74
+ "eval_loss": 0.5571172833442688,
75
+ "eval_runtime": 18.369,
76
+ "eval_samples_per_second": 22.865,
77
+ "eval_steps_per_second": 2.885,
78
+ "step": 50
79
+ },
80
+ {
81
+ "epoch": 0.25,
82
+ "learning_rate": 1.745762711864407e-05,
83
+ "loss": 0.6177,
84
+ "step": 60
85
+ },
86
+ {
87
+ "epoch": 0.25,
88
+ "eval_loss": 0.5848062634468079,
89
+ "eval_runtime": 18.5061,
90
+ "eval_samples_per_second": 22.695,
91
+ "eval_steps_per_second": 2.864,
92
+ "step": 60
93
+ },
94
+ {
95
+ "epoch": 0.3,
96
+ "learning_rate": 1.7033898305084745e-05,
97
+ "loss": 0.6196,
98
+ "step": 70
99
+ },
100
+ {
101
+ "epoch": 0.3,
102
+ "eval_loss": 0.5464363098144531,
103
+ "eval_runtime": 18.5102,
104
+ "eval_samples_per_second": 22.69,
105
+ "eval_steps_per_second": 2.863,
106
+ "step": 70
107
+ },
108
+ {
109
+ "epoch": 0.34,
110
+ "learning_rate": 1.6610169491525424e-05,
111
+ "loss": 0.5772,
112
+ "step": 80
113
+ },
114
+ {
115
+ "epoch": 0.34,
116
+ "eval_loss": 0.5307226777076721,
117
+ "eval_runtime": 18.3662,
118
+ "eval_samples_per_second": 22.868,
119
+ "eval_steps_per_second": 2.886,
120
+ "step": 80
121
+ },
122
+ {
123
+ "epoch": 0.38,
124
+ "learning_rate": 1.6186440677966104e-05,
125
+ "loss": 0.5805,
126
+ "step": 90
127
+ },
128
+ {
129
+ "epoch": 0.38,
130
+ "eval_loss": 0.554991602897644,
131
+ "eval_runtime": 17.9687,
132
+ "eval_samples_per_second": 23.374,
133
+ "eval_steps_per_second": 2.95,
134
+ "step": 90
135
+ },
136
+ {
137
+ "epoch": 0.42,
138
+ "learning_rate": 1.576271186440678e-05,
139
+ "loss": 0.6453,
140
+ "step": 100
141
+ },
142
+ {
143
+ "epoch": 0.42,
144
+ "eval_loss": 0.5466664433479309,
145
+ "eval_runtime": 18.0919,
146
+ "eval_samples_per_second": 23.215,
147
+ "eval_steps_per_second": 2.929,
148
+ "step": 100
149
+ },
150
+ {
151
+ "epoch": 0.47,
152
+ "learning_rate": 1.533898305084746e-05,
153
+ "loss": 0.5756,
154
+ "step": 110
155
+ },
156
+ {
157
+ "epoch": 0.47,
158
+ "eval_loss": 0.5586597919464111,
159
+ "eval_runtime": 18.6353,
160
+ "eval_samples_per_second": 22.538,
161
+ "eval_steps_per_second": 2.844,
162
+ "step": 110
163
+ },
164
+ {
165
+ "epoch": 0.51,
166
+ "learning_rate": 1.4915254237288137e-05,
167
+ "loss": 0.5901,
168
+ "step": 120
169
+ },
170
+ {
171
+ "epoch": 0.51,
172
+ "eval_loss": 0.5481747388839722,
173
+ "eval_runtime": 18.326,
174
+ "eval_samples_per_second": 22.918,
175
+ "eval_steps_per_second": 2.892,
176
+ "step": 120
177
+ },
178
+ {
179
+ "epoch": 0.55,
180
+ "learning_rate": 1.4491525423728813e-05,
181
+ "loss": 0.568,
182
+ "step": 130
183
+ },
184
+ {
185
+ "epoch": 0.55,
186
+ "eval_loss": 0.5262647867202759,
187
+ "eval_runtime": 18.2108,
188
+ "eval_samples_per_second": 23.063,
189
+ "eval_steps_per_second": 2.91,
190
+ "step": 130
191
+ },
192
+ {
193
+ "epoch": 0.59,
194
+ "learning_rate": 1.4067796610169493e-05,
195
+ "loss": 0.5452,
196
+ "step": 140
197
+ },
198
+ {
199
+ "epoch": 0.59,
200
+ "eval_loss": 0.5698090195655823,
201
+ "eval_runtime": 18.2551,
202
+ "eval_samples_per_second": 23.007,
203
+ "eval_steps_per_second": 2.903,
204
+ "step": 140
205
+ },
206
+ {
207
+ "epoch": 0.64,
208
+ "learning_rate": 1.364406779661017e-05,
209
+ "loss": 0.5949,
210
+ "step": 150
211
+ },
212
+ {
213
+ "epoch": 0.64,
214
+ "eval_loss": 0.5483840107917786,
215
+ "eval_runtime": 18.0824,
216
+ "eval_samples_per_second": 23.227,
217
+ "eval_steps_per_second": 2.931,
218
+ "step": 150
219
+ },
220
+ {
221
+ "epoch": 0.68,
222
+ "learning_rate": 1.3220338983050848e-05,
223
+ "loss": 0.5537,
224
+ "step": 160
225
+ },
226
+ {
227
+ "epoch": 0.68,
228
+ "eval_loss": 0.578332781791687,
229
+ "eval_runtime": 18.2057,
230
+ "eval_samples_per_second": 23.07,
231
+ "eval_steps_per_second": 2.911,
232
+ "step": 160
233
+ },
234
+ {
235
+ "epoch": 0.72,
236
+ "learning_rate": 1.2796610169491526e-05,
237
+ "loss": 0.5327,
238
+ "step": 170
239
+ },
240
+ {
241
+ "epoch": 0.72,
242
+ "eval_loss": 0.5201743841171265,
243
+ "eval_runtime": 18.1319,
244
+ "eval_samples_per_second": 23.164,
245
+ "eval_steps_per_second": 2.923,
246
+ "step": 170
247
+ }
248
+ ],
249
+ "max_steps": 472,
250
+ "num_train_epochs": 2,
251
+ "total_flos": 5037619130747424.0,
252
+ "trial_name": null,
253
+ "trial_params": null
254
+ }
{checkpoint-10 β†’ checkpoint-170}/training_args.bin RENAMED
File without changes
checkpoint-460/added_tokens.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[CLAIM_END]": 128006,
3
+ "[CLAIM_START]": 128005,
4
+ "[CONCLUDING STATEMENT_END]": 128014,
5
+ "[CONCLUDING STATEMENT_START]": 128013,
6
+ "[COUNTERCLAIM_END]": 128008,
7
+ "[COUNTERCLAIM_START]": 128007,
8
+ "[EVIDENCE_END]": 128012,
9
+ "[EVIDENCE_START]": 128011,
10
+ "[LEAD_END]": 128002,
11
+ "[LEAD_START]": 128001,
12
+ "[MASK]": 128000,
13
+ "[POSITION_END]": 128004,
14
+ "[POSITION_START]": 128003,
15
+ "[REBUTTAL_END]": 128010,
16
+ "[REBUTTAL_START]": 128009
17
+ }
checkpoint-460/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "TTian/deberta-classifier-feedback-1024-pseudo",
3
+ "architectures": [
4
+ "DebertaV2ForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 1024,
10
+ "id2label": {
11
+ "0": "LABEL_0",
12
+ "1": "LABEL_1",
13
+ "2": "LABEL_2"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 4096,
17
+ "label2id": {
18
+ "LABEL_0": 0,
19
+ "LABEL_1": 1,
20
+ "LABEL_2": 2
21
+ },
22
+ "layer_norm_eps": 1e-07,
23
+ "max_position_embeddings": 1024,
24
+ "max_relative_positions": 128,
25
+ "model_type": "deberta-v2",
26
+ "norm_rel_ebd": "layer_norm",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 24,
29
+ "pad_token_id": 0,
30
+ "pooler_dropout": 0,
31
+ "pooler_hidden_act": "gelu",
32
+ "pooler_hidden_size": 1024,
33
+ "pos_att_type": [
34
+ "p2c",
35
+ "c2p"
36
+ ],
37
+ "position_biased_input": false,
38
+ "position_buckets": 256,
39
+ "relative_attention": true,
40
+ "share_att_key": true,
41
+ "torch_dtype": "float32",
42
+ "transformers_version": "4.24.0",
43
+ "type_vocab_size": 0,
44
+ "vocab_size": 128100
45
+ }
checkpoint-460/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae4cd9ae51745929f0da171a8f49f3dfcea7e0463e54a178c8a7ab370ff78219
3
+ size 3472349985
checkpoint-460/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40d408ef894b4d94bd894f9d7ea0b2b3182b1ce4882f2f6e97094d38b886d5b
3
+ size 1736202543
checkpoint-460/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:914efd757fd9b9805363c6c356f593ac5c732f0874ffac17f8cbc6209ec8ce66
3
+ size 14503
checkpoint-460/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2e8a863a9becce90fdc872120c273538d6f7b6b8e69c0f99731af8003375805
3
+ size 559
checkpoint-460/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a995b69aa8c9ee937f3e6ff6a927576563844156b137bbb3e8e4e086877fd195
3
+ size 623
checkpoint-460/special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "[LEAD_START]",
4
+ "[LEAD_END]",
5
+ "[POSITION_START]",
6
+ "[POSITION_END]",
7
+ "[CLAIM_START]",
8
+ "[CLAIM_END]",
9
+ "[COUNTERCLAIM_START]",
10
+ "[COUNTERCLAIM_END]",
11
+ "[REBUTTAL_START]",
12
+ "[REBUTTAL_END]",
13
+ "[EVIDENCE_START]",
14
+ "[EVIDENCE_END]",
15
+ "[CONCLUDING STATEMENT_START]",
16
+ "[CONCLUDING STATEMENT_END]"
17
+ ],
18
+ "bos_token": "[CLS]",
19
+ "cls_token": "[CLS]",
20
+ "eos_token": "[SEP]",
21
+ "mask_token": "[MASK]",
22
+ "pad_token": "[PAD]",
23
+ "sep_token": "[SEP]",
24
+ "unk_token": "[UNK]"
25
+ }
checkpoint-460/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
checkpoint-460/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-460/tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "[LEAD_START]",
4
+ "[LEAD_END]",
5
+ "[POSITION_START]",
6
+ "[POSITION_END]",
7
+ "[CLAIM_START]",
8
+ "[CLAIM_END]",
9
+ "[COUNTERCLAIM_START]",
10
+ "[COUNTERCLAIM_END]",
11
+ "[REBUTTAL_START]",
12
+ "[REBUTTAL_END]",
13
+ "[EVIDENCE_START]",
14
+ "[EVIDENCE_END]",
15
+ "[CONCLUDING STATEMENT_START]",
16
+ "[CONCLUDING STATEMENT_END]"
17
+ ],
18
+ "bos_token": "[CLS]",
19
+ "cls_token": "[CLS]",
20
+ "do_lower_case": false,
21
+ "eos_token": "[SEP]",
22
+ "mask_token": "[MASK]",
23
+ "name_or_path": "TTian/deberta-classifier-feedback-1024-pseudo",
24
+ "pad_token": "[PAD]",
25
+ "sep_token": "[SEP]",
26
+ "sp_model_kwargs": {},
27
+ "special_tokens_map_file": null,
28
+ "split_by_punct": false,
29
+ "tokenizer_class": "DebertaV2Tokenizer",
30
+ "unk_token": "[UNK]",
31
+ "vocab_type": "spm"
32
+ }
checkpoint-460/trainer_state.json ADDED
@@ -0,0 +1,660 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5201743841171265,
3
+ "best_model_checkpoint": "deberta-classifier-feedback-1024-pseudo-final/checkpoint-170",
4
+ "epoch": 1.9491525423728815,
5
+ "global_step": 460,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.04,
12
+ "learning_rate": 1.9576271186440678e-05,
13
+ "loss": 0.5814,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "eval_loss": 0.5888153910636902,
19
+ "eval_runtime": 18.2433,
20
+ "eval_samples_per_second": 23.022,
21
+ "eval_steps_per_second": 2.905,
22
+ "step": 10
23
+ },
24
+ {
25
+ "epoch": 0.08,
26
+ "learning_rate": 1.9152542372881357e-05,
27
+ "loss": 0.5521,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.08,
32
+ "eval_loss": 0.5736112594604492,
33
+ "eval_runtime": 18.7271,
34
+ "eval_samples_per_second": 22.427,
35
+ "eval_steps_per_second": 2.83,
36
+ "step": 20
37
+ },
38
+ {
39
+ "epoch": 0.13,
40
+ "learning_rate": 1.8728813559322033e-05,
41
+ "loss": 0.5685,
42
+ "step": 30
43
+ },
44
+ {
45
+ "epoch": 0.13,
46
+ "eval_loss": 0.5809019804000854,
47
+ "eval_runtime": 17.2407,
48
+ "eval_samples_per_second": 24.361,
49
+ "eval_steps_per_second": 3.074,
50
+ "step": 30
51
+ },
52
+ {
53
+ "epoch": 0.17,
54
+ "learning_rate": 1.8305084745762713e-05,
55
+ "loss": 0.6052,
56
+ "step": 40
57
+ },
58
+ {
59
+ "epoch": 0.17,
60
+ "eval_loss": 0.5701586008071899,
61
+ "eval_runtime": 18.6909,
62
+ "eval_samples_per_second": 22.471,
63
+ "eval_steps_per_second": 2.836,
64
+ "step": 40
65
+ },
66
+ {
67
+ "epoch": 0.21,
68
+ "learning_rate": 1.788135593220339e-05,
69
+ "loss": 0.5532,
70
+ "step": 50
71
+ },
72
+ {
73
+ "epoch": 0.21,
74
+ "eval_loss": 0.5571172833442688,
75
+ "eval_runtime": 18.369,
76
+ "eval_samples_per_second": 22.865,
77
+ "eval_steps_per_second": 2.885,
78
+ "step": 50
79
+ },
80
+ {
81
+ "epoch": 0.25,
82
+ "learning_rate": 1.745762711864407e-05,
83
+ "loss": 0.6177,
84
+ "step": 60
85
+ },
86
+ {
87
+ "epoch": 0.25,
88
+ "eval_loss": 0.5848062634468079,
89
+ "eval_runtime": 18.5061,
90
+ "eval_samples_per_second": 22.695,
91
+ "eval_steps_per_second": 2.864,
92
+ "step": 60
93
+ },
94
+ {
95
+ "epoch": 0.3,
96
+ "learning_rate": 1.7033898305084745e-05,
97
+ "loss": 0.6196,
98
+ "step": 70
99
+ },
100
+ {
101
+ "epoch": 0.3,
102
+ "eval_loss": 0.5464363098144531,
103
+ "eval_runtime": 18.5102,
104
+ "eval_samples_per_second": 22.69,
105
+ "eval_steps_per_second": 2.863,
106
+ "step": 70
107
+ },
108
+ {
109
+ "epoch": 0.34,
110
+ "learning_rate": 1.6610169491525424e-05,
111
+ "loss": 0.5772,
112
+ "step": 80
113
+ },
114
+ {
115
+ "epoch": 0.34,
116
+ "eval_loss": 0.5307226777076721,
117
+ "eval_runtime": 18.3662,
118
+ "eval_samples_per_second": 22.868,
119
+ "eval_steps_per_second": 2.886,
120
+ "step": 80
121
+ },
122
+ {
123
+ "epoch": 0.38,
124
+ "learning_rate": 1.6186440677966104e-05,
125
+ "loss": 0.5805,
126
+ "step": 90
127
+ },
128
+ {
129
+ "epoch": 0.38,
130
+ "eval_loss": 0.554991602897644,
131
+ "eval_runtime": 17.9687,
132
+ "eval_samples_per_second": 23.374,
133
+ "eval_steps_per_second": 2.95,
134
+ "step": 90
135
+ },
136
+ {
137
+ "epoch": 0.42,
138
+ "learning_rate": 1.576271186440678e-05,
139
+ "loss": 0.6453,
140
+ "step": 100
141
+ },
142
+ {
143
+ "epoch": 0.42,
144
+ "eval_loss": 0.5466664433479309,
145
+ "eval_runtime": 18.0919,
146
+ "eval_samples_per_second": 23.215,
147
+ "eval_steps_per_second": 2.929,
148
+ "step": 100
149
+ },
150
+ {
151
+ "epoch": 0.47,
152
+ "learning_rate": 1.533898305084746e-05,
153
+ "loss": 0.5756,
154
+ "step": 110
155
+ },
156
+ {
157
+ "epoch": 0.47,
158
+ "eval_loss": 0.5586597919464111,
159
+ "eval_runtime": 18.6353,
160
+ "eval_samples_per_second": 22.538,
161
+ "eval_steps_per_second": 2.844,
162
+ "step": 110
163
+ },
164
+ {
165
+ "epoch": 0.51,
166
+ "learning_rate": 1.4915254237288137e-05,
167
+ "loss": 0.5901,
168
+ "step": 120
169
+ },
170
+ {
171
+ "epoch": 0.51,
172
+ "eval_loss": 0.5481747388839722,
173
+ "eval_runtime": 18.326,
174
+ "eval_samples_per_second": 22.918,
175
+ "eval_steps_per_second": 2.892,
176
+ "step": 120
177
+ },
178
+ {
179
+ "epoch": 0.55,
180
+ "learning_rate": 1.4491525423728813e-05,
181
+ "loss": 0.568,
182
+ "step": 130
183
+ },
184
+ {
185
+ "epoch": 0.55,
186
+ "eval_loss": 0.5262647867202759,
187
+ "eval_runtime": 18.2108,
188
+ "eval_samples_per_second": 23.063,
189
+ "eval_steps_per_second": 2.91,
190
+ "step": 130
191
+ },
192
+ {
193
+ "epoch": 0.59,
194
+ "learning_rate": 1.4067796610169493e-05,
195
+ "loss": 0.5452,
196
+ "step": 140
197
+ },
198
+ {
199
+ "epoch": 0.59,
200
+ "eval_loss": 0.5698090195655823,
201
+ "eval_runtime": 18.2551,
202
+ "eval_samples_per_second": 23.007,
203
+ "eval_steps_per_second": 2.903,
204
+ "step": 140
205
+ },
206
+ {
207
+ "epoch": 0.64,
208
+ "learning_rate": 1.364406779661017e-05,
209
+ "loss": 0.5949,
210
+ "step": 150
211
+ },
212
+ {
213
+ "epoch": 0.64,
214
+ "eval_loss": 0.5483840107917786,
215
+ "eval_runtime": 18.0824,
216
+ "eval_samples_per_second": 23.227,
217
+ "eval_steps_per_second": 2.931,
218
+ "step": 150
219
+ },
220
+ {
221
+ "epoch": 0.68,
222
+ "learning_rate": 1.3220338983050848e-05,
223
+ "loss": 0.5537,
224
+ "step": 160
225
+ },
226
+ {
227
+ "epoch": 0.68,
228
+ "eval_loss": 0.578332781791687,
229
+ "eval_runtime": 18.2057,
230
+ "eval_samples_per_second": 23.07,
231
+ "eval_steps_per_second": 2.911,
232
+ "step": 160
233
+ },
234
+ {
235
+ "epoch": 0.72,
236
+ "learning_rate": 1.2796610169491526e-05,
237
+ "loss": 0.5327,
238
+ "step": 170
239
+ },
240
+ {
241
+ "epoch": 0.72,
242
+ "eval_loss": 0.5201743841171265,
243
+ "eval_runtime": 18.1319,
244
+ "eval_samples_per_second": 23.164,
245
+ "eval_steps_per_second": 2.923,
246
+ "step": 170
247
+ },
248
+ {
249
+ "epoch": 0.76,
250
+ "learning_rate": 1.2372881355932205e-05,
251
+ "loss": 0.5449,
252
+ "step": 180
253
+ },
254
+ {
255
+ "epoch": 0.76,
256
+ "eval_loss": 0.5272189378738403,
257
+ "eval_runtime": 18.125,
258
+ "eval_samples_per_second": 23.172,
259
+ "eval_steps_per_second": 2.924,
260
+ "step": 180
261
+ },
262
+ {
263
+ "epoch": 0.81,
264
+ "learning_rate": 1.1949152542372882e-05,
265
+ "loss": 0.5345,
266
+ "step": 190
267
+ },
268
+ {
269
+ "epoch": 0.81,
270
+ "eval_loss": 0.5621271133422852,
271
+ "eval_runtime": 18.0129,
272
+ "eval_samples_per_second": 23.317,
273
+ "eval_steps_per_second": 2.942,
274
+ "step": 190
275
+ },
276
+ {
277
+ "epoch": 0.85,
278
+ "learning_rate": 1.1525423728813561e-05,
279
+ "loss": 0.5837,
280
+ "step": 200
281
+ },
282
+ {
283
+ "epoch": 0.85,
284
+ "eval_loss": 0.55014967918396,
285
+ "eval_runtime": 18.0302,
286
+ "eval_samples_per_second": 23.294,
287
+ "eval_steps_per_second": 2.94,
288
+ "step": 200
289
+ },
290
+ {
291
+ "epoch": 0.89,
292
+ "learning_rate": 1.1101694915254237e-05,
293
+ "loss": 0.5969,
294
+ "step": 210
295
+ },
296
+ {
297
+ "epoch": 0.89,
298
+ "eval_loss": 0.5470077395439148,
299
+ "eval_runtime": 17.9721,
300
+ "eval_samples_per_second": 23.37,
301
+ "eval_steps_per_second": 2.949,
302
+ "step": 210
303
+ },
304
+ {
305
+ "epoch": 0.93,
306
+ "learning_rate": 1.0677966101694917e-05,
307
+ "loss": 0.5905,
308
+ "step": 220
309
+ },
310
+ {
311
+ "epoch": 0.93,
312
+ "eval_loss": 0.5924287438392639,
313
+ "eval_runtime": 18.0444,
314
+ "eval_samples_per_second": 23.276,
315
+ "eval_steps_per_second": 2.937,
316
+ "step": 220
317
+ },
318
+ {
319
+ "epoch": 0.97,
320
+ "learning_rate": 1.0254237288135593e-05,
321
+ "loss": 0.5481,
322
+ "step": 230
323
+ },
324
+ {
325
+ "epoch": 0.97,
326
+ "eval_loss": 0.5415045022964478,
327
+ "eval_runtime": 17.9371,
328
+ "eval_samples_per_second": 23.415,
329
+ "eval_steps_per_second": 2.955,
330
+ "step": 230
331
+ },
332
+ {
333
+ "epoch": 1.02,
334
+ "learning_rate": 9.830508474576272e-06,
335
+ "loss": 0.5035,
336
+ "step": 240
337
+ },
338
+ {
339
+ "epoch": 1.02,
340
+ "eval_loss": 0.5320823788642883,
341
+ "eval_runtime": 17.9879,
342
+ "eval_samples_per_second": 23.349,
343
+ "eval_steps_per_second": 2.946,
344
+ "step": 240
345
+ },
346
+ {
347
+ "epoch": 1.06,
348
+ "learning_rate": 9.40677966101695e-06,
349
+ "loss": 0.4508,
350
+ "step": 250
351
+ },
352
+ {
353
+ "epoch": 1.06,
354
+ "eval_loss": 0.5371343493461609,
355
+ "eval_runtime": 17.4824,
356
+ "eval_samples_per_second": 24.024,
357
+ "eval_steps_per_second": 3.032,
358
+ "step": 250
359
+ },
360
+ {
361
+ "epoch": 1.1,
362
+ "learning_rate": 8.983050847457628e-06,
363
+ "loss": 0.4227,
364
+ "step": 260
365
+ },
366
+ {
367
+ "epoch": 1.1,
368
+ "eval_loss": 0.5276100635528564,
369
+ "eval_runtime": 18.1362,
370
+ "eval_samples_per_second": 23.158,
371
+ "eval_steps_per_second": 2.922,
372
+ "step": 260
373
+ },
374
+ {
375
+ "epoch": 1.14,
376
+ "learning_rate": 8.559322033898306e-06,
377
+ "loss": 0.4423,
378
+ "step": 270
379
+ },
380
+ {
381
+ "epoch": 1.14,
382
+ "eval_loss": 0.532426118850708,
383
+ "eval_runtime": 17.9907,
384
+ "eval_samples_per_second": 23.345,
385
+ "eval_steps_per_second": 2.946,
386
+ "step": 270
387
+ },
388
+ {
389
+ "epoch": 1.19,
390
+ "learning_rate": 8.135593220338983e-06,
391
+ "loss": 0.432,
392
+ "step": 280
393
+ },
394
+ {
395
+ "epoch": 1.19,
396
+ "eval_loss": 0.5377896428108215,
397
+ "eval_runtime": 17.4953,
398
+ "eval_samples_per_second": 24.006,
399
+ "eval_steps_per_second": 3.029,
400
+ "step": 280
401
+ },
402
+ {
403
+ "epoch": 1.23,
404
+ "learning_rate": 7.711864406779663e-06,
405
+ "loss": 0.4317,
406
+ "step": 290
407
+ },
408
+ {
409
+ "epoch": 1.23,
410
+ "eval_loss": 0.5301514863967896,
411
+ "eval_runtime": 18.2183,
412
+ "eval_samples_per_second": 23.054,
413
+ "eval_steps_per_second": 2.909,
414
+ "step": 290
415
+ },
416
+ {
417
+ "epoch": 1.27,
418
+ "learning_rate": 7.288135593220339e-06,
419
+ "loss": 0.46,
420
+ "step": 300
421
+ },
422
+ {
423
+ "epoch": 1.27,
424
+ "eval_loss": 0.5301567316055298,
425
+ "eval_runtime": 18.4315,
426
+ "eval_samples_per_second": 22.787,
427
+ "eval_steps_per_second": 2.876,
428
+ "step": 300
429
+ },
430
+ {
431
+ "epoch": 1.31,
432
+ "learning_rate": 6.864406779661017e-06,
433
+ "loss": 0.435,
434
+ "step": 310
435
+ },
436
+ {
437
+ "epoch": 1.31,
438
+ "eval_loss": 0.5325623750686646,
439
+ "eval_runtime": 17.6821,
440
+ "eval_samples_per_second": 23.753,
441
+ "eval_steps_per_second": 2.997,
442
+ "step": 310
443
+ },
444
+ {
445
+ "epoch": 1.36,
446
+ "learning_rate": 6.440677966101695e-06,
447
+ "loss": 0.3813,
448
+ "step": 320
449
+ },
450
+ {
451
+ "epoch": 1.36,
452
+ "eval_loss": 0.5431253910064697,
453
+ "eval_runtime": 18.5006,
454
+ "eval_samples_per_second": 22.702,
455
+ "eval_steps_per_second": 2.865,
456
+ "step": 320
457
+ },
458
+ {
459
+ "epoch": 1.4,
460
+ "learning_rate": 6.0169491525423725e-06,
461
+ "loss": 0.4422,
462
+ "step": 330
463
+ },
464
+ {
465
+ "epoch": 1.4,
466
+ "eval_loss": 0.5322949290275574,
467
+ "eval_runtime": 18.4759,
468
+ "eval_samples_per_second": 22.732,
469
+ "eval_steps_per_second": 2.869,
470
+ "step": 330
471
+ },
472
+ {
473
+ "epoch": 1.44,
474
+ "learning_rate": 5.593220338983051e-06,
475
+ "loss": 0.4298,
476
+ "step": 340
477
+ },
478
+ {
479
+ "epoch": 1.44,
480
+ "eval_loss": 0.5574814677238464,
481
+ "eval_runtime": 17.6896,
482
+ "eval_samples_per_second": 23.743,
483
+ "eval_steps_per_second": 2.996,
484
+ "step": 340
485
+ },
486
+ {
487
+ "epoch": 1.48,
488
+ "learning_rate": 5.169491525423729e-06,
489
+ "loss": 0.5068,
490
+ "step": 350
491
+ },
492
+ {
493
+ "epoch": 1.48,
494
+ "eval_loss": 0.5528993606567383,
495
+ "eval_runtime": 18.3232,
496
+ "eval_samples_per_second": 22.922,
497
+ "eval_steps_per_second": 2.893,
498
+ "step": 350
499
+ },
500
+ {
501
+ "epoch": 1.53,
502
+ "learning_rate": 4.745762711864408e-06,
503
+ "loss": 0.4619,
504
+ "step": 360
505
+ },
506
+ {
507
+ "epoch": 1.53,
508
+ "eval_loss": 0.5589260458946228,
509
+ "eval_runtime": 18.3038,
510
+ "eval_samples_per_second": 22.946,
511
+ "eval_steps_per_second": 2.896,
512
+ "step": 360
513
+ },
514
+ {
515
+ "epoch": 1.57,
516
+ "learning_rate": 4.322033898305085e-06,
517
+ "loss": 0.4852,
518
+ "step": 370
519
+ },
520
+ {
521
+ "epoch": 1.57,
522
+ "eval_loss": 0.5255549550056458,
523
+ "eval_runtime": 18.4908,
524
+ "eval_samples_per_second": 22.714,
525
+ "eval_steps_per_second": 2.866,
526
+ "step": 370
527
+ },
528
+ {
529
+ "epoch": 1.61,
530
+ "learning_rate": 3.898305084745763e-06,
531
+ "loss": 0.3888,
532
+ "step": 380
533
+ },
534
+ {
535
+ "epoch": 1.61,
536
+ "eval_loss": 0.5730893611907959,
537
+ "eval_runtime": 18.3761,
538
+ "eval_samples_per_second": 22.856,
539
+ "eval_steps_per_second": 2.884,
540
+ "step": 380
541
+ },
542
+ {
543
+ "epoch": 1.65,
544
+ "learning_rate": 3.474576271186441e-06,
545
+ "loss": 0.4319,
546
+ "step": 390
547
+ },
548
+ {
549
+ "epoch": 1.65,
550
+ "eval_loss": 0.5334990620613098,
551
+ "eval_runtime": 18.4738,
552
+ "eval_samples_per_second": 22.735,
553
+ "eval_steps_per_second": 2.869,
554
+ "step": 390
555
+ },
556
+ {
557
+ "epoch": 1.69,
558
+ "learning_rate": 3.0508474576271192e-06,
559
+ "loss": 0.4422,
560
+ "step": 400
561
+ },
562
+ {
563
+ "epoch": 1.69,
564
+ "eval_loss": 0.5419171452522278,
565
+ "eval_runtime": 18.5281,
566
+ "eval_samples_per_second": 22.668,
567
+ "eval_steps_per_second": 2.861,
568
+ "step": 400
569
+ },
570
+ {
571
+ "epoch": 1.74,
572
+ "learning_rate": 2.627118644067797e-06,
573
+ "loss": 0.4522,
574
+ "step": 410
575
+ },
576
+ {
577
+ "epoch": 1.74,
578
+ "eval_loss": 0.5547201037406921,
579
+ "eval_runtime": 18.4837,
580
+ "eval_samples_per_second": 22.723,
581
+ "eval_steps_per_second": 2.867,
582
+ "step": 410
583
+ },
584
+ {
585
+ "epoch": 1.78,
586
+ "learning_rate": 2.203389830508475e-06,
587
+ "loss": 0.4276,
588
+ "step": 420
589
+ },
590
+ {
591
+ "epoch": 1.78,
592
+ "eval_loss": 0.5263144373893738,
593
+ "eval_runtime": 18.6045,
594
+ "eval_samples_per_second": 22.575,
595
+ "eval_steps_per_second": 2.849,
596
+ "step": 420
597
+ },
598
+ {
599
+ "epoch": 1.82,
600
+ "learning_rate": 1.7796610169491526e-06,
601
+ "loss": 0.3988,
602
+ "step": 430
603
+ },
604
+ {
605
+ "epoch": 1.82,
606
+ "eval_loss": 0.5480612516403198,
607
+ "eval_runtime": 18.5789,
608
+ "eval_samples_per_second": 22.606,
609
+ "eval_steps_per_second": 2.853,
610
+ "step": 430
611
+ },
612
+ {
613
+ "epoch": 1.86,
614
+ "learning_rate": 1.3559322033898307e-06,
615
+ "loss": 0.4063,
616
+ "step": 440
617
+ },
618
+ {
619
+ "epoch": 1.86,
620
+ "eval_loss": 0.5404064655303955,
621
+ "eval_runtime": 18.6623,
622
+ "eval_samples_per_second": 22.505,
623
+ "eval_steps_per_second": 2.84,
624
+ "step": 440
625
+ },
626
+ {
627
+ "epoch": 1.91,
628
+ "learning_rate": 9.322033898305086e-07,
629
+ "loss": 0.4141,
630
+ "step": 450
631
+ },
632
+ {
633
+ "epoch": 1.91,
634
+ "eval_loss": 0.5292345881462097,
635
+ "eval_runtime": 18.5061,
636
+ "eval_samples_per_second": 22.695,
637
+ "eval_steps_per_second": 2.864,
638
+ "step": 450
639
+ },
640
+ {
641
+ "epoch": 1.95,
642
+ "learning_rate": 5.084745762711865e-07,
643
+ "loss": 0.4149,
644
+ "step": 460
645
+ },
646
+ {
647
+ "epoch": 1.95,
648
+ "eval_loss": 0.5240865349769592,
649
+ "eval_runtime": 18.7251,
650
+ "eval_samples_per_second": 22.43,
651
+ "eval_steps_per_second": 2.83,
652
+ "step": 460
653
+ }
654
+ ],
655
+ "max_steps": 472,
656
+ "num_train_epochs": 2,
657
+ "total_flos": 1.3624153973519232e+16,
658
+ "trial_name": null,
659
+ "trial_params": null
660
+ }
checkpoint-460/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48b5fec3efb2c118a302d92fce79c45311f12529781bf0db59fab4b9060a4ffc
3
+ size 3439
checkpoint-470/added_tokens.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[CLAIM_END]": 128006,
3
+ "[CLAIM_START]": 128005,
4
+ "[CONCLUDING STATEMENT_END]": 128014,
5
+ "[CONCLUDING STATEMENT_START]": 128013,
6
+ "[COUNTERCLAIM_END]": 128008,
7
+ "[COUNTERCLAIM_START]": 128007,
8
+ "[EVIDENCE_END]": 128012,
9
+ "[EVIDENCE_START]": 128011,
10
+ "[LEAD_END]": 128002,
11
+ "[LEAD_START]": 128001,
12
+ "[MASK]": 128000,
13
+ "[POSITION_END]": 128004,
14
+ "[POSITION_START]": 128003,
15
+ "[REBUTTAL_END]": 128010,
16
+ "[REBUTTAL_START]": 128009
17
+ }
checkpoint-470/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "TTian/deberta-classifier-feedback-1024-pseudo",
3
+ "architectures": [
4
+ "DebertaV2ForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 1024,
10
+ "id2label": {
11
+ "0": "LABEL_0",
12
+ "1": "LABEL_1",
13
+ "2": "LABEL_2"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 4096,
17
+ "label2id": {
18
+ "LABEL_0": 0,
19
+ "LABEL_1": 1,
20
+ "LABEL_2": 2
21
+ },
22
+ "layer_norm_eps": 1e-07,
23
+ "max_position_embeddings": 1024,
24
+ "max_relative_positions": 128,
25
+ "model_type": "deberta-v2",
26
+ "norm_rel_ebd": "layer_norm",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 24,
29
+ "pad_token_id": 0,
30
+ "pooler_dropout": 0,
31
+ "pooler_hidden_act": "gelu",
32
+ "pooler_hidden_size": 1024,
33
+ "pos_att_type": [
34
+ "p2c",
35
+ "c2p"
36
+ ],
37
+ "position_biased_input": false,
38
+ "position_buckets": 256,
39
+ "relative_attention": true,
40
+ "share_att_key": true,
41
+ "torch_dtype": "float32",
42
+ "transformers_version": "4.24.0",
43
+ "type_vocab_size": 0,
44
+ "vocab_size": 128100
45
+ }
checkpoint-470/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e710207000a905882be89e864b0e7bd6e6c5e117db1ae88fed9238edc81c5863
3
+ size 3472349985
checkpoint-470/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5156a78b24e5ec2e687d939de2fc28f3c2ab423db3c2550047e2bd0553972703
3
+ size 1736202543
checkpoint-470/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71d0726f7f19ab288def8a1d0a35ca529db667c9b2b072cf5525b1c24b1de328
3
+ size 14503
checkpoint-470/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34356eaa271de388a24b2f5b1cfc2ad5f344c0b7ca336f603906ce9b0eee12df
3
+ size 559
checkpoint-470/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cde9ccd310f396b601a3397b93b0454b9791f79a5af0e6e4087719f99ed5eb9
3
+ size 623
checkpoint-470/special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "[LEAD_START]",
4
+ "[LEAD_END]",
5
+ "[POSITION_START]",
6
+ "[POSITION_END]",
7
+ "[CLAIM_START]",
8
+ "[CLAIM_END]",
9
+ "[COUNTERCLAIM_START]",
10
+ "[COUNTERCLAIM_END]",
11
+ "[REBUTTAL_START]",
12
+ "[REBUTTAL_END]",
13
+ "[EVIDENCE_START]",
14
+ "[EVIDENCE_END]",
15
+ "[CONCLUDING STATEMENT_START]",
16
+ "[CONCLUDING STATEMENT_END]"
17
+ ],
18
+ "bos_token": "[CLS]",
19
+ "cls_token": "[CLS]",
20
+ "eos_token": "[SEP]",
21
+ "mask_token": "[MASK]",
22
+ "pad_token": "[PAD]",
23
+ "sep_token": "[SEP]",
24
+ "unk_token": "[UNK]"
25
+ }
checkpoint-470/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
checkpoint-470/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-470/tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "[LEAD_START]",
4
+ "[LEAD_END]",
5
+ "[POSITION_START]",
6
+ "[POSITION_END]",
7
+ "[CLAIM_START]",
8
+ "[CLAIM_END]",
9
+ "[COUNTERCLAIM_START]",
10
+ "[COUNTERCLAIM_END]",
11
+ "[REBUTTAL_START]",
12
+ "[REBUTTAL_END]",
13
+ "[EVIDENCE_START]",
14
+ "[EVIDENCE_END]",
15
+ "[CONCLUDING STATEMENT_START]",
16
+ "[CONCLUDING STATEMENT_END]"
17
+ ],
18
+ "bos_token": "[CLS]",
19
+ "cls_token": "[CLS]",
20
+ "do_lower_case": false,
21
+ "eos_token": "[SEP]",
22
+ "mask_token": "[MASK]",
23
+ "name_or_path": "TTian/deberta-classifier-feedback-1024-pseudo",
24
+ "pad_token": "[PAD]",
25
+ "sep_token": "[SEP]",
26
+ "sp_model_kwargs": {},
27
+ "special_tokens_map_file": null,
28
+ "split_by_punct": false,
29
+ "tokenizer_class": "DebertaV2Tokenizer",
30
+ "unk_token": "[UNK]",
31
+ "vocab_type": "spm"
32
+ }
checkpoint-470/trainer_state.json ADDED
@@ -0,0 +1,674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5201743841171265,
3
+ "best_model_checkpoint": "deberta-classifier-feedback-1024-pseudo-final/checkpoint-170",
4
+ "epoch": 1.9915254237288136,
5
+ "global_step": 470,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.04,
12
+ "learning_rate": 1.9576271186440678e-05,
13
+ "loss": 0.5814,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "eval_loss": 0.5888153910636902,
19
+ "eval_runtime": 18.2433,
20
+ "eval_samples_per_second": 23.022,
21
+ "eval_steps_per_second": 2.905,
22
+ "step": 10
23
+ },
24
+ {
25
+ "epoch": 0.08,
26
+ "learning_rate": 1.9152542372881357e-05,
27
+ "loss": 0.5521,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 0.08,
32
+ "eval_loss": 0.5736112594604492,
33
+ "eval_runtime": 18.7271,
34
+ "eval_samples_per_second": 22.427,
35
+ "eval_steps_per_second": 2.83,
36
+ "step": 20
37
+ },
38
+ {
39
+ "epoch": 0.13,
40
+ "learning_rate": 1.8728813559322033e-05,
41
+ "loss": 0.5685,
42
+ "step": 30
43
+ },
44
+ {
45
+ "epoch": 0.13,
46
+ "eval_loss": 0.5809019804000854,
47
+ "eval_runtime": 17.2407,
48
+ "eval_samples_per_second": 24.361,
49
+ "eval_steps_per_second": 3.074,
50
+ "step": 30
51
+ },
52
+ {
53
+ "epoch": 0.17,
54
+ "learning_rate": 1.8305084745762713e-05,
55
+ "loss": 0.6052,
56
+ "step": 40
57
+ },
58
+ {
59
+ "epoch": 0.17,
60
+ "eval_loss": 0.5701586008071899,
61
+ "eval_runtime": 18.6909,
62
+ "eval_samples_per_second": 22.471,
63
+ "eval_steps_per_second": 2.836,
64
+ "step": 40
65
+ },
66
+ {
67
+ "epoch": 0.21,
68
+ "learning_rate": 1.788135593220339e-05,
69
+ "loss": 0.5532,
70
+ "step": 50
71
+ },
72
+ {
73
+ "epoch": 0.21,
74
+ "eval_loss": 0.5571172833442688,
75
+ "eval_runtime": 18.369,
76
+ "eval_samples_per_second": 22.865,
77
+ "eval_steps_per_second": 2.885,
78
+ "step": 50
79
+ },
80
+ {
81
+ "epoch": 0.25,
82
+ "learning_rate": 1.745762711864407e-05,
83
+ "loss": 0.6177,
84
+ "step": 60
85
+ },
86
+ {
87
+ "epoch": 0.25,
88
+ "eval_loss": 0.5848062634468079,
89
+ "eval_runtime": 18.5061,
90
+ "eval_samples_per_second": 22.695,
91
+ "eval_steps_per_second": 2.864,
92
+ "step": 60
93
+ },
94
+ {
95
+ "epoch": 0.3,
96
+ "learning_rate": 1.7033898305084745e-05,
97
+ "loss": 0.6196,
98
+ "step": 70
99
+ },
100
+ {
101
+ "epoch": 0.3,
102
+ "eval_loss": 0.5464363098144531,
103
+ "eval_runtime": 18.5102,
104
+ "eval_samples_per_second": 22.69,
105
+ "eval_steps_per_second": 2.863,
106
+ "step": 70
107
+ },
108
+ {
109
+ "epoch": 0.34,
110
+ "learning_rate": 1.6610169491525424e-05,
111
+ "loss": 0.5772,
112
+ "step": 80
113
+ },
114
+ {
115
+ "epoch": 0.34,
116
+ "eval_loss": 0.5307226777076721,
117
+ "eval_runtime": 18.3662,
118
+ "eval_samples_per_second": 22.868,
119
+ "eval_steps_per_second": 2.886,
120
+ "step": 80
121
+ },
122
+ {
123
+ "epoch": 0.38,
124
+ "learning_rate": 1.6186440677966104e-05,
125
+ "loss": 0.5805,
126
+ "step": 90
127
+ },
128
+ {
129
+ "epoch": 0.38,
130
+ "eval_loss": 0.554991602897644,
131
+ "eval_runtime": 17.9687,
132
+ "eval_samples_per_second": 23.374,
133
+ "eval_steps_per_second": 2.95,
134
+ "step": 90
135
+ },
136
+ {
137
+ "epoch": 0.42,
138
+ "learning_rate": 1.576271186440678e-05,
139
+ "loss": 0.6453,
140
+ "step": 100
141
+ },
142
+ {
143
+ "epoch": 0.42,
144
+ "eval_loss": 0.5466664433479309,
145
+ "eval_runtime": 18.0919,
146
+ "eval_samples_per_second": 23.215,
147
+ "eval_steps_per_second": 2.929,
148
+ "step": 100
149
+ },
150
+ {
151
+ "epoch": 0.47,
152
+ "learning_rate": 1.533898305084746e-05,
153
+ "loss": 0.5756,
154
+ "step": 110
155
+ },
156
+ {
157
+ "epoch": 0.47,
158
+ "eval_loss": 0.5586597919464111,
159
+ "eval_runtime": 18.6353,
160
+ "eval_samples_per_second": 22.538,
161
+ "eval_steps_per_second": 2.844,
162
+ "step": 110
163
+ },
164
+ {
165
+ "epoch": 0.51,
166
+ "learning_rate": 1.4915254237288137e-05,
167
+ "loss": 0.5901,
168
+ "step": 120
169
+ },
170
+ {
171
+ "epoch": 0.51,
172
+ "eval_loss": 0.5481747388839722,
173
+ "eval_runtime": 18.326,
174
+ "eval_samples_per_second": 22.918,
175
+ "eval_steps_per_second": 2.892,
176
+ "step": 120
177
+ },
178
+ {
179
+ "epoch": 0.55,
180
+ "learning_rate": 1.4491525423728813e-05,
181
+ "loss": 0.568,
182
+ "step": 130
183
+ },
184
+ {
185
+ "epoch": 0.55,
186
+ "eval_loss": 0.5262647867202759,
187
+ "eval_runtime": 18.2108,
188
+ "eval_samples_per_second": 23.063,
189
+ "eval_steps_per_second": 2.91,
190
+ "step": 130
191
+ },
192
+ {
193
+ "epoch": 0.59,
194
+ "learning_rate": 1.4067796610169493e-05,
195
+ "loss": 0.5452,
196
+ "step": 140
197
+ },
198
+ {
199
+ "epoch": 0.59,
200
+ "eval_loss": 0.5698090195655823,
201
+ "eval_runtime": 18.2551,
202
+ "eval_samples_per_second": 23.007,
203
+ "eval_steps_per_second": 2.903,
204
+ "step": 140
205
+ },
206
+ {
207
+ "epoch": 0.64,
208
+ "learning_rate": 1.364406779661017e-05,
209
+ "loss": 0.5949,
210
+ "step": 150
211
+ },
212
+ {
213
+ "epoch": 0.64,
214
+ "eval_loss": 0.5483840107917786,
215
+ "eval_runtime": 18.0824,
216
+ "eval_samples_per_second": 23.227,
217
+ "eval_steps_per_second": 2.931,
218
+ "step": 150
219
+ },
220
+ {
221
+ "epoch": 0.68,
222
+ "learning_rate": 1.3220338983050848e-05,
223
+ "loss": 0.5537,
224
+ "step": 160
225
+ },
226
+ {
227
+ "epoch": 0.68,
228
+ "eval_loss": 0.578332781791687,
229
+ "eval_runtime": 18.2057,
230
+ "eval_samples_per_second": 23.07,
231
+ "eval_steps_per_second": 2.911,
232
+ "step": 160
233
+ },
234
+ {
235
+ "epoch": 0.72,
236
+ "learning_rate": 1.2796610169491526e-05,
237
+ "loss": 0.5327,
238
+ "step": 170
239
+ },
240
+ {
241
+ "epoch": 0.72,
242
+ "eval_loss": 0.5201743841171265,
243
+ "eval_runtime": 18.1319,
244
+ "eval_samples_per_second": 23.164,
245
+ "eval_steps_per_second": 2.923,
246
+ "step": 170
247
+ },
248
+ {
249
+ "epoch": 0.76,
250
+ "learning_rate": 1.2372881355932205e-05,
251
+ "loss": 0.5449,
252
+ "step": 180
253
+ },
254
+ {
255
+ "epoch": 0.76,
256
+ "eval_loss": 0.5272189378738403,
257
+ "eval_runtime": 18.125,
258
+ "eval_samples_per_second": 23.172,
259
+ "eval_steps_per_second": 2.924,
260
+ "step": 180
261
+ },
262
+ {
263
+ "epoch": 0.81,
264
+ "learning_rate": 1.1949152542372882e-05,
265
+ "loss": 0.5345,
266
+ "step": 190
267
+ },
268
+ {
269
+ "epoch": 0.81,
270
+ "eval_loss": 0.5621271133422852,
271
+ "eval_runtime": 18.0129,
272
+ "eval_samples_per_second": 23.317,
273
+ "eval_steps_per_second": 2.942,
274
+ "step": 190
275
+ },
276
+ {
277
+ "epoch": 0.85,
278
+ "learning_rate": 1.1525423728813561e-05,
279
+ "loss": 0.5837,
280
+ "step": 200
281
+ },
282
+ {
283
+ "epoch": 0.85,
284
+ "eval_loss": 0.55014967918396,
285
+ "eval_runtime": 18.0302,
286
+ "eval_samples_per_second": 23.294,
287
+ "eval_steps_per_second": 2.94,
288
+ "step": 200
289
+ },
290
+ {
291
+ "epoch": 0.89,
292
+ "learning_rate": 1.1101694915254237e-05,
293
+ "loss": 0.5969,
294
+ "step": 210
295
+ },
296
+ {
297
+ "epoch": 0.89,
298
+ "eval_loss": 0.5470077395439148,
299
+ "eval_runtime": 17.9721,
300
+ "eval_samples_per_second": 23.37,
301
+ "eval_steps_per_second": 2.949,
302
+ "step": 210
303
+ },
304
+ {
305
+ "epoch": 0.93,
306
+ "learning_rate": 1.0677966101694917e-05,
307
+ "loss": 0.5905,
308
+ "step": 220
309
+ },
310
+ {
311
+ "epoch": 0.93,
312
+ "eval_loss": 0.5924287438392639,
313
+ "eval_runtime": 18.0444,
314
+ "eval_samples_per_second": 23.276,
315
+ "eval_steps_per_second": 2.937,
316
+ "step": 220
317
+ },
318
+ {
319
+ "epoch": 0.97,
320
+ "learning_rate": 1.0254237288135593e-05,
321
+ "loss": 0.5481,
322
+ "step": 230
323
+ },
324
+ {
325
+ "epoch": 0.97,
326
+ "eval_loss": 0.5415045022964478,
327
+ "eval_runtime": 17.9371,
328
+ "eval_samples_per_second": 23.415,
329
+ "eval_steps_per_second": 2.955,
330
+ "step": 230
331
+ },
332
+ {
333
+ "epoch": 1.02,
334
+ "learning_rate": 9.830508474576272e-06,
335
+ "loss": 0.5035,
336
+ "step": 240
337
+ },
338
+ {
339
+ "epoch": 1.02,
340
+ "eval_loss": 0.5320823788642883,
341
+ "eval_runtime": 17.9879,
342
+ "eval_samples_per_second": 23.349,
343
+ "eval_steps_per_second": 2.946,
344
+ "step": 240
345
+ },
346
+ {
347
+ "epoch": 1.06,
348
+ "learning_rate": 9.40677966101695e-06,
349
+ "loss": 0.4508,
350
+ "step": 250
351
+ },
352
+ {
353
+ "epoch": 1.06,
354
+ "eval_loss": 0.5371343493461609,
355
+ "eval_runtime": 17.4824,
356
+ "eval_samples_per_second": 24.024,
357
+ "eval_steps_per_second": 3.032,
358
+ "step": 250
359
+ },
360
+ {
361
+ "epoch": 1.1,
362
+ "learning_rate": 8.983050847457628e-06,
363
+ "loss": 0.4227,
364
+ "step": 260
365
+ },
366
+ {
367
+ "epoch": 1.1,
368
+ "eval_loss": 0.5276100635528564,
369
+ "eval_runtime": 18.1362,
370
+ "eval_samples_per_second": 23.158,
371
+ "eval_steps_per_second": 2.922,
372
+ "step": 260
373
+ },
374
+ {
375
+ "epoch": 1.14,
376
+ "learning_rate": 8.559322033898306e-06,
377
+ "loss": 0.4423,
378
+ "step": 270
379
+ },
380
+ {
381
+ "epoch": 1.14,
382
+ "eval_loss": 0.532426118850708,
383
+ "eval_runtime": 17.9907,
384
+ "eval_samples_per_second": 23.345,
385
+ "eval_steps_per_second": 2.946,
386
+ "step": 270
387
+ },
388
+ {
389
+ "epoch": 1.19,
390
+ "learning_rate": 8.135593220338983e-06,
391
+ "loss": 0.432,
392
+ "step": 280
393
+ },
394
+ {
395
+ "epoch": 1.19,
396
+ "eval_loss": 0.5377896428108215,
397
+ "eval_runtime": 17.4953,
398
+ "eval_samples_per_second": 24.006,
399
+ "eval_steps_per_second": 3.029,
400
+ "step": 280
401
+ },
402
+ {
403
+ "epoch": 1.23,
404
+ "learning_rate": 7.711864406779663e-06,
405
+ "loss": 0.4317,
406
+ "step": 290
407
+ },
408
+ {
409
+ "epoch": 1.23,
410
+ "eval_loss": 0.5301514863967896,
411
+ "eval_runtime": 18.2183,
412
+ "eval_samples_per_second": 23.054,
413
+ "eval_steps_per_second": 2.909,
414
+ "step": 290
415
+ },
416
+ {
417
+ "epoch": 1.27,
418
+ "learning_rate": 7.288135593220339e-06,
419
+ "loss": 0.46,
420
+ "step": 300
421
+ },
422
+ {
423
+ "epoch": 1.27,
424
+ "eval_loss": 0.5301567316055298,
425
+ "eval_runtime": 18.4315,
426
+ "eval_samples_per_second": 22.787,
427
+ "eval_steps_per_second": 2.876,
428
+ "step": 300
429
+ },
430
+ {
431
+ "epoch": 1.31,
432
+ "learning_rate": 6.864406779661017e-06,
433
+ "loss": 0.435,
434
+ "step": 310
435
+ },
436
+ {
437
+ "epoch": 1.31,
438
+ "eval_loss": 0.5325623750686646,
439
+ "eval_runtime": 17.6821,
440
+ "eval_samples_per_second": 23.753,
441
+ "eval_steps_per_second": 2.997,
442
+ "step": 310
443
+ },
444
+ {
445
+ "epoch": 1.36,
446
+ "learning_rate": 6.440677966101695e-06,
447
+ "loss": 0.3813,
448
+ "step": 320
449
+ },
450
+ {
451
+ "epoch": 1.36,
452
+ "eval_loss": 0.5431253910064697,
453
+ "eval_runtime": 18.5006,
454
+ "eval_samples_per_second": 22.702,
455
+ "eval_steps_per_second": 2.865,
456
+ "step": 320
457
+ },
458
+ {
459
+ "epoch": 1.4,
460
+ "learning_rate": 6.0169491525423725e-06,
461
+ "loss": 0.4422,
462
+ "step": 330
463
+ },
464
+ {
465
+ "epoch": 1.4,
466
+ "eval_loss": 0.5322949290275574,
467
+ "eval_runtime": 18.4759,
468
+ "eval_samples_per_second": 22.732,
469
+ "eval_steps_per_second": 2.869,
470
+ "step": 330
471
+ },
472
+ {
473
+ "epoch": 1.44,
474
+ "learning_rate": 5.593220338983051e-06,
475
+ "loss": 0.4298,
476
+ "step": 340
477
+ },
478
+ {
479
+ "epoch": 1.44,
480
+ "eval_loss": 0.5574814677238464,
481
+ "eval_runtime": 17.6896,
482
+ "eval_samples_per_second": 23.743,
483
+ "eval_steps_per_second": 2.996,
484
+ "step": 340
485
+ },
486
+ {
487
+ "epoch": 1.48,
488
+ "learning_rate": 5.169491525423729e-06,
489
+ "loss": 0.5068,
490
+ "step": 350
491
+ },
492
+ {
493
+ "epoch": 1.48,
494
+ "eval_loss": 0.5528993606567383,
495
+ "eval_runtime": 18.3232,
496
+ "eval_samples_per_second": 22.922,
497
+ "eval_steps_per_second": 2.893,
498
+ "step": 350
499
+ },
500
+ {
501
+ "epoch": 1.53,
502
+ "learning_rate": 4.745762711864408e-06,
503
+ "loss": 0.4619,
504
+ "step": 360
505
+ },
506
+ {
507
+ "epoch": 1.53,
508
+ "eval_loss": 0.5589260458946228,
509
+ "eval_runtime": 18.3038,
510
+ "eval_samples_per_second": 22.946,
511
+ "eval_steps_per_second": 2.896,
512
+ "step": 360
513
+ },
514
+ {
515
+ "epoch": 1.57,
516
+ "learning_rate": 4.322033898305085e-06,
517
+ "loss": 0.4852,
518
+ "step": 370
519
+ },
520
+ {
521
+ "epoch": 1.57,
522
+ "eval_loss": 0.5255549550056458,
523
+ "eval_runtime": 18.4908,
524
+ "eval_samples_per_second": 22.714,
525
+ "eval_steps_per_second": 2.866,
526
+ "step": 370
527
+ },
528
+ {
529
+ "epoch": 1.61,
530
+ "learning_rate": 3.898305084745763e-06,
531
+ "loss": 0.3888,
532
+ "step": 380
533
+ },
534
+ {
535
+ "epoch": 1.61,
536
+ "eval_loss": 0.5730893611907959,
537
+ "eval_runtime": 18.3761,
538
+ "eval_samples_per_second": 22.856,
539
+ "eval_steps_per_second": 2.884,
540
+ "step": 380
541
+ },
542
+ {
543
+ "epoch": 1.65,
544
+ "learning_rate": 3.474576271186441e-06,
545
+ "loss": 0.4319,
546
+ "step": 390
547
+ },
548
+ {
549
+ "epoch": 1.65,
550
+ "eval_loss": 0.5334990620613098,
551
+ "eval_runtime": 18.4738,
552
+ "eval_samples_per_second": 22.735,
553
+ "eval_steps_per_second": 2.869,
554
+ "step": 390
555
+ },
556
+ {
557
+ "epoch": 1.69,
558
+ "learning_rate": 3.0508474576271192e-06,
559
+ "loss": 0.4422,
560
+ "step": 400
561
+ },
562
+ {
563
+ "epoch": 1.69,
564
+ "eval_loss": 0.5419171452522278,
565
+ "eval_runtime": 18.5281,
566
+ "eval_samples_per_second": 22.668,
567
+ "eval_steps_per_second": 2.861,
568
+ "step": 400
569
+ },
570
+ {
571
+ "epoch": 1.74,
572
+ "learning_rate": 2.627118644067797e-06,
573
+ "loss": 0.4522,
574
+ "step": 410
575
+ },
576
+ {
577
+ "epoch": 1.74,
578
+ "eval_loss": 0.5547201037406921,
579
+ "eval_runtime": 18.4837,
580
+ "eval_samples_per_second": 22.723,
581
+ "eval_steps_per_second": 2.867,
582
+ "step": 410
583
+ },
584
+ {
585
+ "epoch": 1.78,
586
+ "learning_rate": 2.203389830508475e-06,
587
+ "loss": 0.4276,
588
+ "step": 420
589
+ },
590
+ {
591
+ "epoch": 1.78,
592
+ "eval_loss": 0.5263144373893738,
593
+ "eval_runtime": 18.6045,
594
+ "eval_samples_per_second": 22.575,
595
+ "eval_steps_per_second": 2.849,
596
+ "step": 420
597
+ },
598
+ {
599
+ "epoch": 1.82,
600
+ "learning_rate": 1.7796610169491526e-06,
601
+ "loss": 0.3988,
602
+ "step": 430
603
+ },
604
+ {
605
+ "epoch": 1.82,
606
+ "eval_loss": 0.5480612516403198,
607
+ "eval_runtime": 18.5789,
608
+ "eval_samples_per_second": 22.606,
609
+ "eval_steps_per_second": 2.853,
610
+ "step": 430
611
+ },
612
+ {
613
+ "epoch": 1.86,
614
+ "learning_rate": 1.3559322033898307e-06,
615
+ "loss": 0.4063,
616
+ "step": 440
617
+ },
618
+ {
619
+ "epoch": 1.86,
620
+ "eval_loss": 0.5404064655303955,
621
+ "eval_runtime": 18.6623,
622
+ "eval_samples_per_second": 22.505,
623
+ "eval_steps_per_second": 2.84,
624
+ "step": 440
625
+ },
626
+ {
627
+ "epoch": 1.91,
628
+ "learning_rate": 9.322033898305086e-07,
629
+ "loss": 0.4141,
630
+ "step": 450
631
+ },
632
+ {
633
+ "epoch": 1.91,
634
+ "eval_loss": 0.5292345881462097,
635
+ "eval_runtime": 18.5061,
636
+ "eval_samples_per_second": 22.695,
637
+ "eval_steps_per_second": 2.864,
638
+ "step": 450
639
+ },
640
+ {
641
+ "epoch": 1.95,
642
+ "learning_rate": 5.084745762711865e-07,
643
+ "loss": 0.4149,
644
+ "step": 460
645
+ },
646
+ {
647
+ "epoch": 1.95,
648
+ "eval_loss": 0.5240865349769592,
649
+ "eval_runtime": 18.7251,
650
+ "eval_samples_per_second": 22.43,
651
+ "eval_steps_per_second": 2.83,
652
+ "step": 460
653
+ },
654
+ {
655
+ "epoch": 1.99,
656
+ "learning_rate": 8.474576271186442e-08,
657
+ "loss": 0.4104,
658
+ "step": 470
659
+ },
660
+ {
661
+ "epoch": 1.99,
662
+ "eval_loss": 0.5263239145278931,
663
+ "eval_runtime": 18.5702,
664
+ "eval_samples_per_second": 22.617,
665
+ "eval_steps_per_second": 2.854,
666
+ "step": 470
667
+ }
668
+ ],
669
+ "max_steps": 472,
670
+ "num_train_epochs": 2,
671
+ "total_flos": 1.3918818284081568e+16,
672
+ "trial_name": null,
673
+ "trial_params": null
674
+ }
checkpoint-470/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48b5fec3efb2c118a302d92fce79c45311f12529781bf0db59fab4b9060a4ffc
3
+ size 3439
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c67ed9579fbf92ad582171655d9a63cf47be3ee9a1a837842b2683db66360847
3
  size 1736202543
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ea0f96260b582f68582a71bd74d29b120583100cf5dfbd08940813b056f101d
3
  size 1736202543
runs/Nov26_03-30-10_f19dc631087e/events.out.tfevents.1669433434.f19dc631087e.2463.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c1407a8b3196f5e133af9fa532c7bd1a2d308e1954075e2b209034e57c624a5
3
- size 4616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aa50eea59dea3df3c7b79be08fbcda6a3720dbfa611b3d2c57c7dc07beeedf9
3
+ size 24570