nvan15 commited on
Commit
d4b473b
·
verified ·
1 Parent(s): adea12b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d17h39m52,sd43/ft/tokenizer.json +0 -0
  2. reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d19h50m11,sd44/ft/special_tokens_map.json +15 -0
  3. reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d19h50m11,sd44/ft/tokenizer.json +0 -0
  4. reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d19h50m11,sd44/ft/tokenizer_config.json +60 -0
  5. reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d19h50m11,sd44/ft2/README.md +205 -0
  6. reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d19h50m11,sd44/ft2/adapter_config.json +34 -0
  7. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/QQP.tsv +0 -0
  8. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/all_results.json +11 -0
  9. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/eval_results.json +11 -0
  10. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft/added_tokens.json +3 -0
  11. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft/special_tokens_map.json +15 -0
  12. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft/tokenizer.json +0 -0
  13. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft/tokenizer_config.json +60 -0
  14. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft2/README.md +205 -0
  15. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft2/adapter_config.json +34 -0
  16. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/trainer_state.json +2077 -0
  17. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/QQP.tsv +0 -0
  18. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/all_results.json +11 -0
  19. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/eval_results.json +11 -0
  20. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft/added_tokens.json +3 -0
  21. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft/special_tokens_map.json +15 -0
  22. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft/tokenizer.json +0 -0
  23. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft/tokenizer_config.json +60 -0
  24. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft2/README.md +205 -0
  25. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft2/adapter_config.json +34 -0
  26. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/trainer_state.json +2077 -0
  27. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/QQP.tsv +0 -0
  28. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/all_results.json +11 -0
  29. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/eval_results.json +11 -0
  30. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft/added_tokens.json +3 -0
  31. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft/special_tokens_map.json +15 -0
  32. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft/tokenizer.json +0 -0
  33. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft/tokenizer_config.json +60 -0
  34. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft2/README.md +205 -0
  35. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft2/adapter_config.json +34 -0
  36. reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/trainer_state.json +2077 -0
  37. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/RTE.tsv +3001 -0
  38. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/all_results.json +9 -0
  39. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/eval_results.json +9 -0
  40. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft/added_tokens.json +3 -0
  41. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft/special_tokens_map.json +15 -0
  42. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft/tokenizer.json +0 -0
  43. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft/tokenizer_config.json +60 -0
  44. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft2/README.md +205 -0
  45. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft2/adapter_config.json +34 -0
  46. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/trainer_state.json +107 -0
  47. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=25d17h57m30,sd43/all_results.json +9 -0
  48. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=25d17h57m30,sd43/eval_results.json +9 -0
  49. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=25d17h57m30,sd43/ft/added_tokens.json +3 -0
  50. reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=25d17h57m30,sd43/ft/special_tokens_map.json +15 -0
reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d17h39m52,sd43/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d19h50m11,sd44/ft/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d19h50m11,sd44/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d19h50m11,sd44/ft/tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "pad_token": "[PAD]",
53
+ "padding_side": "right",
54
+ "sep_token": "[SEP]",
55
+ "sp_model_kwargs": {},
56
+ "split_by_punct": false,
57
+ "tokenizer_class": "DebertaV2Tokenizer",
58
+ "unk_token": "[UNK]",
59
+ "vocab_type": "spm"
60
+ }
reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d19h50m11,sd44/ft2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/deberta-v3-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:microsoft/deberta-v3-base
6
+ - transformers
7
+ ---
8
+
9
+ # Model Card for Model ID
10
+
11
+ <!-- Provide a quick summary of what the model is/does. -->
12
+
13
+
14
+
15
+ ## Model Details
16
+
17
+ ### Model Description
18
+
19
+ <!-- Provide a longer summary of what this model is. -->
20
+
21
+
22
+
23
+ - **Developed by:** [More Information Needed]
24
+ - **Funded by [optional]:** [More Information Needed]
25
+ - **Shared by [optional]:** [More Information Needed]
26
+ - **Model type:** [More Information Needed]
27
+ - **Language(s) (NLP):** [More Information Needed]
28
+ - **License:** [More Information Needed]
29
+ - **Finetuned from model [optional]:** [More Information Needed]
30
+
31
+ ### Model Sources [optional]
32
+
33
+ <!-- Provide the basic links for the model. -->
34
+
35
+ - **Repository:** [More Information Needed]
36
+ - **Paper [optional]:** [More Information Needed]
37
+ - **Demo [optional]:** [More Information Needed]
38
+
39
+ ## Uses
40
+
41
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
42
+
43
+ ### Direct Use
44
+
45
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
46
+
47
+ [More Information Needed]
48
+
49
+ ### Downstream Use [optional]
50
+
51
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
52
+
53
+ [More Information Needed]
54
+
55
+ ### Out-of-Scope Use
56
+
57
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
58
+
59
+ [More Information Needed]
60
+
61
+ ## Bias, Risks, and Limitations
62
+
63
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
64
+
65
+ [More Information Needed]
66
+
67
+ ### Recommendations
68
+
69
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
70
+
71
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
72
+
73
+ ## How to Get Started with the Model
74
+
75
+ Use the code below to get started with the model.
76
+
77
+ [More Information Needed]
78
+
79
+ ## Training Details
80
+
81
+ ### Training Data
82
+
83
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
84
+
85
+ [More Information Needed]
86
+
87
+ ### Training Procedure
88
+
89
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
90
+
91
+ #### Preprocessing [optional]
92
+
93
+ [More Information Needed]
94
+
95
+
96
+ #### Training Hyperparameters
97
+
98
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
99
+
100
+ #### Speeds, Sizes, Times [optional]
101
+
102
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
103
+
104
+ [More Information Needed]
105
+
106
+ ## Evaluation
107
+
108
+ <!-- This section describes the evaluation protocols and provides the results. -->
109
+
110
+ ### Testing Data, Factors & Metrics
111
+
112
+ #### Testing Data
113
+
114
+ <!-- This should link to a Dataset Card if possible. -->
115
+
116
+ [More Information Needed]
117
+
118
+ #### Factors
119
+
120
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
121
+
122
+ [More Information Needed]
123
+
124
+ #### Metrics
125
+
126
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
127
+
128
+ [More Information Needed]
129
+
130
+ ### Results
131
+
132
+ [More Information Needed]
133
+
134
+ #### Summary
135
+
136
+
137
+
138
+ ## Model Examination [optional]
139
+
140
+ <!-- Relevant interpretability work for the model goes here -->
141
+
142
+ [More Information Needed]
143
+
144
+ ## Environmental Impact
145
+
146
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
147
+
148
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
149
+
150
+ - **Hardware Type:** [More Information Needed]
151
+ - **Hours used:** [More Information Needed]
152
+ - **Cloud Provider:** [More Information Needed]
153
+ - **Compute Region:** [More Information Needed]
154
+ - **Carbon Emitted:** [More Information Needed]
155
+
156
+ ## Technical Specifications [optional]
157
+
158
+ ### Model Architecture and Objective
159
+
160
+ [More Information Needed]
161
+
162
+ ### Compute Infrastructure
163
+
164
+ [More Information Needed]
165
+
166
+ #### Hardware
167
+
168
+ [More Information Needed]
169
+
170
+ #### Software
171
+
172
+ [More Information Needed]
173
+
174
+ ## Citation [optional]
175
+
176
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
177
+
178
+ **BibTeX:**
179
+
180
+ [More Information Needed]
181
+
182
+ **APA:**
183
+
184
+ [More Information Needed]
185
+
186
+ ## Glossary [optional]
187
+
188
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
189
+
190
+ [More Information Needed]
191
+
192
+ ## More Information [optional]
193
+
194
+ [More Information Needed]
195
+
196
+ ## Model Card Authors [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Contact
201
+
202
+ [More Information Needed]
203
+ ### Framework versions
204
+
205
+ - PEFT 0.18.0
reproduction/glue_expBOFT/qnli/dr0.05,mlr2e-04,clr2e-04,ep=4.0t=25d19h50m11,sd44/ft2/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": {
3
+ "base_model_class": "DebertaV2ForSequenceClassification",
4
+ "parent_library": "transformers.models.deberta_v2.modeling_deberta_v2"
5
+ },
6
+ "base_model_name_or_path": "microsoft/deberta-v3-base",
7
+ "bias": "none",
8
+ "boft_block_num": 0,
9
+ "boft_block_size": 4,
10
+ "boft_dropout": 0.05,
11
+ "boft_n_butterfly_factor": 2,
12
+ "exclude_modules": null,
13
+ "fan_in_fan_out": false,
14
+ "inference_mode": true,
15
+ "init_weights": true,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "modules_to_save": [
19
+ "classifier",
20
+ "pooler"
21
+ ],
22
+ "peft_type": "BOFT",
23
+ "peft_version": "0.18.0",
24
+ "revision": null,
25
+ "target_modules": [
26
+ "query_proj",
27
+ "intermediate.dense",
28
+ "key_proj",
29
+ "attention.output.dense",
30
+ "value_proj",
31
+ "output.dense"
32
+ ],
33
+ "task_type": null
34
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/QQP.tsv ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9167944595597328,
4
+ "eval_combined_score": 0.9034945009640941,
5
+ "eval_f1": 0.8901945423684554,
6
+ "eval_loss": 0.2233511060476303,
7
+ "eval_runtime": 47.8389,
8
+ "eval_samples": 40430,
9
+ "eval_samples_per_second": 845.128,
10
+ "eval_steps_per_second": 1.651
11
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/eval_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9167944595597328,
4
+ "eval_combined_score": 0.9034945009640941,
5
+ "eval_f1": 0.8901945423684554,
6
+ "eval_loss": 0.2233511060476303,
7
+ "eval_runtime": 47.8389,
8
+ "eval_samples": 40430,
9
+ "eval_samples_per_second": 845.128,
10
+ "eval_steps_per_second": 1.651
11
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft/tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "pad_token": "[PAD]",
53
+ "padding_side": "right",
54
+ "sep_token": "[SEP]",
55
+ "sp_model_kwargs": {},
56
+ "split_by_punct": false,
57
+ "tokenizer_class": "DebertaV2Tokenizer",
58
+ "unk_token": "[UNK]",
59
+ "vocab_type": "spm"
60
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/deberta-v3-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:microsoft/deberta-v3-base
6
+ - transformers
7
+ ---
8
+
9
+ # Model Card for Model ID
10
+
11
+ <!-- Provide a quick summary of what the model is/does. -->
12
+
13
+
14
+
15
+ ## Model Details
16
+
17
+ ### Model Description
18
+
19
+ <!-- Provide a longer summary of what this model is. -->
20
+
21
+
22
+
23
+ - **Developed by:** [More Information Needed]
24
+ - **Funded by [optional]:** [More Information Needed]
25
+ - **Shared by [optional]:** [More Information Needed]
26
+ - **Model type:** [More Information Needed]
27
+ - **Language(s) (NLP):** [More Information Needed]
28
+ - **License:** [More Information Needed]
29
+ - **Finetuned from model [optional]:** [More Information Needed]
30
+
31
+ ### Model Sources [optional]
32
+
33
+ <!-- Provide the basic links for the model. -->
34
+
35
+ - **Repository:** [More Information Needed]
36
+ - **Paper [optional]:** [More Information Needed]
37
+ - **Demo [optional]:** [More Information Needed]
38
+
39
+ ## Uses
40
+
41
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
42
+
43
+ ### Direct Use
44
+
45
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
46
+
47
+ [More Information Needed]
48
+
49
+ ### Downstream Use [optional]
50
+
51
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
52
+
53
+ [More Information Needed]
54
+
55
+ ### Out-of-Scope Use
56
+
57
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
58
+
59
+ [More Information Needed]
60
+
61
+ ## Bias, Risks, and Limitations
62
+
63
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
64
+
65
+ [More Information Needed]
66
+
67
+ ### Recommendations
68
+
69
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
70
+
71
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
72
+
73
+ ## How to Get Started with the Model
74
+
75
+ Use the code below to get started with the model.
76
+
77
+ [More Information Needed]
78
+
79
+ ## Training Details
80
+
81
+ ### Training Data
82
+
83
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
84
+
85
+ [More Information Needed]
86
+
87
+ ### Training Procedure
88
+
89
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
90
+
91
+ #### Preprocessing [optional]
92
+
93
+ [More Information Needed]
94
+
95
+
96
+ #### Training Hyperparameters
97
+
98
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
99
+
100
+ #### Speeds, Sizes, Times [optional]
101
+
102
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
103
+
104
+ [More Information Needed]
105
+
106
+ ## Evaluation
107
+
108
+ <!-- This section describes the evaluation protocols and provides the results. -->
109
+
110
+ ### Testing Data, Factors & Metrics
111
+
112
+ #### Testing Data
113
+
114
+ <!-- This should link to a Dataset Card if possible. -->
115
+
116
+ [More Information Needed]
117
+
118
+ #### Factors
119
+
120
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
121
+
122
+ [More Information Needed]
123
+
124
+ #### Metrics
125
+
126
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
127
+
128
+ [More Information Needed]
129
+
130
+ ### Results
131
+
132
+ [More Information Needed]
133
+
134
+ #### Summary
135
+
136
+
137
+
138
+ ## Model Examination [optional]
139
+
140
+ <!-- Relevant interpretability work for the model goes here -->
141
+
142
+ [More Information Needed]
143
+
144
+ ## Environmental Impact
145
+
146
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
147
+
148
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
149
+
150
+ - **Hardware Type:** [More Information Needed]
151
+ - **Hours used:** [More Information Needed]
152
+ - **Cloud Provider:** [More Information Needed]
153
+ - **Compute Region:** [More Information Needed]
154
+ - **Carbon Emitted:** [More Information Needed]
155
+
156
+ ## Technical Specifications [optional]
157
+
158
+ ### Model Architecture and Objective
159
+
160
+ [More Information Needed]
161
+
162
+ ### Compute Infrastructure
163
+
164
+ [More Information Needed]
165
+
166
+ #### Hardware
167
+
168
+ [More Information Needed]
169
+
170
+ #### Software
171
+
172
+ [More Information Needed]
173
+
174
+ ## Citation [optional]
175
+
176
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
177
+
178
+ **BibTeX:**
179
+
180
+ [More Information Needed]
181
+
182
+ **APA:**
183
+
184
+ [More Information Needed]
185
+
186
+ ## Glossary [optional]
187
+
188
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
189
+
190
+ [More Information Needed]
191
+
192
+ ## More Information [optional]
193
+
194
+ [More Information Needed]
195
+
196
+ ## Model Card Authors [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Contact
201
+
202
+ [More Information Needed]
203
+ ### Framework versions
204
+
205
+ - PEFT 0.18.0
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/ft2/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": {
3
+ "base_model_class": "DebertaV2ForSequenceClassification",
4
+ "parent_library": "transformers.models.deberta_v2.modeling_deberta_v2"
5
+ },
6
+ "base_model_name_or_path": "microsoft/deberta-v3-base",
7
+ "bias": "none",
8
+ "boft_block_num": 0,
9
+ "boft_block_size": 4,
10
+ "boft_dropout": 0.05,
11
+ "boft_n_butterfly_factor": 2,
12
+ "exclude_modules": null,
13
+ "fan_in_fan_out": false,
14
+ "inference_mode": true,
15
+ "init_weights": true,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "modules_to_save": [
19
+ "classifier",
20
+ "pooler"
21
+ ],
22
+ "peft_type": "BOFT",
23
+ "peft_version": "0.18.0",
24
+ "revision": null,
25
+ "target_modules": [
26
+ "output.dense",
27
+ "intermediate.dense",
28
+ "attention.output.dense",
29
+ "value_proj",
30
+ "query_proj",
31
+ "key_proj"
32
+ ],
33
+ "task_type": null
34
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/trainer_state.json ADDED
@@ -0,0 +1,2077 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 103000,
3
+ "best_metric": 0.9167944595597328,
4
+ "best_model_checkpoint": "./glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=22d00h06m39/checkpoint-103000",
5
+ "epoch": 10.0,
6
+ "eval_steps": 1000,
7
+ "global_step": 113710,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0879430129276229,
14
+ "grad_norm": 2.398847818374634,
15
+ "learning_rate": 0.00029969999999999997,
16
+ "loss": 0.442,
17
+ "step": 1000
18
+ },
19
+ {
20
+ "epoch": 0.0879430129276229,
21
+ "eval_accuracy": 0.8523126391293594,
22
+ "eval_combined_score": 0.8349542422773726,
23
+ "eval_f1": 0.8175958454253857,
24
+ "eval_loss": 0.33211618661880493,
25
+ "eval_runtime": 85.5276,
26
+ "eval_samples_per_second": 472.713,
27
+ "eval_steps_per_second": 0.924,
28
+ "step": 1000
29
+ },
30
+ {
31
+ "epoch": 0.1758860258552458,
32
+ "grad_norm": 0.9574108719825745,
33
+ "learning_rate": 0.00029994184111301736,
34
+ "loss": 0.3219,
35
+ "step": 2000
36
+ },
37
+ {
38
+ "epoch": 0.1758860258552458,
39
+ "eval_accuracy": 0.8604748948800396,
40
+ "eval_combined_score": 0.845059420683157,
41
+ "eval_f1": 0.8296439464862743,
42
+ "eval_loss": 0.3096776604652405,
43
+ "eval_runtime": 85.1462,
44
+ "eval_samples_per_second": 474.83,
45
+ "eval_steps_per_second": 0.928,
46
+ "step": 2000
47
+ },
48
+ {
49
+ "epoch": 0.2638290387828687,
50
+ "grad_norm": 0.7193896174430847,
51
+ "learning_rate": 0.00029976717673021227,
52
+ "loss": 0.2917,
53
+ "step": 3000
54
+ },
55
+ {
56
+ "epoch": 0.2638290387828687,
57
+ "eval_accuracy": 0.8810784071234232,
58
+ "eval_combined_score": 0.8629407529484128,
59
+ "eval_f1": 0.8448030987734022,
60
+ "eval_loss": 0.27602431178092957,
61
+ "eval_runtime": 84.7536,
62
+ "eval_samples_per_second": 477.03,
63
+ "eval_steps_per_second": 0.932,
64
+ "step": 3000
65
+ },
66
+ {
67
+ "epoch": 0.3517720517104916,
68
+ "grad_norm": 0.9506718516349792,
69
+ "learning_rate": 0.0002994761425083971,
70
+ "loss": 0.2892,
71
+ "step": 4000
72
+ },
73
+ {
74
+ "epoch": 0.3517720517104916,
75
+ "eval_accuracy": 0.8832797427652733,
76
+ "eval_combined_score": 0.8606881439321272,
77
+ "eval_f1": 0.838096545098981,
78
+ "eval_loss": 0.2720984220504761,
79
+ "eval_runtime": 83.6647,
80
+ "eval_samples_per_second": 483.238,
81
+ "eval_steps_per_second": 0.944,
82
+ "step": 4000
83
+ },
84
+ {
85
+ "epoch": 0.4397150646381145,
86
+ "grad_norm": 1.3692947626113892,
87
+ "learning_rate": 0.0002990689645826054,
88
+ "loss": 0.2735,
89
+ "step": 5000
90
+ },
91
+ {
92
+ "epoch": 0.4397150646381145,
93
+ "eval_accuracy": 0.8903042295325253,
94
+ "eval_combined_score": 0.8715568142452046,
95
+ "eval_f1": 0.8528093989578839,
96
+ "eval_loss": 0.25894808769226074,
97
+ "eval_runtime": 55.3727,
98
+ "eval_samples_per_second": 730.143,
99
+ "eval_steps_per_second": 1.427,
100
+ "step": 5000
101
+ },
102
+ {
103
+ "epoch": 0.5276580775657373,
104
+ "grad_norm": 0.8675212860107422,
105
+ "learning_rate": 0.00029854595933210474,
106
+ "loss": 0.2723,
107
+ "step": 6000
108
+ },
109
+ {
110
+ "epoch": 0.5276580775657373,
111
+ "eval_accuracy": 0.8893643334157804,
112
+ "eval_combined_score": 0.8712073482184524,
113
+ "eval_f1": 0.8530503630211242,
114
+ "eval_loss": 0.26774412393569946,
115
+ "eval_runtime": 58.3831,
116
+ "eval_samples_per_second": 692.495,
117
+ "eval_steps_per_second": 1.353,
118
+ "step": 6000
119
+ },
120
+ {
121
+ "epoch": 0.6156010904933603,
122
+ "grad_norm": 1.075390100479126,
123
+ "learning_rate": 0.0002979075331345683,
124
+ "loss": 0.2648,
125
+ "step": 7000
126
+ },
127
+ {
128
+ "epoch": 0.6156010904933603,
129
+ "eval_accuracy": 0.886272569873856,
130
+ "eval_combined_score": 0.8708315925227879,
131
+ "eval_f1": 0.8553906151717197,
132
+ "eval_loss": 0.26676592230796814,
133
+ "eval_runtime": 54.0926,
134
+ "eval_samples_per_second": 747.422,
135
+ "eval_steps_per_second": 1.46,
136
+ "step": 7000
137
+ },
138
+ {
139
+ "epoch": 0.7035441034209832,
140
+ "grad_norm": 1.3225692510604858,
141
+ "learning_rate": 0.0002971541820503175,
142
+ "loss": 0.2642,
143
+ "step": 8000
144
+ },
145
+ {
146
+ "epoch": 0.7035441034209832,
147
+ "eval_accuracy": 0.8935196636161267,
148
+ "eval_combined_score": 0.8764051986980781,
149
+ "eval_f1": 0.8592907337800294,
150
+ "eval_loss": 0.25021693110466003,
151
+ "eval_runtime": 54.6586,
152
+ "eval_samples_per_second": 739.682,
153
+ "eval_steps_per_second": 1.445,
154
+ "step": 8000
155
+ },
156
+ {
157
+ "epoch": 0.7914871163486061,
158
+ "grad_norm": 0.7988959550857544,
159
+ "learning_rate": 0.00029628649143688076,
160
+ "loss": 0.2578,
161
+ "step": 9000
162
+ },
163
+ {
164
+ "epoch": 0.7914871163486061,
165
+ "eval_accuracy": 0.8921840217660153,
166
+ "eval_combined_score": 0.8776726104749084,
167
+ "eval_f1": 0.8631611991838016,
168
+ "eval_loss": 0.25787219405174255,
169
+ "eval_runtime": 87.2076,
170
+ "eval_samples_per_second": 463.606,
171
+ "eval_steps_per_second": 0.906,
172
+ "step": 9000
173
+ },
174
+ {
175
+ "epoch": 0.879430129276229,
176
+ "grad_norm": 0.9499006271362305,
177
+ "learning_rate": 0.0002953051354941674,
178
+ "loss": 0.2592,
179
+ "step": 10000
180
+ },
181
+ {
182
+ "epoch": 0.879430129276229,
183
+ "eval_accuracy": 0.8905021023992085,
184
+ "eval_combined_score": 0.8760424091084933,
185
+ "eval_f1": 0.8615827158177782,
186
+ "eval_loss": 0.2585276961326599,
187
+ "eval_runtime": 87.1178,
188
+ "eval_samples_per_second": 464.084,
189
+ "eval_steps_per_second": 0.907,
190
+ "step": 10000
191
+ },
192
+ {
193
+ "epoch": 0.967373142203852,
194
+ "grad_norm": 1.2539165019989014,
195
+ "learning_rate": 0.0002942108767406115,
196
+ "loss": 0.2539,
197
+ "step": 11000
198
+ },
199
+ {
200
+ "epoch": 0.967373142203852,
201
+ "eval_accuracy": 0.8973287162997774,
202
+ "eval_combined_score": 0.8796948303527266,
203
+ "eval_f1": 0.8620609444056757,
204
+ "eval_loss": 0.23903892934322357,
205
+ "eval_runtime": 87.3443,
206
+ "eval_samples_per_second": 462.881,
207
+ "eval_steps_per_second": 0.904,
208
+ "step": 11000
209
+ },
210
+ {
211
+ "epoch": 1.0553161551314747,
212
+ "grad_norm": 0.8417719602584839,
213
+ "learning_rate": 0.00029300456542069104,
214
+ "loss": 0.2403,
215
+ "step": 12000
216
+ },
217
+ {
218
+ "epoch": 1.0553161551314747,
219
+ "eval_accuracy": 0.8982933465248578,
220
+ "eval_combined_score": 0.8817324165898602,
221
+ "eval_f1": 0.8651714866548627,
222
+ "eval_loss": 0.24867697060108185,
223
+ "eval_runtime": 54.2947,
224
+ "eval_samples_per_second": 744.641,
225
+ "eval_steps_per_second": 1.455,
226
+ "step": 12000
227
+ },
228
+ {
229
+ "epoch": 1.1432591680590978,
230
+ "grad_norm": 0.9629989862442017,
231
+ "learning_rate": 0.0002916871388442835,
232
+ "loss": 0.2366,
233
+ "step": 13000
234
+ },
235
+ {
236
+ "epoch": 1.1432591680590978,
237
+ "eval_accuracy": 0.895176848874598,
238
+ "eval_combined_score": 0.8794971647971962,
239
+ "eval_f1": 0.8638174807197944,
240
+ "eval_loss": 0.25021758675575256,
241
+ "eval_runtime": 54.1759,
242
+ "eval_samples_per_second": 746.273,
243
+ "eval_steps_per_second": 1.458,
244
+ "step": 13000
245
+ },
246
+ {
247
+ "epoch": 1.2312021809867206,
248
+ "grad_norm": 1.174015760421753,
249
+ "learning_rate": 0.00029025962065837193,
250
+ "loss": 0.2331,
251
+ "step": 14000
252
+ },
253
+ {
254
+ "epoch": 1.2312021809867206,
255
+ "eval_accuracy": 0.8964630225080386,
256
+ "eval_combined_score": 0.88263667610296,
257
+ "eval_f1": 0.8688103296978814,
258
+ "eval_loss": 0.25016075372695923,
259
+ "eval_runtime": 54.4818,
260
+ "eval_samples_per_second": 742.083,
261
+ "eval_steps_per_second": 1.45,
262
+ "step": 14000
263
+ },
264
+ {
265
+ "epoch": 1.3191451939143435,
266
+ "grad_norm": 0.7278306484222412,
267
+ "learning_rate": 0.00028872312005166577,
268
+ "loss": 0.2299,
269
+ "step": 15000
270
+ },
271
+ {
272
+ "epoch": 1.3191451939143435,
273
+ "eval_accuracy": 0.8990353697749196,
274
+ "eval_combined_score": 0.8810690967863128,
275
+ "eval_f1": 0.8631028237977061,
276
+ "eval_loss": 0.23498821258544922,
277
+ "eval_runtime": 54.6828,
278
+ "eval_samples_per_second": 739.355,
279
+ "eval_steps_per_second": 1.445,
280
+ "step": 15000
281
+ },
282
+ {
283
+ "epoch": 1.4070882068419663,
284
+ "grad_norm": 1.6794105768203735,
285
+ "learning_rate": 0.00028707883089275593,
286
+ "loss": 0.2353,
287
+ "step": 16000
288
+ },
289
+ {
290
+ "epoch": 1.4070882068419663,
291
+ "eval_accuracy": 0.9013851100667821,
292
+ "eval_combined_score": 0.8853597460667211,
293
+ "eval_f1": 0.8693343820666601,
294
+ "eval_loss": 0.23489055037498474,
295
+ "eval_runtime": 54.6884,
296
+ "eval_samples_per_second": 739.279,
297
+ "eval_steps_per_second": 1.445,
298
+ "step": 16000
299
+ },
300
+ {
301
+ "epoch": 1.4950312197695892,
302
+ "grad_norm": 1.0810128450393677,
303
+ "learning_rate": 0.0002853280308024728,
304
+ "loss": 0.2313,
305
+ "step": 17000
306
+ },
307
+ {
308
+ "epoch": 1.4950312197695892,
309
+ "eval_accuracy": 0.9034380410586198,
310
+ "eval_combined_score": 0.8877316082169779,
311
+ "eval_f1": 0.872025175375336,
312
+ "eval_loss": 0.22801174223423004,
313
+ "eval_runtime": 89.5061,
314
+ "eval_samples_per_second": 451.701,
315
+ "eval_steps_per_second": 0.883,
316
+ "step": 17000
317
+ },
318
+ {
319
+ "epoch": 1.5829742326972123,
320
+ "grad_norm": 1.0413625240325928,
321
+ "learning_rate": 0.0002834720801611687,
322
+ "loss": 0.227,
323
+ "step": 18000
324
+ },
325
+ {
326
+ "epoch": 1.5829742326972123,
327
+ "eval_accuracy": 0.902819688350235,
328
+ "eval_combined_score": 0.8874883265881588,
329
+ "eval_f1": 0.8721569648260827,
330
+ "eval_loss": 0.2316894680261612,
331
+ "eval_runtime": 86.5521,
332
+ "eval_samples_per_second": 467.118,
333
+ "eval_steps_per_second": 0.913,
334
+ "step": 18000
335
+ },
336
+ {
337
+ "epoch": 1.6709172456248351,
338
+ "grad_norm": 1.8623915910720825,
339
+ "learning_rate": 0.0002815124210516956,
340
+ "loss": 0.2311,
341
+ "step": 19000
342
+ },
343
+ {
344
+ "epoch": 1.6709172456248351,
345
+ "eval_accuracy": 0.90375958446698,
346
+ "eval_combined_score": 0.8866361556259579,
347
+ "eval_f1": 0.8695127267849357,
348
+ "eval_loss": 0.23079416155815125,
349
+ "eval_runtime": 87.1377,
350
+ "eval_samples_per_second": 463.978,
351
+ "eval_steps_per_second": 0.907,
352
+ "step": 19000
353
+ },
354
+ {
355
+ "epoch": 1.758860258552458,
356
+ "grad_norm": 0.6980274319648743,
357
+ "learning_rate": 0.0002794505761388994,
358
+ "loss": 0.2277,
359
+ "step": 20000
360
+ },
361
+ {
362
+ "epoch": 1.758860258552458,
363
+ "eval_accuracy": 0.9012367054167697,
364
+ "eval_combined_score": 0.8872635467552861,
365
+ "eval_f1": 0.8732903880938025,
366
+ "eval_loss": 0.240788072347641,
367
+ "eval_runtime": 54.3707,
368
+ "eval_samples_per_second": 743.6,
369
+ "eval_steps_per_second": 1.453,
370
+ "step": 20000
371
+ },
372
+ {
373
+ "epoch": 1.846803271480081,
374
+ "grad_norm": 0.8938764333724976,
375
+ "learning_rate": 0.0002772881474865019,
376
+ "loss": 0.225,
377
+ "step": 21000
378
+ },
379
+ {
380
+ "epoch": 1.846803271480081,
381
+ "eval_accuracy": 0.9029680930002474,
382
+ "eval_combined_score": 0.8856906006878942,
383
+ "eval_f1": 0.8684131083755409,
384
+ "eval_loss": 0.23772455751895905,
385
+ "eval_runtime": 54.6804,
386
+ "eval_samples_per_second": 739.387,
387
+ "eval_steps_per_second": 1.445,
388
+ "step": 21000
389
+ },
390
+ {
391
+ "epoch": 1.9347462844077037,
392
+ "grad_norm": 0.7940501570701599,
393
+ "learning_rate": 0.00027502681531228946,
394
+ "loss": 0.2292,
395
+ "step": 22000
396
+ },
397
+ {
398
+ "epoch": 1.9347462844077037,
399
+ "eval_accuracy": 0.9016077170418006,
400
+ "eval_combined_score": 0.8875243980984009,
401
+ "eval_f1": 0.8734410791550012,
402
+ "eval_loss": 0.23382480442523956,
403
+ "eval_runtime": 53.9134,
404
+ "eval_samples_per_second": 749.906,
405
+ "eval_steps_per_second": 1.465,
406
+ "step": 22000
407
+ },
408
+ {
409
+ "epoch": 2.0226892973353268,
410
+ "grad_norm": 0.7027292251586914,
411
+ "learning_rate": 0.00027266833668257537,
412
+ "loss": 0.2212,
413
+ "step": 23000
414
+ },
415
+ {
416
+ "epoch": 2.0226892973353268,
417
+ "eval_accuracy": 0.9046500123670541,
418
+ "eval_combined_score": 0.889394450174516,
419
+ "eval_f1": 0.8741388879819779,
420
+ "eval_loss": 0.2338305413722992,
421
+ "eval_runtime": 54.6808,
422
+ "eval_samples_per_second": 739.382,
423
+ "eval_steps_per_second": 1.445,
424
+ "step": 23000
425
+ },
426
+ {
427
+ "epoch": 2.1106323102629494,
428
+ "grad_norm": 0.9791672825813293,
429
+ "learning_rate": 0.0002702145441469506,
430
+ "loss": 0.2102,
431
+ "step": 24000
432
+ },
433
+ {
434
+ "epoch": 2.1106323102629494,
435
+ "eval_accuracy": 0.9050457581004204,
436
+ "eval_combined_score": 0.8886693811625817,
437
+ "eval_f1": 0.872293004224743,
438
+ "eval_loss": 0.2333918660879135,
439
+ "eval_runtime": 54.0058,
440
+ "eval_samples_per_second": 748.623,
441
+ "eval_steps_per_second": 1.463,
442
+ "step": 24000
443
+ },
444
+ {
445
+ "epoch": 2.1985753231905725,
446
+ "grad_norm": 0.8743547201156616,
447
+ "learning_rate": 0.00026766734431438345,
448
+ "loss": 0.2123,
449
+ "step": 25000
450
+ },
451
+ {
452
+ "epoch": 2.1985753231905725,
453
+ "eval_accuracy": 0.9072223596339352,
454
+ "eval_combined_score": 0.8913415161495151,
455
+ "eval_f1": 0.8754606726650951,
456
+ "eval_loss": 0.2295483946800232,
457
+ "eval_runtime": 87.2432,
458
+ "eval_samples_per_second": 463.417,
459
+ "eval_steps_per_second": 0.906,
460
+ "step": 25000
461
+ },
462
+ {
463
+ "epoch": 2.2865183361181955,
464
+ "grad_norm": 1.2709019184112549,
465
+ "learning_rate": 0.0002650287163717754,
466
+ "loss": 0.2118,
467
+ "step": 26000
468
+ },
469
+ {
470
+ "epoch": 2.2865183361181955,
471
+ "eval_accuracy": 0.9080385852090033,
472
+ "eval_combined_score": 0.8922502867565485,
473
+ "eval_f1": 0.8764619883040936,
474
+ "eval_loss": 0.22616761922836304,
475
+ "eval_runtime": 87.1794,
476
+ "eval_samples_per_second": 463.756,
477
+ "eval_steps_per_second": 0.906,
478
+ "step": 26000
479
+ },
480
+ {
481
+ "epoch": 2.374461349045818,
482
+ "grad_norm": 1.1367487907409668,
483
+ "learning_rate": 0.0002623007105461227,
484
+ "loss": 0.2137,
485
+ "step": 27000
486
+ },
487
+ {
488
+ "epoch": 2.374461349045818,
489
+ "eval_accuracy": 0.9050952263170913,
490
+ "eval_combined_score": 0.8901675123619273,
491
+ "eval_f1": 0.8752397984067631,
492
+ "eval_loss": 0.2204943299293518,
493
+ "eval_runtime": 94.4103,
494
+ "eval_samples_per_second": 428.237,
495
+ "eval_steps_per_second": 0.837,
496
+ "step": 27000
497
+ },
498
+ {
499
+ "epoch": 2.4624043619734413,
500
+ "grad_norm": 1.0608514547348022,
501
+ "learning_rate": 0.00025948544651147997,
502
+ "loss": 0.2097,
503
+ "step": 28000
504
+ },
505
+ {
506
+ "epoch": 2.4624043619734413,
507
+ "eval_accuracy": 0.905144694533762,
508
+ "eval_combined_score": 0.8909421605282187,
509
+ "eval_f1": 0.8767396265226755,
510
+ "eval_loss": 0.2264743149280548,
511
+ "eval_runtime": 54.8838,
512
+ "eval_samples_per_second": 736.647,
513
+ "eval_steps_per_second": 1.439,
514
+ "step": 28000
515
+ },
516
+ {
517
+ "epoch": 2.5503473749010643,
518
+ "grad_norm": 0.9547954201698303,
519
+ "learning_rate": 0.00025658511174196294,
520
+ "loss": 0.2076,
521
+ "step": 29000
522
+ },
523
+ {
524
+ "epoch": 2.5503473749010643,
525
+ "eval_accuracy": 0.9054415038337867,
526
+ "eval_combined_score": 0.8911845190997154,
527
+ "eval_f1": 0.876927534365644,
528
+ "eval_loss": 0.22710390388965607,
529
+ "eval_runtime": 54.7864,
530
+ "eval_samples_per_second": 737.956,
531
+ "eval_steps_per_second": 1.442,
532
+ "step": 29000
533
+ },
534
+ {
535
+ "epoch": 2.638290387828687,
536
+ "grad_norm": 0.9954330325126648,
537
+ "learning_rate": 0.00025360195981207026,
538
+ "loss": 0.2129,
539
+ "step": 30000
540
+ },
541
+ {
542
+ "epoch": 2.638290387828687,
543
+ "eval_accuracy": 0.9066534751422212,
544
+ "eval_combined_score": 0.891764686155216,
545
+ "eval_f1": 0.8768758971682109,
546
+ "eval_loss": 0.23385684192180634,
547
+ "eval_runtime": 54.1927,
548
+ "eval_samples_per_second": 746.041,
549
+ "eval_steps_per_second": 1.458,
550
+ "step": 30000
551
+ },
552
+ {
553
+ "epoch": 2.72623340075631,
554
+ "grad_norm": 0.7042277455329895,
555
+ "learning_rate": 0.0002505383086456447,
556
+ "loss": 0.2125,
557
+ "step": 31000
558
+ },
559
+ {
560
+ "epoch": 2.72623340075631,
561
+ "eval_accuracy": 0.9085580014840465,
562
+ "eval_combined_score": 0.894393038387405,
563
+ "eval_f1": 0.8802280752907636,
564
+ "eval_loss": 0.22638675570487976,
565
+ "eval_runtime": 54.7177,
566
+ "eval_samples_per_second": 738.883,
567
+ "eval_steps_per_second": 1.444,
568
+ "step": 31000
569
+ },
570
+ {
571
+ "epoch": 2.8141764136839327,
572
+ "grad_norm": 1.7481952905654907,
573
+ "learning_rate": 0.0002473965387148352,
574
+ "loss": 0.2121,
575
+ "step": 32000
576
+ },
577
+ {
578
+ "epoch": 2.8141764136839327,
579
+ "eval_accuracy": 0.9079891169923324,
580
+ "eval_combined_score": 0.8917747150493207,
581
+ "eval_f1": 0.875560313106309,
582
+ "eval_loss": 0.2314489334821701,
583
+ "eval_runtime": 86.9488,
584
+ "eval_samples_per_second": 464.986,
585
+ "eval_steps_per_second": 0.909,
586
+ "step": 32000
587
+ },
588
+ {
589
+ "epoch": 2.9021194266115558,
590
+ "grad_norm": 0.9269095659255981,
591
+ "learning_rate": 0.000244179091190458,
592
+ "loss": 0.2061,
593
+ "step": 33000
594
+ },
595
+ {
596
+ "epoch": 2.9021194266115558,
597
+ "eval_accuracy": 0.9060351224338362,
598
+ "eval_combined_score": 0.8897072255838148,
599
+ "eval_f1": 0.8733793287337933,
600
+ "eval_loss": 0.23508088290691376,
601
+ "eval_runtime": 87.376,
602
+ "eval_samples_per_second": 462.713,
603
+ "eval_steps_per_second": 0.904,
604
+ "step": 33000
605
+ },
606
+ {
607
+ "epoch": 2.9900624395391784,
608
+ "grad_norm": 0.6865495443344116,
609
+ "learning_rate": 0.00024088846604519457,
610
+ "loss": 0.2073,
611
+ "step": 34000
612
+ },
613
+ {
614
+ "epoch": 2.9900624395391784,
615
+ "eval_accuracy": 0.9079149146673262,
616
+ "eval_combined_score": 0.8936073195479604,
617
+ "eval_f1": 0.8792997244285946,
618
+ "eval_loss": 0.22693374752998352,
619
+ "eval_runtime": 86.8693,
620
+ "eval_samples_per_second": 465.412,
621
+ "eval_steps_per_second": 0.909,
622
+ "step": 34000
623
+ },
624
+ {
625
+ "epoch": 3.0780054524668015,
626
+ "grad_norm": 1.1589127779006958,
627
+ "learning_rate": 0.00023752722011110102,
628
+ "loss": 0.1922,
629
+ "step": 35000
630
+ },
631
+ {
632
+ "epoch": 3.0780054524668015,
633
+ "eval_accuracy": 0.9080138511006678,
634
+ "eval_combined_score": 0.8938269844519164,
635
+ "eval_f1": 0.8796401178031652,
636
+ "eval_loss": 0.22861303389072418,
637
+ "eval_runtime": 87.5134,
638
+ "eval_samples_per_second": 461.986,
639
+ "eval_steps_per_second": 0.903,
640
+ "step": 35000
641
+ },
642
+ {
643
+ "epoch": 3.1659484653944245,
644
+ "grad_norm": 0.8257580995559692,
645
+ "learning_rate": 0.00023409796509293643,
646
+ "loss": 0.1965,
647
+ "step": 36000
648
+ },
649
+ {
650
+ "epoch": 3.1659484653944245,
651
+ "eval_accuracy": 0.9082364580756864,
652
+ "eval_combined_score": 0.894167353205641,
653
+ "eval_f1": 0.8800982483355956,
654
+ "eval_loss": 0.2238505333662033,
655
+ "eval_runtime": 55.5705,
656
+ "eval_samples_per_second": 727.545,
657
+ "eval_steps_per_second": 1.422,
658
+ "step": 36000
659
+ },
660
+ {
661
+ "epoch": 3.253891478322047,
662
+ "grad_norm": 1.379067063331604,
663
+ "learning_rate": 0.0002306033655388555,
664
+ "loss": 0.1937,
665
+ "step": 37000
666
+ },
667
+ {
668
+ "epoch": 3.253891478322047,
669
+ "eval_accuracy": 0.9101409844175118,
670
+ "eval_combined_score": 0.8959338577717992,
671
+ "eval_f1": 0.8817267311260866,
672
+ "eval_loss": 0.23340874910354614,
673
+ "eval_runtime": 53.9515,
674
+ "eval_samples_per_second": 749.376,
675
+ "eval_steps_per_second": 1.464,
676
+ "step": 37000
677
+ },
678
+ {
679
+ "epoch": 3.3418344912496702,
680
+ "grad_norm": 0.6008528470993042,
681
+ "learning_rate": 0.0002270461367700413,
682
+ "loss": 0.1993,
683
+ "step": 38000
684
+ },
685
+ {
686
+ "epoch": 3.3418344912496702,
687
+ "eval_accuracy": 0.9077417759089785,
688
+ "eval_combined_score": 0.8944759834958905,
689
+ "eval_f1": 0.8812101910828025,
690
+ "eval_loss": 0.22766782343387604,
691
+ "eval_runtime": 58.3838,
692
+ "eval_samples_per_second": 692.487,
693
+ "eval_steps_per_second": 1.353,
694
+ "step": 38000
695
+ },
696
+ {
697
+ "epoch": 3.4297775041772933,
698
+ "grad_norm": 0.3488192558288574,
699
+ "learning_rate": 0.00022342904277088745,
700
+ "loss": 0.1966,
701
+ "step": 39000
702
+ },
703
+ {
704
+ "epoch": 3.4297775041772933,
705
+ "eval_accuracy": 0.9076923076923077,
706
+ "eval_combined_score": 0.8940499608063538,
707
+ "eval_f1": 0.8804076139203999,
708
+ "eval_loss": 0.23169130086898804,
709
+ "eval_runtime": 54.7179,
710
+ "eval_samples_per_second": 738.881,
711
+ "eval_steps_per_second": 1.444,
712
+ "step": 39000
713
+ },
714
+ {
715
+ "epoch": 3.517720517104916,
716
+ "grad_norm": 1.225816011428833,
717
+ "learning_rate": 0.00021975489404136827,
718
+ "loss": 0.1947,
719
+ "step": 40000
720
+ },
721
+ {
722
+ "epoch": 3.517720517104916,
723
+ "eval_accuracy": 0.9097699727924808,
724
+ "eval_combined_score": 0.8946949124786058,
725
+ "eval_f1": 0.8796198521647307,
726
+ "eval_loss": 0.22107689082622528,
727
+ "eval_runtime": 87.0865,
728
+ "eval_samples_per_second": 464.251,
729
+ "eval_steps_per_second": 0.907,
730
+ "step": 40000
731
+ },
732
+ {
733
+ "epoch": 3.605663530032539,
734
+ "grad_norm": 1.0007948875427246,
735
+ "learning_rate": 0.00021602654541326668,
736
+ "loss": 0.192,
737
+ "step": 41000
738
+ },
739
+ {
740
+ "epoch": 3.605663530032539,
741
+ "eval_accuracy": 0.9105614642592135,
742
+ "eval_combined_score": 0.8974543392427647,
743
+ "eval_f1": 0.8843472142263161,
744
+ "eval_loss": 0.22493700683116913,
745
+ "eval_runtime": 86.9435,
746
+ "eval_samples_per_second": 465.015,
747
+ "eval_steps_per_second": 0.909,
748
+ "step": 41000
749
+ },
750
+ {
751
+ "epoch": 3.6936065429601617,
752
+ "grad_norm": 1.137495756149292,
753
+ "learning_rate": 0.00021224689383195542,
754
+ "loss": 0.1946,
755
+ "step": 42000
756
+ },
757
+ {
758
+ "epoch": 3.6936065429601617,
759
+ "eval_accuracy": 0.9088053425674004,
760
+ "eval_combined_score": 0.893506823498545,
761
+ "eval_f1": 0.8782083044296898,
762
+ "eval_loss": 0.22202743589878082,
763
+ "eval_runtime": 87.3222,
764
+ "eval_samples_per_second": 462.998,
765
+ "eval_steps_per_second": 0.905,
766
+ "step": 42000
767
+ },
768
+ {
769
+ "epoch": 3.7815495558877847,
770
+ "grad_norm": 0.5520714521408081,
771
+ "learning_rate": 0.00020841887610545634,
772
+ "loss": 0.195,
773
+ "step": 43000
774
+ },
775
+ {
776
+ "epoch": 3.7815495558877847,
777
+ "eval_accuracy": 0.9101409844175118,
778
+ "eval_combined_score": 0.8936004911935618,
779
+ "eval_f1": 0.8770599979696119,
780
+ "eval_loss": 0.22466640174388885,
781
+ "eval_runtime": 54.3269,
782
+ "eval_samples_per_second": 744.199,
783
+ "eval_steps_per_second": 1.454,
784
+ "step": 43000
785
+ },
786
+ {
787
+ "epoch": 3.8694925688154074,
788
+ "grad_norm": 1.8173424005508423,
789
+ "learning_rate": 0.00020454546662252592,
790
+ "loss": 0.1974,
791
+ "step": 44000
792
+ },
793
+ {
794
+ "epoch": 3.8694925688154074,
795
+ "eval_accuracy": 0.9104872619342073,
796
+ "eval_combined_score": 0.8968858407906919,
797
+ "eval_f1": 0.8832844196471764,
798
+ "eval_loss": 0.2287817895412445,
799
+ "eval_runtime": 54.2695,
800
+ "eval_samples_per_second": 744.986,
801
+ "eval_steps_per_second": 1.456,
802
+ "step": 44000
803
+ },
804
+ {
805
+ "epoch": 3.9574355817430305,
806
+ "grad_norm": 0.8394154906272888,
807
+ "learning_rate": 0.00020062967504154062,
808
+ "loss": 0.1973,
809
+ "step": 45000
810
+ },
811
+ {
812
+ "epoch": 3.9574355817430305,
813
+ "eval_accuracy": 0.9072470937422706,
814
+ "eval_combined_score": 0.8945386536966222,
815
+ "eval_f1": 0.8818302136509737,
816
+ "eval_loss": 0.22817550599575043,
817
+ "eval_runtime": 54.4452,
818
+ "eval_samples_per_second": 742.582,
819
+ "eval_steps_per_second": 1.451,
820
+ "step": 45000
821
+ },
822
+ {
823
+ "epoch": 4.0453785946706535,
824
+ "grad_norm": 1.104277491569519,
825
+ "learning_rate": 0.00019667454395197706,
826
+ "loss": 0.1892,
827
+ "step": 46000
828
+ },
829
+ {
830
+ "epoch": 4.0453785946706535,
831
+ "eval_accuracy": 0.9135048231511254,
832
+ "eval_combined_score": 0.8986376111634246,
833
+ "eval_f1": 0.8837703991757238,
834
+ "eval_loss": 0.22852258384227753,
835
+ "eval_runtime": 54.9374,
836
+ "eval_samples_per_second": 735.929,
837
+ "eval_steps_per_second": 1.438,
838
+ "step": 46000
839
+ },
840
+ {
841
+ "epoch": 4.133321607598276,
842
+ "grad_norm": 1.2995333671569824,
843
+ "learning_rate": 0.00019268314651030522,
844
+ "loss": 0.1833,
845
+ "step": 47000
846
+ },
847
+ {
848
+ "epoch": 4.133321607598276,
849
+ "eval_accuracy": 0.9120207766510018,
850
+ "eval_combined_score": 0.8978153030861435,
851
+ "eval_f1": 0.8836098295212853,
852
+ "eval_loss": 0.21560963988304138,
853
+ "eval_runtime": 54.9981,
854
+ "eval_samples_per_second": 735.116,
855
+ "eval_steps_per_second": 1.436,
856
+ "step": 47000
857
+ },
858
+ {
859
+ "epoch": 4.221264620525899,
860
+ "grad_norm": 1.6505780220031738,
861
+ "learning_rate": 0.00018865858405213055,
862
+ "loss": 0.1804,
863
+ "step": 48000
864
+ },
865
+ {
866
+ "epoch": 4.221264620525899,
867
+ "eval_accuracy": 0.9126391293593866,
868
+ "eval_combined_score": 0.8979239559236587,
869
+ "eval_f1": 0.8832087824879307,
870
+ "eval_loss": 0.2259899377822876,
871
+ "eval_runtime": 87.2335,
872
+ "eval_samples_per_second": 463.469,
873
+ "eval_steps_per_second": 0.906,
874
+ "step": 48000
875
+ },
876
+ {
877
+ "epoch": 4.309207633453522,
878
+ "grad_norm": 1.3793467283248901,
879
+ "learning_rate": 0.0001846039836824406,
880
+ "loss": 0.1844,
881
+ "step": 49000
882
+ },
883
+ {
884
+ "epoch": 4.309207633453522,
885
+ "eval_accuracy": 0.9140737076428395,
886
+ "eval_combined_score": 0.9003240420152293,
887
+ "eval_f1": 0.8865743763876192,
888
+ "eval_loss": 0.22719497978687286,
889
+ "eval_runtime": 94.4623,
890
+ "eval_samples_per_second": 428.001,
891
+ "eval_steps_per_second": 0.836,
892
+ "step": 49000
893
+ },
894
+ {
895
+ "epoch": 4.397150646381145,
896
+ "grad_norm": 1.4247913360595703,
897
+ "learning_rate": 0.00018052249584582937,
898
+ "loss": 0.1814,
899
+ "step": 50000
900
+ },
901
+ {
902
+ "epoch": 4.397150646381145,
903
+ "eval_accuracy": 0.9132822161761068,
904
+ "eval_combined_score": 0.8994954567645432,
905
+ "eval_f1": 0.8857086973529795,
906
+ "eval_loss": 0.2213682383298874,
907
+ "eval_runtime": 87.3662,
908
+ "eval_samples_per_second": 462.765,
909
+ "eval_steps_per_second": 0.904,
910
+ "step": 50000
911
+ },
912
+ {
913
+ "epoch": 4.485093659308768,
914
+ "grad_norm": 0.8198834657669067,
915
+ "learning_rate": 0.0001764172918785858,
916
+ "loss": 0.1823,
917
+ "step": 51000
918
+ },
919
+ {
920
+ "epoch": 4.485093659308768,
921
+ "eval_accuracy": 0.9134306208261193,
922
+ "eval_combined_score": 0.8997714082914925,
923
+ "eval_f1": 0.8861121957568658,
924
+ "eval_loss": 0.21433314681053162,
925
+ "eval_runtime": 54.737,
926
+ "eval_samples_per_second": 738.623,
927
+ "eval_steps_per_second": 1.443,
928
+ "step": 51000
929
+ },
930
+ {
931
+ "epoch": 4.573036672236391,
932
+ "grad_norm": 1.7346959114074707,
933
+ "learning_rate": 0.0001722915615445501,
934
+ "loss": 0.1861,
935
+ "step": 52000
936
+ },
937
+ {
938
+ "epoch": 4.573036672236391,
939
+ "eval_accuracy": 0.912292851842691,
940
+ "eval_combined_score": 0.8985777526308721,
941
+ "eval_f1": 0.8848626534190532,
942
+ "eval_loss": 0.21990624070167542,
943
+ "eval_runtime": 54.6796,
944
+ "eval_samples_per_second": 739.399,
945
+ "eval_steps_per_second": 1.445,
946
+ "step": 52000
947
+ },
948
+ {
949
+ "epoch": 4.660979685164014,
950
+ "grad_norm": 0.8727362751960754,
951
+ "learning_rate": 0.0001681485105566511,
952
+ "loss": 0.1885,
953
+ "step": 53000
954
+ },
955
+ {
956
+ "epoch": 4.660979685164014,
957
+ "eval_accuracy": 0.9137521642344794,
958
+ "eval_combined_score": 0.9000197312286045,
959
+ "eval_f1": 0.8862872982227294,
960
+ "eval_loss": 0.21856088936328888,
961
+ "eval_runtime": 54.0603,
962
+ "eval_samples_per_second": 747.868,
963
+ "eval_steps_per_second": 1.461,
964
+ "step": 53000
965
+ },
966
+ {
967
+ "epoch": 4.748922698091636,
968
+ "grad_norm": 0.7080467343330383,
969
+ "learning_rate": 0.00016399135808605172,
970
+ "loss": 0.1835,
971
+ "step": 54000
972
+ },
973
+ {
974
+ "epoch": 4.748922698091636,
975
+ "eval_accuracy": 0.9138758347761563,
976
+ "eval_combined_score": 0.8999045687672304,
977
+ "eval_f1": 0.8859333027583044,
978
+ "eval_loss": 0.22292451560497284,
979
+ "eval_runtime": 54.607,
980
+ "eval_samples_per_second": 740.382,
981
+ "eval_steps_per_second": 1.447,
982
+ "step": 54000
983
+ },
984
+ {
985
+ "epoch": 4.83686571101926,
986
+ "grad_norm": 0.5516489148139954,
987
+ "learning_rate": 0.00015982333426083677,
988
+ "loss": 0.186,
989
+ "step": 55000
990
+ },
991
+ {
992
+ "epoch": 4.83686571101926,
993
+ "eval_accuracy": 0.9153351471679446,
994
+ "eval_combined_score": 0.9019310965591897,
995
+ "eval_f1": 0.8885270459504347,
996
+ "eval_loss": 0.21966171264648438,
997
+ "eval_runtime": 53.8107,
998
+ "eval_samples_per_second": 751.337,
999
+ "eval_steps_per_second": 1.468,
1000
+ "step": 55000
1001
+ },
1002
+ {
1003
+ "epoch": 4.9248087239468825,
1004
+ "grad_norm": 1.1122982501983643,
1005
+ "learning_rate": 0.00015564767765618756,
1006
+ "loss": 0.1846,
1007
+ "step": 56000
1008
+ },
1009
+ {
1010
+ "epoch": 4.9248087239468825,
1011
+ "eval_accuracy": 0.9129854068760821,
1012
+ "eval_combined_score": 0.8988205722905,
1013
+ "eval_f1": 0.884655737704918,
1014
+ "eval_loss": 0.21510818600654602,
1015
+ "eval_runtime": 87.5693,
1016
+ "eval_samples_per_second": 461.692,
1017
+ "eval_steps_per_second": 0.902,
1018
+ "step": 56000
1019
+ },
1020
+ {
1021
+ "epoch": 5.012751736874505,
1022
+ "grad_norm": 0.8173431158065796,
1023
+ "learning_rate": 0.0001514676327779928,
1024
+ "loss": 0.1791,
1025
+ "step": 57000
1026
+ },
1027
+ {
1028
+ "epoch": 5.012751736874505,
1029
+ "eval_accuracy": 0.9143952510511996,
1030
+ "eval_combined_score": 0.9002526783407632,
1031
+ "eval_f1": 0.8861101056303268,
1032
+ "eval_loss": 0.21763020753860474,
1033
+ "eval_runtime": 87.3646,
1034
+ "eval_samples_per_second": 462.774,
1035
+ "eval_steps_per_second": 0.904,
1036
+ "step": 57000
1037
+ },
1038
+ {
1039
+ "epoch": 5.100694749802129,
1040
+ "grad_norm": 1.5404053926467896,
1041
+ "learning_rate": 0.00014728644754185164,
1042
+ "loss": 0.1735,
1043
+ "step": 58000
1044
+ },
1045
+ {
1046
+ "epoch": 5.100694749802129,
1047
+ "eval_accuracy": 0.9145931239178827,
1048
+ "eval_combined_score": 0.9004645398383976,
1049
+ "eval_f1": 0.8863359557589124,
1050
+ "eval_loss": 0.21669216454029083,
1051
+ "eval_runtime": 87.4837,
1052
+ "eval_samples_per_second": 462.143,
1053
+ "eval_steps_per_second": 0.903,
1054
+ "step": 58000
1055
+ },
1056
+ {
1057
+ "epoch": 5.188637762729751,
1058
+ "grad_norm": 1.2535247802734375,
1059
+ "learning_rate": 0.00014310737074942683,
1060
+ "loss": 0.1733,
1061
+ "step": 59000
1062
+ },
1063
+ {
1064
+ "epoch": 5.188637762729751,
1065
+ "eval_accuracy": 0.9115013603759584,
1066
+ "eval_combined_score": 0.8982043600644589,
1067
+ "eval_f1": 0.8849073597529593,
1068
+ "eval_loss": 0.22689573466777802,
1069
+ "eval_runtime": 55.208,
1070
+ "eval_samples_per_second": 732.321,
1071
+ "eval_steps_per_second": 1.431,
1072
+ "step": 59000
1073
+ },
1074
+ {
1075
+ "epoch": 5.276580775657374,
1076
+ "grad_norm": 1.0199129581451416,
1077
+ "learning_rate": 0.00013893364956411012,
1078
+ "loss": 0.1684,
1079
+ "step": 60000
1080
+ },
1081
+ {
1082
+ "epoch": 5.276580775657374,
1083
+ "eval_accuracy": 0.9096463022508039,
1084
+ "eval_combined_score": 0.8967376049336386,
1085
+ "eval_f1": 0.8838289076164733,
1086
+ "eval_loss": 0.23635230958461761,
1087
+ "eval_runtime": 58.8389,
1088
+ "eval_samples_per_second": 687.13,
1089
+ "eval_steps_per_second": 1.343,
1090
+ "step": 60000
1091
+ },
1092
+ {
1093
+ "epoch": 5.364523788584997,
1094
+ "grad_norm": 1.3774573802947998,
1095
+ "learning_rate": 0.0001347685269879597,
1096
+ "loss": 0.1776,
1097
+ "step": 61000
1098
+ },
1099
+ {
1100
+ "epoch": 5.364523788584997,
1101
+ "eval_accuracy": 0.9106356665842197,
1102
+ "eval_combined_score": 0.8973484465193344,
1103
+ "eval_f1": 0.8840612264544492,
1104
+ "eval_loss": 0.22598478198051453,
1105
+ "eval_runtime": 54.8948,
1106
+ "eval_samples_per_second": 736.5,
1107
+ "eval_steps_per_second": 1.439,
1108
+ "step": 61000
1109
+ },
1110
+ {
1111
+ "epoch": 5.45246680151262,
1112
+ "grad_norm": 0.5923727750778198,
1113
+ "learning_rate": 0.00013061523934187208,
1114
+ "loss": 0.1746,
1115
+ "step": 62000
1116
+ },
1117
+ {
1118
+ "epoch": 5.45246680151262,
1119
+ "eval_accuracy": 0.9127627999010636,
1120
+ "eval_combined_score": 0.8995621197365935,
1121
+ "eval_f1": 0.8863614395721235,
1122
+ "eval_loss": 0.22661614418029785,
1123
+ "eval_runtime": 55.9826,
1124
+ "eval_samples_per_second": 722.188,
1125
+ "eval_steps_per_second": 1.411,
1126
+ "step": 62000
1127
+ },
1128
+ {
1129
+ "epoch": 5.540409814440243,
1130
+ "grad_norm": 0.7008156776428223,
1131
+ "learning_rate": 0.0001264770137509442,
1132
+ "loss": 0.1711,
1133
+ "step": 63000
1134
+ },
1135
+ {
1136
+ "epoch": 5.540409814440243,
1137
+ "eval_accuracy": 0.9127627999010636,
1138
+ "eval_combined_score": 0.8985408672994117,
1139
+ "eval_f1": 0.8843189346977598,
1140
+ "eval_loss": 0.22105169296264648,
1141
+ "eval_runtime": 86.859,
1142
+ "eval_samples_per_second": 465.467,
1143
+ "eval_steps_per_second": 0.91,
1144
+ "step": 63000
1145
+ },
1146
+ {
1147
+ "epoch": 5.628352827367865,
1148
+ "grad_norm": 1.5121339559555054,
1149
+ "learning_rate": 0.00012235706563698158,
1150
+ "loss": 0.1723,
1151
+ "step": 64000
1152
+ },
1153
+ {
1154
+ "epoch": 5.628352827367865,
1155
+ "eval_accuracy": 0.9145683898095474,
1156
+ "eval_combined_score": 0.9011690402388002,
1157
+ "eval_f1": 0.887769690668053,
1158
+ "eval_loss": 0.22686441242694855,
1159
+ "eval_runtime": 87.1861,
1160
+ "eval_samples_per_second": 463.721,
1161
+ "eval_steps_per_second": 0.906,
1162
+ "step": 64000
1163
+ },
1164
+ {
1165
+ "epoch": 5.716295840295489,
1166
+ "grad_norm": 1.6048673391342163,
1167
+ "learning_rate": 0.00011825859622009953,
1168
+ "loss": 0.1751,
1169
+ "step": 65000
1170
+ },
1171
+ {
1172
+ "epoch": 5.716295840295489,
1173
+ "eval_accuracy": 0.9146673262428889,
1174
+ "eval_combined_score": 0.9006424576527972,
1175
+ "eval_f1": 0.8866175890627054,
1176
+ "eval_loss": 0.22858625650405884,
1177
+ "eval_runtime": 87.0073,
1178
+ "eval_samples_per_second": 464.674,
1179
+ "eval_steps_per_second": 0.908,
1180
+ "step": 65000
1181
+ },
1182
+ {
1183
+ "epoch": 5.8042388532231115,
1184
+ "grad_norm": 1.0805526971817017,
1185
+ "learning_rate": 0.00011418479003135898,
1186
+ "loss": 0.1696,
1187
+ "step": 66000
1188
+ },
1189
+ {
1190
+ "epoch": 5.8042388532231115,
1191
+ "eval_accuracy": 0.9142468464011873,
1192
+ "eval_combined_score": 0.9009813109487557,
1193
+ "eval_f1": 0.8877157754963241,
1194
+ "eval_loss": 0.2191147804260254,
1195
+ "eval_runtime": 55.1326,
1196
+ "eval_samples_per_second": 733.323,
1197
+ "eval_steps_per_second": 1.433,
1198
+ "step": 66000
1199
+ },
1200
+ {
1201
+ "epoch": 5.892181866150734,
1202
+ "grad_norm": 0.6021662950515747,
1203
+ "learning_rate": 0.00011013881243837068,
1204
+ "loss": 0.1726,
1205
+ "step": 67000
1206
+ },
1207
+ {
1208
+ "epoch": 5.892181866150734,
1209
+ "eval_accuracy": 0.914988869651249,
1210
+ "eval_combined_score": 0.9015627830762347,
1211
+ "eval_f1": 0.8881366965012205,
1212
+ "eval_loss": 0.2130921483039856,
1213
+ "eval_runtime": 56.0344,
1214
+ "eval_samples_per_second": 721.52,
1215
+ "eval_steps_per_second": 1.41,
1216
+ "step": 67000
1217
+ },
1218
+ {
1219
+ "epoch": 5.980124879078357,
1220
+ "grad_norm": 1.1541500091552734,
1221
+ "learning_rate": 0.00010612380718578806,
1222
+ "loss": 0.1737,
1223
+ "step": 68000
1224
+ },
1225
+ {
1226
+ "epoch": 5.980124879078357,
1227
+ "eval_accuracy": 0.9167449913430621,
1228
+ "eval_combined_score": 0.9030797694740148,
1229
+ "eval_f1": 0.8894145476049675,
1230
+ "eval_loss": 0.22315308451652527,
1231
+ "eval_runtime": 54.8044,
1232
+ "eval_samples_per_second": 737.714,
1233
+ "eval_steps_per_second": 1.441,
1234
+ "step": 68000
1235
+ },
1236
+ {
1237
+ "epoch": 6.06806789200598,
1238
+ "grad_norm": 0.5831199288368225,
1239
+ "learning_rate": 0.00010214289395260275,
1240
+ "loss": 0.164,
1241
+ "step": 69000
1242
+ },
1243
+ {
1244
+ "epoch": 6.06806789200598,
1245
+ "eval_accuracy": 0.914543655701212,
1246
+ "eval_combined_score": 0.9012041896694369,
1247
+ "eval_f1": 0.8878647236376619,
1248
+ "eval_loss": 0.22724460065364838,
1249
+ "eval_runtime": 55.5028,
1250
+ "eval_samples_per_second": 728.432,
1251
+ "eval_steps_per_second": 1.423,
1252
+ "step": 69000
1253
+ },
1254
+ {
1255
+ "epoch": 6.156010904933603,
1256
+ "grad_norm": 1.3472540378570557,
1257
+ "learning_rate": 9.819916592813812e-05,
1258
+ "loss": 0.1605,
1259
+ "step": 70000
1260
+ },
1261
+ {
1262
+ "epoch": 6.156010904933603,
1263
+ "eval_accuracy": 0.9143210487261935,
1264
+ "eval_combined_score": 0.9014692060351224,
1265
+ "eval_f1": 0.8886173633440514,
1266
+ "eval_loss": 0.23116746544837952,
1267
+ "eval_runtime": 55.3566,
1268
+ "eval_samples_per_second": 730.355,
1269
+ "eval_steps_per_second": 1.427,
1270
+ "step": 70000
1271
+ },
1272
+ {
1273
+ "epoch": 6.2439539178612256,
1274
+ "grad_norm": 1.1969749927520752,
1275
+ "learning_rate": 9.429568740862609e-05,
1276
+ "loss": 0.1619,
1277
+ "step": 71000
1278
+ },
1279
+ {
1280
+ "epoch": 6.2439539178612256,
1281
+ "eval_accuracy": 0.9136284936928024,
1282
+ "eval_combined_score": 0.9008383555922335,
1283
+ "eval_f1": 0.8880482174916645,
1284
+ "eval_loss": 0.2302049845457077,
1285
+ "eval_runtime": 53.2162,
1286
+ "eval_samples_per_second": 759.732,
1287
+ "eval_steps_per_second": 1.485,
1288
+ "step": 71000
1289
+ },
1290
+ {
1291
+ "epoch": 6.331896930788849,
1292
+ "grad_norm": 0.6474857926368713,
1293
+ "learning_rate": 9.043549141623341e-05,
1294
+ "loss": 0.1549,
1295
+ "step": 72000
1296
+ },
1297
+ {
1298
+ "epoch": 6.331896930788849,
1299
+ "eval_accuracy": 0.9153846153846154,
1300
+ "eval_combined_score": 0.901999278681791,
1301
+ "eval_f1": 0.8886139419789666,
1302
+ "eval_loss": 0.23562392592430115,
1303
+ "eval_runtime": 53.9629,
1304
+ "eval_samples_per_second": 749.218,
1305
+ "eval_steps_per_second": 1.464,
1306
+ "step": 72000
1307
+ },
1308
+ {
1309
+ "epoch": 6.419839943716472,
1310
+ "grad_norm": 1.9965488910675049,
1311
+ "learning_rate": 8.662157734238882e-05,
1312
+ "loss": 0.1658,
1313
+ "step": 73000
1314
+ },
1315
+ {
1316
+ "epoch": 6.419839943716472,
1317
+ "eval_accuracy": 0.9152114766262677,
1318
+ "eval_combined_score": 0.9019202282416589,
1319
+ "eval_f1": 0.88862897985705,
1320
+ "eval_loss": 0.22398078441619873,
1321
+ "eval_runtime": 53.8509,
1322
+ "eval_samples_per_second": 750.777,
1323
+ "eval_steps_per_second": 1.467,
1324
+ "step": 73000
1325
+ },
1326
+ {
1327
+ "epoch": 6.507782956644094,
1328
+ "grad_norm": 1.2385900020599365,
1329
+ "learning_rate": 8.285690861724085e-05,
1330
+ "loss": 0.1656,
1331
+ "step": 74000
1332
+ },
1333
+ {
1334
+ "epoch": 6.507782956644094,
1335
+ "eval_accuracy": 0.9112045510759337,
1336
+ "eval_combined_score": 0.8983637551298036,
1337
+ "eval_f1": 0.8855229591836735,
1338
+ "eval_loss": 0.2247052639722824,
1339
+ "eval_runtime": 53.3174,
1340
+ "eval_samples_per_second": 758.289,
1341
+ "eval_steps_per_second": 1.482,
1342
+ "step": 74000
1343
+ },
1344
+ {
1345
+ "epoch": 6.595725969571718,
1346
+ "grad_norm": 1.513914942741394,
1347
+ "learning_rate": 7.914441040705777e-05,
1348
+ "loss": 0.1647,
1349
+ "step": 75000
1350
+ },
1351
+ {
1352
+ "epoch": 6.595725969571718,
1353
+ "eval_accuracy": 0.9162997773930249,
1354
+ "eval_combined_score": 0.9027180973192356,
1355
+ "eval_f1": 0.8891364172454462,
1356
+ "eval_loss": 0.22348518669605255,
1357
+ "eval_runtime": 53.4091,
1358
+ "eval_samples_per_second": 756.988,
1359
+ "eval_steps_per_second": 1.479,
1360
+ "step": 75000
1361
+ },
1362
+ {
1363
+ "epoch": 6.6836689824993405,
1364
+ "grad_norm": 1.1074211597442627,
1365
+ "learning_rate": 7.5486967341359e-05,
1366
+ "loss": 0.1611,
1367
+ "step": 76000
1368
+ },
1369
+ {
1370
+ "epoch": 6.6836689824993405,
1371
+ "eval_accuracy": 0.913727430126144,
1372
+ "eval_combined_score": 0.8999817842085385,
1373
+ "eval_f1": 0.8862361382909328,
1374
+ "eval_loss": 0.21794870495796204,
1375
+ "eval_runtime": 53.6652,
1376
+ "eval_samples_per_second": 753.375,
1377
+ "eval_steps_per_second": 1.472,
1378
+ "step": 76000
1379
+ },
1380
+ {
1381
+ "epoch": 6.771611995426963,
1382
+ "grad_norm": 1.8691868782043457,
1383
+ "learning_rate": 7.188742127154373e-05,
1384
+ "loss": 0.1698,
1385
+ "step": 77000
1386
+ },
1387
+ {
1388
+ "epoch": 6.771611995426963,
1389
+ "eval_accuracy": 0.9149146673262429,
1390
+ "eval_combined_score": 0.9020163847203675,
1391
+ "eval_f1": 0.889118102114492,
1392
+ "eval_loss": 0.2198248952627182,
1393
+ "eval_runtime": 53.3414,
1394
+ "eval_samples_per_second": 757.947,
1395
+ "eval_steps_per_second": 1.481,
1396
+ "step": 77000
1397
+ },
1398
+ {
1399
+ "epoch": 6.859555008354587,
1400
+ "grad_norm": 1.2905817031860352,
1401
+ "learning_rate": 6.834856906275834e-05,
1402
+ "loss": 0.1676,
1403
+ "step": 78000
1404
+ },
1405
+ {
1406
+ "epoch": 6.859555008354587,
1407
+ "eval_accuracy": 0.912663863467722,
1408
+ "eval_combined_score": 0.9001433954046545,
1409
+ "eval_f1": 0.8876229273415869,
1410
+ "eval_loss": 0.2259044647216797,
1411
+ "eval_runtime": 53.1863,
1412
+ "eval_samples_per_second": 760.159,
1413
+ "eval_steps_per_second": 1.485,
1414
+ "step": 78000
1415
+ },
1416
+ {
1417
+ "epoch": 6.947498021282209,
1418
+ "grad_norm": 1.9013108015060425,
1419
+ "learning_rate": 6.487316042071804e-05,
1420
+ "loss": 0.1659,
1421
+ "step": 79000
1422
+ },
1423
+ {
1424
+ "epoch": 6.947498021282209,
1425
+ "eval_accuracy": 0.9139253029928271,
1426
+ "eval_combined_score": 0.8987180240611856,
1427
+ "eval_f1": 0.8835107451295441,
1428
+ "eval_loss": 0.2250695824623108,
1429
+ "eval_runtime": 53.3885,
1430
+ "eval_samples_per_second": 757.279,
1431
+ "eval_steps_per_second": 1.48,
1432
+ "step": 79000
1433
+ },
1434
+ {
1435
+ "epoch": 7.035441034209832,
1436
+ "grad_norm": 0.7197607755661011,
1437
+ "learning_rate": 6.146389575517211e-05,
1438
+ "loss": 0.1592,
1439
+ "step": 80000
1440
+ },
1441
+ {
1442
+ "epoch": 7.035441034209832,
1443
+ "eval_accuracy": 0.9150630719762552,
1444
+ "eval_combined_score": 0.9013947413762137,
1445
+ "eval_f1": 0.8877264107761721,
1446
+ "eval_loss": 0.2220790535211563,
1447
+ "eval_runtime": 53.4285,
1448
+ "eval_samples_per_second": 756.713,
1449
+ "eval_steps_per_second": 1.479,
1450
+ "step": 80000
1451
+ },
1452
+ {
1453
+ "epoch": 7.1233840471374545,
1454
+ "grad_norm": 1.0090577602386475,
1455
+ "learning_rate": 5.81234240816722e-05,
1456
+ "loss": 0.1585,
1457
+ "step": 81000
1458
+ },
1459
+ {
1460
+ "epoch": 7.1233840471374545,
1461
+ "eval_accuracy": 0.9149394014345783,
1462
+ "eval_combined_score": 0.9015200700271515,
1463
+ "eval_f1": 0.8881007386197247,
1464
+ "eval_loss": 0.22591271996498108,
1465
+ "eval_runtime": 66.5471,
1466
+ "eval_samples_per_second": 607.539,
1467
+ "eval_steps_per_second": 1.187,
1468
+ "step": 81000
1469
+ },
1470
+ {
1471
+ "epoch": 7.211327060065078,
1472
+ "grad_norm": 0.8818415403366089,
1473
+ "learning_rate": 5.485434096327387e-05,
1474
+ "loss": 0.1556,
1475
+ "step": 82000
1476
+ },
1477
+ {
1478
+ "epoch": 7.211327060065078,
1479
+ "eval_accuracy": 0.9145931239178827,
1480
+ "eval_combined_score": 0.9012395471207972,
1481
+ "eval_f1": 0.8878859703237119,
1482
+ "eval_loss": 0.22191596031188965,
1483
+ "eval_runtime": 66.6411,
1484
+ "eval_samples_per_second": 606.683,
1485
+ "eval_steps_per_second": 1.185,
1486
+ "step": 82000
1487
+ },
1488
+ {
1489
+ "epoch": 7.299270072992701,
1490
+ "grad_norm": 1.0010974407196045,
1491
+ "learning_rate": 5.165918649377139e-05,
1492
+ "loss": 0.1529,
1493
+ "step": 83000
1494
+ },
1495
+ {
1496
+ "epoch": 7.299270072992701,
1497
+ "eval_accuracy": 0.9133069502844422,
1498
+ "eval_combined_score": 0.9001303337134192,
1499
+ "eval_f1": 0.8869537171423963,
1500
+ "eval_loss": 0.22674131393432617,
1501
+ "eval_runtime": 65.3133,
1502
+ "eval_samples_per_second": 619.016,
1503
+ "eval_steps_per_second": 1.21,
1504
+ "step": 83000
1505
+ },
1506
+ {
1507
+ "epoch": 7.387213085920323,
1508
+ "grad_norm": 1.4556635618209839,
1509
+ "learning_rate": 4.854044332403218e-05,
1510
+ "loss": 0.1524,
1511
+ "step": 84000
1512
+ },
1513
+ {
1514
+ "epoch": 7.387213085920323,
1515
+ "eval_accuracy": 0.9162255750680188,
1516
+ "eval_combined_score": 0.9033477259293785,
1517
+ "eval_f1": 0.8904698767907383,
1518
+ "eval_loss": 0.22501328587532043,
1519
+ "eval_runtime": 65.3744,
1520
+ "eval_samples_per_second": 618.438,
1521
+ "eval_steps_per_second": 1.208,
1522
+ "step": 84000
1523
+ },
1524
+ {
1525
+ "epoch": 7.475156098847947,
1526
+ "grad_norm": 1.5750316381454468,
1527
+ "learning_rate": 4.550053473296499e-05,
1528
+ "loss": 0.1565,
1529
+ "step": 85000
1530
+ },
1531
+ {
1532
+ "epoch": 7.475156098847947,
1533
+ "eval_accuracy": 0.9158792975513232,
1534
+ "eval_combined_score": 0.9019671659739106,
1535
+ "eval_f1": 0.8880550343964978,
1536
+ "eval_loss": 0.22253504395484924,
1537
+ "eval_runtime": 65.3176,
1538
+ "eval_samples_per_second": 618.975,
1539
+ "eval_steps_per_second": 1.209,
1540
+ "step": 85000
1541
+ },
1542
+ {
1543
+ "epoch": 7.5630991117755695,
1544
+ "grad_norm": 0.5022881627082825,
1545
+ "learning_rate": 4.254182274461983e-05,
1546
+ "loss": 0.1579,
1547
+ "step": 86000
1548
+ },
1549
+ {
1550
+ "epoch": 7.5630991117755695,
1551
+ "eval_accuracy": 0.9155330200346278,
1552
+ "eval_combined_score": 0.9024413634016252,
1553
+ "eval_f1": 0.8893497067686226,
1554
+ "eval_loss": 0.22446686029434204,
1555
+ "eval_runtime": 47.9666,
1556
+ "eval_samples_per_second": 842.878,
1557
+ "eval_steps_per_second": 1.647,
1558
+ "step": 86000
1559
+ },
1560
+ {
1561
+ "epoch": 7.651042124703192,
1562
+ "grad_norm": 0.5855485200881958,
1563
+ "learning_rate": 3.966660629288376e-05,
1564
+ "loss": 0.1593,
1565
+ "step": 87000
1566
+ },
1567
+ {
1568
+ "epoch": 7.651042124703192,
1569
+ "eval_accuracy": 0.9165223843680436,
1570
+ "eval_combined_score": 0.9034811240132703,
1571
+ "eval_f1": 0.890439863658497,
1572
+ "eval_loss": 0.22174741327762604,
1573
+ "eval_runtime": 47.9873,
1574
+ "eval_samples_per_second": 842.514,
1575
+ "eval_steps_per_second": 1.646,
1576
+ "step": 87000
1577
+ },
1578
+ {
1579
+ "epoch": 7.738985137630815,
1580
+ "grad_norm": 2.386737108230591,
1581
+ "learning_rate": 3.687711943519798e-05,
1582
+ "loss": 0.1557,
1583
+ "step": 88000
1584
+ },
1585
+ {
1586
+ "epoch": 7.738985137630815,
1587
+ "eval_accuracy": 0.9156319564679694,
1588
+ "eval_combined_score": 0.9024408037153315,
1589
+ "eval_f1": 0.8892496509626936,
1590
+ "eval_loss": 0.22780916094779968,
1591
+ "eval_runtime": 47.9697,
1592
+ "eval_samples_per_second": 842.824,
1593
+ "eval_steps_per_second": 1.647,
1594
+ "step": 88000
1595
+ },
1596
+ {
1597
+ "epoch": 7.826928150558438,
1598
+ "grad_norm": 1.339406967163086,
1599
+ "learning_rate": 3.4175529616683805e-05,
1600
+ "loss": 0.1586,
1601
+ "step": 89000
1602
+ },
1603
+ {
1604
+ "epoch": 7.826928150558438,
1605
+ "eval_accuracy": 0.9154588177096216,
1606
+ "eval_combined_score": 0.9022315846033333,
1607
+ "eval_f1": 0.8890043514970449,
1608
+ "eval_loss": 0.2238481193780899,
1609
+ "eval_runtime": 48.005,
1610
+ "eval_samples_per_second": 842.204,
1611
+ "eval_steps_per_second": 1.646,
1612
+ "step": 89000
1613
+ },
1614
+ {
1615
+ "epoch": 7.914871163486061,
1616
+ "grad_norm": 0.8423387408256531,
1617
+ "learning_rate": 3.156393598602742e-05,
1618
+ "loss": 0.1592,
1619
+ "step": 90000
1620
+ },
1621
+ {
1622
+ "epoch": 7.914871163486061,
1623
+ "eval_accuracy": 0.9166707890180559,
1624
+ "eval_combined_score": 0.9032521729008398,
1625
+ "eval_f1": 0.8898335567836239,
1626
+ "eval_loss": 0.21675018966197968,
1627
+ "eval_runtime": 47.9354,
1628
+ "eval_samples_per_second": 843.428,
1629
+ "eval_steps_per_second": 1.648,
1630
+ "step": 90000
1631
+ },
1632
+ {
1633
+ "epoch": 8.002814176413684,
1634
+ "grad_norm": 0.36737295985221863,
1635
+ "learning_rate": 2.9044367764430513e-05,
1636
+ "loss": 0.1539,
1637
+ "step": 91000
1638
+ },
1639
+ {
1640
+ "epoch": 8.002814176413684,
1641
+ "eval_accuracy": 0.9154835518179569,
1642
+ "eval_combined_score": 0.9016278730614073,
1643
+ "eval_f1": 0.8877721943048577,
1644
+ "eval_loss": 0.2243315726518631,
1645
+ "eval_runtime": 47.9598,
1646
+ "eval_samples_per_second": 842.998,
1647
+ "eval_steps_per_second": 1.647,
1648
+ "step": 91000
1649
+ },
1650
+ {
1651
+ "epoch": 8.090757189341307,
1652
+ "grad_norm": 1.1826375722885132,
1653
+ "learning_rate": 2.661878266889586e-05,
1654
+ "loss": 0.1486,
1655
+ "step": 92000
1656
+ },
1657
+ {
1658
+ "epoch": 8.090757189341307,
1659
+ "eval_accuracy": 0.9158792975513232,
1660
+ "eval_combined_score": 0.9024312433570227,
1661
+ "eval_f1": 0.8889831891627223,
1662
+ "eval_loss": 0.22577287256717682,
1663
+ "eval_runtime": 48.067,
1664
+ "eval_samples_per_second": 841.117,
1665
+ "eval_steps_per_second": 1.644,
1666
+ "step": 92000
1667
+ },
1668
+ {
1669
+ "epoch": 8.17870020226893,
1670
+ "grad_norm": 1.1861492395401,
1671
+ "learning_rate": 2.428906539107102e-05,
1672
+ "loss": 0.1505,
1673
+ "step": 93000
1674
+ },
1675
+ {
1676
+ "epoch": 8.17870020226893,
1677
+ "eval_accuracy": 0.9159287657679941,
1678
+ "eval_combined_score": 0.9030337128558772,
1679
+ "eval_f1": 0.8901386599437603,
1680
+ "eval_loss": 0.22480596601963043,
1681
+ "eval_runtime": 51.9003,
1682
+ "eval_samples_per_second": 778.994,
1683
+ "eval_steps_per_second": 1.522,
1684
+ "step": 93000
1685
+ },
1686
+ {
1687
+ "epoch": 8.266643215196552,
1688
+ "grad_norm": 1.6536142826080322,
1689
+ "learning_rate": 2.2057026132833862e-05,
1690
+ "loss": 0.1521,
1691
+ "step": 94000
1692
+ },
1693
+ {
1694
+ "epoch": 8.266643215196552,
1695
+ "eval_accuracy": 0.9164481820430374,
1696
+ "eval_combined_score": 0.9030352268207607,
1697
+ "eval_f1": 0.8896222715984838,
1698
+ "eval_loss": 0.2237117737531662,
1699
+ "eval_runtime": 47.9654,
1700
+ "eval_samples_per_second": 842.899,
1701
+ "eval_steps_per_second": 1.647,
1702
+ "step": 94000
1703
+ },
1704
+ {
1705
+ "epoch": 8.354586228124175,
1706
+ "grad_norm": 1.2048615217208862,
1707
+ "learning_rate": 1.992439919975663e-05,
1708
+ "loss": 0.1504,
1709
+ "step": 95000
1710
+ },
1711
+ {
1712
+ "epoch": 8.354586228124175,
1713
+ "eval_accuracy": 0.91535988127628,
1714
+ "eval_combined_score": 0.9023686326921905,
1715
+ "eval_f1": 0.8893773841081011,
1716
+ "eval_loss": 0.2246434986591339,
1717
+ "eval_runtime": 47.9948,
1718
+ "eval_samples_per_second": 842.384,
1719
+ "eval_steps_per_second": 1.646,
1720
+ "step": 95000
1721
+ },
1722
+ {
1723
+ "epoch": 8.442529241051798,
1724
+ "grad_norm": 1.1018518209457397,
1725
+ "learning_rate": 1.7892841653541984e-05,
1726
+ "loss": 0.1458,
1727
+ "step": 96000
1728
+ },
1729
+ {
1730
+ "epoch": 8.442529241051798,
1731
+ "eval_accuracy": 0.9163739797180311,
1732
+ "eval_combined_score": 0.9027698451286434,
1733
+ "eval_f1": 0.8891657105392559,
1734
+ "eval_loss": 0.22791457176208496,
1735
+ "eval_runtime": 47.9639,
1736
+ "eval_samples_per_second": 842.926,
1737
+ "eval_steps_per_second": 1.647,
1738
+ "step": 96000
1739
+ },
1740
+ {
1741
+ "epoch": 8.530472253979422,
1742
+ "grad_norm": 0.7790504097938538,
1743
+ "learning_rate": 1.596393202447782e-05,
1744
+ "loss": 0.153,
1745
+ "step": 97000
1746
+ },
1747
+ {
1748
+ "epoch": 8.530472253979422,
1749
+ "eval_accuracy": 0.9166460549097205,
1750
+ "eval_combined_score": 0.903004577028071,
1751
+ "eval_f1": 0.8893630991464215,
1752
+ "eval_loss": 0.22113507986068726,
1753
+ "eval_runtime": 47.9635,
1754
+ "eval_samples_per_second": 842.932,
1755
+ "eval_steps_per_second": 1.647,
1756
+ "step": 97000
1757
+ },
1758
+ {
1759
+ "epoch": 8.618415266907045,
1760
+ "grad_norm": 0.6206575036048889,
1761
+ "learning_rate": 1.4139169084911189e-05,
1762
+ "loss": 0.149,
1763
+ "step": 98000
1764
+ },
1765
+ {
1766
+ "epoch": 8.618415266907045,
1767
+ "eval_accuracy": 0.9167449913430621,
1768
+ "eval_combined_score": 0.9033760902078358,
1769
+ "eval_f1": 0.8900071890726097,
1770
+ "eval_loss": 0.2230953723192215,
1771
+ "eval_runtime": 47.918,
1772
+ "eval_samples_per_second": 843.732,
1773
+ "eval_steps_per_second": 1.649,
1774
+ "step": 98000
1775
+ },
1776
+ {
1777
+ "epoch": 8.706358279834667,
1778
+ "grad_norm": 0.9861566424369812,
1779
+ "learning_rate": 1.2419970684695196e-05,
1780
+ "loss": 0.1554,
1781
+ "step": 99000
1782
+ },
1783
+ {
1784
+ "epoch": 8.706358279834667,
1785
+ "eval_accuracy": 0.916423447934702,
1786
+ "eval_combined_score": 0.9031773580270923,
1787
+ "eval_f1": 0.8899312681194828,
1788
+ "eval_loss": 0.2201649397611618,
1789
+ "eval_runtime": 47.9763,
1790
+ "eval_samples_per_second": 842.708,
1791
+ "eval_steps_per_second": 1.647,
1792
+ "step": 99000
1793
+ },
1794
+ {
1795
+ "epoch": 8.79430129276229,
1796
+ "grad_norm": 1.2466603517532349,
1797
+ "learning_rate": 1.0807672649512177e-05,
1798
+ "loss": 0.1494,
1799
+ "step": 100000
1800
+ },
1801
+ {
1802
+ "epoch": 8.79430129276229,
1803
+ "eval_accuracy": 0.9156814246846401,
1804
+ "eval_combined_score": 0.9023141136518943,
1805
+ "eval_f1": 0.8889468026191485,
1806
+ "eval_loss": 0.22495532035827637,
1807
+ "eval_runtime": 47.9521,
1808
+ "eval_samples_per_second": 843.133,
1809
+ "eval_steps_per_second": 1.647,
1810
+ "step": 100000
1811
+ },
1812
+ {
1813
+ "epoch": 8.882244305689913,
1814
+ "grad_norm": 1.0836381912231445,
1815
+ "learning_rate": 9.3035277429309e-06,
1816
+ "loss": 0.1493,
1817
+ "step": 101000
1818
+ },
1819
+ {
1820
+ "epoch": 8.882244305689913,
1821
+ "eval_accuracy": 0.9164481820430374,
1822
+ "eval_combined_score": 0.9030712697049043,
1823
+ "eval_f1": 0.8896943573667712,
1824
+ "eval_loss": 0.2241670787334442,
1825
+ "eval_runtime": 47.9327,
1826
+ "eval_samples_per_second": 843.475,
1827
+ "eval_steps_per_second": 1.648,
1828
+ "step": 101000
1829
+ },
1830
+ {
1831
+ "epoch": 8.970187318617535,
1832
+ "grad_norm": 0.9767763614654541,
1833
+ "learning_rate": 7.908704693002666e-06,
1834
+ "loss": 0.1453,
1835
+ "step": 102000
1836
+ },
1837
+ {
1838
+ "epoch": 8.970187318617535,
1839
+ "eval_accuracy": 0.9160771704180064,
1840
+ "eval_combined_score": 0.9029090885723412,
1841
+ "eval_f1": 0.889741006726676,
1842
+ "eval_loss": 0.2252817302942276,
1843
+ "eval_runtime": 47.9718,
1844
+ "eval_samples_per_second": 842.787,
1845
+ "eval_steps_per_second": 1.647,
1846
+ "step": 102000
1847
+ },
1848
+ {
1849
+ "epoch": 9.05813033154516,
1850
+ "grad_norm": 1.684928297996521,
1851
+ "learning_rate": 6.624287284154212e-06,
1852
+ "loss": 0.1533,
1853
+ "step": 103000
1854
+ },
1855
+ {
1856
+ "epoch": 9.05813033154516,
1857
+ "eval_accuracy": 0.9167944595597328,
1858
+ "eval_combined_score": 0.9034945009640941,
1859
+ "eval_f1": 0.8901945423684554,
1860
+ "eval_loss": 0.2233511060476303,
1861
+ "eval_runtime": 47.9608,
1862
+ "eval_samples_per_second": 842.98,
1863
+ "eval_steps_per_second": 1.647,
1864
+ "step": 103000
1865
+ },
1866
+ {
1867
+ "epoch": 9.146073344472782,
1868
+ "grad_norm": 1.3675307035446167,
1869
+ "learning_rate": 5.451273515081639e-06,
1870
+ "loss": 0.1487,
1871
+ "step": 104000
1872
+ },
1873
+ {
1874
+ "epoch": 9.146073344472782,
1875
+ "eval_accuracy": 0.9162255750680188,
1876
+ "eval_combined_score": 0.9029876288477687,
1877
+ "eval_f1": 0.8897496826275186,
1878
+ "eval_loss": 0.22376590967178345,
1879
+ "eval_runtime": 47.9992,
1880
+ "eval_samples_per_second": 842.306,
1881
+ "eval_steps_per_second": 1.646,
1882
+ "step": 104000
1883
+ },
1884
+ {
1885
+ "epoch": 9.234016357400405,
1886
+ "grad_norm": 0.4817681312561035,
1887
+ "learning_rate": 4.3905748233003915e-06,
1888
+ "loss": 0.1466,
1889
+ "step": 105000
1890
+ },
1891
+ {
1892
+ "epoch": 9.234016357400405,
1893
+ "eval_accuracy": 0.9162750432846896,
1894
+ "eval_combined_score": 0.9027818388950022,
1895
+ "eval_f1": 0.8892886345053148,
1896
+ "eval_loss": 0.2244621366262436,
1897
+ "eval_runtime": 51.9298,
1898
+ "eval_samples_per_second": 778.551,
1899
+ "eval_steps_per_second": 1.521,
1900
+ "step": 105000
1901
+ },
1902
+ {
1903
+ "epoch": 9.321959370328027,
1904
+ "grad_norm": 2.7926080226898193,
1905
+ "learning_rate": 3.4430153769539838e-06,
1906
+ "loss": 0.1469,
1907
+ "step": 106000
1908
+ },
1909
+ {
1910
+ "epoch": 9.321959370328027,
1911
+ "eval_accuracy": 0.9162255750680188,
1912
+ "eval_combined_score": 0.9030378257466343,
1913
+ "eval_f1": 0.8898500764252496,
1914
+ "eval_loss": 0.2267247438430786,
1915
+ "eval_runtime": 47.9367,
1916
+ "eval_samples_per_second": 843.403,
1917
+ "eval_steps_per_second": 1.648,
1918
+ "step": 106000
1919
+ },
1920
+ {
1921
+ "epoch": 9.40990238325565,
1922
+ "grad_norm": 1.4503751993179321,
1923
+ "learning_rate": 2.609331434431139e-06,
1924
+ "loss": 0.1515,
1925
+ "step": 107000
1926
+ },
1927
+ {
1928
+ "epoch": 9.40990238325565,
1929
+ "eval_accuracy": 0.916176106851348,
1930
+ "eval_combined_score": 0.9029554684248606,
1931
+ "eval_f1": 0.8897348299983732,
1932
+ "eval_loss": 0.2243395447731018,
1933
+ "eval_runtime": 47.9531,
1934
+ "eval_samples_per_second": 843.116,
1935
+ "eval_steps_per_second": 1.647,
1936
+ "step": 107000
1937
+ },
1938
+ {
1939
+ "epoch": 9.497845396183273,
1940
+ "grad_norm": 1.1244585514068604,
1941
+ "learning_rate": 1.890170772289401e-06,
1942
+ "loss": 0.1493,
1943
+ "step": 108000
1944
+ },
1945
+ {
1946
+ "epoch": 9.497845396183273,
1947
+ "eval_accuracy": 0.9162750432846896,
1948
+ "eval_combined_score": 0.9030197960423187,
1949
+ "eval_f1": 0.8897645487999479,
1950
+ "eval_loss": 0.2248089611530304,
1951
+ "eval_runtime": 47.9657,
1952
+ "eval_samples_per_second": 842.895,
1953
+ "eval_steps_per_second": 1.647,
1954
+ "step": 108000
1955
+ },
1956
+ {
1957
+ "epoch": 9.585788409110895,
1958
+ "grad_norm": 1.1632763147354126,
1959
+ "learning_rate": 1.286092181929571e-06,
1960
+ "loss": 0.1505,
1961
+ "step": 109000
1962
+ },
1963
+ {
1964
+ "epoch": 9.585788409110895,
1965
+ "eval_accuracy": 0.9162008409596833,
1966
+ "eval_combined_score": 0.9029320494774318,
1967
+ "eval_f1": 0.8896632579951801,
1968
+ "eval_loss": 0.22427567839622498,
1969
+ "eval_runtime": 48.0054,
1970
+ "eval_samples_per_second": 842.197,
1971
+ "eval_steps_per_second": 1.646,
1972
+ "step": 109000
1973
+ },
1974
+ {
1975
+ "epoch": 9.67373142203852,
1976
+ "grad_norm": 3.00376296043396,
1977
+ "learning_rate": 7.975650354119345e-07,
1978
+ "loss": 0.1409,
1979
+ "step": 110000
1980
+ },
1981
+ {
1982
+ "epoch": 9.67373142203852,
1983
+ "eval_accuracy": 0.9162750432846896,
1984
+ "eval_combined_score": 0.9029407046316152,
1985
+ "eval_f1": 0.8896063659785409,
1986
+ "eval_loss": 0.2249860167503357,
1987
+ "eval_runtime": 48.0308,
1988
+ "eval_samples_per_second": 841.752,
1989
+ "eval_steps_per_second": 1.645,
1990
+ "step": 110000
1991
+ },
1992
+ {
1993
+ "epoch": 9.761674434966142,
1994
+ "grad_norm": 0.831674337387085,
1995
+ "learning_rate": 4.249689207519447e-07,
1996
+ "loss": 0.1489,
1997
+ "step": 111000
1998
+ },
1999
+ {
2000
+ "epoch": 9.761674434966142,
2001
+ "eval_accuracy": 0.9162255750680188,
2002
+ "eval_combined_score": 0.9029481241899265,
2003
+ "eval_f1": 0.8896706733118342,
2004
+ "eval_loss": 0.22530485689640045,
2005
+ "eval_runtime": 47.9605,
2006
+ "eval_samples_per_second": 842.985,
2007
+ "eval_steps_per_second": 1.647,
2008
+ "step": 111000
2009
+ },
2010
+ {
2011
+ "epoch": 9.849617447893765,
2012
+ "grad_norm": 0.972335159778595,
2013
+ "learning_rate": 1.6859334697840177e-07,
2014
+ "loss": 0.152,
2015
+ "step": 112000
2016
+ },
2017
+ {
2018
+ "epoch": 9.849617447893765,
2019
+ "eval_accuracy": 0.916176106851348,
2020
+ "eval_combined_score": 0.9029303437157037,
2021
+ "eval_f1": 0.8896845805800593,
2022
+ "eval_loss": 0.22533808648586273,
2023
+ "eval_runtime": 47.9968,
2024
+ "eval_samples_per_second": 842.348,
2025
+ "eval_steps_per_second": 1.646,
2026
+ "step": 112000
2027
+ },
2028
+ {
2029
+ "epoch": 9.937560460821388,
2030
+ "grad_norm": 0.8028485178947449,
2031
+ "learning_rate": 2.863751918346091e-08,
2032
+ "loss": 0.1449,
2033
+ "step": 113000
2034
+ },
2035
+ {
2036
+ "epoch": 9.937560460821388,
2037
+ "eval_accuracy": 0.9161513727430126,
2038
+ "eval_combined_score": 0.9028999051215063,
2039
+ "eval_f1": 0.8896484375,
2040
+ "eval_loss": 0.22538121044635773,
2041
+ "eval_runtime": 47.9803,
2042
+ "eval_samples_per_second": 842.637,
2043
+ "eval_steps_per_second": 1.647,
2044
+ "step": 113000
2045
+ },
2046
+ {
2047
+ "epoch": 10.0,
2048
+ "step": 113710,
2049
+ "total_flos": 6.076865681478144e+17,
2050
+ "train_loss": 0.18996371687816216,
2051
+ "train_runtime": 52014.9626,
2052
+ "train_samples_per_second": 69.95,
2053
+ "train_steps_per_second": 2.186
2054
+ }
2055
+ ],
2056
+ "logging_steps": 1000,
2057
+ "max_steps": 113710,
2058
+ "num_input_tokens_seen": 0,
2059
+ "num_train_epochs": 10,
2060
+ "save_steps": 1000,
2061
+ "stateful_callbacks": {
2062
+ "TrainerControl": {
2063
+ "args": {
2064
+ "should_epoch_stop": false,
2065
+ "should_evaluate": false,
2066
+ "should_log": false,
2067
+ "should_save": true,
2068
+ "should_training_stop": true
2069
+ },
2070
+ "attributes": {}
2071
+ }
2072
+ },
2073
+ "total_flos": 6.076865681478144e+17,
2074
+ "train_batch_size": 32,
2075
+ "trial_name": null,
2076
+ "trial_params": null
2077
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/QQP.tsv ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9170912688597576,
4
+ "eval_combined_score": 0.9035767921865203,
5
+ "eval_f1": 0.8900623155132831,
6
+ "eval_loss": 0.22255383431911469,
7
+ "eval_runtime": 55.1264,
8
+ "eval_samples": 40430,
9
+ "eval_samples_per_second": 733.405,
10
+ "eval_steps_per_second": 1.433
11
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/eval_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9170912688597576,
4
+ "eval_combined_score": 0.9035767921865203,
5
+ "eval_f1": 0.8900623155132831,
6
+ "eval_loss": 0.22255383431911469,
7
+ "eval_runtime": 55.1264,
8
+ "eval_samples": 40430,
9
+ "eval_samples_per_second": 733.405,
10
+ "eval_steps_per_second": 1.433
11
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft/tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "pad_token": "[PAD]",
53
+ "padding_side": "right",
54
+ "sep_token": "[SEP]",
55
+ "sp_model_kwargs": {},
56
+ "split_by_punct": false,
57
+ "tokenizer_class": "DebertaV2Tokenizer",
58
+ "unk_token": "[UNK]",
59
+ "vocab_type": "spm"
60
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/deberta-v3-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:microsoft/deberta-v3-base
6
+ - transformers
7
+ ---
8
+
9
+ # Model Card for Model ID
10
+
11
+ <!-- Provide a quick summary of what the model is/does. -->
12
+
13
+
14
+
15
+ ## Model Details
16
+
17
+ ### Model Description
18
+
19
+ <!-- Provide a longer summary of what this model is. -->
20
+
21
+
22
+
23
+ - **Developed by:** [More Information Needed]
24
+ - **Funded by [optional]:** [More Information Needed]
25
+ - **Shared by [optional]:** [More Information Needed]
26
+ - **Model type:** [More Information Needed]
27
+ - **Language(s) (NLP):** [More Information Needed]
28
+ - **License:** [More Information Needed]
29
+ - **Finetuned from model [optional]:** [More Information Needed]
30
+
31
+ ### Model Sources [optional]
32
+
33
+ <!-- Provide the basic links for the model. -->
34
+
35
+ - **Repository:** [More Information Needed]
36
+ - **Paper [optional]:** [More Information Needed]
37
+ - **Demo [optional]:** [More Information Needed]
38
+
39
+ ## Uses
40
+
41
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
42
+
43
+ ### Direct Use
44
+
45
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
46
+
47
+ [More Information Needed]
48
+
49
+ ### Downstream Use [optional]
50
+
51
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
52
+
53
+ [More Information Needed]
54
+
55
+ ### Out-of-Scope Use
56
+
57
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
58
+
59
+ [More Information Needed]
60
+
61
+ ## Bias, Risks, and Limitations
62
+
63
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
64
+
65
+ [More Information Needed]
66
+
67
+ ### Recommendations
68
+
69
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
70
+
71
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
72
+
73
+ ## How to Get Started with the Model
74
+
75
+ Use the code below to get started with the model.
76
+
77
+ [More Information Needed]
78
+
79
+ ## Training Details
80
+
81
+ ### Training Data
82
+
83
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
84
+
85
+ [More Information Needed]
86
+
87
+ ### Training Procedure
88
+
89
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
90
+
91
+ #### Preprocessing [optional]
92
+
93
+ [More Information Needed]
94
+
95
+
96
+ #### Training Hyperparameters
97
+
98
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
99
+
100
+ #### Speeds, Sizes, Times [optional]
101
+
102
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
103
+
104
+ [More Information Needed]
105
+
106
+ ## Evaluation
107
+
108
+ <!-- This section describes the evaluation protocols and provides the results. -->
109
+
110
+ ### Testing Data, Factors & Metrics
111
+
112
+ #### Testing Data
113
+
114
+ <!-- This should link to a Dataset Card if possible. -->
115
+
116
+ [More Information Needed]
117
+
118
+ #### Factors
119
+
120
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
121
+
122
+ [More Information Needed]
123
+
124
+ #### Metrics
125
+
126
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
127
+
128
+ [More Information Needed]
129
+
130
+ ### Results
131
+
132
+ [More Information Needed]
133
+
134
+ #### Summary
135
+
136
+
137
+
138
+ ## Model Examination [optional]
139
+
140
+ <!-- Relevant interpretability work for the model goes here -->
141
+
142
+ [More Information Needed]
143
+
144
+ ## Environmental Impact
145
+
146
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
147
+
148
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
149
+
150
+ - **Hardware Type:** [More Information Needed]
151
+ - **Hours used:** [More Information Needed]
152
+ - **Cloud Provider:** [More Information Needed]
153
+ - **Compute Region:** [More Information Needed]
154
+ - **Carbon Emitted:** [More Information Needed]
155
+
156
+ ## Technical Specifications [optional]
157
+
158
+ ### Model Architecture and Objective
159
+
160
+ [More Information Needed]
161
+
162
+ ### Compute Infrastructure
163
+
164
+ [More Information Needed]
165
+
166
+ #### Hardware
167
+
168
+ [More Information Needed]
169
+
170
+ #### Software
171
+
172
+ [More Information Needed]
173
+
174
+ ## Citation [optional]
175
+
176
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
177
+
178
+ **BibTeX:**
179
+
180
+ [More Information Needed]
181
+
182
+ **APA:**
183
+
184
+ [More Information Needed]
185
+
186
+ ## Glossary [optional]
187
+
188
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
189
+
190
+ [More Information Needed]
191
+
192
+ ## More Information [optional]
193
+
194
+ [More Information Needed]
195
+
196
+ ## Model Card Authors [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Contact
201
+
202
+ [More Information Needed]
203
+ ### Framework versions
204
+
205
+ - PEFT 0.18.0
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/ft2/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": {
3
+ "base_model_class": "DebertaV2ForSequenceClassification",
4
+ "parent_library": "transformers.models.deberta_v2.modeling_deberta_v2"
5
+ },
6
+ "base_model_name_or_path": "microsoft/deberta-v3-base",
7
+ "bias": "none",
8
+ "boft_block_num": 0,
9
+ "boft_block_size": 4,
10
+ "boft_dropout": 0.05,
11
+ "boft_n_butterfly_factor": 2,
12
+ "exclude_modules": null,
13
+ "fan_in_fan_out": false,
14
+ "inference_mode": true,
15
+ "init_weights": true,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "modules_to_save": [
19
+ "classifier",
20
+ "pooler"
21
+ ],
22
+ "peft_type": "BOFT",
23
+ "peft_version": "0.18.0",
24
+ "revision": null,
25
+ "target_modules": [
26
+ "value_proj",
27
+ "attention.output.dense",
28
+ "key_proj",
29
+ "output.dense",
30
+ "query_proj",
31
+ "intermediate.dense"
32
+ ],
33
+ "task_type": null
34
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/trainer_state.json ADDED
@@ -0,0 +1,2077 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 94000,
3
+ "best_metric": 0.9170912688597576,
4
+ "best_model_checkpoint": "./glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d01h13m32,sd43/checkpoint-94000",
5
+ "epoch": 10.0,
6
+ "eval_steps": 1000,
7
+ "global_step": 113710,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0879430129276229,
14
+ "grad_norm": 2.7297446727752686,
15
+ "learning_rate": 0.00029969999999999997,
16
+ "loss": 0.4413,
17
+ "step": 1000
18
+ },
19
+ {
20
+ "epoch": 0.0879430129276229,
21
+ "eval_accuracy": 0.8535988127627999,
22
+ "eval_combined_score": 0.8363529041351361,
23
+ "eval_f1": 0.8191069955074722,
24
+ "eval_loss": 0.3258603811264038,
25
+ "eval_runtime": 57.5406,
26
+ "eval_samples_per_second": 702.634,
27
+ "eval_steps_per_second": 1.373,
28
+ "step": 1000
29
+ },
30
+ {
31
+ "epoch": 0.1758860258552458,
32
+ "grad_norm": 0.6822599768638611,
33
+ "learning_rate": 0.00029994184111301736,
34
+ "loss": 0.3205,
35
+ "step": 2000
36
+ },
37
+ {
38
+ "epoch": 0.1758860258552458,
39
+ "eval_accuracy": 0.8633687855552807,
40
+ "eval_combined_score": 0.8474359789172207,
41
+ "eval_f1": 0.8315031722791606,
42
+ "eval_loss": 0.30383479595184326,
43
+ "eval_runtime": 55.7418,
44
+ "eval_samples_per_second": 725.309,
45
+ "eval_steps_per_second": 1.417,
46
+ "step": 2000
47
+ },
48
+ {
49
+ "epoch": 0.2638290387828687,
50
+ "grad_norm": 0.7207789421081543,
51
+ "learning_rate": 0.00029976717673021227,
52
+ "loss": 0.2897,
53
+ "step": 3000
54
+ },
55
+ {
56
+ "epoch": 0.2638290387828687,
57
+ "eval_accuracy": 0.881375216423448,
58
+ "eval_combined_score": 0.8639466967933932,
59
+ "eval_f1": 0.8465181771633384,
60
+ "eval_loss": 0.27747777104377747,
61
+ "eval_runtime": 55.436,
62
+ "eval_samples_per_second": 729.309,
63
+ "eval_steps_per_second": 1.425,
64
+ "step": 3000
65
+ },
66
+ {
67
+ "epoch": 0.3517720517104916,
68
+ "grad_norm": 0.9168917536735535,
69
+ "learning_rate": 0.0002994761425083971,
70
+ "loss": 0.2887,
71
+ "step": 4000
72
+ },
73
+ {
74
+ "epoch": 0.3517720517104916,
75
+ "eval_accuracy": 0.8850853326737571,
76
+ "eval_combined_score": 0.8636792768746298,
77
+ "eval_f1": 0.8422732210755024,
78
+ "eval_loss": 0.26767390966415405,
79
+ "eval_runtime": 62.1194,
80
+ "eval_samples_per_second": 650.843,
81
+ "eval_steps_per_second": 1.272,
82
+ "step": 4000
83
+ },
84
+ {
85
+ "epoch": 0.4397150646381145,
86
+ "grad_norm": 1.0872021913528442,
87
+ "learning_rate": 0.0002990689645826054,
88
+ "loss": 0.2715,
89
+ "step": 5000
90
+ },
91
+ {
92
+ "epoch": 0.4397150646381145,
93
+ "eval_accuracy": 0.889339599307445,
94
+ "eval_combined_score": 0.8715030566694875,
95
+ "eval_f1": 0.85366651403153,
96
+ "eval_loss": 0.2571745216846466,
97
+ "eval_runtime": 93.9863,
98
+ "eval_samples_per_second": 430.169,
99
+ "eval_steps_per_second": 0.841,
100
+ "step": 5000
101
+ },
102
+ {
103
+ "epoch": 0.5276580775657373,
104
+ "grad_norm": 0.8852691054344177,
105
+ "learning_rate": 0.00029854595933210474,
106
+ "loss": 0.2718,
107
+ "step": 6000
108
+ },
109
+ {
110
+ "epoch": 0.5276580775657373,
111
+ "eval_accuracy": 0.8892653969824388,
112
+ "eval_combined_score": 0.8717672318722942,
113
+ "eval_f1": 0.8542690667621496,
114
+ "eval_loss": 0.2696632146835327,
115
+ "eval_runtime": 87.155,
116
+ "eval_samples_per_second": 463.886,
117
+ "eval_steps_per_second": 0.906,
118
+ "step": 6000
119
+ },
120
+ {
121
+ "epoch": 0.6156010904933603,
122
+ "grad_norm": 0.9273120164871216,
123
+ "learning_rate": 0.0002979075331345683,
124
+ "loss": 0.2637,
125
+ "step": 7000
126
+ },
127
+ {
128
+ "epoch": 0.6156010904933603,
129
+ "eval_accuracy": 0.8877318822656444,
130
+ "eval_combined_score": 0.8720894776140518,
131
+ "eval_f1": 0.8564470729624593,
132
+ "eval_loss": 0.2592132091522217,
133
+ "eval_runtime": 87.1323,
134
+ "eval_samples_per_second": 464.007,
135
+ "eval_steps_per_second": 0.907,
136
+ "step": 7000
137
+ },
138
+ {
139
+ "epoch": 0.7035441034209832,
140
+ "grad_norm": 1.545639991760254,
141
+ "learning_rate": 0.0002971541820503175,
142
+ "loss": 0.2637,
143
+ "step": 8000
144
+ },
145
+ {
146
+ "epoch": 0.7035441034209832,
147
+ "eval_accuracy": 0.893099183774425,
148
+ "eval_combined_score": 0.8766595530773806,
149
+ "eval_f1": 0.8602199223803364,
150
+ "eval_loss": 0.2516544759273529,
151
+ "eval_runtime": 56.9694,
152
+ "eval_samples_per_second": 709.679,
153
+ "eval_steps_per_second": 1.387,
154
+ "step": 8000
155
+ },
156
+ {
157
+ "epoch": 0.7914871163486061,
158
+ "grad_norm": 0.8067322373390198,
159
+ "learning_rate": 0.00029628649143688076,
160
+ "loss": 0.2578,
161
+ "step": 9000
162
+ },
163
+ {
164
+ "epoch": 0.7914871163486061,
165
+ "eval_accuracy": 0.8898095473658174,
166
+ "eval_combined_score": 0.8757041150728245,
167
+ "eval_f1": 0.8615986827798316,
168
+ "eval_loss": 0.26392853260040283,
169
+ "eval_runtime": 55.9304,
170
+ "eval_samples_per_second": 722.863,
171
+ "eval_steps_per_second": 1.412,
172
+ "step": 9000
173
+ },
174
+ {
175
+ "epoch": 0.879430129276229,
176
+ "grad_norm": 1.1267387866973877,
177
+ "learning_rate": 0.0002953051354941674,
178
+ "loss": 0.2582,
179
+ "step": 10000
180
+ },
181
+ {
182
+ "epoch": 0.879430129276229,
183
+ "eval_accuracy": 0.8933465248577789,
184
+ "eval_combined_score": 0.8785575035510934,
185
+ "eval_f1": 0.8637684822444079,
186
+ "eval_loss": 0.25557711720466614,
187
+ "eval_runtime": 55.7874,
188
+ "eval_samples_per_second": 724.715,
189
+ "eval_steps_per_second": 1.416,
190
+ "step": 10000
191
+ },
192
+ {
193
+ "epoch": 0.967373142203852,
194
+ "grad_norm": 1.7442392110824585,
195
+ "learning_rate": 0.0002942108767406115,
196
+ "loss": 0.2531,
197
+ "step": 11000
198
+ },
199
+ {
200
+ "epoch": 0.967373142203852,
201
+ "eval_accuracy": 0.8981944100915162,
202
+ "eval_combined_score": 0.8805794263801568,
203
+ "eval_f1": 0.8629644426687975,
204
+ "eval_loss": 0.2394532710313797,
205
+ "eval_runtime": 79.3594,
206
+ "eval_samples_per_second": 509.454,
207
+ "eval_steps_per_second": 0.995,
208
+ "step": 11000
209
+ },
210
+ {
211
+ "epoch": 1.0553161551314747,
212
+ "grad_norm": 0.8848230838775635,
213
+ "learning_rate": 0.00029300456542069104,
214
+ "loss": 0.24,
215
+ "step": 12000
216
+ },
217
+ {
218
+ "epoch": 1.0553161551314747,
219
+ "eval_accuracy": 0.8990106356665842,
220
+ "eval_combined_score": 0.8819082183994985,
221
+ "eval_f1": 0.8648058011324128,
222
+ "eval_loss": 0.24989046156406403,
223
+ "eval_runtime": 87.0661,
224
+ "eval_samples_per_second": 464.36,
225
+ "eval_steps_per_second": 0.907,
226
+ "step": 12000
227
+ },
228
+ {
229
+ "epoch": 1.1432591680590978,
230
+ "grad_norm": 1.031847596168518,
231
+ "learning_rate": 0.0002916871388442835,
232
+ "loss": 0.2372,
233
+ "step": 13000
234
+ },
235
+ {
236
+ "epoch": 1.1432591680590978,
237
+ "eval_accuracy": 0.8948553054662379,
238
+ "eval_combined_score": 0.8795961544085618,
239
+ "eval_f1": 0.8643370033508856,
240
+ "eval_loss": 0.2568126618862152,
241
+ "eval_runtime": 86.9093,
242
+ "eval_samples_per_second": 465.198,
243
+ "eval_steps_per_second": 0.909,
244
+ "step": 13000
245
+ },
246
+ {
247
+ "epoch": 1.2312021809867206,
248
+ "grad_norm": 1.1493035554885864,
249
+ "learning_rate": 0.00029025962065837193,
250
+ "loss": 0.2316,
251
+ "step": 14000
252
+ },
253
+ {
254
+ "epoch": 1.2312021809867206,
255
+ "eval_accuracy": 0.897180311649765,
256
+ "eval_combined_score": 0.8832675498347509,
257
+ "eval_f1": 0.8693547880197366,
258
+ "eval_loss": 0.249246746301651,
259
+ "eval_runtime": 87.1761,
260
+ "eval_samples_per_second": 463.774,
261
+ "eval_steps_per_second": 0.906,
262
+ "step": 14000
263
+ },
264
+ {
265
+ "epoch": 1.3191451939143435,
266
+ "grad_norm": 0.6977606415748596,
267
+ "learning_rate": 0.00028872312005166577,
268
+ "loss": 0.2282,
269
+ "step": 15000
270
+ },
271
+ {
272
+ "epoch": 1.3191451939143435,
273
+ "eval_accuracy": 0.8985406876082117,
274
+ "eval_combined_score": 0.880514058185601,
275
+ "eval_f1": 0.8624874287629902,
276
+ "eval_loss": 0.23751328885555267,
277
+ "eval_runtime": 61.8847,
278
+ "eval_samples_per_second": 653.312,
279
+ "eval_steps_per_second": 1.277,
280
+ "step": 15000
281
+ },
282
+ {
283
+ "epoch": 1.4070882068419663,
284
+ "grad_norm": 1.7894372940063477,
285
+ "learning_rate": 0.00028707883089275593,
286
+ "loss": 0.2344,
287
+ "step": 16000
288
+ },
289
+ {
290
+ "epoch": 1.4070882068419663,
291
+ "eval_accuracy": 0.9016571852584714,
292
+ "eval_combined_score": 0.8850791548729187,
293
+ "eval_f1": 0.868501124487366,
294
+ "eval_loss": 0.2350623905658722,
295
+ "eval_runtime": 55.7055,
296
+ "eval_samples_per_second": 725.782,
297
+ "eval_steps_per_second": 1.418,
298
+ "step": 16000
299
+ },
300
+ {
301
+ "epoch": 1.4950312197695892,
302
+ "grad_norm": 0.8567989468574524,
303
+ "learning_rate": 0.0002853280308024728,
304
+ "loss": 0.231,
305
+ "step": 17000
306
+ },
307
+ {
308
+ "epoch": 1.4950312197695892,
309
+ "eval_accuracy": 0.9044768736087064,
310
+ "eval_combined_score": 0.8896452439356336,
311
+ "eval_f1": 0.8748136142625608,
312
+ "eval_loss": 0.22801724076271057,
313
+ "eval_runtime": 56.0896,
314
+ "eval_samples_per_second": 720.811,
315
+ "eval_steps_per_second": 1.408,
316
+ "step": 17000
317
+ },
318
+ {
319
+ "epoch": 1.5829742326972123,
320
+ "grad_norm": 0.8832383155822754,
321
+ "learning_rate": 0.0002834720801611687,
322
+ "loss": 0.2268,
323
+ "step": 18000
324
+ },
325
+ {
326
+ "epoch": 1.5829742326972123,
327
+ "eval_accuracy": 0.9038337867919861,
328
+ "eval_combined_score": 0.8878852728426334,
329
+ "eval_f1": 0.8719367588932806,
330
+ "eval_loss": 0.22873158752918243,
331
+ "eval_runtime": 87.1297,
332
+ "eval_samples_per_second": 464.021,
333
+ "eval_steps_per_second": 0.907,
334
+ "step": 18000
335
+ },
336
+ {
337
+ "epoch": 1.6709172456248351,
338
+ "grad_norm": 1.8913209438323975,
339
+ "learning_rate": 0.0002815124210516956,
340
+ "loss": 0.2296,
341
+ "step": 19000
342
+ },
343
+ {
344
+ "epoch": 1.6709172456248351,
345
+ "eval_accuracy": 0.9036606480336384,
346
+ "eval_combined_score": 0.8865546410940774,
347
+ "eval_f1": 0.8694486341545165,
348
+ "eval_loss": 0.23362140357494354,
349
+ "eval_runtime": 86.6867,
350
+ "eval_samples_per_second": 466.392,
351
+ "eval_steps_per_second": 0.911,
352
+ "step": 19000
353
+ },
354
+ {
355
+ "epoch": 1.758860258552458,
356
+ "grad_norm": 0.9055523872375488,
357
+ "learning_rate": 0.0002794505761388994,
358
+ "loss": 0.2267,
359
+ "step": 20000
360
+ },
361
+ {
362
+ "epoch": 1.758860258552458,
363
+ "eval_accuracy": 0.9018055899084838,
364
+ "eval_combined_score": 0.8878749166282116,
365
+ "eval_f1": 0.8739442433479393,
366
+ "eval_loss": 0.23946107923984528,
367
+ "eval_runtime": 87.1633,
368
+ "eval_samples_per_second": 463.842,
369
+ "eval_steps_per_second": 0.906,
370
+ "step": 20000
371
+ },
372
+ {
373
+ "epoch": 1.846803271480081,
374
+ "grad_norm": 0.7792288661003113,
375
+ "learning_rate": 0.0002772881474865019,
376
+ "loss": 0.2253,
377
+ "step": 21000
378
+ },
379
+ {
380
+ "epoch": 1.846803271480081,
381
+ "eval_accuracy": 0.9050457581004204,
382
+ "eval_combined_score": 0.888817724890043,
383
+ "eval_f1": 0.8725896916796655,
384
+ "eval_loss": 0.23969051241874695,
385
+ "eval_runtime": 86.906,
386
+ "eval_samples_per_second": 465.215,
387
+ "eval_steps_per_second": 0.909,
388
+ "step": 21000
389
+ },
390
+ {
391
+ "epoch": 1.9347462844077037,
392
+ "grad_norm": 0.8400819897651672,
393
+ "learning_rate": 0.00027502681531228946,
394
+ "loss": 0.2281,
395
+ "step": 22000
396
+ },
397
+ {
398
+ "epoch": 1.9347462844077037,
399
+ "eval_accuracy": 0.9032401681919366,
400
+ "eval_combined_score": 0.8893151850706769,
401
+ "eval_f1": 0.8753902019494171,
402
+ "eval_loss": 0.2290925830602646,
403
+ "eval_runtime": 55.9966,
404
+ "eval_samples_per_second": 722.008,
405
+ "eval_steps_per_second": 1.411,
406
+ "step": 22000
407
+ },
408
+ {
409
+ "epoch": 2.0226892973353268,
410
+ "grad_norm": 0.6936965584754944,
411
+ "learning_rate": 0.00027266833668257537,
412
+ "loss": 0.2211,
413
+ "step": 23000
414
+ },
415
+ {
416
+ "epoch": 2.0226892973353268,
417
+ "eval_accuracy": 0.9054415038337867,
418
+ "eval_combined_score": 0.8907576412812164,
419
+ "eval_f1": 0.876073778728646,
420
+ "eval_loss": 0.23454435169696808,
421
+ "eval_runtime": 54.9548,
422
+ "eval_samples_per_second": 735.695,
423
+ "eval_steps_per_second": 1.438,
424
+ "step": 23000
425
+ },
426
+ {
427
+ "epoch": 2.1106323102629494,
428
+ "grad_norm": 0.9834011793136597,
429
+ "learning_rate": 0.0002702145441469506,
430
+ "loss": 0.2105,
431
+ "step": 24000
432
+ },
433
+ {
434
+ "epoch": 2.1106323102629494,
435
+ "eval_accuracy": 0.906331931733861,
436
+ "eval_combined_score": 0.8905067117552788,
437
+ "eval_f1": 0.8746814917766967,
438
+ "eval_loss": 0.22863534092903137,
439
+ "eval_runtime": 55.5285,
440
+ "eval_samples_per_second": 728.095,
441
+ "eval_steps_per_second": 1.423,
442
+ "step": 24000
443
+ },
444
+ {
445
+ "epoch": 2.1985753231905725,
446
+ "grad_norm": 0.8303177952766418,
447
+ "learning_rate": 0.00026766734431438345,
448
+ "loss": 0.2116,
449
+ "step": 25000
450
+ },
451
+ {
452
+ "epoch": 2.1985753231905725,
453
+ "eval_accuracy": 0.907395498392283,
454
+ "eval_combined_score": 0.8916986765326984,
455
+ "eval_f1": 0.8760018546731139,
456
+ "eval_loss": 0.2305864840745926,
457
+ "eval_runtime": 87.1391,
458
+ "eval_samples_per_second": 463.971,
459
+ "eval_steps_per_second": 0.907,
460
+ "step": 25000
461
+ },
462
+ {
463
+ "epoch": 2.2865183361181955,
464
+ "grad_norm": 1.1610081195831299,
465
+ "learning_rate": 0.0002650287163717754,
466
+ "loss": 0.21,
467
+ "step": 26000
468
+ },
469
+ {
470
+ "epoch": 2.2865183361181955,
471
+ "eval_accuracy": 0.908780608459065,
472
+ "eval_combined_score": 0.8933953320146608,
473
+ "eval_f1": 0.8780100555702567,
474
+ "eval_loss": 0.22183135151863098,
475
+ "eval_runtime": 94.1343,
476
+ "eval_samples_per_second": 429.493,
477
+ "eval_steps_per_second": 0.839,
478
+ "step": 26000
479
+ },
480
+ {
481
+ "epoch": 2.374461349045818,
482
+ "grad_norm": 1.1182605028152466,
483
+ "learning_rate": 0.0002623007105461227,
484
+ "loss": 0.2133,
485
+ "step": 27000
486
+ },
487
+ {
488
+ "epoch": 2.374461349045818,
489
+ "eval_accuracy": 0.9062329953005194,
490
+ "eval_combined_score": 0.8922872559699542,
491
+ "eval_f1": 0.878341516639389,
492
+ "eval_loss": 0.21990598738193512,
493
+ "eval_runtime": 86.9528,
494
+ "eval_samples_per_second": 464.965,
495
+ "eval_steps_per_second": 0.909,
496
+ "step": 27000
497
+ },
498
+ {
499
+ "epoch": 2.4624043619734413,
500
+ "grad_norm": 1.0740619897842407,
501
+ "learning_rate": 0.00025948544651147997,
502
+ "loss": 0.2091,
503
+ "step": 28000
504
+ },
505
+ {
506
+ "epoch": 2.4624043619734413,
507
+ "eval_accuracy": 0.9060845906505071,
508
+ "eval_combined_score": 0.8920267079879565,
509
+ "eval_f1": 0.8779688253254058,
510
+ "eval_loss": 0.22643305361270905,
511
+ "eval_runtime": 85.7066,
512
+ "eval_samples_per_second": 471.725,
513
+ "eval_steps_per_second": 0.922,
514
+ "step": 28000
515
+ },
516
+ {
517
+ "epoch": 2.5503473749010643,
518
+ "grad_norm": 0.6771745085716248,
519
+ "learning_rate": 0.00025658511174196294,
520
+ "loss": 0.2056,
521
+ "step": 29000
522
+ },
523
+ {
524
+ "epoch": 2.5503473749010643,
525
+ "eval_accuracy": 0.9063566658421964,
526
+ "eval_combined_score": 0.8922513937514812,
527
+ "eval_f1": 0.878146121660766,
528
+ "eval_loss": 0.22508816421031952,
529
+ "eval_runtime": 55.4822,
530
+ "eval_samples_per_second": 728.702,
531
+ "eval_steps_per_second": 1.424,
532
+ "step": 29000
533
+ },
534
+ {
535
+ "epoch": 2.638290387828687,
536
+ "grad_norm": 0.9677979946136475,
537
+ "learning_rate": 0.00025360195981207026,
538
+ "loss": 0.2114,
539
+ "step": 30000
540
+ },
541
+ {
542
+ "epoch": 2.638290387828687,
543
+ "eval_accuracy": 0.907271827850606,
544
+ "eval_combined_score": 0.8924317476667729,
545
+ "eval_f1": 0.8775916674829399,
546
+ "eval_loss": 0.23263554275035858,
547
+ "eval_runtime": 55.2748,
548
+ "eval_samples_per_second": 731.437,
549
+ "eval_steps_per_second": 1.429,
550
+ "step": 30000
551
+ },
552
+ {
553
+ "epoch": 2.72623340075631,
554
+ "grad_norm": 0.6644540429115295,
555
+ "learning_rate": 0.0002505383086456447,
556
+ "loss": 0.2112,
557
+ "step": 31000
558
+ },
559
+ {
560
+ "epoch": 2.72623340075631,
561
+ "eval_accuracy": 0.9091021518674252,
562
+ "eval_combined_score": 0.895075432625734,
563
+ "eval_f1": 0.8810487133840427,
564
+ "eval_loss": 0.22498038411140442,
565
+ "eval_runtime": 56.1057,
566
+ "eval_samples_per_second": 720.604,
567
+ "eval_steps_per_second": 1.408,
568
+ "step": 31000
569
+ },
570
+ {
571
+ "epoch": 2.8141764136839327,
572
+ "grad_norm": 1.8015542030334473,
573
+ "learning_rate": 0.0002473965387148352,
574
+ "loss": 0.2111,
575
+ "step": 32000
576
+ },
577
+ {
578
+ "epoch": 2.8141764136839327,
579
+ "eval_accuracy": 0.9067771456838981,
580
+ "eval_combined_score": 0.8896950587986192,
581
+ "eval_f1": 0.8726129719133403,
582
+ "eval_loss": 0.23708254098892212,
583
+ "eval_runtime": 86.5803,
584
+ "eval_samples_per_second": 466.965,
585
+ "eval_steps_per_second": 0.912,
586
+ "step": 32000
587
+ },
588
+ {
589
+ "epoch": 2.9021194266115558,
590
+ "grad_norm": 0.9836463332176208,
591
+ "learning_rate": 0.000244179091190458,
592
+ "loss": 0.2053,
593
+ "step": 33000
594
+ },
595
+ {
596
+ "epoch": 2.9021194266115558,
597
+ "eval_accuracy": 0.907271827850606,
598
+ "eval_combined_score": 0.8913291165179571,
599
+ "eval_f1": 0.8753864051853083,
600
+ "eval_loss": 0.2347380369901657,
601
+ "eval_runtime": 87.041,
602
+ "eval_samples_per_second": 464.494,
603
+ "eval_steps_per_second": 0.908,
604
+ "step": 33000
605
+ },
606
+ {
607
+ "epoch": 2.9900624395391784,
608
+ "grad_norm": 0.8091123700141907,
609
+ "learning_rate": 0.00024088846604519457,
610
+ "loss": 0.2065,
611
+ "step": 34000
612
+ },
613
+ {
614
+ "epoch": 2.9900624395391784,
615
+ "eval_accuracy": 0.9093989611674499,
616
+ "eval_combined_score": 0.8952525657211847,
617
+ "eval_f1": 0.8811061702749197,
618
+ "eval_loss": 0.2226964384317398,
619
+ "eval_runtime": 87.4423,
620
+ "eval_samples_per_second": 462.362,
621
+ "eval_steps_per_second": 0.903,
622
+ "step": 34000
623
+ },
624
+ {
625
+ "epoch": 3.0780054524668015,
626
+ "grad_norm": 1.2436619997024536,
627
+ "learning_rate": 0.00023752722011110102,
628
+ "loss": 0.1908,
629
+ "step": 35000
630
+ },
631
+ {
632
+ "epoch": 3.0780054524668015,
633
+ "eval_accuracy": 0.9093989611674499,
634
+ "eval_combined_score": 0.8953527322605987,
635
+ "eval_f1": 0.8813065033537475,
636
+ "eval_loss": 0.2280348539352417,
637
+ "eval_runtime": 66.5104,
638
+ "eval_samples_per_second": 607.875,
639
+ "eval_steps_per_second": 1.188,
640
+ "step": 35000
641
+ },
642
+ {
643
+ "epoch": 3.1659484653944245,
644
+ "grad_norm": 0.7347617149353027,
645
+ "learning_rate": 0.00023409796509293643,
646
+ "loss": 0.1955,
647
+ "step": 36000
648
+ },
649
+ {
650
+ "epoch": 3.1659484653944245,
651
+ "eval_accuracy": 0.9095720999257977,
652
+ "eval_combined_score": 0.8955507939680836,
653
+ "eval_f1": 0.8815294880103695,
654
+ "eval_loss": 0.2231292426586151,
655
+ "eval_runtime": 56.1448,
656
+ "eval_samples_per_second": 720.102,
657
+ "eval_steps_per_second": 1.407,
658
+ "step": 36000
659
+ },
660
+ {
661
+ "epoch": 3.253891478322047,
662
+ "grad_norm": 1.5832217931747437,
663
+ "learning_rate": 0.0002306033655388555,
664
+ "loss": 0.1924,
665
+ "step": 37000
666
+ },
667
+ {
668
+ "epoch": 3.253891478322047,
669
+ "eval_accuracy": 0.9100915162008409,
670
+ "eval_combined_score": 0.896175547884374,
671
+ "eval_f1": 0.8822595795679072,
672
+ "eval_loss": 0.2340392768383026,
673
+ "eval_runtime": 59.6999,
674
+ "eval_samples_per_second": 677.221,
675
+ "eval_steps_per_second": 1.323,
676
+ "step": 37000
677
+ },
678
+ {
679
+ "epoch": 3.3418344912496702,
680
+ "grad_norm": 0.591686487197876,
681
+ "learning_rate": 0.0002270461367700413,
682
+ "loss": 0.1988,
683
+ "step": 38000
684
+ },
685
+ {
686
+ "epoch": 3.3418344912496702,
687
+ "eval_accuracy": 0.9090032154340836,
688
+ "eval_combined_score": 0.8956507903001432,
689
+ "eval_f1": 0.8822983651662027,
690
+ "eval_loss": 0.22800563275814056,
691
+ "eval_runtime": 56.1427,
692
+ "eval_samples_per_second": 720.129,
693
+ "eval_steps_per_second": 1.407,
694
+ "step": 38000
695
+ },
696
+ {
697
+ "epoch": 3.4297775041772933,
698
+ "grad_norm": 0.40456414222717285,
699
+ "learning_rate": 0.00022342904277088745,
700
+ "loss": 0.1956,
701
+ "step": 39000
702
+ },
703
+ {
704
+ "epoch": 3.4297775041772933,
705
+ "eval_accuracy": 0.9068266139005688,
706
+ "eval_combined_score": 0.8933508049572362,
707
+ "eval_f1": 0.8798749960139035,
708
+ "eval_loss": 0.23531748354434967,
709
+ "eval_runtime": 87.0769,
710
+ "eval_samples_per_second": 464.302,
711
+ "eval_steps_per_second": 0.907,
712
+ "step": 39000
713
+ },
714
+ {
715
+ "epoch": 3.517720517104916,
716
+ "grad_norm": 1.1434704065322876,
717
+ "learning_rate": 0.00021975489404136827,
718
+ "loss": 0.1938,
719
+ "step": 40000
720
+ },
721
+ {
722
+ "epoch": 3.517720517104916,
723
+ "eval_accuracy": 0.9121197130843434,
724
+ "eval_combined_score": 0.8974546376591842,
725
+ "eval_f1": 0.882789562234025,
726
+ "eval_loss": 0.21503011882305145,
727
+ "eval_runtime": 87.6754,
728
+ "eval_samples_per_second": 461.133,
729
+ "eval_steps_per_second": 0.901,
730
+ "step": 40000
731
+ },
732
+ {
733
+ "epoch": 3.605663530032539,
734
+ "grad_norm": 1.0183204412460327,
735
+ "learning_rate": 0.00021602654541326668,
736
+ "loss": 0.1924,
737
+ "step": 41000
738
+ },
739
+ {
740
+ "epoch": 3.605663530032539,
741
+ "eval_accuracy": 0.9107098689092258,
742
+ "eval_combined_score": 0.8974986616166105,
743
+ "eval_f1": 0.8842874543239951,
744
+ "eval_loss": 0.224385604262352,
745
+ "eval_runtime": 87.4339,
746
+ "eval_samples_per_second": 462.407,
747
+ "eval_steps_per_second": 0.904,
748
+ "step": 41000
749
+ },
750
+ {
751
+ "epoch": 3.6936065429601617,
752
+ "grad_norm": 0.9868887662887573,
753
+ "learning_rate": 0.00021224689383195542,
754
+ "loss": 0.195,
755
+ "step": 42000
756
+ },
757
+ {
758
+ "epoch": 3.6936065429601617,
759
+ "eval_accuracy": 0.9104377937175365,
760
+ "eval_combined_score": 0.8954959874080013,
761
+ "eval_f1": 0.8805541810984661,
762
+ "eval_loss": 0.21859273314476013,
763
+ "eval_runtime": 55.287,
764
+ "eval_samples_per_second": 731.275,
765
+ "eval_steps_per_second": 1.429,
766
+ "step": 42000
767
+ },
768
+ {
769
+ "epoch": 3.7815495558877847,
770
+ "grad_norm": 0.6419038772583008,
771
+ "learning_rate": 0.00020841887610545634,
772
+ "loss": 0.1958,
773
+ "step": 43000
774
+ },
775
+ {
776
+ "epoch": 3.7815495558877847,
777
+ "eval_accuracy": 0.9104625278258719,
778
+ "eval_combined_score": 0.8938753317095461,
779
+ "eval_f1": 0.8772881355932204,
780
+ "eval_loss": 0.22058646380901337,
781
+ "eval_runtime": 56.1494,
782
+ "eval_samples_per_second": 720.044,
783
+ "eval_steps_per_second": 1.407,
784
+ "step": 43000
785
+ },
786
+ {
787
+ "epoch": 3.8694925688154074,
788
+ "grad_norm": 1.719427227973938,
789
+ "learning_rate": 0.00020454546662252592,
790
+ "loss": 0.196,
791
+ "step": 44000
792
+ },
793
+ {
794
+ "epoch": 3.8694925688154074,
795
+ "eval_accuracy": 0.9113529557259461,
796
+ "eval_combined_score": 0.8976904566359474,
797
+ "eval_f1": 0.8840279575459488,
798
+ "eval_loss": 0.22054925560951233,
799
+ "eval_runtime": 55.0531,
800
+ "eval_samples_per_second": 734.382,
801
+ "eval_steps_per_second": 1.435,
802
+ "step": 44000
803
+ },
804
+ {
805
+ "epoch": 3.9574355817430305,
806
+ "grad_norm": 0.8242283463478088,
807
+ "learning_rate": 0.00020062967504154062,
808
+ "loss": 0.1959,
809
+ "step": 45000
810
+ },
811
+ {
812
+ "epoch": 3.9574355817430305,
813
+ "eval_accuracy": 0.908088053425674,
814
+ "eval_combined_score": 0.8953616945796805,
815
+ "eval_f1": 0.882635335733687,
816
+ "eval_loss": 0.22607550024986267,
817
+ "eval_runtime": 55.7392,
818
+ "eval_samples_per_second": 725.342,
819
+ "eval_steps_per_second": 1.417,
820
+ "step": 45000
821
+ },
822
+ {
823
+ "epoch": 4.0453785946706535,
824
+ "grad_norm": 1.3167576789855957,
825
+ "learning_rate": 0.00019667454395197706,
826
+ "loss": 0.1894,
827
+ "step": 46000
828
+ },
829
+ {
830
+ "epoch": 4.0453785946706535,
831
+ "eval_accuracy": 0.914296314617858,
832
+ "eval_combined_score": 0.8999982315296118,
833
+ "eval_f1": 0.8857001484413657,
834
+ "eval_loss": 0.2266334593296051,
835
+ "eval_runtime": 86.689,
836
+ "eval_samples_per_second": 466.38,
837
+ "eval_steps_per_second": 0.911,
838
+ "step": 46000
839
+ },
840
+ {
841
+ "epoch": 4.133321607598276,
842
+ "grad_norm": 1.0003962516784668,
843
+ "learning_rate": 0.00019268314651030522,
844
+ "loss": 0.1817,
845
+ "step": 47000
846
+ },
847
+ {
848
+ "epoch": 4.133321607598276,
849
+ "eval_accuracy": 0.913356418501113,
850
+ "eval_combined_score": 0.8990763412596334,
851
+ "eval_f1": 0.8847962640181537,
852
+ "eval_loss": 0.21194230020046234,
853
+ "eval_runtime": 87.412,
854
+ "eval_samples_per_second": 462.522,
855
+ "eval_steps_per_second": 0.904,
856
+ "step": 47000
857
+ },
858
+ {
859
+ "epoch": 4.221264620525899,
860
+ "grad_norm": 1.556255578994751,
861
+ "learning_rate": 0.00018865858405213055,
862
+ "loss": 0.1797,
863
+ "step": 48000
864
+ },
865
+ {
866
+ "epoch": 4.221264620525899,
867
+ "eval_accuracy": 0.9149394014345783,
868
+ "eval_combined_score": 0.9006436324400824,
869
+ "eval_f1": 0.8863478634455865,
870
+ "eval_loss": 0.22268745303153992,
871
+ "eval_runtime": 93.9761,
872
+ "eval_samples_per_second": 430.216,
873
+ "eval_steps_per_second": 0.841,
874
+ "step": 48000
875
+ },
876
+ {
877
+ "epoch": 4.309207633453522,
878
+ "grad_norm": 1.4525554180145264,
879
+ "learning_rate": 0.0001846039836824406,
880
+ "loss": 0.1837,
881
+ "step": 49000
882
+ },
883
+ {
884
+ "epoch": 4.309207633453522,
885
+ "eval_accuracy": 0.9146425921345536,
886
+ "eval_combined_score": 0.9013003876552123,
887
+ "eval_f1": 0.8879581831758709,
888
+ "eval_loss": 0.22220176458358765,
889
+ "eval_runtime": 55.5494,
890
+ "eval_samples_per_second": 727.821,
891
+ "eval_steps_per_second": 1.422,
892
+ "step": 49000
893
+ },
894
+ {
895
+ "epoch": 4.397150646381145,
896
+ "grad_norm": 1.4938559532165527,
897
+ "learning_rate": 0.00018052249584582937,
898
+ "loss": 0.1808,
899
+ "step": 50000
900
+ },
901
+ {
902
+ "epoch": 4.397150646381145,
903
+ "eval_accuracy": 0.914419985159535,
904
+ "eval_combined_score": 0.9005963398538037,
905
+ "eval_f1": 0.8867726945480725,
906
+ "eval_loss": 0.21833352744579315,
907
+ "eval_runtime": 56.2669,
908
+ "eval_samples_per_second": 718.539,
909
+ "eval_steps_per_second": 1.404,
910
+ "step": 50000
911
+ },
912
+ {
913
+ "epoch": 4.485093659308768,
914
+ "grad_norm": 0.8967867493629456,
915
+ "learning_rate": 0.0001764172918785858,
916
+ "loss": 0.1809,
917
+ "step": 51000
918
+ },
919
+ {
920
+ "epoch": 4.485093659308768,
921
+ "eval_accuracy": 0.9135790254761316,
922
+ "eval_combined_score": 0.8999912813836214,
923
+ "eval_f1": 0.8864035372911112,
924
+ "eval_loss": 0.2153746634721756,
925
+ "eval_runtime": 55.6547,
926
+ "eval_samples_per_second": 726.443,
927
+ "eval_steps_per_second": 1.419,
928
+ "step": 51000
929
+ },
930
+ {
931
+ "epoch": 4.573036672236391,
932
+ "grad_norm": 1.5915240049362183,
933
+ "learning_rate": 0.0001722915615445501,
934
+ "loss": 0.1844,
935
+ "step": 52000
936
+ },
937
+ {
938
+ "epoch": 4.573036672236391,
939
+ "eval_accuracy": 0.9124412564927035,
940
+ "eval_combined_score": 0.8987269622942956,
941
+ "eval_f1": 0.8850126680958877,
942
+ "eval_loss": 0.22099778056144714,
943
+ "eval_runtime": 56.0449,
944
+ "eval_samples_per_second": 721.386,
945
+ "eval_steps_per_second": 1.41,
946
+ "step": 52000
947
+ },
948
+ {
949
+ "epoch": 4.660979685164014,
950
+ "grad_norm": 1.076545000076294,
951
+ "learning_rate": 0.0001681485105566511,
952
+ "loss": 0.1882,
953
+ "step": 53000
954
+ },
955
+ {
956
+ "epoch": 4.660979685164014,
957
+ "eval_accuracy": 0.9136779619094731,
958
+ "eval_combined_score": 0.9001352700299299,
959
+ "eval_f1": 0.8865925781503867,
960
+ "eval_loss": 0.21928681433200836,
961
+ "eval_runtime": 87.0569,
962
+ "eval_samples_per_second": 464.409,
963
+ "eval_steps_per_second": 0.907,
964
+ "step": 53000
965
+ },
966
+ {
967
+ "epoch": 4.748922698091636,
968
+ "grad_norm": 0.7791244983673096,
969
+ "learning_rate": 0.00016399135808605172,
970
+ "loss": 0.1831,
971
+ "step": 54000
972
+ },
973
+ {
974
+ "epoch": 4.748922698091636,
975
+ "eval_accuracy": 0.9146425921345536,
976
+ "eval_combined_score": 0.9009379349588887,
977
+ "eval_f1": 0.8872332777832239,
978
+ "eval_loss": 0.22267895936965942,
979
+ "eval_runtime": 87.0469,
980
+ "eval_samples_per_second": 464.462,
981
+ "eval_steps_per_second": 0.908,
982
+ "step": 54000
983
+ },
984
+ {
985
+ "epoch": 4.83686571101926,
986
+ "grad_norm": 0.657599687576294,
987
+ "learning_rate": 0.00015982333426083677,
988
+ "loss": 0.1866,
989
+ "step": 55000
990
+ },
991
+ {
992
+ "epoch": 4.83686571101926,
993
+ "eval_accuracy": 0.9149394014345783,
994
+ "eval_combined_score": 0.901197815388232,
995
+ "eval_f1": 0.8874562293418856,
996
+ "eval_loss": 0.21784672141075134,
997
+ "eval_runtime": 87.5052,
998
+ "eval_samples_per_second": 462.03,
999
+ "eval_steps_per_second": 0.903,
1000
+ "step": 55000
1001
+ },
1002
+ {
1003
+ "epoch": 4.9248087239468825,
1004
+ "grad_norm": 1.3747438192367554,
1005
+ "learning_rate": 0.00015564767765618756,
1006
+ "loss": 0.1849,
1007
+ "step": 56000
1008
+ },
1009
+ {
1010
+ "epoch": 4.9248087239468825,
1011
+ "eval_accuracy": 0.9138016324511501,
1012
+ "eval_combined_score": 0.8987884266974777,
1013
+ "eval_f1": 0.8837752209438052,
1014
+ "eval_loss": 0.21690765023231506,
1015
+ "eval_runtime": 55.9393,
1016
+ "eval_samples_per_second": 722.747,
1017
+ "eval_steps_per_second": 1.412,
1018
+ "step": 56000
1019
+ },
1020
+ {
1021
+ "epoch": 5.012751736874505,
1022
+ "grad_norm": 0.8309698104858398,
1023
+ "learning_rate": 0.0001514676327779928,
1024
+ "loss": 0.1786,
1025
+ "step": 57000
1026
+ },
1027
+ {
1028
+ "epoch": 5.012751736874505,
1029
+ "eval_accuracy": 0.9144941874845411,
1030
+ "eval_combined_score": 0.9010631184456672,
1031
+ "eval_f1": 0.8876320494067934,
1032
+ "eval_loss": 0.2155766487121582,
1033
+ "eval_runtime": 55.5419,
1034
+ "eval_samples_per_second": 727.919,
1035
+ "eval_steps_per_second": 1.422,
1036
+ "step": 57000
1037
+ },
1038
+ {
1039
+ "epoch": 5.100694749802129,
1040
+ "grad_norm": 1.6811013221740723,
1041
+ "learning_rate": 0.00014728644754185164,
1042
+ "loss": 0.1717,
1043
+ "step": 58000
1044
+ },
1045
+ {
1046
+ "epoch": 5.100694749802129,
1047
+ "eval_accuracy": 0.9157061587929756,
1048
+ "eval_combined_score": 0.9017413828738323,
1049
+ "eval_f1": 0.8877766069546892,
1050
+ "eval_loss": 0.2196960747241974,
1051
+ "eval_runtime": 55.5821,
1052
+ "eval_samples_per_second": 727.392,
1053
+ "eval_steps_per_second": 1.421,
1054
+ "step": 58000
1055
+ },
1056
+ {
1057
+ "epoch": 5.188637762729751,
1058
+ "grad_norm": 1.4050065279006958,
1059
+ "learning_rate": 0.00014310737074942683,
1060
+ "loss": 0.1724,
1061
+ "step": 59000
1062
+ },
1063
+ {
1064
+ "epoch": 5.188637762729751,
1065
+ "eval_accuracy": 0.9139253029928271,
1066
+ "eval_combined_score": 0.9006920351067881,
1067
+ "eval_f1": 0.887458767220749,
1068
+ "eval_loss": 0.23073293268680573,
1069
+ "eval_runtime": 59.0316,
1070
+ "eval_samples_per_second": 684.887,
1071
+ "eval_steps_per_second": 1.338,
1072
+ "step": 59000
1073
+ },
1074
+ {
1075
+ "epoch": 5.276580775657374,
1076
+ "grad_norm": 0.8422167301177979,
1077
+ "learning_rate": 0.00013893364956411012,
1078
+ "loss": 0.1669,
1079
+ "step": 60000
1080
+ },
1081
+ {
1082
+ "epoch": 5.276580775657374,
1083
+ "eval_accuracy": 0.9110808805342567,
1084
+ "eval_combined_score": 0.8985330507139901,
1085
+ "eval_f1": 0.8859852208937237,
1086
+ "eval_loss": 0.23831692337989807,
1087
+ "eval_runtime": 87.0945,
1088
+ "eval_samples_per_second": 464.208,
1089
+ "eval_steps_per_second": 0.907,
1090
+ "step": 60000
1091
+ },
1092
+ {
1093
+ "epoch": 5.364523788584997,
1094
+ "grad_norm": 1.245219111442566,
1095
+ "learning_rate": 0.0001347685269879597,
1096
+ "loss": 0.1765,
1097
+ "step": 61000
1098
+ },
1099
+ {
1100
+ "epoch": 5.364523788584997,
1101
+ "eval_accuracy": 0.9122186495176848,
1102
+ "eval_combined_score": 0.89931260229301,
1103
+ "eval_f1": 0.8864065550683353,
1104
+ "eval_loss": 0.2277195006608963,
1105
+ "eval_runtime": 86.9443,
1106
+ "eval_samples_per_second": 465.01,
1107
+ "eval_steps_per_second": 0.909,
1108
+ "step": 61000
1109
+ },
1110
+ {
1111
+ "epoch": 5.45246680151262,
1112
+ "grad_norm": 0.5509847402572632,
1113
+ "learning_rate": 0.00013061523934187208,
1114
+ "loss": 0.174,
1115
+ "step": 62000
1116
+ },
1117
+ {
1118
+ "epoch": 5.45246680151262,
1119
+ "eval_accuracy": 0.9133316843927777,
1120
+ "eval_combined_score": 0.8998822809077391,
1121
+ "eval_f1": 0.8864328774227005,
1122
+ "eval_loss": 0.22177070379257202,
1123
+ "eval_runtime": 87.0562,
1124
+ "eval_samples_per_second": 464.413,
1125
+ "eval_steps_per_second": 0.907,
1126
+ "step": 62000
1127
+ },
1128
+ {
1129
+ "epoch": 5.540409814440243,
1130
+ "grad_norm": 0.7231882810592651,
1131
+ "learning_rate": 0.0001264770137509442,
1132
+ "loss": 0.1693,
1133
+ "step": 63000
1134
+ },
1135
+ {
1136
+ "epoch": 5.540409814440243,
1137
+ "eval_accuracy": 0.9148899332179075,
1138
+ "eval_combined_score": 0.9007357804129683,
1139
+ "eval_f1": 0.8865816276080293,
1140
+ "eval_loss": 0.21918977797031403,
1141
+ "eval_runtime": 55.6828,
1142
+ "eval_samples_per_second": 726.076,
1143
+ "eval_steps_per_second": 1.419,
1144
+ "step": 63000
1145
+ },
1146
+ {
1147
+ "epoch": 5.628352827367865,
1148
+ "grad_norm": 2.424527883529663,
1149
+ "learning_rate": 0.00012235706563698158,
1150
+ "loss": 0.1705,
1151
+ "step": 64000
1152
+ },
1153
+ {
1154
+ "epoch": 5.628352827367865,
1155
+ "eval_accuracy": 0.9151867425179322,
1156
+ "eval_combined_score": 0.9015731131312092,
1157
+ "eval_f1": 0.8879594837444862,
1158
+ "eval_loss": 0.22360049188137054,
1159
+ "eval_runtime": 55.7753,
1160
+ "eval_samples_per_second": 724.873,
1161
+ "eval_steps_per_second": 1.416,
1162
+ "step": 64000
1163
+ },
1164
+ {
1165
+ "epoch": 5.716295840295489,
1166
+ "grad_norm": 1.5083374977111816,
1167
+ "learning_rate": 0.00011825859622009953,
1168
+ "loss": 0.1744,
1169
+ "step": 65000
1170
+ },
1171
+ {
1172
+ "epoch": 5.716295840295489,
1173
+ "eval_accuracy": 0.91535988127628,
1174
+ "eval_combined_score": 0.901267148023567,
1175
+ "eval_f1": 0.8871744147708539,
1176
+ "eval_loss": 0.22406791150569916,
1177
+ "eval_runtime": 55.5444,
1178
+ "eval_samples_per_second": 727.886,
1179
+ "eval_steps_per_second": 1.422,
1180
+ "step": 65000
1181
+ },
1182
+ {
1183
+ "epoch": 5.8042388532231115,
1184
+ "grad_norm": 1.0378375053405762,
1185
+ "learning_rate": 0.00011418479003135898,
1186
+ "loss": 0.1684,
1187
+ "step": 66000
1188
+ },
1189
+ {
1190
+ "epoch": 5.8042388532231115,
1191
+ "eval_accuracy": 0.9145931239178827,
1192
+ "eval_combined_score": 0.9014245837058104,
1193
+ "eval_f1": 0.8882560434937381,
1194
+ "eval_loss": 0.2192625105381012,
1195
+ "eval_runtime": 55.9265,
1196
+ "eval_samples_per_second": 722.913,
1197
+ "eval_steps_per_second": 1.413,
1198
+ "step": 66000
1199
+ },
1200
+ {
1201
+ "epoch": 5.892181866150734,
1202
+ "grad_norm": 0.6121230125427246,
1203
+ "learning_rate": 0.00011013881243837068,
1204
+ "loss": 0.1712,
1205
+ "step": 67000
1206
+ },
1207
+ {
1208
+ "epoch": 5.892181866150734,
1209
+ "eval_accuracy": 0.9155577541429631,
1210
+ "eval_combined_score": 0.9020273583569999,
1211
+ "eval_f1": 0.8884969625710366,
1212
+ "eval_loss": 0.21181334555149078,
1213
+ "eval_runtime": 86.9155,
1214
+ "eval_samples_per_second": 465.164,
1215
+ "eval_steps_per_second": 0.909,
1216
+ "step": 67000
1217
+ },
1218
+ {
1219
+ "epoch": 5.980124879078357,
1220
+ "grad_norm": 1.4167020320892334,
1221
+ "learning_rate": 0.00010612380718578806,
1222
+ "loss": 0.1724,
1223
+ "step": 68000
1224
+ },
1225
+ {
1226
+ "epoch": 5.980124879078357,
1227
+ "eval_accuracy": 0.9161019045263418,
1228
+ "eval_combined_score": 0.9022137230539788,
1229
+ "eval_f1": 0.8883255415816158,
1230
+ "eval_loss": 0.22262895107269287,
1231
+ "eval_runtime": 87.1789,
1232
+ "eval_samples_per_second": 463.759,
1233
+ "eval_steps_per_second": 0.906,
1234
+ "step": 68000
1235
+ },
1236
+ {
1237
+ "epoch": 6.06806789200598,
1238
+ "grad_norm": 0.5954177379608154,
1239
+ "learning_rate": 0.00010214289395260275,
1240
+ "loss": 0.1618,
1241
+ "step": 69000
1242
+ },
1243
+ {
1244
+ "epoch": 6.06806789200598,
1245
+ "eval_accuracy": 0.9145683898095474,
1246
+ "eval_combined_score": 0.9011945554114317,
1247
+ "eval_f1": 0.887820721013316,
1248
+ "eval_loss": 0.2276068925857544,
1249
+ "eval_runtime": 92.9201,
1250
+ "eval_samples_per_second": 435.105,
1251
+ "eval_steps_per_second": 0.85,
1252
+ "step": 69000
1253
+ },
1254
+ {
1255
+ "epoch": 6.156010904933603,
1256
+ "grad_norm": 0.9870212078094482,
1257
+ "learning_rate": 9.819916592813812e-05,
1258
+ "loss": 0.1609,
1259
+ "step": 70000
1260
+ },
1261
+ {
1262
+ "epoch": 6.156010904933603,
1263
+ "eval_accuracy": 0.9152856789512738,
1264
+ "eval_combined_score": 0.9020295507411842,
1265
+ "eval_f1": 0.8887734225310947,
1266
+ "eval_loss": 0.2195996344089508,
1267
+ "eval_runtime": 55.6543,
1268
+ "eval_samples_per_second": 726.449,
1269
+ "eval_steps_per_second": 1.419,
1270
+ "step": 70000
1271
+ },
1272
+ {
1273
+ "epoch": 6.2439539178612256,
1274
+ "grad_norm": 1.3545230627059937,
1275
+ "learning_rate": 9.429568740862609e-05,
1276
+ "loss": 0.1602,
1277
+ "step": 71000
1278
+ },
1279
+ {
1280
+ "epoch": 6.2439539178612256,
1281
+ "eval_accuracy": 0.9127380657927282,
1282
+ "eval_combined_score": 0.900172792055332,
1283
+ "eval_f1": 0.8876075183179356,
1284
+ "eval_loss": 0.23225632309913635,
1285
+ "eval_runtime": 55.2108,
1286
+ "eval_samples_per_second": 732.285,
1287
+ "eval_steps_per_second": 1.431,
1288
+ "step": 71000
1289
+ },
1290
+ {
1291
+ "epoch": 6.331896930788849,
1292
+ "grad_norm": 0.6814817190170288,
1293
+ "learning_rate": 9.043549141623341e-05,
1294
+ "loss": 0.1535,
1295
+ "step": 72000
1296
+ },
1297
+ {
1298
+ "epoch": 6.331896930788849,
1299
+ "eval_accuracy": 0.9164481820430374,
1300
+ "eval_combined_score": 0.9029123278567296,
1301
+ "eval_f1": 0.8893764736704218,
1302
+ "eval_loss": 0.2333795428276062,
1303
+ "eval_runtime": 55.2621,
1304
+ "eval_samples_per_second": 731.604,
1305
+ "eval_steps_per_second": 1.43,
1306
+ "step": 72000
1307
+ },
1308
+ {
1309
+ "epoch": 6.419839943716472,
1310
+ "grad_norm": 1.3187615871429443,
1311
+ "learning_rate": 8.662157734238882e-05,
1312
+ "loss": 0.1646,
1313
+ "step": 73000
1314
+ },
1315
+ {
1316
+ "epoch": 6.419839943716472,
1317
+ "eval_accuracy": 0.9153846153846154,
1318
+ "eval_combined_score": 0.9020861830539669,
1319
+ "eval_f1": 0.8887877507233185,
1320
+ "eval_loss": 0.22094346582889557,
1321
+ "eval_runtime": 55.8826,
1322
+ "eval_samples_per_second": 723.482,
1323
+ "eval_steps_per_second": 1.414,
1324
+ "step": 73000
1325
+ },
1326
+ {
1327
+ "epoch": 6.507782956644094,
1328
+ "grad_norm": 1.0354348421096802,
1329
+ "learning_rate": 8.285690861724085e-05,
1330
+ "loss": 0.1644,
1331
+ "step": 74000
1332
+ },
1333
+ {
1334
+ "epoch": 6.507782956644094,
1335
+ "eval_accuracy": 0.9130101409844175,
1336
+ "eval_combined_score": 0.9001699751854652,
1337
+ "eval_f1": 0.8873298093865128,
1338
+ "eval_loss": 0.22492603957653046,
1339
+ "eval_runtime": 87.294,
1340
+ "eval_samples_per_second": 463.147,
1341
+ "eval_steps_per_second": 0.905,
1342
+ "step": 74000
1343
+ },
1344
+ {
1345
+ "epoch": 6.595725969571718,
1346
+ "grad_norm": 1.4580389261245728,
1347
+ "learning_rate": 7.914441040705777e-05,
1348
+ "loss": 0.1638,
1349
+ "step": 75000
1350
+ },
1351
+ {
1352
+ "epoch": 6.595725969571718,
1353
+ "eval_accuracy": 0.9161513727430126,
1354
+ "eval_combined_score": 0.9026364242533667,
1355
+ "eval_f1": 0.8891214757637208,
1356
+ "eval_loss": 0.22241519391536713,
1357
+ "eval_runtime": 87.259,
1358
+ "eval_samples_per_second": 463.333,
1359
+ "eval_steps_per_second": 0.905,
1360
+ "step": 75000
1361
+ },
1362
+ {
1363
+ "epoch": 6.6836689824993405,
1364
+ "grad_norm": 0.8584627509117126,
1365
+ "learning_rate": 7.5486967341359e-05,
1366
+ "loss": 0.1609,
1367
+ "step": 76000
1368
+ },
1369
+ {
1370
+ "epoch": 6.6836689824993405,
1371
+ "eval_accuracy": 0.9152114766262677,
1372
+ "eval_combined_score": 0.9017350818166217,
1373
+ "eval_f1": 0.8882586870069756,
1374
+ "eval_loss": 0.21768353879451752,
1375
+ "eval_runtime": 87.0299,
1376
+ "eval_samples_per_second": 464.553,
1377
+ "eval_steps_per_second": 0.908,
1378
+ "step": 76000
1379
+ },
1380
+ {
1381
+ "epoch": 6.771611995426963,
1382
+ "grad_norm": 2.1091301441192627,
1383
+ "learning_rate": 7.188742127154373e-05,
1384
+ "loss": 0.1688,
1385
+ "step": 77000
1386
+ },
1387
+ {
1388
+ "epoch": 6.771611995426963,
1389
+ "eval_accuracy": 0.9150878060845906,
1390
+ "eval_combined_score": 0.9021246669334263,
1391
+ "eval_f1": 0.889161527782262,
1392
+ "eval_loss": 0.21927987039089203,
1393
+ "eval_runtime": 55.2963,
1394
+ "eval_samples_per_second": 731.153,
1395
+ "eval_steps_per_second": 1.429,
1396
+ "step": 77000
1397
+ },
1398
+ {
1399
+ "epoch": 6.859555008354587,
1400
+ "grad_norm": 1.2470057010650635,
1401
+ "learning_rate": 6.834856906275834e-05,
1402
+ "loss": 0.1675,
1403
+ "step": 78000
1404
+ },
1405
+ {
1406
+ "epoch": 6.859555008354587,
1407
+ "eval_accuracy": 0.914419985159535,
1408
+ "eval_combined_score": 0.902044176253237,
1409
+ "eval_f1": 0.8896683673469388,
1410
+ "eval_loss": 0.22205699980258942,
1411
+ "eval_runtime": 55.5761,
1412
+ "eval_samples_per_second": 727.471,
1413
+ "eval_steps_per_second": 1.421,
1414
+ "step": 78000
1415
+ },
1416
+ {
1417
+ "epoch": 6.947498021282209,
1418
+ "grad_norm": 1.7795581817626953,
1419
+ "learning_rate": 6.487316042071804e-05,
1420
+ "loss": 0.1645,
1421
+ "step": 79000
1422
+ },
1423
+ {
1424
+ "epoch": 6.947498021282209,
1425
+ "eval_accuracy": 0.9150383378679199,
1426
+ "eval_combined_score": 0.9001869928148558,
1427
+ "eval_f1": 0.8853356477617919,
1428
+ "eval_loss": 0.2218712568283081,
1429
+ "eval_runtime": 55.3681,
1430
+ "eval_samples_per_second": 730.203,
1431
+ "eval_steps_per_second": 1.427,
1432
+ "step": 79000
1433
+ },
1434
+ {
1435
+ "epoch": 7.035441034209832,
1436
+ "grad_norm": 1.0313372611999512,
1437
+ "learning_rate": 6.146389575517211e-05,
1438
+ "loss": 0.1578,
1439
+ "step": 80000
1440
+ },
1441
+ {
1442
+ "epoch": 7.035441034209832,
1443
+ "eval_accuracy": 0.9153104130596093,
1444
+ "eval_combined_score": 0.9019514257007554,
1445
+ "eval_f1": 0.8885924383419015,
1446
+ "eval_loss": 0.22430460155010223,
1447
+ "eval_runtime": 60.6209,
1448
+ "eval_samples_per_second": 666.932,
1449
+ "eval_steps_per_second": 1.303,
1450
+ "step": 80000
1451
+ },
1452
+ {
1453
+ "epoch": 7.1233840471374545,
1454
+ "grad_norm": 1.0180041790008545,
1455
+ "learning_rate": 5.81234240816722e-05,
1456
+ "loss": 0.1584,
1457
+ "step": 81000
1458
+ },
1459
+ {
1460
+ "epoch": 7.1233840471374545,
1461
+ "eval_accuracy": 0.9158545634429879,
1462
+ "eval_combined_score": 0.9025670005264148,
1463
+ "eval_f1": 0.8892794376098418,
1464
+ "eval_loss": 0.2226150631904602,
1465
+ "eval_runtime": 87.3967,
1466
+ "eval_samples_per_second": 462.604,
1467
+ "eval_steps_per_second": 0.904,
1468
+ "step": 81000
1469
+ },
1470
+ {
1471
+ "epoch": 7.211327060065078,
1472
+ "grad_norm": 0.9006807804107666,
1473
+ "learning_rate": 5.485434096327387e-05,
1474
+ "loss": 0.1522,
1475
+ "step": 82000
1476
+ },
1477
+ {
1478
+ "epoch": 7.211327060065078,
1479
+ "eval_accuracy": 0.9151867425179322,
1480
+ "eval_combined_score": 0.9020881752344914,
1481
+ "eval_f1": 0.8889896079510505,
1482
+ "eval_loss": 0.22395850718021393,
1483
+ "eval_runtime": 87.4178,
1484
+ "eval_samples_per_second": 462.492,
1485
+ "eval_steps_per_second": 0.904,
1486
+ "step": 82000
1487
+ },
1488
+ {
1489
+ "epoch": 7.299270072992701,
1490
+ "grad_norm": 1.0538363456726074,
1491
+ "learning_rate": 5.165918649377139e-05,
1492
+ "loss": 0.1527,
1493
+ "step": 83000
1494
+ },
1495
+ {
1496
+ "epoch": 7.299270072992701,
1497
+ "eval_accuracy": 0.9146920603512243,
1498
+ "eval_combined_score": 0.901625328375877,
1499
+ "eval_f1": 0.8885585964005299,
1500
+ "eval_loss": 0.22327056527137756,
1501
+ "eval_runtime": 87.1691,
1502
+ "eval_samples_per_second": 463.811,
1503
+ "eval_steps_per_second": 0.906,
1504
+ "step": 83000
1505
+ },
1506
+ {
1507
+ "epoch": 7.387213085920323,
1508
+ "grad_norm": 1.3840506076812744,
1509
+ "learning_rate": 4.854044332403218e-05,
1510
+ "loss": 0.1507,
1511
+ "step": 84000
1512
+ },
1513
+ {
1514
+ "epoch": 7.387213085920323,
1515
+ "eval_accuracy": 0.9158792975513232,
1516
+ "eval_combined_score": 0.9028520204106352,
1517
+ "eval_f1": 0.8898247432699472,
1518
+ "eval_loss": 0.22502297163009644,
1519
+ "eval_runtime": 55.4268,
1520
+ "eval_samples_per_second": 729.43,
1521
+ "eval_steps_per_second": 1.425,
1522
+ "step": 84000
1523
+ },
1524
+ {
1525
+ "epoch": 7.475156098847947,
1526
+ "grad_norm": 1.4362976551055908,
1527
+ "learning_rate": 4.550053473296499e-05,
1528
+ "loss": 0.1549,
1529
+ "step": 85000
1530
+ },
1531
+ {
1532
+ "epoch": 7.475156098847947,
1533
+ "eval_accuracy": 0.916052436309671,
1534
+ "eval_combined_score": 0.9026338692842257,
1535
+ "eval_f1": 0.8892153022587805,
1536
+ "eval_loss": 0.2227443903684616,
1537
+ "eval_runtime": 55.1943,
1538
+ "eval_samples_per_second": 732.504,
1539
+ "eval_steps_per_second": 1.431,
1540
+ "step": 85000
1541
+ },
1542
+ {
1543
+ "epoch": 7.5630991117755695,
1544
+ "grad_norm": 0.39123019576072693,
1545
+ "learning_rate": 4.254182274461983e-05,
1546
+ "loss": 0.1565,
1547
+ "step": 86000
1548
+ },
1549
+ {
1550
+ "epoch": 7.5630991117755695,
1551
+ "eval_accuracy": 0.916052436309671,
1552
+ "eval_combined_score": 0.9031177861351629,
1553
+ "eval_f1": 0.8901831359606549,
1554
+ "eval_loss": 0.22492727637290955,
1555
+ "eval_runtime": 55.3134,
1556
+ "eval_samples_per_second": 730.926,
1557
+ "eval_steps_per_second": 1.428,
1558
+ "step": 86000
1559
+ },
1560
+ {
1561
+ "epoch": 7.651042124703192,
1562
+ "grad_norm": 1.4433910846710205,
1563
+ "learning_rate": 3.966660629288376e-05,
1564
+ "loss": 0.1579,
1565
+ "step": 87000
1566
+ },
1567
+ {
1568
+ "epoch": 7.651042124703192,
1569
+ "eval_accuracy": 0.9166707890180559,
1570
+ "eval_combined_score": 0.9037908230619536,
1571
+ "eval_f1": 0.8909108571058512,
1572
+ "eval_loss": 0.2216387242078781,
1573
+ "eval_runtime": 55.8191,
1574
+ "eval_samples_per_second": 724.304,
1575
+ "eval_steps_per_second": 1.415,
1576
+ "step": 87000
1577
+ },
1578
+ {
1579
+ "epoch": 7.738985137630815,
1580
+ "grad_norm": 2.134287118911743,
1581
+ "learning_rate": 3.687711943519798e-05,
1582
+ "loss": 0.1558,
1583
+ "step": 88000
1584
+ },
1585
+ {
1586
+ "epoch": 7.738985137630815,
1587
+ "eval_accuracy": 0.9162008409596833,
1588
+ "eval_combined_score": 0.9030432181917502,
1589
+ "eval_f1": 0.889885595423817,
1590
+ "eval_loss": 0.22495825588703156,
1591
+ "eval_runtime": 87.1663,
1592
+ "eval_samples_per_second": 463.826,
1593
+ "eval_steps_per_second": 0.906,
1594
+ "step": 88000
1595
+ },
1596
+ {
1597
+ "epoch": 7.826928150558438,
1598
+ "grad_norm": 0.8385611772537231,
1599
+ "learning_rate": 3.4175529616683805e-05,
1600
+ "loss": 0.1571,
1601
+ "step": 89000
1602
+ },
1603
+ {
1604
+ "epoch": 7.826928150558438,
1605
+ "eval_accuracy": 0.9150136037595845,
1606
+ "eval_combined_score": 0.9018612748991612,
1607
+ "eval_f1": 0.8887089460387381,
1608
+ "eval_loss": 0.2220984548330307,
1609
+ "eval_runtime": 87.3283,
1610
+ "eval_samples_per_second": 462.965,
1611
+ "eval_steps_per_second": 0.905,
1612
+ "step": 89000
1613
+ },
1614
+ {
1615
+ "epoch": 7.914871163486061,
1616
+ "grad_norm": 1.0239052772521973,
1617
+ "learning_rate": 3.156393598602742e-05,
1618
+ "loss": 0.1581,
1619
+ "step": 90000
1620
+ },
1621
+ {
1622
+ "epoch": 7.914871163486061,
1623
+ "eval_accuracy": 0.9166707890180559,
1624
+ "eval_combined_score": 0.9029806575631504,
1625
+ "eval_f1": 0.8892905261082449,
1626
+ "eval_loss": 0.21750399470329285,
1627
+ "eval_runtime": 87.1287,
1628
+ "eval_samples_per_second": 464.026,
1629
+ "eval_steps_per_second": 0.907,
1630
+ "step": 90000
1631
+ },
1632
+ {
1633
+ "epoch": 8.002814176413684,
1634
+ "grad_norm": 0.38908326625823975,
1635
+ "learning_rate": 2.9044367764430513e-05,
1636
+ "loss": 0.1531,
1637
+ "step": 91000
1638
+ },
1639
+ {
1640
+ "epoch": 8.002814176413684,
1641
+ "eval_accuracy": 0.9162750432846896,
1642
+ "eval_combined_score": 0.9024576848163421,
1643
+ "eval_f1": 0.8886403263479948,
1644
+ "eval_loss": 0.22433432936668396,
1645
+ "eval_runtime": 60.0446,
1646
+ "eval_samples_per_second": 673.333,
1647
+ "eval_steps_per_second": 1.316,
1648
+ "step": 91000
1649
+ },
1650
+ {
1651
+ "epoch": 8.090757189341307,
1652
+ "grad_norm": 1.5134445428848267,
1653
+ "learning_rate": 2.661878266889586e-05,
1654
+ "loss": 0.1484,
1655
+ "step": 92000
1656
+ },
1657
+ {
1658
+ "epoch": 8.090757189341307,
1659
+ "eval_accuracy": 0.9166460549097205,
1660
+ "eval_combined_score": 0.9034298817681259,
1661
+ "eval_f1": 0.8902137086265312,
1662
+ "eval_loss": 0.22620514035224915,
1663
+ "eval_runtime": 55.5114,
1664
+ "eval_samples_per_second": 728.318,
1665
+ "eval_steps_per_second": 1.423,
1666
+ "step": 92000
1667
+ },
1668
+ {
1669
+ "epoch": 8.17870020226893,
1670
+ "grad_norm": 1.3153518438339233,
1671
+ "learning_rate": 2.428906539107102e-05,
1672
+ "loss": 0.1496,
1673
+ "step": 93000
1674
+ },
1675
+ {
1676
+ "epoch": 8.17870020226893,
1677
+ "eval_accuracy": 0.9165965866930497,
1678
+ "eval_combined_score": 0.9037845840863076,
1679
+ "eval_f1": 0.8909725814795655,
1680
+ "eval_loss": 0.22277146577835083,
1681
+ "eval_runtime": 55.2882,
1682
+ "eval_samples_per_second": 731.259,
1683
+ "eval_steps_per_second": 1.429,
1684
+ "step": 93000
1685
+ },
1686
+ {
1687
+ "epoch": 8.266643215196552,
1688
+ "grad_norm": 1.276479721069336,
1689
+ "learning_rate": 2.2057026132833862e-05,
1690
+ "loss": 0.1506,
1691
+ "step": 94000
1692
+ },
1693
+ {
1694
+ "epoch": 8.266643215196552,
1695
+ "eval_accuracy": 0.9170912688597576,
1696
+ "eval_combined_score": 0.9035767921865203,
1697
+ "eval_f1": 0.8900623155132831,
1698
+ "eval_loss": 0.22255383431911469,
1699
+ "eval_runtime": 55.8223,
1700
+ "eval_samples_per_second": 724.263,
1701
+ "eval_steps_per_second": 1.415,
1702
+ "step": 94000
1703
+ },
1704
+ {
1705
+ "epoch": 8.354586228124175,
1706
+ "grad_norm": 1.188973307609558,
1707
+ "learning_rate": 1.992439919975663e-05,
1708
+ "loss": 0.1493,
1709
+ "step": 95000
1710
+ },
1711
+ {
1712
+ "epoch": 8.354586228124175,
1713
+ "eval_accuracy": 0.9159040316596587,
1714
+ "eval_combined_score": 0.9031239180423043,
1715
+ "eval_f1": 0.89034380442495,
1716
+ "eval_loss": 0.2243431955575943,
1717
+ "eval_runtime": 87.1287,
1718
+ "eval_samples_per_second": 464.026,
1719
+ "eval_steps_per_second": 0.907,
1720
+ "step": 95000
1721
+ },
1722
+ {
1723
+ "epoch": 8.442529241051798,
1724
+ "grad_norm": 1.3669207096099854,
1725
+ "learning_rate": 1.7892841653541984e-05,
1726
+ "loss": 0.1447,
1727
+ "step": 96000
1728
+ },
1729
+ {
1730
+ "epoch": 8.442529241051798,
1731
+ "eval_accuracy": 0.9167202572347267,
1732
+ "eval_combined_score": 0.903349181978145,
1733
+ "eval_f1": 0.8899781067215633,
1734
+ "eval_loss": 0.2281969040632248,
1735
+ "eval_runtime": 87.3222,
1736
+ "eval_samples_per_second": 462.998,
1737
+ "eval_steps_per_second": 0.905,
1738
+ "step": 96000
1739
+ },
1740
+ {
1741
+ "epoch": 8.530472253979422,
1742
+ "grad_norm": 0.6875282526016235,
1743
+ "learning_rate": 1.596393202447782e-05,
1744
+ "loss": 0.1508,
1745
+ "step": 97000
1746
+ },
1747
+ {
1748
+ "epoch": 8.530472253979422,
1749
+ "eval_accuracy": 0.9163492456096958,
1750
+ "eval_combined_score": 0.9029312382899835,
1751
+ "eval_f1": 0.8895132309702711,
1752
+ "eval_loss": 0.22148585319519043,
1753
+ "eval_runtime": 87.0802,
1754
+ "eval_samples_per_second": 464.285,
1755
+ "eval_steps_per_second": 0.907,
1756
+ "step": 97000
1757
+ },
1758
+ {
1759
+ "epoch": 8.618415266907045,
1760
+ "grad_norm": 0.5993226170539856,
1761
+ "learning_rate": 1.4139169084911189e-05,
1762
+ "loss": 0.1479,
1763
+ "step": 98000
1764
+ },
1765
+ {
1766
+ "epoch": 8.618415266907045,
1767
+ "eval_accuracy": 0.9169675983180806,
1768
+ "eval_combined_score": 0.9035788238556481,
1769
+ "eval_f1": 0.8901900493932158,
1770
+ "eval_loss": 0.2228359580039978,
1771
+ "eval_runtime": 55.3123,
1772
+ "eval_samples_per_second": 730.94,
1773
+ "eval_steps_per_second": 1.428,
1774
+ "step": 98000
1775
+ },
1776
+ {
1777
+ "epoch": 8.706358279834667,
1778
+ "grad_norm": 1.0595625638961792,
1779
+ "learning_rate": 1.2419970684695196e-05,
1780
+ "loss": 0.1548,
1781
+ "step": 99000
1782
+ },
1783
+ {
1784
+ "epoch": 8.706358279834667,
1785
+ "eval_accuracy": 0.916423447934702,
1786
+ "eval_combined_score": 0.9032096078752847,
1787
+ "eval_f1": 0.8899957678158674,
1788
+ "eval_loss": 0.2201036661863327,
1789
+ "eval_runtime": 55.2492,
1790
+ "eval_samples_per_second": 731.775,
1791
+ "eval_steps_per_second": 1.43,
1792
+ "step": 99000
1793
+ },
1794
+ {
1795
+ "epoch": 8.79430129276229,
1796
+ "grad_norm": 1.308894395828247,
1797
+ "learning_rate": 1.0807672649512177e-05,
1798
+ "loss": 0.1488,
1799
+ "step": 100000
1800
+ },
1801
+ {
1802
+ "epoch": 8.79430129276229,
1803
+ "eval_accuracy": 0.9162255750680188,
1804
+ "eval_combined_score": 0.9028761522882415,
1805
+ "eval_f1": 0.8895267295084641,
1806
+ "eval_loss": 0.2235955446958542,
1807
+ "eval_runtime": 55.191,
1808
+ "eval_samples_per_second": 732.547,
1809
+ "eval_steps_per_second": 1.431,
1810
+ "step": 100000
1811
+ },
1812
+ {
1813
+ "epoch": 8.882244305689913,
1814
+ "grad_norm": 0.6770079731941223,
1815
+ "learning_rate": 9.3035277429309e-06,
1816
+ "loss": 0.1478,
1817
+ "step": 101000
1818
+ },
1819
+ {
1820
+ "epoch": 8.882244305689913,
1821
+ "eval_accuracy": 0.9167202572347267,
1822
+ "eval_combined_score": 0.903277185189035,
1823
+ "eval_f1": 0.8898341131433433,
1824
+ "eval_loss": 0.22350822389125824,
1825
+ "eval_runtime": 60.3308,
1826
+ "eval_samples_per_second": 670.138,
1827
+ "eval_steps_per_second": 1.309,
1828
+ "step": 101000
1829
+ },
1830
+ {
1831
+ "epoch": 8.970187318617535,
1832
+ "grad_norm": 0.7858123779296875,
1833
+ "learning_rate": 7.908704693002666e-06,
1834
+ "loss": 0.1444,
1835
+ "step": 102000
1836
+ },
1837
+ {
1838
+ "epoch": 8.970187318617535,
1839
+ "eval_accuracy": 0.9165471184763789,
1840
+ "eval_combined_score": 0.9034474298914231,
1841
+ "eval_f1": 0.8903477413064673,
1842
+ "eval_loss": 0.2248321771621704,
1843
+ "eval_runtime": 87.05,
1844
+ "eval_samples_per_second": 464.446,
1845
+ "eval_steps_per_second": 0.908,
1846
+ "step": 102000
1847
+ },
1848
+ {
1849
+ "epoch": 9.05813033154516,
1850
+ "grad_norm": 1.6393145322799683,
1851
+ "learning_rate": 6.624287284154212e-06,
1852
+ "loss": 0.1528,
1853
+ "step": 103000
1854
+ },
1855
+ {
1856
+ "epoch": 9.05813033154516,
1857
+ "eval_accuracy": 0.9163245115013604,
1858
+ "eval_combined_score": 0.9029548403650606,
1859
+ "eval_f1": 0.8895851692287607,
1860
+ "eval_loss": 0.22281712293624878,
1861
+ "eval_runtime": 87.2239,
1862
+ "eval_samples_per_second": 463.52,
1863
+ "eval_steps_per_second": 0.906,
1864
+ "step": 103000
1865
+ },
1866
+ {
1867
+ "epoch": 9.146073344472782,
1868
+ "grad_norm": 1.7177001237869263,
1869
+ "learning_rate": 5.451273515081639e-06,
1870
+ "loss": 0.1472,
1871
+ "step": 104000
1872
+ },
1873
+ {
1874
+ "epoch": 9.146073344472782,
1875
+ "eval_accuracy": 0.916423447934702,
1876
+ "eval_combined_score": 0.903327534642074,
1877
+ "eval_f1": 0.8902316213494461,
1878
+ "eval_loss": 0.2237127125263214,
1879
+ "eval_runtime": 87.0582,
1880
+ "eval_samples_per_second": 464.402,
1881
+ "eval_steps_per_second": 0.907,
1882
+ "step": 104000
1883
+ },
1884
+ {
1885
+ "epoch": 9.234016357400405,
1886
+ "grad_norm": 0.5357476472854614,
1887
+ "learning_rate": 4.3905748233003915e-06,
1888
+ "loss": 0.1462,
1889
+ "step": 105000
1890
+ },
1891
+ {
1892
+ "epoch": 9.234016357400405,
1893
+ "eval_accuracy": 0.9163987138263665,
1894
+ "eval_combined_score": 0.90299224858965,
1895
+ "eval_f1": 0.8895857833529335,
1896
+ "eval_loss": 0.22355595231056213,
1897
+ "eval_runtime": 55.747,
1898
+ "eval_samples_per_second": 725.241,
1899
+ "eval_steps_per_second": 1.417,
1900
+ "step": 105000
1901
+ },
1902
+ {
1903
+ "epoch": 9.321959370328027,
1904
+ "grad_norm": 2.119542121887207,
1905
+ "learning_rate": 3.4430153769539838e-06,
1906
+ "loss": 0.1467,
1907
+ "step": 106000
1908
+ },
1909
+ {
1910
+ "epoch": 9.321959370328027,
1911
+ "eval_accuracy": 0.9161513727430126,
1912
+ "eval_combined_score": 0.9029967236107342,
1913
+ "eval_f1": 0.8898420744784559,
1914
+ "eval_loss": 0.22608432173728943,
1915
+ "eval_runtime": 55.7404,
1916
+ "eval_samples_per_second": 725.327,
1917
+ "eval_steps_per_second": 1.417,
1918
+ "step": 106000
1919
+ },
1920
+ {
1921
+ "epoch": 9.40990238325565,
1922
+ "grad_norm": 1.2164239883422852,
1923
+ "learning_rate": 2.609331434431139e-06,
1924
+ "loss": 0.1491,
1925
+ "step": 107000
1926
+ },
1927
+ {
1928
+ "epoch": 9.40990238325565,
1929
+ "eval_accuracy": 0.9165471184763789,
1930
+ "eval_combined_score": 0.9033724918078327,
1931
+ "eval_f1": 0.8901978651392867,
1932
+ "eval_loss": 0.2239413857460022,
1933
+ "eval_runtime": 55.5045,
1934
+ "eval_samples_per_second": 728.41,
1935
+ "eval_steps_per_second": 1.423,
1936
+ "step": 107000
1937
+ },
1938
+ {
1939
+ "epoch": 9.497845396183273,
1940
+ "grad_norm": 0.8952678442001343,
1941
+ "learning_rate": 1.890170772289401e-06,
1942
+ "loss": 0.1471,
1943
+ "step": 108000
1944
+ },
1945
+ {
1946
+ "epoch": 9.497845396183273,
1947
+ "eval_accuracy": 0.9166707890180559,
1948
+ "eval_combined_score": 0.9034531738072393,
1949
+ "eval_f1": 0.8902355585964227,
1950
+ "eval_loss": 0.2244831621646881,
1951
+ "eval_runtime": 55.9049,
1952
+ "eval_samples_per_second": 723.192,
1953
+ "eval_steps_per_second": 1.413,
1954
+ "step": 108000
1955
+ },
1956
+ {
1957
+ "epoch": 9.585788409110895,
1958
+ "grad_norm": 1.3746293783187866,
1959
+ "learning_rate": 1.286092181929571e-06,
1960
+ "loss": 0.1477,
1961
+ "step": 109000
1962
+ },
1963
+ {
1964
+ "epoch": 9.585788409110895,
1965
+ "eval_accuracy": 0.916423447934702,
1966
+ "eval_combined_score": 0.9031737723759903,
1967
+ "eval_f1": 0.8899240968172786,
1968
+ "eval_loss": 0.22419258952140808,
1969
+ "eval_runtime": 86.669,
1970
+ "eval_samples_per_second": 466.487,
1971
+ "eval_steps_per_second": 0.912,
1972
+ "step": 109000
1973
+ },
1974
+ {
1975
+ "epoch": 9.67373142203852,
1976
+ "grad_norm": 2.9468863010406494,
1977
+ "learning_rate": 7.975650354119345e-07,
1978
+ "loss": 0.1398,
1979
+ "step": 110000
1980
+ },
1981
+ {
1982
+ "epoch": 9.67373142203852,
1983
+ "eval_accuracy": 0.9164481820430374,
1984
+ "eval_combined_score": 0.9031072655418163,
1985
+ "eval_f1": 0.8897663490405953,
1986
+ "eval_loss": 0.22482836246490479,
1987
+ "eval_runtime": 86.9147,
1988
+ "eval_samples_per_second": 465.169,
1989
+ "eval_steps_per_second": 0.909,
1990
+ "step": 110000
1991
+ },
1992
+ {
1993
+ "epoch": 9.761674434966142,
1994
+ "grad_norm": 0.9196714162826538,
1995
+ "learning_rate": 4.249689207519447e-07,
1996
+ "loss": 0.1484,
1997
+ "step": 111000
1998
+ },
1999
+ {
2000
+ "epoch": 9.761674434966142,
2001
+ "eval_accuracy": 0.9164481820430374,
2002
+ "eval_combined_score": 0.9031898779227894,
2003
+ "eval_f1": 0.8899315738025415,
2004
+ "eval_loss": 0.22509995102882385,
2005
+ "eval_runtime": 87.3187,
2006
+ "eval_samples_per_second": 463.016,
2007
+ "eval_steps_per_second": 0.905,
2008
+ "step": 111000
2009
+ },
2010
+ {
2011
+ "epoch": 9.849617447893765,
2012
+ "grad_norm": 0.945548415184021,
2013
+ "learning_rate": 1.6859334697840177e-07,
2014
+ "loss": 0.15,
2015
+ "step": 112000
2016
+ },
2017
+ {
2018
+ "epoch": 9.849617447893765,
2019
+ "eval_accuracy": 0.9164729161513727,
2020
+ "eval_combined_score": 0.9032597381199673,
2021
+ "eval_f1": 0.8900465600885619,
2022
+ "eval_loss": 0.2251836657524109,
2023
+ "eval_runtime": 56.0654,
2024
+ "eval_samples_per_second": 721.122,
2025
+ "eval_steps_per_second": 1.409,
2026
+ "step": 112000
2027
+ },
2028
+ {
2029
+ "epoch": 9.937560460821388,
2030
+ "grad_norm": 0.8757835030555725,
2031
+ "learning_rate": 2.863751918346091e-08,
2032
+ "loss": 0.1434,
2033
+ "step": 113000
2034
+ },
2035
+ {
2036
+ "epoch": 9.937560460821388,
2037
+ "eval_accuracy": 0.9164976502597081,
2038
+ "eval_combined_score": 0.9032901743517072,
2039
+ "eval_f1": 0.8900826984437065,
2040
+ "eval_loss": 0.22522608935832977,
2041
+ "eval_runtime": 59.9448,
2042
+ "eval_samples_per_second": 674.454,
2043
+ "eval_steps_per_second": 1.318,
2044
+ "step": 113000
2045
+ },
2046
+ {
2047
+ "epoch": 10.0,
2048
+ "step": 113710,
2049
+ "total_flos": 6.076865681478144e+17,
2050
+ "train_loss": 0.18900428874628303,
2051
+ "train_runtime": 59418.549,
2052
+ "train_samples_per_second": 61.234,
2053
+ "train_steps_per_second": 1.914
2054
+ }
2055
+ ],
2056
+ "logging_steps": 1000,
2057
+ "max_steps": 113710,
2058
+ "num_input_tokens_seen": 0,
2059
+ "num_train_epochs": 10,
2060
+ "save_steps": 1000,
2061
+ "stateful_callbacks": {
2062
+ "TrainerControl": {
2063
+ "args": {
2064
+ "should_epoch_stop": false,
2065
+ "should_evaluate": false,
2066
+ "should_log": false,
2067
+ "should_save": true,
2068
+ "should_training_stop": true
2069
+ },
2070
+ "attributes": {}
2071
+ }
2072
+ },
2073
+ "total_flos": 6.076865681478144e+17,
2074
+ "train_batch_size": 32,
2075
+ "trial_name": null,
2076
+ "trial_params": null
2077
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/QQP.tsv ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9159782339846648,
4
+ "eval_combined_score": 0.9024553586140425,
5
+ "eval_f1": 0.88893248324342,
6
+ "eval_loss": 0.2253103107213974,
7
+ "eval_runtime": 51.8525,
8
+ "eval_samples": 40430,
9
+ "eval_samples_per_second": 779.711,
10
+ "eval_steps_per_second": 1.524
11
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/eval_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9159782339846648,
4
+ "eval_combined_score": 0.9024553586140425,
5
+ "eval_f1": 0.88893248324342,
6
+ "eval_loss": 0.2253103107213974,
7
+ "eval_runtime": 51.8525,
8
+ "eval_samples": 40430,
9
+ "eval_samples_per_second": 779.711,
10
+ "eval_steps_per_second": 1.524
11
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft/tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "pad_token": "[PAD]",
53
+ "padding_side": "right",
54
+ "sep_token": "[SEP]",
55
+ "sp_model_kwargs": {},
56
+ "split_by_punct": false,
57
+ "tokenizer_class": "DebertaV2Tokenizer",
58
+ "unk_token": "[UNK]",
59
+ "vocab_type": "spm"
60
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/deberta-v3-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:microsoft/deberta-v3-base
6
+ - transformers
7
+ ---
8
+
9
+ # Model Card for Model ID
10
+
11
+ <!-- Provide a quick summary of what the model is/does. -->
12
+
13
+
14
+
15
+ ## Model Details
16
+
17
+ ### Model Description
18
+
19
+ <!-- Provide a longer summary of what this model is. -->
20
+
21
+
22
+
23
+ - **Developed by:** [More Information Needed]
24
+ - **Funded by [optional]:** [More Information Needed]
25
+ - **Shared by [optional]:** [More Information Needed]
26
+ - **Model type:** [More Information Needed]
27
+ - **Language(s) (NLP):** [More Information Needed]
28
+ - **License:** [More Information Needed]
29
+ - **Finetuned from model [optional]:** [More Information Needed]
30
+
31
+ ### Model Sources [optional]
32
+
33
+ <!-- Provide the basic links for the model. -->
34
+
35
+ - **Repository:** [More Information Needed]
36
+ - **Paper [optional]:** [More Information Needed]
37
+ - **Demo [optional]:** [More Information Needed]
38
+
39
+ ## Uses
40
+
41
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
42
+
43
+ ### Direct Use
44
+
45
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
46
+
47
+ [More Information Needed]
48
+
49
+ ### Downstream Use [optional]
50
+
51
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
52
+
53
+ [More Information Needed]
54
+
55
+ ### Out-of-Scope Use
56
+
57
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
58
+
59
+ [More Information Needed]
60
+
61
+ ## Bias, Risks, and Limitations
62
+
63
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
64
+
65
+ [More Information Needed]
66
+
67
+ ### Recommendations
68
+
69
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
70
+
71
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
72
+
73
+ ## How to Get Started with the Model
74
+
75
+ Use the code below to get started with the model.
76
+
77
+ [More Information Needed]
78
+
79
+ ## Training Details
80
+
81
+ ### Training Data
82
+
83
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
84
+
85
+ [More Information Needed]
86
+
87
+ ### Training Procedure
88
+
89
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
90
+
91
+ #### Preprocessing [optional]
92
+
93
+ [More Information Needed]
94
+
95
+
96
+ #### Training Hyperparameters
97
+
98
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
99
+
100
+ #### Speeds, Sizes, Times [optional]
101
+
102
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
103
+
104
+ [More Information Needed]
105
+
106
+ ## Evaluation
107
+
108
+ <!-- This section describes the evaluation protocols and provides the results. -->
109
+
110
+ ### Testing Data, Factors & Metrics
111
+
112
+ #### Testing Data
113
+
114
+ <!-- This should link to a Dataset Card if possible. -->
115
+
116
+ [More Information Needed]
117
+
118
+ #### Factors
119
+
120
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
121
+
122
+ [More Information Needed]
123
+
124
+ #### Metrics
125
+
126
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
127
+
128
+ [More Information Needed]
129
+
130
+ ### Results
131
+
132
+ [More Information Needed]
133
+
134
+ #### Summary
135
+
136
+
137
+
138
+ ## Model Examination [optional]
139
+
140
+ <!-- Relevant interpretability work for the model goes here -->
141
+
142
+ [More Information Needed]
143
+
144
+ ## Environmental Impact
145
+
146
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
147
+
148
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
149
+
150
+ - **Hardware Type:** [More Information Needed]
151
+ - **Hours used:** [More Information Needed]
152
+ - **Cloud Provider:** [More Information Needed]
153
+ - **Compute Region:** [More Information Needed]
154
+ - **Carbon Emitted:** [More Information Needed]
155
+
156
+ ## Technical Specifications [optional]
157
+
158
+ ### Model Architecture and Objective
159
+
160
+ [More Information Needed]
161
+
162
+ ### Compute Infrastructure
163
+
164
+ [More Information Needed]
165
+
166
+ #### Hardware
167
+
168
+ [More Information Needed]
169
+
170
+ #### Software
171
+
172
+ [More Information Needed]
173
+
174
+ ## Citation [optional]
175
+
176
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
177
+
178
+ **BibTeX:**
179
+
180
+ [More Information Needed]
181
+
182
+ **APA:**
183
+
184
+ [More Information Needed]
185
+
186
+ ## Glossary [optional]
187
+
188
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
189
+
190
+ [More Information Needed]
191
+
192
+ ## More Information [optional]
193
+
194
+ [More Information Needed]
195
+
196
+ ## Model Card Authors [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Contact
201
+
202
+ [More Information Needed]
203
+ ### Framework versions
204
+
205
+ - PEFT 0.18.0
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/ft2/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": {
3
+ "base_model_class": "DebertaV2ForSequenceClassification",
4
+ "parent_library": "transformers.models.deberta_v2.modeling_deberta_v2"
5
+ },
6
+ "base_model_name_or_path": "microsoft/deberta-v3-base",
7
+ "bias": "none",
8
+ "boft_block_num": 0,
9
+ "boft_block_size": 4,
10
+ "boft_dropout": 0.05,
11
+ "boft_n_butterfly_factor": 2,
12
+ "exclude_modules": null,
13
+ "fan_in_fan_out": false,
14
+ "inference_mode": true,
15
+ "init_weights": true,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "modules_to_save": [
19
+ "classifier",
20
+ "pooler"
21
+ ],
22
+ "peft_type": "BOFT",
23
+ "peft_version": "0.18.0",
24
+ "revision": null,
25
+ "target_modules": [
26
+ "output.dense",
27
+ "attention.output.dense",
28
+ "intermediate.dense",
29
+ "value_proj",
30
+ "key_proj",
31
+ "query_proj"
32
+ ],
33
+ "task_type": null
34
+ }
reproduction/glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/trainer_state.json ADDED
@@ -0,0 +1,2077 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 105000,
3
+ "best_metric": 0.9159782339846648,
4
+ "best_model_checkpoint": "./glue_expBOFT/qqp/dr0.05,mlr3e-04,clr3e-04,ep=10.0t=26d17h54m17,sd44/checkpoint-105000",
5
+ "epoch": 10.0,
6
+ "eval_steps": 1000,
7
+ "global_step": 113710,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0879430129276229,
14
+ "grad_norm": 2.4402430057525635,
15
+ "learning_rate": 0.00029969999999999997,
16
+ "loss": 0.4407,
17
+ "step": 1000
18
+ },
19
+ {
20
+ "epoch": 0.0879430129276229,
21
+ "eval_accuracy": 0.8518179569626515,
22
+ "eval_combined_score": 0.8343454755010415,
23
+ "eval_f1": 0.8168729940394315,
24
+ "eval_loss": 0.3301476538181305,
25
+ "eval_runtime": 87.2157,
26
+ "eval_samples_per_second": 463.563,
27
+ "eval_steps_per_second": 0.906,
28
+ "step": 1000
29
+ },
30
+ {
31
+ "epoch": 0.1758860258552458,
32
+ "grad_norm": 0.9877966046333313,
33
+ "learning_rate": 0.00029994184111301736,
34
+ "loss": 0.3209,
35
+ "step": 2000
36
+ },
37
+ {
38
+ "epoch": 0.1758860258552458,
39
+ "eval_accuracy": 0.8608706406134059,
40
+ "eval_combined_score": 0.8454013067012608,
41
+ "eval_f1": 0.8299319727891157,
42
+ "eval_loss": 0.30784711241722107,
43
+ "eval_runtime": 86.747,
44
+ "eval_samples_per_second": 466.068,
45
+ "eval_steps_per_second": 0.911,
46
+ "step": 2000
47
+ },
48
+ {
49
+ "epoch": 0.2638290387828687,
50
+ "grad_norm": 0.6972408294677734,
51
+ "learning_rate": 0.00029976717673021227,
52
+ "loss": 0.2912,
53
+ "step": 3000
54
+ },
55
+ {
56
+ "epoch": 0.2638290387828687,
57
+ "eval_accuracy": 0.881251545881771,
58
+ "eval_combined_score": 0.8630827293643115,
59
+ "eval_f1": 0.8449139128468521,
60
+ "eval_loss": 0.27525821328163147,
61
+ "eval_runtime": 89.5625,
62
+ "eval_samples_per_second": 451.417,
63
+ "eval_steps_per_second": 0.882,
64
+ "step": 3000
65
+ },
66
+ {
67
+ "epoch": 0.3517720517104916,
68
+ "grad_norm": 0.9264038801193237,
69
+ "learning_rate": 0.0002994761425083971,
70
+ "loss": 0.2889,
71
+ "step": 4000
72
+ },
73
+ {
74
+ "epoch": 0.3517720517104916,
75
+ "eval_accuracy": 0.8837496908236459,
76
+ "eval_combined_score": 0.8615823758204604,
77
+ "eval_f1": 0.8394150608172748,
78
+ "eval_loss": 0.2710655629634857,
79
+ "eval_runtime": 56.4509,
80
+ "eval_samples_per_second": 716.197,
81
+ "eval_steps_per_second": 1.399,
82
+ "step": 4000
83
+ },
84
+ {
85
+ "epoch": 0.4397150646381145,
86
+ "grad_norm": 1.364985466003418,
87
+ "learning_rate": 0.0002990689645826054,
88
+ "loss": 0.2733,
89
+ "step": 5000
90
+ },
91
+ {
92
+ "epoch": 0.4397150646381145,
93
+ "eval_accuracy": 0.8894632698491219,
94
+ "eval_combined_score": 0.8706655405893162,
95
+ "eval_f1": 0.8518678113295104,
96
+ "eval_loss": 0.2592947483062744,
97
+ "eval_runtime": 55.852,
98
+ "eval_samples_per_second": 723.877,
99
+ "eval_steps_per_second": 1.414,
100
+ "step": 5000
101
+ },
102
+ {
103
+ "epoch": 0.5276580775657373,
104
+ "grad_norm": 1.0107234716415405,
105
+ "learning_rate": 0.00029854595933210474,
106
+ "loss": 0.2713,
107
+ "step": 6000
108
+ },
109
+ {
110
+ "epoch": 0.5276580775657373,
111
+ "eval_accuracy": 0.8898342814741529,
112
+ "eval_combined_score": 0.8716219853921581,
113
+ "eval_f1": 0.8534096893101633,
114
+ "eval_loss": 0.26888197660446167,
115
+ "eval_runtime": 55.9812,
116
+ "eval_samples_per_second": 722.206,
117
+ "eval_steps_per_second": 1.411,
118
+ "step": 6000
119
+ },
120
+ {
121
+ "epoch": 0.6156010904933603,
122
+ "grad_norm": 1.0506044626235962,
123
+ "learning_rate": 0.0002979075331345683,
124
+ "loss": 0.2651,
125
+ "step": 7000
126
+ },
127
+ {
128
+ "epoch": 0.6156010904933603,
129
+ "eval_accuracy": 0.8876329458323028,
130
+ "eval_combined_score": 0.8718629742307322,
131
+ "eval_f1": 0.8560930026291615,
132
+ "eval_loss": 0.2643047571182251,
133
+ "eval_runtime": 55.2175,
134
+ "eval_samples_per_second": 732.195,
135
+ "eval_steps_per_second": 1.431,
136
+ "step": 7000
137
+ },
138
+ {
139
+ "epoch": 0.7035441034209832,
140
+ "grad_norm": 1.359427571296692,
141
+ "learning_rate": 0.0002971541820503175,
142
+ "loss": 0.2642,
143
+ "step": 8000
144
+ },
145
+ {
146
+ "epoch": 0.7035441034209832,
147
+ "eval_accuracy": 0.8933959930744496,
148
+ "eval_combined_score": 0.8754111953463548,
149
+ "eval_f1": 0.85742639761826,
150
+ "eval_loss": 0.25238949060440063,
151
+ "eval_runtime": 87.0135,
152
+ "eval_samples_per_second": 464.641,
153
+ "eval_steps_per_second": 0.908,
154
+ "step": 8000
155
+ },
156
+ {
157
+ "epoch": 0.7914871163486061,
158
+ "grad_norm": 0.7769826650619507,
159
+ "learning_rate": 0.00029628649143688076,
160
+ "loss": 0.2584,
161
+ "step": 9000
162
+ },
163
+ {
164
+ "epoch": 0.7914871163486061,
165
+ "eval_accuracy": 0.8923324264160277,
166
+ "eval_combined_score": 0.8774179899539629,
167
+ "eval_f1": 0.862503553491898,
168
+ "eval_loss": 0.25590795278549194,
169
+ "eval_runtime": 87.1955,
170
+ "eval_samples_per_second": 463.671,
171
+ "eval_steps_per_second": 0.906,
172
+ "step": 9000
173
+ },
174
+ {
175
+ "epoch": 0.879430129276229,
176
+ "grad_norm": 1.009511947631836,
177
+ "learning_rate": 0.0002953051354941674,
178
+ "loss": 0.2595,
179
+ "step": 10000
180
+ },
181
+ {
182
+ "epoch": 0.879430129276229,
183
+ "eval_accuracy": 0.8916151372743013,
184
+ "eval_combined_score": 0.8769688103111484,
185
+ "eval_f1": 0.8623224833479954,
186
+ "eval_loss": 0.254254013299942,
187
+ "eval_runtime": 87.0743,
188
+ "eval_samples_per_second": 464.316,
189
+ "eval_steps_per_second": 0.907,
190
+ "step": 10000
191
+ },
192
+ {
193
+ "epoch": 0.967373142203852,
194
+ "grad_norm": 1.7336089611053467,
195
+ "learning_rate": 0.0002942108767406115,
196
+ "loss": 0.2541,
197
+ "step": 11000
198
+ },
199
+ {
200
+ "epoch": 0.967373142203852,
201
+ "eval_accuracy": 0.897180311649765,
202
+ "eval_combined_score": 0.8794474127358303,
203
+ "eval_f1": 0.8617145138218955,
204
+ "eval_loss": 0.23982903361320496,
205
+ "eval_runtime": 56.7413,
206
+ "eval_samples_per_second": 712.532,
207
+ "eval_steps_per_second": 1.392,
208
+ "step": 11000
209
+ },
210
+ {
211
+ "epoch": 1.0553161551314747,
212
+ "grad_norm": 0.8252896666526794,
213
+ "learning_rate": 0.00029300456542069104,
214
+ "loss": 0.2407,
215
+ "step": 12000
216
+ },
217
+ {
218
+ "epoch": 1.0553161551314747,
219
+ "eval_accuracy": 0.8977244620331437,
220
+ "eval_combined_score": 0.881272574604136,
221
+ "eval_f1": 0.8648206871751283,
222
+ "eval_loss": 0.24860291182994843,
223
+ "eval_runtime": 55.8804,
224
+ "eval_samples_per_second": 723.51,
225
+ "eval_steps_per_second": 1.414,
226
+ "step": 12000
227
+ },
228
+ {
229
+ "epoch": 1.1432591680590978,
230
+ "grad_norm": 1.1061173677444458,
231
+ "learning_rate": 0.0002916871388442835,
232
+ "loss": 0.2371,
233
+ "step": 13000
234
+ },
235
+ {
236
+ "epoch": 1.1432591680590978,
237
+ "eval_accuracy": 0.8950779124412565,
238
+ "eval_combined_score": 0.8795756134879955,
239
+ "eval_f1": 0.8640733145347347,
240
+ "eval_loss": 0.25066396594047546,
241
+ "eval_runtime": 56.1438,
242
+ "eval_samples_per_second": 720.116,
243
+ "eval_steps_per_second": 1.407,
244
+ "step": 13000
245
+ },
246
+ {
247
+ "epoch": 1.2312021809867206,
248
+ "grad_norm": 1.1123906373977661,
249
+ "learning_rate": 0.00029025962065837193,
250
+ "loss": 0.2327,
251
+ "step": 14000
252
+ },
253
+ {
254
+ "epoch": 1.2312021809867206,
255
+ "eval_accuracy": 0.8974771209497897,
256
+ "eval_combined_score": 0.8832505205343002,
257
+ "eval_f1": 0.8690239201188106,
258
+ "eval_loss": 0.24921134114265442,
259
+ "eval_runtime": 94.1253,
260
+ "eval_samples_per_second": 429.534,
261
+ "eval_steps_per_second": 0.839,
262
+ "step": 14000
263
+ },
264
+ {
265
+ "epoch": 1.3191451939143435,
266
+ "grad_norm": 0.6919705271720886,
267
+ "learning_rate": 0.00028872312005166577,
268
+ "loss": 0.2305,
269
+ "step": 15000
270
+ },
271
+ {
272
+ "epoch": 1.3191451939143435,
273
+ "eval_accuracy": 0.8989116992332427,
274
+ "eval_combined_score": 0.8814094975137943,
275
+ "eval_f1": 0.8639072957943459,
276
+ "eval_loss": 0.2356240302324295,
277
+ "eval_runtime": 87.3232,
278
+ "eval_samples_per_second": 462.993,
279
+ "eval_steps_per_second": 0.905,
280
+ "step": 15000
281
+ },
282
+ {
283
+ "epoch": 1.4070882068419663,
284
+ "grad_norm": 1.8297632932662964,
285
+ "learning_rate": 0.00028707883089275593,
286
+ "loss": 0.2362,
287
+ "step": 16000
288
+ },
289
+ {
290
+ "epoch": 1.4070882068419663,
291
+ "eval_accuracy": 0.9004452139500371,
292
+ "eval_combined_score": 0.8845040294352609,
293
+ "eval_f1": 0.8685628449204846,
294
+ "eval_loss": 0.23727574944496155,
295
+ "eval_runtime": 86.7703,
296
+ "eval_samples_per_second": 465.943,
297
+ "eval_steps_per_second": 0.91,
298
+ "step": 16000
299
+ },
300
+ {
301
+ "epoch": 1.4950312197695892,
302
+ "grad_norm": 1.0713611841201782,
303
+ "learning_rate": 0.0002853280308024728,
304
+ "loss": 0.2321,
305
+ "step": 17000
306
+ },
307
+ {
308
+ "epoch": 1.4950312197695892,
309
+ "eval_accuracy": 0.9034875092752906,
310
+ "eval_combined_score": 0.8879104867407732,
311
+ "eval_f1": 0.8723334642062557,
312
+ "eval_loss": 0.22905229032039642,
313
+ "eval_runtime": 87.3498,
314
+ "eval_samples_per_second": 462.852,
315
+ "eval_steps_per_second": 0.904,
316
+ "step": 17000
317
+ },
318
+ {
319
+ "epoch": 1.5829742326972123,
320
+ "grad_norm": 1.0400513410568237,
321
+ "learning_rate": 0.0002834720801611687,
322
+ "loss": 0.2271,
323
+ "step": 18000
324
+ },
325
+ {
326
+ "epoch": 1.5829742326972123,
327
+ "eval_accuracy": 0.9022013356418501,
328
+ "eval_combined_score": 0.8872069311523193,
329
+ "eval_f1": 0.8722125266627885,
330
+ "eval_loss": 0.23319664597511292,
331
+ "eval_runtime": 55.8286,
332
+ "eval_samples_per_second": 724.18,
333
+ "eval_steps_per_second": 1.415,
334
+ "step": 18000
335
+ },
336
+ {
337
+ "epoch": 1.6709172456248351,
338
+ "grad_norm": 2.248392105102539,
339
+ "learning_rate": 0.0002815124210516956,
340
+ "loss": 0.2318,
341
+ "step": 19000
342
+ },
343
+ {
344
+ "epoch": 1.6709172456248351,
345
+ "eval_accuracy": 0.902448676725204,
346
+ "eval_combined_score": 0.8844990219925675,
347
+ "eval_f1": 0.8665493672599309,
348
+ "eval_loss": 0.2336564064025879,
349
+ "eval_runtime": 55.5382,
350
+ "eval_samples_per_second": 727.967,
351
+ "eval_steps_per_second": 1.422,
352
+ "step": 19000
353
+ },
354
+ {
355
+ "epoch": 1.758860258552458,
356
+ "grad_norm": 0.8035272359848022,
357
+ "learning_rate": 0.0002794505761388994,
358
+ "loss": 0.2287,
359
+ "step": 20000
360
+ },
361
+ {
362
+ "epoch": 1.758860258552458,
363
+ "eval_accuracy": 0.9015335147167944,
364
+ "eval_combined_score": 0.8876584221756192,
365
+ "eval_f1": 0.873783329634444,
366
+ "eval_loss": 0.24032531678676605,
367
+ "eval_runtime": 56.1973,
368
+ "eval_samples_per_second": 719.43,
369
+ "eval_steps_per_second": 1.406,
370
+ "step": 20000
371
+ },
372
+ {
373
+ "epoch": 1.846803271480081,
374
+ "grad_norm": 0.9019431471824646,
375
+ "learning_rate": 0.0002772881474865019,
376
+ "loss": 0.2253,
377
+ "step": 21000
378
+ },
379
+ {
380
+ "epoch": 1.846803271480081,
381
+ "eval_accuracy": 0.9032896364086075,
382
+ "eval_combined_score": 0.8860451940180075,
383
+ "eval_f1": 0.8688007516274076,
384
+ "eval_loss": 0.23804490268230438,
385
+ "eval_runtime": 87.2417,
386
+ "eval_samples_per_second": 463.425,
387
+ "eval_steps_per_second": 0.906,
388
+ "step": 21000
389
+ },
390
+ {
391
+ "epoch": 1.9347462844077037,
392
+ "grad_norm": 0.7785677313804626,
393
+ "learning_rate": 0.00027502681531228946,
394
+ "loss": 0.2311,
395
+ "step": 22000
396
+ },
397
+ {
398
+ "epoch": 1.9347462844077037,
399
+ "eval_accuracy": 0.9008656937917388,
400
+ "eval_combined_score": 0.8864890051600685,
401
+ "eval_f1": 0.8721123165283983,
402
+ "eval_loss": 0.23252907395362854,
403
+ "eval_runtime": 87.0149,
404
+ "eval_samples_per_second": 464.633,
405
+ "eval_steps_per_second": 0.908,
406
+ "step": 22000
407
+ },
408
+ {
409
+ "epoch": 2.0226892973353268,
410
+ "grad_norm": 0.6860449314117432,
411
+ "learning_rate": 0.00027266833668257537,
412
+ "loss": 0.2228,
413
+ "step": 23000
414
+ },
415
+ {
416
+ "epoch": 2.0226892973353268,
417
+ "eval_accuracy": 0.903635913925303,
418
+ "eval_combined_score": 0.8885753103812606,
419
+ "eval_f1": 0.8735147068372183,
420
+ "eval_loss": 0.235237717628479,
421
+ "eval_runtime": 86.8685,
422
+ "eval_samples_per_second": 465.416,
423
+ "eval_steps_per_second": 0.909,
424
+ "step": 23000
425
+ },
426
+ {
427
+ "epoch": 2.1106323102629494,
428
+ "grad_norm": 0.8559027910232544,
429
+ "learning_rate": 0.0002702145441469506,
430
+ "loss": 0.211,
431
+ "step": 24000
432
+ },
433
+ {
434
+ "epoch": 2.1106323102629494,
435
+ "eval_accuracy": 0.9057135790254761,
436
+ "eval_combined_score": 0.8897984953289229,
437
+ "eval_f1": 0.8738834116323695,
438
+ "eval_loss": 0.23580744862556458,
439
+ "eval_runtime": 55.7446,
440
+ "eval_samples_per_second": 725.272,
441
+ "eval_steps_per_second": 1.417,
442
+ "step": 24000
443
+ },
444
+ {
445
+ "epoch": 2.1985753231905725,
446
+ "grad_norm": 0.779170036315918,
447
+ "learning_rate": 0.00026766734431438345,
448
+ "loss": 0.2137,
449
+ "step": 25000
450
+ },
451
+ {
452
+ "epoch": 2.1985753231905725,
453
+ "eval_accuracy": 0.9070739549839228,
454
+ "eval_combined_score": 0.8914480665681472,
455
+ "eval_f1": 0.8758221781523715,
456
+ "eval_loss": 0.23073111474514008,
457
+ "eval_runtime": 60.3864,
458
+ "eval_samples_per_second": 669.522,
459
+ "eval_steps_per_second": 1.308,
460
+ "step": 25000
461
+ },
462
+ {
463
+ "epoch": 2.2865183361181955,
464
+ "grad_norm": 1.2027239799499512,
465
+ "learning_rate": 0.0002650287163717754,
466
+ "loss": 0.2126,
467
+ "step": 26000
468
+ },
469
+ {
470
+ "epoch": 2.2865183361181955,
471
+ "eval_accuracy": 0.9067524115755627,
472
+ "eval_combined_score": 0.8911649846656692,
473
+ "eval_f1": 0.8755775577557756,
474
+ "eval_loss": 0.22513732314109802,
475
+ "eval_runtime": 55.9295,
476
+ "eval_samples_per_second": 722.874,
477
+ "eval_steps_per_second": 1.412,
478
+ "step": 26000
479
+ },
480
+ {
481
+ "epoch": 2.374461349045818,
482
+ "grad_norm": 1.0992302894592285,
483
+ "learning_rate": 0.0002623007105461227,
484
+ "loss": 0.2151,
485
+ "step": 27000
486
+ },
487
+ {
488
+ "epoch": 2.374461349045818,
489
+ "eval_accuracy": 0.9044026712837002,
490
+ "eval_combined_score": 0.8888801259036543,
491
+ "eval_f1": 0.8733575805236082,
492
+ "eval_loss": 0.22250224649906158,
493
+ "eval_runtime": 56.2936,
494
+ "eval_samples_per_second": 718.199,
495
+ "eval_steps_per_second": 1.403,
496
+ "step": 27000
497
+ },
498
+ {
499
+ "epoch": 2.4624043619734413,
500
+ "grad_norm": 0.9432693719863892,
501
+ "learning_rate": 0.00025948544651147997,
502
+ "loss": 0.2105,
503
+ "step": 28000
504
+ },
505
+ {
506
+ "epoch": 2.4624043619734413,
507
+ "eval_accuracy": 0.9040316596586693,
508
+ "eval_combined_score": 0.8899398026950712,
509
+ "eval_f1": 0.8758479457314732,
510
+ "eval_loss": 0.2282586693763733,
511
+ "eval_runtime": 87.1287,
512
+ "eval_samples_per_second": 464.026,
513
+ "eval_steps_per_second": 0.907,
514
+ "step": 28000
515
+ },
516
+ {
517
+ "epoch": 2.5503473749010643,
518
+ "grad_norm": 0.8697973489761353,
519
+ "learning_rate": 0.00025658511174196294,
520
+ "loss": 0.2084,
521
+ "step": 29000
522
+ },
523
+ {
524
+ "epoch": 2.5503473749010643,
525
+ "eval_accuracy": 0.9056146425921345,
526
+ "eval_combined_score": 0.8904910367575605,
527
+ "eval_f1": 0.8753674309229865,
528
+ "eval_loss": 0.2263556867837906,
529
+ "eval_runtime": 87.0161,
530
+ "eval_samples_per_second": 464.627,
531
+ "eval_steps_per_second": 0.908,
532
+ "step": 29000
533
+ },
534
+ {
535
+ "epoch": 2.638290387828687,
536
+ "grad_norm": 0.9848024845123291,
537
+ "learning_rate": 0.00025360195981207026,
538
+ "loss": 0.214,
539
+ "step": 30000
540
+ },
541
+ {
542
+ "epoch": 2.638290387828687,
543
+ "eval_accuracy": 0.9048973534504081,
544
+ "eval_combined_score": 0.8893061667899067,
545
+ "eval_f1": 0.8737149801294052,
546
+ "eval_loss": 0.23553258180618286,
547
+ "eval_runtime": 86.9823,
548
+ "eval_samples_per_second": 464.807,
549
+ "eval_steps_per_second": 0.908,
550
+ "step": 30000
551
+ },
552
+ {
553
+ "epoch": 2.72623340075631,
554
+ "grad_norm": 0.7595154047012329,
555
+ "learning_rate": 0.0002505383086456447,
556
+ "loss": 0.213,
557
+ "step": 31000
558
+ },
559
+ {
560
+ "epoch": 2.72623340075631,
561
+ "eval_accuracy": 0.9091268859757605,
562
+ "eval_combined_score": 0.8950828057647227,
563
+ "eval_f1": 0.8810387255536848,
564
+ "eval_loss": 0.22583059966564178,
565
+ "eval_runtime": 56.2161,
566
+ "eval_samples_per_second": 719.189,
567
+ "eval_steps_per_second": 1.405,
568
+ "step": 31000
569
+ },
570
+ {
571
+ "epoch": 2.8141764136839327,
572
+ "grad_norm": 1.9866888523101807,
573
+ "learning_rate": 0.0002473965387148352,
574
+ "loss": 0.2137,
575
+ "step": 32000
576
+ },
577
+ {
578
+ "epoch": 2.8141764136839327,
579
+ "eval_accuracy": 0.9067771456838981,
580
+ "eval_combined_score": 0.8903090749339992,
581
+ "eval_f1": 0.8738410041841004,
582
+ "eval_loss": 0.2325998991727829,
583
+ "eval_runtime": 55.3837,
584
+ "eval_samples_per_second": 729.999,
585
+ "eval_steps_per_second": 1.426,
586
+ "step": 32000
587
+ },
588
+ {
589
+ "epoch": 2.9021194266115558,
590
+ "grad_norm": 0.944116473197937,
591
+ "learning_rate": 0.000244179091190458,
592
+ "loss": 0.207,
593
+ "step": 33000
594
+ },
595
+ {
596
+ "epoch": 2.9021194266115558,
597
+ "eval_accuracy": 0.9053425674004452,
598
+ "eval_combined_score": 0.888638221424012,
599
+ "eval_f1": 0.8719338754475789,
600
+ "eval_loss": 0.24054807424545288,
601
+ "eval_runtime": 56.3214,
602
+ "eval_samples_per_second": 717.844,
603
+ "eval_steps_per_second": 1.403,
604
+ "step": 33000
605
+ },
606
+ {
607
+ "epoch": 2.9900624395391784,
608
+ "grad_norm": 0.6180978417396545,
609
+ "learning_rate": 0.00024088846604519457,
610
+ "loss": 0.2087,
611
+ "step": 34000
612
+ },
613
+ {
614
+ "epoch": 2.9900624395391784,
615
+ "eval_accuracy": 0.9076923076923077,
616
+ "eval_combined_score": 0.8927819480735247,
617
+ "eval_f1": 0.8778715884547418,
618
+ "eval_loss": 0.22520828247070312,
619
+ "eval_runtime": 55.5236,
620
+ "eval_samples_per_second": 728.159,
621
+ "eval_steps_per_second": 1.423,
622
+ "step": 34000
623
+ },
624
+ {
625
+ "epoch": 3.0780054524668015,
626
+ "grad_norm": 1.2374540567398071,
627
+ "learning_rate": 0.00023752722011110102,
628
+ "loss": 0.1925,
629
+ "step": 35000
630
+ },
631
+ {
632
+ "epoch": 3.0780054524668015,
633
+ "eval_accuracy": 0.9074944348256245,
634
+ "eval_combined_score": 0.8931313341064914,
635
+ "eval_f1": 0.8787682333873582,
636
+ "eval_loss": 0.22987455129623413,
637
+ "eval_runtime": 87.0711,
638
+ "eval_samples_per_second": 464.333,
639
+ "eval_steps_per_second": 0.907,
640
+ "step": 35000
641
+ },
642
+ {
643
+ "epoch": 3.1659484653944245,
644
+ "grad_norm": 0.8898099660873413,
645
+ "learning_rate": 0.00023409796509293643,
646
+ "loss": 0.1967,
647
+ "step": 36000
648
+ },
649
+ {
650
+ "epoch": 3.1659484653944245,
651
+ "eval_accuracy": 0.9072223596339352,
652
+ "eval_combined_score": 0.8932183284242772,
653
+ "eval_f1": 0.8792142972146192,
654
+ "eval_loss": 0.22618883848190308,
655
+ "eval_runtime": 93.9738,
656
+ "eval_samples_per_second": 430.226,
657
+ "eval_steps_per_second": 0.841,
658
+ "step": 36000
659
+ },
660
+ {
661
+ "epoch": 3.253891478322047,
662
+ "grad_norm": 1.2053686380386353,
663
+ "learning_rate": 0.0002306033655388555,
664
+ "loss": 0.1941,
665
+ "step": 37000
666
+ },
667
+ {
668
+ "epoch": 3.253891478322047,
669
+ "eval_accuracy": 0.9096463022508039,
670
+ "eval_combined_score": 0.8952601313960689,
671
+ "eval_f1": 0.8808739605413338,
672
+ "eval_loss": 0.23468315601348877,
673
+ "eval_runtime": 86.9077,
674
+ "eval_samples_per_second": 465.206,
675
+ "eval_steps_per_second": 0.909,
676
+ "step": 37000
677
+ },
678
+ {
679
+ "epoch": 3.3418344912496702,
680
+ "grad_norm": 1.3579250574111938,
681
+ "learning_rate": 0.0002270461367700413,
682
+ "loss": 0.2011,
683
+ "step": 38000
684
+ },
685
+ {
686
+ "epoch": 3.3418344912496702,
687
+ "eval_accuracy": 0.9070739549839228,
688
+ "eval_combined_score": 0.8937216614594825,
689
+ "eval_f1": 0.8803693679350422,
690
+ "eval_loss": 0.22943687438964844,
691
+ "eval_runtime": 56.3453,
692
+ "eval_samples_per_second": 717.54,
693
+ "eval_steps_per_second": 1.402,
694
+ "step": 38000
695
+ },
696
+ {
697
+ "epoch": 3.4297775041772933,
698
+ "grad_norm": 0.38186776638031006,
699
+ "learning_rate": 0.00022342904277088745,
700
+ "loss": 0.1984,
701
+ "step": 39000
702
+ },
703
+ {
704
+ "epoch": 3.4297775041772933,
705
+ "eval_accuracy": 0.9076675735839723,
706
+ "eval_combined_score": 0.8940503077234105,
707
+ "eval_f1": 0.8804330418628488,
708
+ "eval_loss": 0.22878248989582062,
709
+ "eval_runtime": 56.1136,
710
+ "eval_samples_per_second": 720.503,
711
+ "eval_steps_per_second": 1.408,
712
+ "step": 39000
713
+ },
714
+ {
715
+ "epoch": 3.517720517104916,
716
+ "grad_norm": 1.1301718950271606,
717
+ "learning_rate": 0.00021975489404136827,
718
+ "loss": 0.1948,
719
+ "step": 40000
720
+ },
721
+ {
722
+ "epoch": 3.517720517104916,
723
+ "eval_accuracy": 0.9086569379173881,
724
+ "eval_combined_score": 0.8933979294437625,
725
+ "eval_f1": 0.8781389209701369,
726
+ "eval_loss": 0.2234371155500412,
727
+ "eval_runtime": 55.9217,
728
+ "eval_samples_per_second": 722.975,
729
+ "eval_steps_per_second": 1.413,
730
+ "step": 40000
731
+ },
732
+ {
733
+ "epoch": 3.605663530032539,
734
+ "grad_norm": 1.0810197591781616,
735
+ "learning_rate": 0.00021602654541326668,
736
+ "loss": 0.1929,
737
+ "step": 41000
738
+ },
739
+ {
740
+ "epoch": 3.605663530032539,
741
+ "eval_accuracy": 0.9084095968340341,
742
+ "eval_combined_score": 0.8952942381064759,
743
+ "eval_f1": 0.8821788793789176,
744
+ "eval_loss": 0.22578567266464233,
745
+ "eval_runtime": 86.5162,
746
+ "eval_samples_per_second": 467.311,
747
+ "eval_steps_per_second": 0.913,
748
+ "step": 41000
749
+ },
750
+ {
751
+ "epoch": 3.6936065429601617,
752
+ "grad_norm": 1.3687130212783813,
753
+ "learning_rate": 0.00021224689383195542,
754
+ "loss": 0.1946,
755
+ "step": 42000
756
+ },
757
+ {
758
+ "epoch": 3.6936065429601617,
759
+ "eval_accuracy": 0.9075933712589661,
760
+ "eval_combined_score": 0.8921709268540805,
761
+ "eval_f1": 0.876748482449195,
762
+ "eval_loss": 0.22414971888065338,
763
+ "eval_runtime": 87.1571,
764
+ "eval_samples_per_second": 463.875,
765
+ "eval_steps_per_second": 0.906,
766
+ "step": 42000
767
+ },
768
+ {
769
+ "epoch": 3.7815495558877847,
770
+ "grad_norm": 0.675777018070221,
771
+ "learning_rate": 0.00020841887610545634,
772
+ "loss": 0.1971,
773
+ "step": 43000
774
+ },
775
+ {
776
+ "epoch": 3.7815495558877847,
777
+ "eval_accuracy": 0.9100420479841702,
778
+ "eval_combined_score": 0.8939421941798391,
779
+ "eval_f1": 0.877842340375508,
780
+ "eval_loss": 0.22170396149158478,
781
+ "eval_runtime": 87.0916,
782
+ "eval_samples_per_second": 464.224,
783
+ "eval_steps_per_second": 0.907,
784
+ "step": 43000
785
+ },
786
+ {
787
+ "epoch": 3.8694925688154074,
788
+ "grad_norm": 1.8298357725143433,
789
+ "learning_rate": 0.00020454546662252592,
790
+ "loss": 0.1986,
791
+ "step": 44000
792
+ },
793
+ {
794
+ "epoch": 3.8694925688154074,
795
+ "eval_accuracy": 0.9101657185258472,
796
+ "eval_combined_score": 0.895972124284732,
797
+ "eval_f1": 0.881778530043617,
798
+ "eval_loss": 0.2266804426908493,
799
+ "eval_runtime": 87.1467,
800
+ "eval_samples_per_second": 463.931,
801
+ "eval_steps_per_second": 0.907,
802
+ "step": 44000
803
+ },
804
+ {
805
+ "epoch": 3.9574355817430305,
806
+ "grad_norm": 0.7815334796905518,
807
+ "learning_rate": 0.00020062967504154062,
808
+ "loss": 0.1985,
809
+ "step": 45000
810
+ },
811
+ {
812
+ "epoch": 3.9574355817430305,
813
+ "eval_accuracy": 0.9075439030422954,
814
+ "eval_combined_score": 0.8947048887564678,
815
+ "eval_f1": 0.8818658744706402,
816
+ "eval_loss": 0.22852376103401184,
817
+ "eval_runtime": 56.4004,
818
+ "eval_samples_per_second": 716.839,
819
+ "eval_steps_per_second": 1.401,
820
+ "step": 45000
821
+ },
822
+ {
823
+ "epoch": 4.0453785946706535,
824
+ "grad_norm": 0.8892888426780701,
825
+ "learning_rate": 0.00019667454395197706,
826
+ "loss": 0.1903,
827
+ "step": 46000
828
+ },
829
+ {
830
+ "epoch": 4.0453785946706535,
831
+ "eval_accuracy": 0.9117981696759832,
832
+ "eval_combined_score": 0.8963943458178341,
833
+ "eval_f1": 0.880990521959685,
834
+ "eval_loss": 0.23053193092346191,
835
+ "eval_runtime": 55.3909,
836
+ "eval_samples_per_second": 729.903,
837
+ "eval_steps_per_second": 1.426,
838
+ "step": 46000
839
+ },
840
+ {
841
+ "epoch": 4.133321607598276,
842
+ "grad_norm": 1.172505497932434,
843
+ "learning_rate": 0.00019268314651030522,
844
+ "loss": 0.1845,
845
+ "step": 47000
846
+ },
847
+ {
848
+ "epoch": 4.133321607598276,
849
+ "eval_accuracy": 0.9109077417759089,
850
+ "eval_combined_score": 0.8964897535490336,
851
+ "eval_f1": 0.8820717653221583,
852
+ "eval_loss": 0.2176717221736908,
853
+ "eval_runtime": 60.2724,
854
+ "eval_samples_per_second": 670.788,
855
+ "eval_steps_per_second": 1.311,
856
+ "step": 47000
857
+ },
858
+ {
859
+ "epoch": 4.221264620525899,
860
+ "grad_norm": 1.5994923114776611,
861
+ "learning_rate": 0.00018865858405213055,
862
+ "loss": 0.1819,
863
+ "step": 48000
864
+ },
865
+ {
866
+ "epoch": 4.221264620525899,
867
+ "eval_accuracy": 0.9123670541676973,
868
+ "eval_combined_score": 0.8978356465792539,
869
+ "eval_f1": 0.8833042389908106,
870
+ "eval_loss": 0.2258807122707367,
871
+ "eval_runtime": 86.9447,
872
+ "eval_samples_per_second": 465.008,
873
+ "eval_steps_per_second": 0.909,
874
+ "step": 48000
875
+ },
876
+ {
877
+ "epoch": 4.309207633453522,
878
+ "grad_norm": 1.8464044332504272,
879
+ "learning_rate": 0.0001846039836824406,
880
+ "loss": 0.1857,
881
+ "step": 49000
882
+ },
883
+ {
884
+ "epoch": 4.309207633453522,
885
+ "eval_accuracy": 0.9128122681177343,
886
+ "eval_combined_score": 0.8989938404988163,
887
+ "eval_f1": 0.8851754128798983,
888
+ "eval_loss": 0.224745512008667,
889
+ "eval_runtime": 86.7886,
890
+ "eval_samples_per_second": 465.845,
891
+ "eval_steps_per_second": 0.91,
892
+ "step": 49000
893
+ },
894
+ {
895
+ "epoch": 4.397150646381145,
896
+ "grad_norm": 1.784670114517212,
897
+ "learning_rate": 0.00018052249584582937,
898
+ "loss": 0.1816,
899
+ "step": 50000
900
+ },
901
+ {
902
+ "epoch": 4.397150646381145,
903
+ "eval_accuracy": 0.9119713084343309,
904
+ "eval_combined_score": 0.8984543749180804,
905
+ "eval_f1": 0.8849374414018298,
906
+ "eval_loss": 0.22437089681625366,
907
+ "eval_runtime": 87.3119,
908
+ "eval_samples_per_second": 463.052,
909
+ "eval_steps_per_second": 0.905,
910
+ "step": 50000
911
+ },
912
+ {
913
+ "epoch": 4.485093659308768,
914
+ "grad_norm": 0.8956874012947083,
915
+ "learning_rate": 0.0001764172918785858,
916
+ "loss": 0.1833,
917
+ "step": 51000
918
+ },
919
+ {
920
+ "epoch": 4.485093659308768,
921
+ "eval_accuracy": 0.9124412564927035,
922
+ "eval_combined_score": 0.8985396010678338,
923
+ "eval_f1": 0.8846379456429642,
924
+ "eval_loss": 0.21341578662395477,
925
+ "eval_runtime": 76.9889,
926
+ "eval_samples_per_second": 525.141,
927
+ "eval_steps_per_second": 1.026,
928
+ "step": 51000
929
+ },
930
+ {
931
+ "epoch": 4.573036672236391,
932
+ "grad_norm": 1.915276050567627,
933
+ "learning_rate": 0.0001722915615445501,
934
+ "loss": 0.1865,
935
+ "step": 52000
936
+ },
937
+ {
938
+ "epoch": 4.573036672236391,
939
+ "eval_accuracy": 0.9127627999010636,
940
+ "eval_combined_score": 0.8988202810420225,
941
+ "eval_f1": 0.8848777621829813,
942
+ "eval_loss": 0.2196654975414276,
943
+ "eval_runtime": 55.9179,
944
+ "eval_samples_per_second": 723.025,
945
+ "eval_steps_per_second": 1.413,
946
+ "step": 52000
947
+ },
948
+ {
949
+ "epoch": 4.660979685164014,
950
+ "grad_norm": 0.7660688757896423,
951
+ "learning_rate": 0.0001681485105566511,
952
+ "loss": 0.1891,
953
+ "step": 53000
954
+ },
955
+ {
956
+ "epoch": 4.660979685164014,
957
+ "eval_accuracy": 0.9142715805095226,
958
+ "eval_combined_score": 0.9003310177360777,
959
+ "eval_f1": 0.8863904549626328,
960
+ "eval_loss": 0.2194739133119583,
961
+ "eval_runtime": 55.3544,
962
+ "eval_samples_per_second": 730.385,
963
+ "eval_steps_per_second": 1.427,
964
+ "step": 53000
965
+ },
966
+ {
967
+ "epoch": 4.748922698091636,
968
+ "grad_norm": 0.5751957893371582,
969
+ "learning_rate": 0.00016399135808605172,
970
+ "loss": 0.185,
971
+ "step": 54000
972
+ },
973
+ {
974
+ "epoch": 4.748922698091636,
975
+ "eval_accuracy": 0.9136779619094731,
976
+ "eval_combined_score": 0.8991720939752257,
977
+ "eval_f1": 0.8846662260409782,
978
+ "eval_loss": 0.2241617739200592,
979
+ "eval_runtime": 55.9679,
980
+ "eval_samples_per_second": 722.379,
981
+ "eval_steps_per_second": 1.412,
982
+ "step": 54000
983
+ },
984
+ {
985
+ "epoch": 4.83686571101926,
986
+ "grad_norm": 0.588286817073822,
987
+ "learning_rate": 0.00015982333426083677,
988
+ "loss": 0.1868,
989
+ "step": 55000
990
+ },
991
+ {
992
+ "epoch": 4.83686571101926,
993
+ "eval_accuracy": 0.9145931239178827,
994
+ "eval_combined_score": 0.9007955965500334,
995
+ "eval_f1": 0.8869980691821842,
996
+ "eval_loss": 0.21942467987537384,
997
+ "eval_runtime": 87.1584,
998
+ "eval_samples_per_second": 463.868,
999
+ "eval_steps_per_second": 0.906,
1000
+ "step": 55000
1001
+ },
1002
+ {
1003
+ "epoch": 4.9248087239468825,
1004
+ "grad_norm": 1.0260065793991089,
1005
+ "learning_rate": 0.00015564767765618756,
1006
+ "loss": 0.1857,
1007
+ "step": 56000
1008
+ },
1009
+ {
1010
+ "epoch": 4.9248087239468825,
1011
+ "eval_accuracy": 0.912540192926045,
1012
+ "eval_combined_score": 0.8981389726384681,
1013
+ "eval_f1": 0.883737752350891,
1014
+ "eval_loss": 0.21640419960021973,
1015
+ "eval_runtime": 87.3018,
1016
+ "eval_samples_per_second": 463.106,
1017
+ "eval_steps_per_second": 0.905,
1018
+ "step": 56000
1019
+ },
1020
+ {
1021
+ "epoch": 5.012751736874505,
1022
+ "grad_norm": 0.8171947598457336,
1023
+ "learning_rate": 0.0001514676327779928,
1024
+ "loss": 0.18,
1025
+ "step": 57000
1026
+ },
1027
+ {
1028
+ "epoch": 5.012751736874505,
1029
+ "eval_accuracy": 0.9140737076428395,
1030
+ "eval_combined_score": 0.899864844078809,
1031
+ "eval_f1": 0.8856559805147785,
1032
+ "eval_loss": 0.21834981441497803,
1033
+ "eval_runtime": 86.7804,
1034
+ "eval_samples_per_second": 465.889,
1035
+ "eval_steps_per_second": 0.91,
1036
+ "step": 57000
1037
+ },
1038
+ {
1039
+ "epoch": 5.100694749802129,
1040
+ "grad_norm": 1.5669310092926025,
1041
+ "learning_rate": 0.00014728644754185164,
1042
+ "loss": 0.1749,
1043
+ "step": 58000
1044
+ },
1045
+ {
1046
+ "epoch": 5.100694749802129,
1047
+ "eval_accuracy": 0.9147909967845659,
1048
+ "eval_combined_score": 0.9004589962272084,
1049
+ "eval_f1": 0.8861269956698509,
1050
+ "eval_loss": 0.21679632365703583,
1051
+ "eval_runtime": 60.0987,
1052
+ "eval_samples_per_second": 672.726,
1053
+ "eval_steps_per_second": 1.315,
1054
+ "step": 58000
1055
+ },
1056
+ {
1057
+ "epoch": 5.188637762729751,
1058
+ "grad_norm": 1.1074833869934082,
1059
+ "learning_rate": 0.00014310737074942683,
1060
+ "loss": 0.1744,
1061
+ "step": 59000
1062
+ },
1063
+ {
1064
+ "epoch": 5.188637762729751,
1065
+ "eval_accuracy": 0.9115013603759584,
1066
+ "eval_combined_score": 0.8979029145538849,
1067
+ "eval_f1": 0.8843044687318115,
1068
+ "eval_loss": 0.2294030636548996,
1069
+ "eval_runtime": 56.2355,
1070
+ "eval_samples_per_second": 718.94,
1071
+ "eval_steps_per_second": 1.405,
1072
+ "step": 59000
1073
+ },
1074
+ {
1075
+ "epoch": 5.276580775657374,
1076
+ "grad_norm": 1.083426594734192,
1077
+ "learning_rate": 0.00013893364956411012,
1078
+ "loss": 0.1698,
1079
+ "step": 60000
1080
+ },
1081
+ {
1082
+ "epoch": 5.276580775657374,
1083
+ "eval_accuracy": 0.9093494929507792,
1084
+ "eval_combined_score": 0.8959915556469398,
1085
+ "eval_f1": 0.8826336183431005,
1086
+ "eval_loss": 0.2406802922487259,
1087
+ "eval_runtime": 55.9488,
1088
+ "eval_samples_per_second": 722.625,
1089
+ "eval_steps_per_second": 1.412,
1090
+ "step": 60000
1091
+ },
1092
+ {
1093
+ "epoch": 5.364523788584997,
1094
+ "grad_norm": 1.933881163597107,
1095
+ "learning_rate": 0.0001347685269879597,
1096
+ "loss": 0.1778,
1097
+ "step": 61000
1098
+ },
1099
+ {
1100
+ "epoch": 5.364523788584997,
1101
+ "eval_accuracy": 0.9120702448676725,
1102
+ "eval_combined_score": 0.8986871630855543,
1103
+ "eval_f1": 0.8853040813034361,
1104
+ "eval_loss": 0.2266305387020111,
1105
+ "eval_runtime": 56.3575,
1106
+ "eval_samples_per_second": 717.384,
1107
+ "eval_steps_per_second": 1.402,
1108
+ "step": 61000
1109
+ },
1110
+ {
1111
+ "epoch": 5.45246680151262,
1112
+ "grad_norm": 0.47942474484443665,
1113
+ "learning_rate": 0.00013061523934187208,
1114
+ "loss": 0.177,
1115
+ "step": 62000
1116
+ },
1117
+ {
1118
+ "epoch": 5.45246680151262,
1119
+ "eval_accuracy": 0.9131832797427653,
1120
+ "eval_combined_score": 0.8998285892212758,
1121
+ "eval_f1": 0.8864738986997865,
1122
+ "eval_loss": 0.22912418842315674,
1123
+ "eval_runtime": 87.1326,
1124
+ "eval_samples_per_second": 464.005,
1125
+ "eval_steps_per_second": 0.907,
1126
+ "step": 62000
1127
+ },
1128
+ {
1129
+ "epoch": 5.540409814440243,
1130
+ "grad_norm": 0.6661811470985413,
1131
+ "learning_rate": 0.0001264770137509442,
1132
+ "loss": 0.1729,
1133
+ "step": 63000
1134
+ },
1135
+ {
1136
+ "epoch": 5.540409814440243,
1137
+ "eval_accuracy": 0.9130101409844175,
1138
+ "eval_combined_score": 0.8983055007420846,
1139
+ "eval_f1": 0.8836008604997517,
1140
+ "eval_loss": 0.22111645340919495,
1141
+ "eval_runtime": 87.0569,
1142
+ "eval_samples_per_second": 464.409,
1143
+ "eval_steps_per_second": 0.907,
1144
+ "step": 63000
1145
+ },
1146
+ {
1147
+ "epoch": 5.628352827367865,
1148
+ "grad_norm": 1.7584110498428345,
1149
+ "learning_rate": 0.00012235706563698158,
1150
+ "loss": 0.1733,
1151
+ "step": 64000
1152
+ },
1153
+ {
1154
+ "epoch": 5.628352827367865,
1155
+ "eval_accuracy": 0.9143952510511996,
1156
+ "eval_combined_score": 0.901018032628804,
1157
+ "eval_f1": 0.8876408142064085,
1158
+ "eval_loss": 0.22673414647579193,
1159
+ "eval_runtime": 86.8749,
1160
+ "eval_samples_per_second": 465.382,
1161
+ "eval_steps_per_second": 0.909,
1162
+ "step": 64000
1163
+ },
1164
+ {
1165
+ "epoch": 5.716295840295489,
1166
+ "grad_norm": 1.4516685009002686,
1167
+ "learning_rate": 0.00011825859622009953,
1168
+ "loss": 0.1754,
1169
+ "step": 65000
1170
+ },
1171
+ {
1172
+ "epoch": 5.716295840295489,
1173
+ "eval_accuracy": 0.9142715805095226,
1174
+ "eval_combined_score": 0.9001629452287898,
1175
+ "eval_f1": 0.8860543099480571,
1176
+ "eval_loss": 0.2289634495973587,
1177
+ "eval_runtime": 55.5446,
1178
+ "eval_samples_per_second": 727.883,
1179
+ "eval_steps_per_second": 1.422,
1180
+ "step": 65000
1181
+ },
1182
+ {
1183
+ "epoch": 5.8042388532231115,
1184
+ "grad_norm": 1.0227175951004028,
1185
+ "learning_rate": 0.00011418479003135898,
1186
+ "loss": 0.1714,
1187
+ "step": 66000
1188
+ },
1189
+ {
1190
+ "epoch": 5.8042388532231115,
1191
+ "eval_accuracy": 0.9136037595844669,
1192
+ "eval_combined_score": 0.900322883575825,
1193
+ "eval_f1": 0.887042007567183,
1194
+ "eval_loss": 0.22240959107875824,
1195
+ "eval_runtime": 55.7791,
1196
+ "eval_samples_per_second": 724.824,
1197
+ "eval_steps_per_second": 1.416,
1198
+ "step": 66000
1199
+ },
1200
+ {
1201
+ "epoch": 5.892181866150734,
1202
+ "grad_norm": 0.7578181028366089,
1203
+ "learning_rate": 0.00011013881243837068,
1204
+ "loss": 0.1732,
1205
+ "step": 67000
1206
+ },
1207
+ {
1208
+ "epoch": 5.892181866150734,
1209
+ "eval_accuracy": 0.9146425921345536,
1210
+ "eval_combined_score": 0.9009637169445672,
1211
+ "eval_f1": 0.8872848417545808,
1212
+ "eval_loss": 0.21318596601486206,
1213
+ "eval_runtime": 55.5313,
1214
+ "eval_samples_per_second": 728.057,
1215
+ "eval_steps_per_second": 1.423,
1216
+ "step": 67000
1217
+ },
1218
+ {
1219
+ "epoch": 5.980124879078357,
1220
+ "grad_norm": 1.2776685953140259,
1221
+ "learning_rate": 0.00010612380718578806,
1222
+ "loss": 0.1741,
1223
+ "step": 68000
1224
+ },
1225
+ {
1226
+ "epoch": 5.980124879078357,
1227
+ "eval_accuracy": 0.9154835518179569,
1228
+ "eval_combined_score": 0.9013426077843292,
1229
+ "eval_f1": 0.8872016637507015,
1230
+ "eval_loss": 0.22081904113292694,
1231
+ "eval_runtime": 56.107,
1232
+ "eval_samples_per_second": 720.587,
1233
+ "eval_steps_per_second": 1.408,
1234
+ "step": 68000
1235
+ },
1236
+ {
1237
+ "epoch": 6.06806789200598,
1238
+ "grad_norm": 0.6756773591041565,
1239
+ "learning_rate": 0.00010214289395260275,
1240
+ "loss": 0.1655,
1241
+ "step": 69000
1242
+ },
1243
+ {
1244
+ "epoch": 6.06806789200598,
1245
+ "eval_accuracy": 0.9145683898095474,
1246
+ "eval_combined_score": 0.9011872677300838,
1247
+ "eval_f1": 0.8878061456506204,
1248
+ "eval_loss": 0.2249419391155243,
1249
+ "eval_runtime": 94.0711,
1250
+ "eval_samples_per_second": 429.781,
1251
+ "eval_steps_per_second": 0.84,
1252
+ "step": 69000
1253
+ },
1254
+ {
1255
+ "epoch": 6.156010904933603,
1256
+ "grad_norm": 1.0925005674362183,
1257
+ "learning_rate": 9.819916592813812e-05,
1258
+ "loss": 0.1622,
1259
+ "step": 70000
1260
+ },
1261
+ {
1262
+ "epoch": 6.156010904933603,
1263
+ "eval_accuracy": 0.9135790254761316,
1264
+ "eval_combined_score": 0.900321738479872,
1265
+ "eval_f1": 0.8870644514836123,
1266
+ "eval_loss": 0.23048754036426544,
1267
+ "eval_runtime": 87.1759,
1268
+ "eval_samples_per_second": 463.775,
1269
+ "eval_steps_per_second": 0.906,
1270
+ "step": 70000
1271
+ },
1272
+ {
1273
+ "epoch": 6.2439539178612256,
1274
+ "grad_norm": 1.463751196861267,
1275
+ "learning_rate": 9.429568740862609e-05,
1276
+ "loss": 0.1623,
1277
+ "step": 71000
1278
+ },
1279
+ {
1280
+ "epoch": 6.2439539178612256,
1281
+ "eval_accuracy": 0.9141231758595103,
1282
+ "eval_combined_score": 0.9012201056858025,
1283
+ "eval_f1": 0.8883170355120947,
1284
+ "eval_loss": 0.23018094897270203,
1285
+ "eval_runtime": 87.3145,
1286
+ "eval_samples_per_second": 463.039,
1287
+ "eval_steps_per_second": 0.905,
1288
+ "step": 71000
1289
+ },
1290
+ {
1291
+ "epoch": 6.331896930788849,
1292
+ "grad_norm": 1.191116452217102,
1293
+ "learning_rate": 9.043549141623341e-05,
1294
+ "loss": 0.1559,
1295
+ "step": 72000
1296
+ },
1297
+ {
1298
+ "epoch": 6.331896930788849,
1299
+ "eval_accuracy": 0.9152114766262677,
1300
+ "eval_combined_score": 0.9018985100240022,
1301
+ "eval_f1": 0.8885855434217369,
1302
+ "eval_loss": 0.2375902384519577,
1303
+ "eval_runtime": 56.6591,
1304
+ "eval_samples_per_second": 713.566,
1305
+ "eval_steps_per_second": 1.394,
1306
+ "step": 72000
1307
+ },
1308
+ {
1309
+ "epoch": 6.419839943716472,
1310
+ "grad_norm": 1.6958703994750977,
1311
+ "learning_rate": 8.662157734238882e-05,
1312
+ "loss": 0.1676,
1313
+ "step": 73000
1314
+ },
1315
+ {
1316
+ "epoch": 6.419839943716472,
1317
+ "eval_accuracy": 0.9138758347761563,
1318
+ "eval_combined_score": 0.9007802501408038,
1319
+ "eval_f1": 0.8876846655054512,
1320
+ "eval_loss": 0.2252551168203354,
1321
+ "eval_runtime": 55.6805,
1322
+ "eval_samples_per_second": 726.106,
1323
+ "eval_steps_per_second": 1.419,
1324
+ "step": 73000
1325
+ },
1326
+ {
1327
+ "epoch": 6.507782956644094,
1328
+ "grad_norm": 0.9404481649398804,
1329
+ "learning_rate": 8.285690861724085e-05,
1330
+ "loss": 0.1666,
1331
+ "step": 74000
1332
+ },
1333
+ {
1334
+ "epoch": 6.507782956644094,
1335
+ "eval_accuracy": 0.9117487014593124,
1336
+ "eval_combined_score": 0.8988775456178351,
1337
+ "eval_f1": 0.8860063897763578,
1338
+ "eval_loss": 0.22508053481578827,
1339
+ "eval_runtime": 56.0644,
1340
+ "eval_samples_per_second": 721.135,
1341
+ "eval_steps_per_second": 1.409,
1342
+ "step": 74000
1343
+ },
1344
+ {
1345
+ "epoch": 6.595725969571718,
1346
+ "grad_norm": 1.591848373413086,
1347
+ "learning_rate": 7.914441040705777e-05,
1348
+ "loss": 0.1656,
1349
+ "step": 75000
1350
+ },
1351
+ {
1352
+ "epoch": 6.595725969571718,
1353
+ "eval_accuracy": 0.915607222359634,
1354
+ "eval_combined_score": 0.901803085969733,
1355
+ "eval_f1": 0.8879989495798319,
1356
+ "eval_loss": 0.2246847152709961,
1357
+ "eval_runtime": 65.9155,
1358
+ "eval_samples_per_second": 613.361,
1359
+ "eval_steps_per_second": 1.199,
1360
+ "step": 75000
1361
+ },
1362
+ {
1363
+ "epoch": 6.6836689824993405,
1364
+ "grad_norm": 1.0205929279327393,
1365
+ "learning_rate": 7.5486967341359e-05,
1366
+ "loss": 0.1631,
1367
+ "step": 76000
1368
+ },
1369
+ {
1370
+ "epoch": 6.6836689824993405,
1371
+ "eval_accuracy": 0.9142715805095226,
1372
+ "eval_combined_score": 0.9005757380354402,
1373
+ "eval_f1": 0.8868798955613577,
1374
+ "eval_loss": 0.21938744187355042,
1375
+ "eval_runtime": 87.186,
1376
+ "eval_samples_per_second": 463.721,
1377
+ "eval_steps_per_second": 0.906,
1378
+ "step": 76000
1379
+ },
1380
+ {
1381
+ "epoch": 6.771611995426963,
1382
+ "grad_norm": 1.5870155096054077,
1383
+ "learning_rate": 7.188742127154373e-05,
1384
+ "loss": 0.1693,
1385
+ "step": 77000
1386
+ },
1387
+ {
1388
+ "epoch": 6.771611995426963,
1389
+ "eval_accuracy": 0.9148899332179075,
1390
+ "eval_combined_score": 0.9019252660700529,
1391
+ "eval_f1": 0.8889605989221983,
1392
+ "eval_loss": 0.22145743668079376,
1393
+ "eval_runtime": 87.1617,
1394
+ "eval_samples_per_second": 463.851,
1395
+ "eval_steps_per_second": 0.906,
1396
+ "step": 77000
1397
+ },
1398
+ {
1399
+ "epoch": 6.859555008354587,
1400
+ "grad_norm": 1.2106655836105347,
1401
+ "learning_rate": 6.834856906275834e-05,
1402
+ "loss": 0.1686,
1403
+ "step": 78000
1404
+ },
1405
+ {
1406
+ "epoch": 6.859555008354587,
1407
+ "eval_accuracy": 0.9131832797427653,
1408
+ "eval_combined_score": 0.9006927992547435,
1409
+ "eval_f1": 0.8882023187667218,
1410
+ "eval_loss": 0.22475597262382507,
1411
+ "eval_runtime": 86.9372,
1412
+ "eval_samples_per_second": 465.048,
1413
+ "eval_steps_per_second": 0.909,
1414
+ "step": 78000
1415
+ },
1416
+ {
1417
+ "epoch": 6.947498021282209,
1418
+ "grad_norm": 1.801979422569275,
1419
+ "learning_rate": 6.487316042071804e-05,
1420
+ "loss": 0.1656,
1421
+ "step": 79000
1422
+ },
1423
+ {
1424
+ "epoch": 6.947498021282209,
1425
+ "eval_accuracy": 0.9140489735345041,
1426
+ "eval_combined_score": 0.8988732768874033,
1427
+ "eval_f1": 0.8836975802403025,
1428
+ "eval_loss": 0.2249360829591751,
1429
+ "eval_runtime": 57.3526,
1430
+ "eval_samples_per_second": 704.937,
1431
+ "eval_steps_per_second": 1.377,
1432
+ "step": 79000
1433
+ },
1434
+ {
1435
+ "epoch": 7.035441034209832,
1436
+ "grad_norm": 0.7926831245422363,
1437
+ "learning_rate": 6.146389575517211e-05,
1438
+ "loss": 0.1592,
1439
+ "step": 80000
1440
+ },
1441
+ {
1442
+ "epoch": 7.035441034209832,
1443
+ "eval_accuracy": 0.9147662626762305,
1444
+ "eval_combined_score": 0.9012849350758245,
1445
+ "eval_f1": 0.8878036074754184,
1446
+ "eval_loss": 0.22441034018993378,
1447
+ "eval_runtime": 60.1965,
1448
+ "eval_samples_per_second": 671.634,
1449
+ "eval_steps_per_second": 1.312,
1450
+ "step": 80000
1451
+ },
1452
+ {
1453
+ "epoch": 7.1233840471374545,
1454
+ "grad_norm": 1.1260881423950195,
1455
+ "learning_rate": 5.81234240816722e-05,
1456
+ "loss": 0.1594,
1457
+ "step": 81000
1458
+ },
1459
+ {
1460
+ "epoch": 7.1233840471374545,
1461
+ "eval_accuracy": 0.9141231758595103,
1462
+ "eval_combined_score": 0.9006356632076593,
1463
+ "eval_f1": 0.8871481505558083,
1464
+ "eval_loss": 0.2256377786397934,
1465
+ "eval_runtime": 55.8904,
1466
+ "eval_samples_per_second": 723.379,
1467
+ "eval_steps_per_second": 1.413,
1468
+ "step": 81000
1469
+ },
1470
+ {
1471
+ "epoch": 7.211327060065078,
1472
+ "grad_norm": 1.023222804069519,
1473
+ "learning_rate": 5.485434096327387e-05,
1474
+ "loss": 0.1562,
1475
+ "step": 82000
1476
+ },
1477
+ {
1478
+ "epoch": 7.211327060065078,
1479
+ "eval_accuracy": 0.9144694533762058,
1480
+ "eval_combined_score": 0.901054586318984,
1481
+ "eval_f1": 0.8876397192617624,
1482
+ "eval_loss": 0.2249106466770172,
1483
+ "eval_runtime": 87.077,
1484
+ "eval_samples_per_second": 464.302,
1485
+ "eval_steps_per_second": 0.907,
1486
+ "step": 82000
1487
+ },
1488
+ {
1489
+ "epoch": 7.299270072992701,
1490
+ "grad_norm": 0.9676831364631653,
1491
+ "learning_rate": 5.165918649377139e-05,
1492
+ "loss": 0.154,
1493
+ "step": 83000
1494
+ },
1495
+ {
1496
+ "epoch": 7.299270072992701,
1497
+ "eval_accuracy": 0.9134553549344546,
1498
+ "eval_combined_score": 0.9002065001984649,
1499
+ "eval_f1": 0.8869576454624754,
1500
+ "eval_loss": 0.22661758959293365,
1501
+ "eval_runtime": 87.1572,
1502
+ "eval_samples_per_second": 463.874,
1503
+ "eval_steps_per_second": 0.906,
1504
+ "step": 83000
1505
+ },
1506
+ {
1507
+ "epoch": 7.387213085920323,
1508
+ "grad_norm": 1.3920458555221558,
1509
+ "learning_rate": 4.854044332403218e-05,
1510
+ "loss": 0.1525,
1511
+ "step": 84000
1512
+ },
1513
+ {
1514
+ "epoch": 7.387213085920323,
1515
+ "eval_accuracy": 0.914543655701212,
1516
+ "eval_combined_score": 0.9014253208839361,
1517
+ "eval_f1": 0.8883069860666603,
1518
+ "eval_loss": 0.22665317356586456,
1519
+ "eval_runtime": 86.9466,
1520
+ "eval_samples_per_second": 464.998,
1521
+ "eval_steps_per_second": 0.909,
1522
+ "step": 84000
1523
+ },
1524
+ {
1525
+ "epoch": 7.475156098847947,
1526
+ "grad_norm": 1.7799046039581299,
1527
+ "learning_rate": 4.550053473296499e-05,
1528
+ "loss": 0.1577,
1529
+ "step": 85000
1530
+ },
1531
+ {
1532
+ "epoch": 7.475156098847947,
1533
+ "eval_accuracy": 0.9155577541429631,
1534
+ "eval_combined_score": 0.9015757952856263,
1535
+ "eval_f1": 0.8875938364282893,
1536
+ "eval_loss": 0.2233825922012329,
1537
+ "eval_runtime": 86.6966,
1538
+ "eval_samples_per_second": 466.339,
1539
+ "eval_steps_per_second": 0.911,
1540
+ "step": 85000
1541
+ },
1542
+ {
1543
+ "epoch": 7.5630991117755695,
1544
+ "grad_norm": 0.5556538701057434,
1545
+ "learning_rate": 4.254182274461983e-05,
1546
+ "loss": 0.1583,
1547
+ "step": 86000
1548
+ },
1549
+ {
1550
+ "epoch": 7.5630991117755695,
1551
+ "eval_accuracy": 0.9146920603512243,
1552
+ "eval_combined_score": 0.9015676558184624,
1553
+ "eval_f1": 0.8884432512857005,
1554
+ "eval_loss": 0.22626982629299164,
1555
+ "eval_runtime": 55.8275,
1556
+ "eval_samples_per_second": 724.195,
1557
+ "eval_steps_per_second": 1.415,
1558
+ "step": 86000
1559
+ },
1560
+ {
1561
+ "epoch": 7.651042124703192,
1562
+ "grad_norm": 0.5572978854179382,
1563
+ "learning_rate": 3.966660629288376e-05,
1564
+ "loss": 0.1603,
1565
+ "step": 87000
1566
+ },
1567
+ {
1568
+ "epoch": 7.651042124703192,
1569
+ "eval_accuracy": 0.9152856789512738,
1570
+ "eval_combined_score": 0.9019354779714568,
1571
+ "eval_f1": 0.8885852769916398,
1572
+ "eval_loss": 0.22297683358192444,
1573
+ "eval_runtime": 55.3128,
1574
+ "eval_samples_per_second": 730.933,
1575
+ "eval_steps_per_second": 1.428,
1576
+ "step": 87000
1577
+ },
1578
+ {
1579
+ "epoch": 7.738985137630815,
1580
+ "grad_norm": 2.4749791622161865,
1581
+ "learning_rate": 3.687711943519798e-05,
1582
+ "loss": 0.1567,
1583
+ "step": 88000
1584
+ },
1585
+ {
1586
+ "epoch": 7.738985137630815,
1587
+ "eval_accuracy": 0.914296314617858,
1588
+ "eval_combined_score": 0.9009437777468852,
1589
+ "eval_f1": 0.8875912408759125,
1590
+ "eval_loss": 0.22879067063331604,
1591
+ "eval_runtime": 55.9851,
1592
+ "eval_samples_per_second": 722.156,
1593
+ "eval_steps_per_second": 1.411,
1594
+ "step": 88000
1595
+ },
1596
+ {
1597
+ "epoch": 7.826928150558438,
1598
+ "grad_norm": 1.4144024848937988,
1599
+ "learning_rate": 3.4175529616683805e-05,
1600
+ "loss": 0.1587,
1601
+ "step": 89000
1602
+ },
1603
+ {
1604
+ "epoch": 7.826928150558438,
1605
+ "eval_accuracy": 0.9151867425179322,
1606
+ "eval_combined_score": 0.9016827239411771,
1607
+ "eval_f1": 0.888178705364422,
1608
+ "eval_loss": 0.224980890750885,
1609
+ "eval_runtime": 87.2835,
1610
+ "eval_samples_per_second": 463.203,
1611
+ "eval_steps_per_second": 0.905,
1612
+ "step": 89000
1613
+ },
1614
+ {
1615
+ "epoch": 7.914871163486061,
1616
+ "grad_norm": 0.7645988464355469,
1617
+ "learning_rate": 3.156393598602742e-05,
1618
+ "loss": 0.1591,
1619
+ "step": 90000
1620
+ },
1621
+ {
1622
+ "epoch": 7.914871163486061,
1623
+ "eval_accuracy": 0.9158050952263171,
1624
+ "eval_combined_score": 0.9023723681678079,
1625
+ "eval_f1": 0.8889396411092986,
1626
+ "eval_loss": 0.21758203208446503,
1627
+ "eval_runtime": 86.9097,
1628
+ "eval_samples_per_second": 465.196,
1629
+ "eval_steps_per_second": 0.909,
1630
+ "step": 90000
1631
+ },
1632
+ {
1633
+ "epoch": 8.002814176413684,
1634
+ "grad_norm": 0.39321404695510864,
1635
+ "learning_rate": 2.9044367764430513e-05,
1636
+ "loss": 0.155,
1637
+ "step": 91000
1638
+ },
1639
+ {
1640
+ "epoch": 8.002814176413684,
1641
+ "eval_accuracy": 0.9147167944595598,
1642
+ "eval_combined_score": 0.9008115591295438,
1643
+ "eval_f1": 0.8869063237995277,
1644
+ "eval_loss": 0.22483354806900024,
1645
+ "eval_runtime": 88.9285,
1646
+ "eval_samples_per_second": 454.635,
1647
+ "eval_steps_per_second": 0.888,
1648
+ "step": 91000
1649
+ },
1650
+ {
1651
+ "epoch": 8.090757189341307,
1652
+ "grad_norm": 1.1553528308868408,
1653
+ "learning_rate": 2.661878266889586e-05,
1654
+ "loss": 0.1496,
1655
+ "step": 92000
1656
+ },
1657
+ {
1658
+ "epoch": 8.090757189341307,
1659
+ "eval_accuracy": 0.91535988127628,
1660
+ "eval_combined_score": 0.9018415488882543,
1661
+ "eval_f1": 0.8883232165002285,
1662
+ "eval_loss": 0.22659851610660553,
1663
+ "eval_runtime": 56.6387,
1664
+ "eval_samples_per_second": 713.823,
1665
+ "eval_steps_per_second": 1.395,
1666
+ "step": 92000
1667
+ },
1668
+ {
1669
+ "epoch": 8.17870020226893,
1670
+ "grad_norm": 1.0939350128173828,
1671
+ "learning_rate": 2.428906539107102e-05,
1672
+ "loss": 0.1522,
1673
+ "step": 93000
1674
+ },
1675
+ {
1676
+ "epoch": 8.17870020226893,
1677
+ "eval_accuracy": 0.9154340836012862,
1678
+ "eval_combined_score": 0.9024917147129352,
1679
+ "eval_f1": 0.8895493458245841,
1680
+ "eval_loss": 0.2251831442117691,
1681
+ "eval_runtime": 56.5957,
1682
+ "eval_samples_per_second": 714.365,
1683
+ "eval_steps_per_second": 1.396,
1684
+ "step": 93000
1685
+ },
1686
+ {
1687
+ "epoch": 8.266643215196552,
1688
+ "grad_norm": 1.2754027843475342,
1689
+ "learning_rate": 2.2057026132833862e-05,
1690
+ "loss": 0.1532,
1691
+ "step": 94000
1692
+ },
1693
+ {
1694
+ "epoch": 8.266643215196552,
1695
+ "eval_accuracy": 0.9155082859262924,
1696
+ "eval_combined_score": 0.9019517383630417,
1697
+ "eval_f1": 0.8883951907997909,
1698
+ "eval_loss": 0.22450992465019226,
1699
+ "eval_runtime": 56.4448,
1700
+ "eval_samples_per_second": 716.274,
1701
+ "eval_steps_per_second": 1.4,
1702
+ "step": 94000
1703
+ },
1704
+ {
1705
+ "epoch": 8.354586228124175,
1706
+ "grad_norm": 1.2699517011642456,
1707
+ "learning_rate": 1.992439919975663e-05,
1708
+ "loss": 0.1505,
1709
+ "step": 95000
1710
+ },
1711
+ {
1712
+ "epoch": 8.354586228124175,
1713
+ "eval_accuracy": 0.9145683898095474,
1714
+ "eval_combined_score": 0.9012709619001031,
1715
+ "eval_f1": 0.887973533990659,
1716
+ "eval_loss": 0.22519494593143463,
1717
+ "eval_runtime": 52.4291,
1718
+ "eval_samples_per_second": 771.137,
1719
+ "eval_steps_per_second": 1.507,
1720
+ "step": 95000
1721
+ },
1722
+ {
1723
+ "epoch": 8.442529241051798,
1724
+ "grad_norm": 1.124017357826233,
1725
+ "learning_rate": 1.7892841653541984e-05,
1726
+ "loss": 0.1462,
1727
+ "step": 96000
1728
+ },
1729
+ {
1730
+ "epoch": 8.442529241051798,
1731
+ "eval_accuracy": 0.9155577541429631,
1732
+ "eval_combined_score": 0.9018336751836997,
1733
+ "eval_f1": 0.8881095962244363,
1734
+ "eval_loss": 0.2291862517595291,
1735
+ "eval_runtime": 81.0833,
1736
+ "eval_samples_per_second": 498.623,
1737
+ "eval_steps_per_second": 0.974,
1738
+ "step": 96000
1739
+ },
1740
+ {
1741
+ "epoch": 8.530472253979422,
1742
+ "grad_norm": 0.7893108129501343,
1743
+ "learning_rate": 1.596393202447782e-05,
1744
+ "loss": 0.1538,
1745
+ "step": 97000
1746
+ },
1747
+ {
1748
+ "epoch": 8.530472253979422,
1749
+ "eval_accuracy": 0.9159287657679941,
1750
+ "eval_combined_score": 0.9021827174755224,
1751
+ "eval_f1": 0.8884366691830505,
1752
+ "eval_loss": 0.222365140914917,
1753
+ "eval_runtime": 81.217,
1754
+ "eval_samples_per_second": 497.802,
1755
+ "eval_steps_per_second": 0.973,
1756
+ "step": 97000
1757
+ },
1758
+ {
1759
+ "epoch": 8.618415266907045,
1760
+ "grad_norm": 0.7923777103424072,
1761
+ "learning_rate": 1.4139169084911189e-05,
1762
+ "loss": 0.1503,
1763
+ "step": 98000
1764
+ },
1765
+ {
1766
+ "epoch": 8.618415266907045,
1767
+ "eval_accuracy": 0.9156319564679694,
1768
+ "eval_combined_score": 0.9022205810308683,
1769
+ "eval_f1": 0.8888092055937673,
1770
+ "eval_loss": 0.2236052006483078,
1771
+ "eval_runtime": 81.2109,
1772
+ "eval_samples_per_second": 497.84,
1773
+ "eval_steps_per_second": 0.973,
1774
+ "step": 98000
1775
+ },
1776
+ {
1777
+ "epoch": 8.706358279834667,
1778
+ "grad_norm": 0.9545219540596008,
1779
+ "learning_rate": 1.2419970684695196e-05,
1780
+ "loss": 0.1562,
1781
+ "step": 99000
1782
+ },
1783
+ {
1784
+ "epoch": 8.706358279834667,
1785
+ "eval_accuracy": 0.9157061587929756,
1786
+ "eval_combined_score": 0.9024204183685112,
1787
+ "eval_f1": 0.8891346779440469,
1788
+ "eval_loss": 0.22190338373184204,
1789
+ "eval_runtime": 52.158,
1790
+ "eval_samples_per_second": 775.145,
1791
+ "eval_steps_per_second": 1.515,
1792
+ "step": 99000
1793
+ },
1794
+ {
1795
+ "epoch": 8.79430129276229,
1796
+ "grad_norm": 1.394586205482483,
1797
+ "learning_rate": 1.0807672649512177e-05,
1798
+ "loss": 0.1504,
1799
+ "step": 100000
1800
+ },
1801
+ {
1802
+ "epoch": 8.79430129276229,
1803
+ "eval_accuracy": 0.9154340836012862,
1804
+ "eval_combined_score": 0.9019621740533419,
1805
+ "eval_f1": 0.8884902645053977,
1806
+ "eval_loss": 0.22529225051403046,
1807
+ "eval_runtime": 51.9658,
1808
+ "eval_samples_per_second": 778.012,
1809
+ "eval_steps_per_second": 1.52,
1810
+ "step": 100000
1811
+ },
1812
+ {
1813
+ "epoch": 8.882244305689913,
1814
+ "grad_norm": 1.0966379642486572,
1815
+ "learning_rate": 9.3035277429309e-06,
1816
+ "loss": 0.15,
1817
+ "step": 101000
1818
+ },
1819
+ {
1820
+ "epoch": 8.882244305689913,
1821
+ "eval_accuracy": 0.9157308929013109,
1822
+ "eval_combined_score": 0.9022409566547371,
1823
+ "eval_f1": 0.8887510204081632,
1824
+ "eval_loss": 0.22496555745601654,
1825
+ "eval_runtime": 52.0674,
1826
+ "eval_samples_per_second": 776.493,
1827
+ "eval_steps_per_second": 1.517,
1828
+ "step": 101000
1829
+ },
1830
+ {
1831
+ "epoch": 8.970187318617535,
1832
+ "grad_norm": 0.7741194367408752,
1833
+ "learning_rate": 7.908704693002666e-06,
1834
+ "loss": 0.1463,
1835
+ "step": 102000
1836
+ },
1837
+ {
1838
+ "epoch": 8.970187318617535,
1839
+ "eval_accuracy": 0.9152856789512738,
1840
+ "eval_combined_score": 0.9019753168022955,
1841
+ "eval_f1": 0.8886649546533173,
1842
+ "eval_loss": 0.22557702660560608,
1843
+ "eval_runtime": 52.2974,
1844
+ "eval_samples_per_second": 773.079,
1845
+ "eval_steps_per_second": 1.511,
1846
+ "step": 102000
1847
+ },
1848
+ {
1849
+ "epoch": 9.05813033154516,
1850
+ "grad_norm": 1.8358986377716064,
1851
+ "learning_rate": 6.624287284154212e-06,
1852
+ "loss": 0.1543,
1853
+ "step": 103000
1854
+ },
1855
+ {
1856
+ "epoch": 9.05813033154516,
1857
+ "eval_accuracy": 0.9157556270096463,
1858
+ "eval_combined_score": 0.9022750970362323,
1859
+ "eval_f1": 0.8887945670628183,
1860
+ "eval_loss": 0.22461377084255219,
1861
+ "eval_runtime": 81.1692,
1862
+ "eval_samples_per_second": 498.095,
1863
+ "eval_steps_per_second": 0.973,
1864
+ "step": 103000
1865
+ },
1866
+ {
1867
+ "epoch": 9.146073344472782,
1868
+ "grad_norm": 1.2145054340362549,
1869
+ "learning_rate": 5.451273515081639e-06,
1870
+ "loss": 0.1498,
1871
+ "step": 104000
1872
+ },
1873
+ {
1874
+ "epoch": 9.146073344472782,
1875
+ "eval_accuracy": 0.9150878060845906,
1876
+ "eval_combined_score": 0.9016809811820421,
1877
+ "eval_f1": 0.8882741562794936,
1878
+ "eval_loss": 0.2247944474220276,
1879
+ "eval_runtime": 80.9979,
1880
+ "eval_samples_per_second": 499.149,
1881
+ "eval_steps_per_second": 0.975,
1882
+ "step": 104000
1883
+ },
1884
+ {
1885
+ "epoch": 9.234016357400405,
1886
+ "grad_norm": 0.44356486201286316,
1887
+ "learning_rate": 4.3905748233003915e-06,
1888
+ "loss": 0.1468,
1889
+ "step": 105000
1890
+ },
1891
+ {
1892
+ "epoch": 9.234016357400405,
1893
+ "eval_accuracy": 0.9159782339846648,
1894
+ "eval_combined_score": 0.9024553586140425,
1895
+ "eval_f1": 0.88893248324342,
1896
+ "eval_loss": 0.2253103107213974,
1897
+ "eval_runtime": 81.4817,
1898
+ "eval_samples_per_second": 496.185,
1899
+ "eval_steps_per_second": 0.97,
1900
+ "step": 105000
1901
+ },
1902
+ {
1903
+ "epoch": 9.321959370328027,
1904
+ "grad_norm": 2.883650302886963,
1905
+ "learning_rate": 3.4430153769539838e-06,
1906
+ "loss": 0.148,
1907
+ "step": 106000
1908
+ },
1909
+ {
1910
+ "epoch": 9.321959370328027,
1911
+ "eval_accuracy": 0.9157803611179817,
1912
+ "eval_combined_score": 0.9025477237114792,
1913
+ "eval_f1": 0.8893150863049768,
1914
+ "eval_loss": 0.22751761972904205,
1915
+ "eval_runtime": 52.5389,
1916
+ "eval_samples_per_second": 769.525,
1917
+ "eval_steps_per_second": 1.504,
1918
+ "step": 106000
1919
+ },
1920
+ {
1921
+ "epoch": 9.40990238325565,
1922
+ "grad_norm": 1.5495615005493164,
1923
+ "learning_rate": 2.609331434431139e-06,
1924
+ "loss": 0.1527,
1925
+ "step": 107000
1926
+ },
1927
+ {
1928
+ "epoch": 9.40990238325565,
1929
+ "eval_accuracy": 0.9156814246846401,
1930
+ "eval_combined_score": 0.9023430403467481,
1931
+ "eval_f1": 0.8890046560088561,
1932
+ "eval_loss": 0.22490988671779633,
1933
+ "eval_runtime": 51.5829,
1934
+ "eval_samples_per_second": 783.787,
1935
+ "eval_steps_per_second": 1.532,
1936
+ "step": 107000
1937
+ },
1938
+ {
1939
+ "epoch": 9.497845396183273,
1940
+ "grad_norm": 0.8224658370018005,
1941
+ "learning_rate": 1.890170772289401e-06,
1942
+ "loss": 0.1511,
1943
+ "step": 108000
1944
+ },
1945
+ {
1946
+ "epoch": 9.497845396183273,
1947
+ "eval_accuracy": 0.9155824882512985,
1948
+ "eval_combined_score": 0.902134220545205,
1949
+ "eval_f1": 0.8886859528391116,
1950
+ "eval_loss": 0.2251313477754593,
1951
+ "eval_runtime": 52.5892,
1952
+ "eval_samples_per_second": 768.789,
1953
+ "eval_steps_per_second": 1.502,
1954
+ "step": 108000
1955
+ },
1956
+ {
1957
+ "epoch": 9.585788409110895,
1958
+ "grad_norm": 1.336452603340149,
1959
+ "learning_rate": 1.286092181929571e-06,
1960
+ "loss": 0.1508,
1961
+ "step": 109000
1962
+ },
1963
+ {
1964
+ "epoch": 9.585788409110895,
1965
+ "eval_accuracy": 0.9155577541429631,
1966
+ "eval_combined_score": 0.9021327693651313,
1967
+ "eval_f1": 0.8887077845872995,
1968
+ "eval_loss": 0.22481171786785126,
1969
+ "eval_runtime": 51.2622,
1970
+ "eval_samples_per_second": 788.691,
1971
+ "eval_steps_per_second": 1.541,
1972
+ "step": 109000
1973
+ },
1974
+ {
1975
+ "epoch": 9.67373142203852,
1976
+ "grad_norm": 2.6884238719940186,
1977
+ "learning_rate": 7.975650354119345e-07,
1978
+ "loss": 0.1413,
1979
+ "step": 110000
1980
+ },
1981
+ {
1982
+ "epoch": 9.67373142203852,
1983
+ "eval_accuracy": 0.9155824882512985,
1984
+ "eval_combined_score": 0.902112429156156,
1985
+ "eval_f1": 0.8886423700610134,
1986
+ "eval_loss": 0.22579550743103027,
1987
+ "eval_runtime": 80.932,
1988
+ "eval_samples_per_second": 499.555,
1989
+ "eval_steps_per_second": 0.976,
1990
+ "step": 110000
1991
+ },
1992
+ {
1993
+ "epoch": 9.761674434966142,
1994
+ "grad_norm": 1.046630620956421,
1995
+ "learning_rate": 4.249689207519447e-07,
1996
+ "loss": 0.1506,
1997
+ "step": 111000
1998
+ },
1999
+ {
2000
+ "epoch": 9.761674434966142,
2001
+ "eval_accuracy": 0.9152856789512738,
2002
+ "eval_combined_score": 0.9018519940610743,
2003
+ "eval_f1": 0.8884183091708747,
2004
+ "eval_loss": 0.22598420083522797,
2005
+ "eval_runtime": 81.1014,
2006
+ "eval_samples_per_second": 498.512,
2007
+ "eval_steps_per_second": 0.974,
2008
+ "step": 111000
2009
+ },
2010
+ {
2011
+ "epoch": 9.849617447893765,
2012
+ "grad_norm": 0.9446746110916138,
2013
+ "learning_rate": 1.6859334697840177e-07,
2014
+ "loss": 0.1532,
2015
+ "step": 112000
2016
+ },
2017
+ {
2018
+ "epoch": 9.849617447893765,
2019
+ "eval_accuracy": 0.9155082859262924,
2020
+ "eval_combined_score": 0.9021443233369171,
2021
+ "eval_f1": 0.8887803607475419,
2022
+ "eval_loss": 0.22602435946464539,
2023
+ "eval_runtime": 81.1781,
2024
+ "eval_samples_per_second": 498.041,
2025
+ "eval_steps_per_second": 0.973,
2026
+ "step": 112000
2027
+ },
2028
+ {
2029
+ "epoch": 9.937560460821388,
2030
+ "grad_norm": 0.8685068488121033,
2031
+ "learning_rate": 2.863751918346091e-08,
2032
+ "loss": 0.1452,
2033
+ "step": 113000
2034
+ },
2035
+ {
2036
+ "epoch": 9.937560460821388,
2037
+ "eval_accuracy": 0.9155082859262924,
2038
+ "eval_combined_score": 0.9021443233369171,
2039
+ "eval_f1": 0.8887803607475419,
2040
+ "eval_loss": 0.22605791687965393,
2041
+ "eval_runtime": 52.1365,
2042
+ "eval_samples_per_second": 775.464,
2043
+ "eval_steps_per_second": 1.515,
2044
+ "step": 113000
2045
+ },
2046
+ {
2047
+ "epoch": 10.0,
2048
+ "step": 113710,
2049
+ "total_flos": 6.076865681478144e+17,
2050
+ "train_loss": 0.19076461288778232,
2051
+ "train_runtime": 59328.5284,
2052
+ "train_samples_per_second": 61.327,
2053
+ "train_steps_per_second": 1.917
2054
+ }
2055
+ ],
2056
+ "logging_steps": 1000,
2057
+ "max_steps": 113710,
2058
+ "num_input_tokens_seen": 0,
2059
+ "num_train_epochs": 10,
2060
+ "save_steps": 1000,
2061
+ "stateful_callbacks": {
2062
+ "TrainerControl": {
2063
+ "args": {
2064
+ "should_epoch_stop": false,
2065
+ "should_evaluate": false,
2066
+ "should_log": false,
2067
+ "should_save": true,
2068
+ "should_training_stop": true
2069
+ },
2070
+ "attributes": {}
2071
+ }
2072
+ },
2073
+ "total_flos": 6.076865681478144e+17,
2074
+ "train_batch_size": 32,
2075
+ "trial_name": null,
2076
+ "trial_params": null
2077
+ }
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/RTE.tsv ADDED
@@ -0,0 +1,3001 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ index prediction
2
+ 0 not_entailment
3
+ 1 entailment
4
+ 2 entailment
5
+ 3 not_entailment
6
+ 4 entailment
7
+ 5 entailment
8
+ 6 entailment
9
+ 7 not_entailment
10
+ 8 not_entailment
11
+ 9 entailment
12
+ 10 not_entailment
13
+ 11 entailment
14
+ 12 not_entailment
15
+ 13 not_entailment
16
+ 14 not_entailment
17
+ 15 not_entailment
18
+ 16 not_entailment
19
+ 17 entailment
20
+ 18 entailment
21
+ 19 not_entailment
22
+ 20 entailment
23
+ 21 not_entailment
24
+ 22 not_entailment
25
+ 23 not_entailment
26
+ 24 not_entailment
27
+ 25 not_entailment
28
+ 26 entailment
29
+ 27 entailment
30
+ 28 entailment
31
+ 29 entailment
32
+ 30 not_entailment
33
+ 31 entailment
34
+ 32 not_entailment
35
+ 33 not_entailment
36
+ 34 not_entailment
37
+ 35 entailment
38
+ 36 not_entailment
39
+ 37 entailment
40
+ 38 entailment
41
+ 39 entailment
42
+ 40 entailment
43
+ 41 not_entailment
44
+ 42 entailment
45
+ 43 not_entailment
46
+ 44 not_entailment
47
+ 45 entailment
48
+ 46 entailment
49
+ 47 entailment
50
+ 48 entailment
51
+ 49 not_entailment
52
+ 50 not_entailment
53
+ 51 entailment
54
+ 52 not_entailment
55
+ 53 entailment
56
+ 54 entailment
57
+ 55 not_entailment
58
+ 56 entailment
59
+ 57 not_entailment
60
+ 58 entailment
61
+ 59 entailment
62
+ 60 entailment
63
+ 61 not_entailment
64
+ 62 not_entailment
65
+ 63 entailment
66
+ 64 entailment
67
+ 65 not_entailment
68
+ 66 entailment
69
+ 67 not_entailment
70
+ 68 not_entailment
71
+ 69 entailment
72
+ 70 entailment
73
+ 71 entailment
74
+ 72 entailment
75
+ 73 entailment
76
+ 74 entailment
77
+ 75 entailment
78
+ 76 not_entailment
79
+ 77 entailment
80
+ 78 entailment
81
+ 79 not_entailment
82
+ 80 not_entailment
83
+ 81 entailment
84
+ 82 not_entailment
85
+ 83 not_entailment
86
+ 84 entailment
87
+ 85 not_entailment
88
+ 86 entailment
89
+ 87 not_entailment
90
+ 88 entailment
91
+ 89 entailment
92
+ 90 not_entailment
93
+ 91 entailment
94
+ 92 not_entailment
95
+ 93 entailment
96
+ 94 entailment
97
+ 95 not_entailment
98
+ 96 entailment
99
+ 97 entailment
100
+ 98 not_entailment
101
+ 99 entailment
102
+ 100 entailment
103
+ 101 entailment
104
+ 102 not_entailment
105
+ 103 entailment
106
+ 104 entailment
107
+ 105 entailment
108
+ 106 entailment
109
+ 107 not_entailment
110
+ 108 not_entailment
111
+ 109 not_entailment
112
+ 110 entailment
113
+ 111 not_entailment
114
+ 112 not_entailment
115
+ 113 not_entailment
116
+ 114 entailment
117
+ 115 entailment
118
+ 116 entailment
119
+ 117 not_entailment
120
+ 118 not_entailment
121
+ 119 not_entailment
122
+ 120 not_entailment
123
+ 121 not_entailment
124
+ 122 not_entailment
125
+ 123 not_entailment
126
+ 124 entailment
127
+ 125 entailment
128
+ 126 not_entailment
129
+ 127 entailment
130
+ 128 entailment
131
+ 129 entailment
132
+ 130 entailment
133
+ 131 entailment
134
+ 132 not_entailment
135
+ 133 not_entailment
136
+ 134 entailment
137
+ 135 not_entailment
138
+ 136 not_entailment
139
+ 137 not_entailment
140
+ 138 entailment
141
+ 139 not_entailment
142
+ 140 entailment
143
+ 141 not_entailment
144
+ 142 entailment
145
+ 143 entailment
146
+ 144 entailment
147
+ 145 not_entailment
148
+ 146 not_entailment
149
+ 147 not_entailment
150
+ 148 not_entailment
151
+ 149 entailment
152
+ 150 entailment
153
+ 151 entailment
154
+ 152 not_entailment
155
+ 153 entailment
156
+ 154 entailment
157
+ 155 not_entailment
158
+ 156 not_entailment
159
+ 157 entailment
160
+ 158 not_entailment
161
+ 159 not_entailment
162
+ 160 entailment
163
+ 161 not_entailment
164
+ 162 entailment
165
+ 163 not_entailment
166
+ 164 not_entailment
167
+ 165 entailment
168
+ 166 entailment
169
+ 167 entailment
170
+ 168 entailment
171
+ 169 not_entailment
172
+ 170 entailment
173
+ 171 entailment
174
+ 172 not_entailment
175
+ 173 entailment
176
+ 174 entailment
177
+ 175 not_entailment
178
+ 176 not_entailment
179
+ 177 not_entailment
180
+ 178 entailment
181
+ 179 not_entailment
182
+ 180 not_entailment
183
+ 181 not_entailment
184
+ 182 entailment
185
+ 183 not_entailment
186
+ 184 entailment
187
+ 185 not_entailment
188
+ 186 entailment
189
+ 187 entailment
190
+ 188 not_entailment
191
+ 189 not_entailment
192
+ 190 entailment
193
+ 191 entailment
194
+ 192 entailment
195
+ 193 not_entailment
196
+ 194 entailment
197
+ 195 entailment
198
+ 196 entailment
199
+ 197 not_entailment
200
+ 198 entailment
201
+ 199 not_entailment
202
+ 200 entailment
203
+ 201 entailment
204
+ 202 entailment
205
+ 203 not_entailment
206
+ 204 not_entailment
207
+ 205 entailment
208
+ 206 not_entailment
209
+ 207 not_entailment
210
+ 208 not_entailment
211
+ 209 entailment
212
+ 210 not_entailment
213
+ 211 not_entailment
214
+ 212 entailment
215
+ 213 entailment
216
+ 214 entailment
217
+ 215 entailment
218
+ 216 not_entailment
219
+ 217 not_entailment
220
+ 218 not_entailment
221
+ 219 not_entailment
222
+ 220 entailment
223
+ 221 not_entailment
224
+ 222 not_entailment
225
+ 223 not_entailment
226
+ 224 entailment
227
+ 225 not_entailment
228
+ 226 entailment
229
+ 227 not_entailment
230
+ 228 not_entailment
231
+ 229 not_entailment
232
+ 230 not_entailment
233
+ 231 entailment
234
+ 232 not_entailment
235
+ 233 not_entailment
236
+ 234 entailment
237
+ 235 not_entailment
238
+ 236 not_entailment
239
+ 237 entailment
240
+ 238 not_entailment
241
+ 239 entailment
242
+ 240 entailment
243
+ 241 not_entailment
244
+ 242 entailment
245
+ 243 entailment
246
+ 244 not_entailment
247
+ 245 not_entailment
248
+ 246 entailment
249
+ 247 not_entailment
250
+ 248 entailment
251
+ 249 not_entailment
252
+ 250 entailment
253
+ 251 not_entailment
254
+ 252 entailment
255
+ 253 not_entailment
256
+ 254 entailment
257
+ 255 entailment
258
+ 256 not_entailment
259
+ 257 not_entailment
260
+ 258 entailment
261
+ 259 not_entailment
262
+ 260 not_entailment
263
+ 261 not_entailment
264
+ 262 entailment
265
+ 263 not_entailment
266
+ 264 entailment
267
+ 265 entailment
268
+ 266 not_entailment
269
+ 267 not_entailment
270
+ 268 entailment
271
+ 269 not_entailment
272
+ 270 not_entailment
273
+ 271 entailment
274
+ 272 not_entailment
275
+ 273 not_entailment
276
+ 274 entailment
277
+ 275 not_entailment
278
+ 276 not_entailment
279
+ 277 entailment
280
+ 278 not_entailment
281
+ 279 not_entailment
282
+ 280 not_entailment
283
+ 281 entailment
284
+ 282 entailment
285
+ 283 not_entailment
286
+ 284 not_entailment
287
+ 285 not_entailment
288
+ 286 entailment
289
+ 287 not_entailment
290
+ 288 not_entailment
291
+ 289 entailment
292
+ 290 not_entailment
293
+ 291 not_entailment
294
+ 292 not_entailment
295
+ 293 entailment
296
+ 294 entailment
297
+ 295 not_entailment
298
+ 296 not_entailment
299
+ 297 not_entailment
300
+ 298 entailment
301
+ 299 not_entailment
302
+ 300 not_entailment
303
+ 301 entailment
304
+ 302 entailment
305
+ 303 not_entailment
306
+ 304 entailment
307
+ 305 not_entailment
308
+ 306 not_entailment
309
+ 307 not_entailment
310
+ 308 not_entailment
311
+ 309 not_entailment
312
+ 310 not_entailment
313
+ 311 entailment
314
+ 312 not_entailment
315
+ 313 not_entailment
316
+ 314 entailment
317
+ 315 not_entailment
318
+ 316 entailment
319
+ 317 not_entailment
320
+ 318 entailment
321
+ 319 entailment
322
+ 320 not_entailment
323
+ 321 not_entailment
324
+ 322 entailment
325
+ 323 entailment
326
+ 324 entailment
327
+ 325 not_entailment
328
+ 326 not_entailment
329
+ 327 entailment
330
+ 328 not_entailment
331
+ 329 not_entailment
332
+ 330 not_entailment
333
+ 331 not_entailment
334
+ 332 not_entailment
335
+ 333 not_entailment
336
+ 334 not_entailment
337
+ 335 entailment
338
+ 336 not_entailment
339
+ 337 entailment
340
+ 338 not_entailment
341
+ 339 entailment
342
+ 340 entailment
343
+ 341 entailment
344
+ 342 entailment
345
+ 343 not_entailment
346
+ 344 not_entailment
347
+ 345 entailment
348
+ 346 entailment
349
+ 347 entailment
350
+ 348 not_entailment
351
+ 349 not_entailment
352
+ 350 entailment
353
+ 351 entailment
354
+ 352 not_entailment
355
+ 353 not_entailment
356
+ 354 entailment
357
+ 355 not_entailment
358
+ 356 entailment
359
+ 357 not_entailment
360
+ 358 entailment
361
+ 359 not_entailment
362
+ 360 entailment
363
+ 361 entailment
364
+ 362 entailment
365
+ 363 not_entailment
366
+ 364 entailment
367
+ 365 not_entailment
368
+ 366 not_entailment
369
+ 367 entailment
370
+ 368 entailment
371
+ 369 not_entailment
372
+ 370 not_entailment
373
+ 371 not_entailment
374
+ 372 not_entailment
375
+ 373 entailment
376
+ 374 not_entailment
377
+ 375 entailment
378
+ 376 not_entailment
379
+ 377 entailment
380
+ 378 not_entailment
381
+ 379 not_entailment
382
+ 380 not_entailment
383
+ 381 not_entailment
384
+ 382 not_entailment
385
+ 383 entailment
386
+ 384 not_entailment
387
+ 385 entailment
388
+ 386 not_entailment
389
+ 387 entailment
390
+ 388 entailment
391
+ 389 not_entailment
392
+ 390 not_entailment
393
+ 391 entailment
394
+ 392 not_entailment
395
+ 393 entailment
396
+ 394 entailment
397
+ 395 entailment
398
+ 396 entailment
399
+ 397 not_entailment
400
+ 398 not_entailment
401
+ 399 entailment
402
+ 400 entailment
403
+ 401 entailment
404
+ 402 entailment
405
+ 403 entailment
406
+ 404 not_entailment
407
+ 405 entailment
408
+ 406 not_entailment
409
+ 407 entailment
410
+ 408 not_entailment
411
+ 409 entailment
412
+ 410 not_entailment
413
+ 411 entailment
414
+ 412 entailment
415
+ 413 entailment
416
+ 414 not_entailment
417
+ 415 not_entailment
418
+ 416 not_entailment
419
+ 417 not_entailment
420
+ 418 not_entailment
421
+ 419 not_entailment
422
+ 420 entailment
423
+ 421 entailment
424
+ 422 not_entailment
425
+ 423 entailment
426
+ 424 entailment
427
+ 425 entailment
428
+ 426 entailment
429
+ 427 not_entailment
430
+ 428 entailment
431
+ 429 entailment
432
+ 430 entailment
433
+ 431 not_entailment
434
+ 432 entailment
435
+ 433 entailment
436
+ 434 not_entailment
437
+ 435 not_entailment
438
+ 436 not_entailment
439
+ 437 entailment
440
+ 438 entailment
441
+ 439 not_entailment
442
+ 440 not_entailment
443
+ 441 entailment
444
+ 442 entailment
445
+ 443 entailment
446
+ 444 not_entailment
447
+ 445 entailment
448
+ 446 entailment
449
+ 447 entailment
450
+ 448 entailment
451
+ 449 entailment
452
+ 450 entailment
453
+ 451 entailment
454
+ 452 not_entailment
455
+ 453 entailment
456
+ 454 not_entailment
457
+ 455 not_entailment
458
+ 456 entailment
459
+ 457 entailment
460
+ 458 entailment
461
+ 459 entailment
462
+ 460 entailment
463
+ 461 entailment
464
+ 462 entailment
465
+ 463 not_entailment
466
+ 464 entailment
467
+ 465 entailment
468
+ 466 entailment
469
+ 467 not_entailment
470
+ 468 entailment
471
+ 469 entailment
472
+ 470 not_entailment
473
+ 471 entailment
474
+ 472 not_entailment
475
+ 473 entailment
476
+ 474 entailment
477
+ 475 not_entailment
478
+ 476 not_entailment
479
+ 477 not_entailment
480
+ 478 not_entailment
481
+ 479 entailment
482
+ 480 not_entailment
483
+ 481 entailment
484
+ 482 not_entailment
485
+ 483 entailment
486
+ 484 entailment
487
+ 485 entailment
488
+ 486 entailment
489
+ 487 entailment
490
+ 488 not_entailment
491
+ 489 entailment
492
+ 490 entailment
493
+ 491 entailment
494
+ 492 not_entailment
495
+ 493 entailment
496
+ 494 entailment
497
+ 495 not_entailment
498
+ 496 not_entailment
499
+ 497 entailment
500
+ 498 entailment
501
+ 499 entailment
502
+ 500 entailment
503
+ 501 entailment
504
+ 502 entailment
505
+ 503 entailment
506
+ 504 not_entailment
507
+ 505 entailment
508
+ 506 entailment
509
+ 507 not_entailment
510
+ 508 entailment
511
+ 509 not_entailment
512
+ 510 not_entailment
513
+ 511 entailment
514
+ 512 not_entailment
515
+ 513 entailment
516
+ 514 entailment
517
+ 515 not_entailment
518
+ 516 not_entailment
519
+ 517 not_entailment
520
+ 518 not_entailment
521
+ 519 not_entailment
522
+ 520 not_entailment
523
+ 521 not_entailment
524
+ 522 not_entailment
525
+ 523 entailment
526
+ 524 entailment
527
+ 525 entailment
528
+ 526 not_entailment
529
+ 527 entailment
530
+ 528 entailment
531
+ 529 not_entailment
532
+ 530 entailment
533
+ 531 not_entailment
534
+ 532 not_entailment
535
+ 533 entailment
536
+ 534 entailment
537
+ 535 entailment
538
+ 536 not_entailment
539
+ 537 entailment
540
+ 538 entailment
541
+ 539 not_entailment
542
+ 540 entailment
543
+ 541 not_entailment
544
+ 542 not_entailment
545
+ 543 entailment
546
+ 544 entailment
547
+ 545 not_entailment
548
+ 546 entailment
549
+ 547 entailment
550
+ 548 entailment
551
+ 549 entailment
552
+ 550 not_entailment
553
+ 551 entailment
554
+ 552 entailment
555
+ 553 not_entailment
556
+ 554 entailment
557
+ 555 not_entailment
558
+ 556 not_entailment
559
+ 557 not_entailment
560
+ 558 not_entailment
561
+ 559 not_entailment
562
+ 560 entailment
563
+ 561 entailment
564
+ 562 not_entailment
565
+ 563 entailment
566
+ 564 entailment
567
+ 565 entailment
568
+ 566 entailment
569
+ 567 entailment
570
+ 568 not_entailment
571
+ 569 entailment
572
+ 570 entailment
573
+ 571 entailment
574
+ 572 entailment
575
+ 573 not_entailment
576
+ 574 entailment
577
+ 575 not_entailment
578
+ 576 entailment
579
+ 577 entailment
580
+ 578 not_entailment
581
+ 579 entailment
582
+ 580 entailment
583
+ 581 not_entailment
584
+ 582 not_entailment
585
+ 583 entailment
586
+ 584 entailment
587
+ 585 entailment
588
+ 586 entailment
589
+ 587 not_entailment
590
+ 588 entailment
591
+ 589 not_entailment
592
+ 590 entailment
593
+ 591 not_entailment
594
+ 592 not_entailment
595
+ 593 not_entailment
596
+ 594 entailment
597
+ 595 not_entailment
598
+ 596 entailment
599
+ 597 not_entailment
600
+ 598 entailment
601
+ 599 not_entailment
602
+ 600 entailment
603
+ 601 entailment
604
+ 602 not_entailment
605
+ 603 not_entailment
606
+ 604 not_entailment
607
+ 605 not_entailment
608
+ 606 entailment
609
+ 607 entailment
610
+ 608 not_entailment
611
+ 609 entailment
612
+ 610 entailment
613
+ 611 not_entailment
614
+ 612 entailment
615
+ 613 not_entailment
616
+ 614 not_entailment
617
+ 615 entailment
618
+ 616 entailment
619
+ 617 entailment
620
+ 618 not_entailment
621
+ 619 entailment
622
+ 620 entailment
623
+ 621 entailment
624
+ 622 entailment
625
+ 623 not_entailment
626
+ 624 entailment
627
+ 625 entailment
628
+ 626 not_entailment
629
+ 627 entailment
630
+ 628 not_entailment
631
+ 629 not_entailment
632
+ 630 entailment
633
+ 631 entailment
634
+ 632 not_entailment
635
+ 633 not_entailment
636
+ 634 not_entailment
637
+ 635 entailment
638
+ 636 entailment
639
+ 637 entailment
640
+ 638 not_entailment
641
+ 639 entailment
642
+ 640 entailment
643
+ 641 not_entailment
644
+ 642 entailment
645
+ 643 not_entailment
646
+ 644 entailment
647
+ 645 not_entailment
648
+ 646 entailment
649
+ 647 entailment
650
+ 648 not_entailment
651
+ 649 not_entailment
652
+ 650 not_entailment
653
+ 651 not_entailment
654
+ 652 entailment
655
+ 653 not_entailment
656
+ 654 entailment
657
+ 655 entailment
658
+ 656 entailment
659
+ 657 entailment
660
+ 658 not_entailment
661
+ 659 not_entailment
662
+ 660 not_entailment
663
+ 661 not_entailment
664
+ 662 entailment
665
+ 663 entailment
666
+ 664 not_entailment
667
+ 665 not_entailment
668
+ 666 entailment
669
+ 667 entailment
670
+ 668 not_entailment
671
+ 669 not_entailment
672
+ 670 entailment
673
+ 671 entailment
674
+ 672 not_entailment
675
+ 673 not_entailment
676
+ 674 not_entailment
677
+ 675 not_entailment
678
+ 676 entailment
679
+ 677 entailment
680
+ 678 entailment
681
+ 679 entailment
682
+ 680 entailment
683
+ 681 entailment
684
+ 682 not_entailment
685
+ 683 entailment
686
+ 684 entailment
687
+ 685 entailment
688
+ 686 entailment
689
+ 687 entailment
690
+ 688 not_entailment
691
+ 689 not_entailment
692
+ 690 entailment
693
+ 691 not_entailment
694
+ 692 entailment
695
+ 693 entailment
696
+ 694 entailment
697
+ 695 not_entailment
698
+ 696 not_entailment
699
+ 697 entailment
700
+ 698 entailment
701
+ 699 entailment
702
+ 700 entailment
703
+ 701 entailment
704
+ 702 entailment
705
+ 703 not_entailment
706
+ 704 not_entailment
707
+ 705 not_entailment
708
+ 706 not_entailment
709
+ 707 entailment
710
+ 708 entailment
711
+ 709 not_entailment
712
+ 710 not_entailment
713
+ 711 entailment
714
+ 712 entailment
715
+ 713 entailment
716
+ 714 entailment
717
+ 715 not_entailment
718
+ 716 not_entailment
719
+ 717 not_entailment
720
+ 718 not_entailment
721
+ 719 not_entailment
722
+ 720 entailment
723
+ 721 entailment
724
+ 722 entailment
725
+ 723 not_entailment
726
+ 724 entailment
727
+ 725 entailment
728
+ 726 not_entailment
729
+ 727 entailment
730
+ 728 entailment
731
+ 729 entailment
732
+ 730 entailment
733
+ 731 entailment
734
+ 732 entailment
735
+ 733 entailment
736
+ 734 entailment
737
+ 735 entailment
738
+ 736 entailment
739
+ 737 not_entailment
740
+ 738 entailment
741
+ 739 entailment
742
+ 740 not_entailment
743
+ 741 entailment
744
+ 742 not_entailment
745
+ 743 not_entailment
746
+ 744 entailment
747
+ 745 entailment
748
+ 746 not_entailment
749
+ 747 not_entailment
750
+ 748 not_entailment
751
+ 749 not_entailment
752
+ 750 entailment
753
+ 751 entailment
754
+ 752 not_entailment
755
+ 753 not_entailment
756
+ 754 not_entailment
757
+ 755 not_entailment
758
+ 756 entailment
759
+ 757 not_entailment
760
+ 758 not_entailment
761
+ 759 not_entailment
762
+ 760 entailment
763
+ 761 entailment
764
+ 762 entailment
765
+ 763 not_entailment
766
+ 764 not_entailment
767
+ 765 not_entailment
768
+ 766 entailment
769
+ 767 not_entailment
770
+ 768 entailment
771
+ 769 entailment
772
+ 770 entailment
773
+ 771 not_entailment
774
+ 772 not_entailment
775
+ 773 not_entailment
776
+ 774 not_entailment
777
+ 775 entailment
778
+ 776 entailment
779
+ 777 entailment
780
+ 778 not_entailment
781
+ 779 not_entailment
782
+ 780 entailment
783
+ 781 entailment
784
+ 782 entailment
785
+ 783 entailment
786
+ 784 entailment
787
+ 785 entailment
788
+ 786 entailment
789
+ 787 entailment
790
+ 788 entailment
791
+ 789 entailment
792
+ 790 not_entailment
793
+ 791 not_entailment
794
+ 792 entailment
795
+ 793 entailment
796
+ 794 not_entailment
797
+ 795 entailment
798
+ 796 entailment
799
+ 797 entailment
800
+ 798 entailment
801
+ 799 not_entailment
802
+ 800 entailment
803
+ 801 entailment
804
+ 802 entailment
805
+ 803 entailment
806
+ 804 not_entailment
807
+ 805 entailment
808
+ 806 not_entailment
809
+ 807 entailment
810
+ 808 entailment
811
+ 809 not_entailment
812
+ 810 not_entailment
813
+ 811 entailment
814
+ 812 not_entailment
815
+ 813 entailment
816
+ 814 entailment
817
+ 815 entailment
818
+ 816 entailment
819
+ 817 entailment
820
+ 818 entailment
821
+ 819 not_entailment
822
+ 820 entailment
823
+ 821 not_entailment
824
+ 822 entailment
825
+ 823 entailment
826
+ 824 entailment
827
+ 825 entailment
828
+ 826 entailment
829
+ 827 entailment
830
+ 828 not_entailment
831
+ 829 entailment
832
+ 830 entailment
833
+ 831 entailment
834
+ 832 entailment
835
+ 833 not_entailment
836
+ 834 entailment
837
+ 835 entailment
838
+ 836 entailment
839
+ 837 not_entailment
840
+ 838 not_entailment
841
+ 839 entailment
842
+ 840 entailment
843
+ 841 entailment
844
+ 842 not_entailment
845
+ 843 not_entailment
846
+ 844 entailment
847
+ 845 entailment
848
+ 846 entailment
849
+ 847 entailment
850
+ 848 entailment
851
+ 849 not_entailment
852
+ 850 not_entailment
853
+ 851 entailment
854
+ 852 entailment
855
+ 853 entailment
856
+ 854 not_entailment
857
+ 855 not_entailment
858
+ 856 not_entailment
859
+ 857 not_entailment
860
+ 858 not_entailment
861
+ 859 entailment
862
+ 860 entailment
863
+ 861 not_entailment
864
+ 862 entailment
865
+ 863 not_entailment
866
+ 864 entailment
867
+ 865 entailment
868
+ 866 entailment
869
+ 867 entailment
870
+ 868 entailment
871
+ 869 entailment
872
+ 870 entailment
873
+ 871 entailment
874
+ 872 entailment
875
+ 873 entailment
876
+ 874 entailment
877
+ 875 entailment
878
+ 876 not_entailment
879
+ 877 entailment
880
+ 878 entailment
881
+ 879 entailment
882
+ 880 entailment
883
+ 881 entailment
884
+ 882 entailment
885
+ 883 entailment
886
+ 884 entailment
887
+ 885 entailment
888
+ 886 entailment
889
+ 887 entailment
890
+ 888 not_entailment
891
+ 889 entailment
892
+ 890 not_entailment
893
+ 891 entailment
894
+ 892 entailment
895
+ 893 entailment
896
+ 894 entailment
897
+ 895 entailment
898
+ 896 not_entailment
899
+ 897 not_entailment
900
+ 898 not_entailment
901
+ 899 not_entailment
902
+ 900 not_entailment
903
+ 901 entailment
904
+ 902 entailment
905
+ 903 entailment
906
+ 904 entailment
907
+ 905 entailment
908
+ 906 entailment
909
+ 907 entailment
910
+ 908 entailment
911
+ 909 entailment
912
+ 910 not_entailment
913
+ 911 not_entailment
914
+ 912 not_entailment
915
+ 913 entailment
916
+ 914 entailment
917
+ 915 entailment
918
+ 916 entailment
919
+ 917 entailment
920
+ 918 not_entailment
921
+ 919 entailment
922
+ 920 entailment
923
+ 921 entailment
924
+ 922 entailment
925
+ 923 not_entailment
926
+ 924 not_entailment
927
+ 925 entailment
928
+ 926 entailment
929
+ 927 entailment
930
+ 928 entailment
931
+ 929 entailment
932
+ 930 not_entailment
933
+ 931 entailment
934
+ 932 entailment
935
+ 933 not_entailment
936
+ 934 entailment
937
+ 935 entailment
938
+ 936 entailment
939
+ 937 entailment
940
+ 938 not_entailment
941
+ 939 not_entailment
942
+ 940 not_entailment
943
+ 941 entailment
944
+ 942 entailment
945
+ 943 entailment
946
+ 944 not_entailment
947
+ 945 not_entailment
948
+ 946 not_entailment
949
+ 947 not_entailment
950
+ 948 not_entailment
951
+ 949 not_entailment
952
+ 950 entailment
953
+ 951 entailment
954
+ 952 entailment
955
+ 953 not_entailment
956
+ 954 entailment
957
+ 955 not_entailment
958
+ 956 entailment
959
+ 957 not_entailment
960
+ 958 not_entailment
961
+ 959 not_entailment
962
+ 960 entailment
963
+ 961 entailment
964
+ 962 entailment
965
+ 963 entailment
966
+ 964 entailment
967
+ 965 entailment
968
+ 966 entailment
969
+ 967 entailment
970
+ 968 entailment
971
+ 969 not_entailment
972
+ 970 entailment
973
+ 971 entailment
974
+ 972 entailment
975
+ 973 entailment
976
+ 974 entailment
977
+ 975 entailment
978
+ 976 entailment
979
+ 977 entailment
980
+ 978 not_entailment
981
+ 979 not_entailment
982
+ 980 not_entailment
983
+ 981 not_entailment
984
+ 982 entailment
985
+ 983 entailment
986
+ 984 not_entailment
987
+ 985 not_entailment
988
+ 986 entailment
989
+ 987 entailment
990
+ 988 not_entailment
991
+ 989 entailment
992
+ 990 entailment
993
+ 991 not_entailment
994
+ 992 entailment
995
+ 993 not_entailment
996
+ 994 entailment
997
+ 995 entailment
998
+ 996 entailment
999
+ 997 not_entailment
1000
+ 998 entailment
1001
+ 999 not_entailment
1002
+ 1000 entailment
1003
+ 1001 not_entailment
1004
+ 1002 entailment
1005
+ 1003 not_entailment
1006
+ 1004 not_entailment
1007
+ 1005 entailment
1008
+ 1006 entailment
1009
+ 1007 entailment
1010
+ 1008 entailment
1011
+ 1009 not_entailment
1012
+ 1010 entailment
1013
+ 1011 not_entailment
1014
+ 1012 not_entailment
1015
+ 1013 not_entailment
1016
+ 1014 not_entailment
1017
+ 1015 entailment
1018
+ 1016 entailment
1019
+ 1017 entailment
1020
+ 1018 not_entailment
1021
+ 1019 entailment
1022
+ 1020 not_entailment
1023
+ 1021 entailment
1024
+ 1022 not_entailment
1025
+ 1023 entailment
1026
+ 1024 not_entailment
1027
+ 1025 not_entailment
1028
+ 1026 not_entailment
1029
+ 1027 not_entailment
1030
+ 1028 entailment
1031
+ 1029 not_entailment
1032
+ 1030 not_entailment
1033
+ 1031 not_entailment
1034
+ 1032 entailment
1035
+ 1033 entailment
1036
+ 1034 entailment
1037
+ 1035 entailment
1038
+ 1036 not_entailment
1039
+ 1037 not_entailment
1040
+ 1038 not_entailment
1041
+ 1039 entailment
1042
+ 1040 not_entailment
1043
+ 1041 not_entailment
1044
+ 1042 entailment
1045
+ 1043 not_entailment
1046
+ 1044 not_entailment
1047
+ 1045 entailment
1048
+ 1046 not_entailment
1049
+ 1047 not_entailment
1050
+ 1048 entailment
1051
+ 1049 entailment
1052
+ 1050 not_entailment
1053
+ 1051 entailment
1054
+ 1052 not_entailment
1055
+ 1053 not_entailment
1056
+ 1054 entailment
1057
+ 1055 entailment
1058
+ 1056 not_entailment
1059
+ 1057 entailment
1060
+ 1058 not_entailment
1061
+ 1059 entailment
1062
+ 1060 entailment
1063
+ 1061 not_entailment
1064
+ 1062 not_entailment
1065
+ 1063 entailment
1066
+ 1064 not_entailment
1067
+ 1065 entailment
1068
+ 1066 entailment
1069
+ 1067 entailment
1070
+ 1068 entailment
1071
+ 1069 not_entailment
1072
+ 1070 entailment
1073
+ 1071 not_entailment
1074
+ 1072 entailment
1075
+ 1073 entailment
1076
+ 1074 not_entailment
1077
+ 1075 not_entailment
1078
+ 1076 not_entailment
1079
+ 1077 entailment
1080
+ 1078 not_entailment
1081
+ 1079 entailment
1082
+ 1080 entailment
1083
+ 1081 not_entailment
1084
+ 1082 not_entailment
1085
+ 1083 entailment
1086
+ 1084 entailment
1087
+ 1085 entailment
1088
+ 1086 entailment
1089
+ 1087 entailment
1090
+ 1088 entailment
1091
+ 1089 not_entailment
1092
+ 1090 not_entailment
1093
+ 1091 entailment
1094
+ 1092 not_entailment
1095
+ 1093 not_entailment
1096
+ 1094 not_entailment
1097
+ 1095 not_entailment
1098
+ 1096 entailment
1099
+ 1097 entailment
1100
+ 1098 not_entailment
1101
+ 1099 not_entailment
1102
+ 1100 not_entailment
1103
+ 1101 not_entailment
1104
+ 1102 not_entailment
1105
+ 1103 not_entailment
1106
+ 1104 not_entailment
1107
+ 1105 not_entailment
1108
+ 1106 not_entailment
1109
+ 1107 not_entailment
1110
+ 1108 not_entailment
1111
+ 1109 not_entailment
1112
+ 1110 not_entailment
1113
+ 1111 not_entailment
1114
+ 1112 not_entailment
1115
+ 1113 entailment
1116
+ 1114 entailment
1117
+ 1115 not_entailment
1118
+ 1116 not_entailment
1119
+ 1117 not_entailment
1120
+ 1118 not_entailment
1121
+ 1119 entailment
1122
+ 1120 not_entailment
1123
+ 1121 not_entailment
1124
+ 1122 not_entailment
1125
+ 1123 not_entailment
1126
+ 1124 entailment
1127
+ 1125 entailment
1128
+ 1126 entailment
1129
+ 1127 entailment
1130
+ 1128 not_entailment
1131
+ 1129 not_entailment
1132
+ 1130 not_entailment
1133
+ 1131 entailment
1134
+ 1132 not_entailment
1135
+ 1133 entailment
1136
+ 1134 not_entailment
1137
+ 1135 not_entailment
1138
+ 1136 not_entailment
1139
+ 1137 not_entailment
1140
+ 1138 not_entailment
1141
+ 1139 entailment
1142
+ 1140 not_entailment
1143
+ 1141 not_entailment
1144
+ 1142 entailment
1145
+ 1143 not_entailment
1146
+ 1144 not_entailment
1147
+ 1145 entailment
1148
+ 1146 not_entailment
1149
+ 1147 entailment
1150
+ 1148 not_entailment
1151
+ 1149 not_entailment
1152
+ 1150 not_entailment
1153
+ 1151 not_entailment
1154
+ 1152 not_entailment
1155
+ 1153 entailment
1156
+ 1154 entailment
1157
+ 1155 entailment
1158
+ 1156 entailment
1159
+ 1157 entailment
1160
+ 1158 not_entailment
1161
+ 1159 not_entailment
1162
+ 1160 entailment
1163
+ 1161 not_entailment
1164
+ 1162 not_entailment
1165
+ 1163 not_entailment
1166
+ 1164 not_entailment
1167
+ 1165 entailment
1168
+ 1166 not_entailment
1169
+ 1167 entailment
1170
+ 1168 not_entailment
1171
+ 1169 not_entailment
1172
+ 1170 entailment
1173
+ 1171 entailment
1174
+ 1172 not_entailment
1175
+ 1173 entailment
1176
+ 1174 not_entailment
1177
+ 1175 entailment
1178
+ 1176 entailment
1179
+ 1177 entailment
1180
+ 1178 not_entailment
1181
+ 1179 not_entailment
1182
+ 1180 not_entailment
1183
+ 1181 not_entailment
1184
+ 1182 not_entailment
1185
+ 1183 not_entailment
1186
+ 1184 not_entailment
1187
+ 1185 not_entailment
1188
+ 1186 not_entailment
1189
+ 1187 not_entailment
1190
+ 1188 not_entailment
1191
+ 1189 not_entailment
1192
+ 1190 not_entailment
1193
+ 1191 not_entailment
1194
+ 1192 not_entailment
1195
+ 1193 not_entailment
1196
+ 1194 not_entailment
1197
+ 1195 not_entailment
1198
+ 1196 not_entailment
1199
+ 1197 not_entailment
1200
+ 1198 entailment
1201
+ 1199 not_entailment
1202
+ 1200 entailment
1203
+ 1201 entailment
1204
+ 1202 entailment
1205
+ 1203 not_entailment
1206
+ 1204 not_entailment
1207
+ 1205 entailment
1208
+ 1206 entailment
1209
+ 1207 entailment
1210
+ 1208 not_entailment
1211
+ 1209 not_entailment
1212
+ 1210 entailment
1213
+ 1211 entailment
1214
+ 1212 not_entailment
1215
+ 1213 entailment
1216
+ 1214 entailment
1217
+ 1215 entailment
1218
+ 1216 entailment
1219
+ 1217 not_entailment
1220
+ 1218 entailment
1221
+ 1219 entailment
1222
+ 1220 entailment
1223
+ 1221 entailment
1224
+ 1222 entailment
1225
+ 1223 entailment
1226
+ 1224 not_entailment
1227
+ 1225 entailment
1228
+ 1226 entailment
1229
+ 1227 not_entailment
1230
+ 1228 entailment
1231
+ 1229 entailment
1232
+ 1230 entailment
1233
+ 1231 entailment
1234
+ 1232 entailment
1235
+ 1233 not_entailment
1236
+ 1234 entailment
1237
+ 1235 entailment
1238
+ 1236 entailment
1239
+ 1237 entailment
1240
+ 1238 entailment
1241
+ 1239 entailment
1242
+ 1240 not_entailment
1243
+ 1241 entailment
1244
+ 1242 entailment
1245
+ 1243 entailment
1246
+ 1244 entailment
1247
+ 1245 entailment
1248
+ 1246 entailment
1249
+ 1247 not_entailment
1250
+ 1248 entailment
1251
+ 1249 entailment
1252
+ 1250 not_entailment
1253
+ 1251 entailment
1254
+ 1252 entailment
1255
+ 1253 entailment
1256
+ 1254 not_entailment
1257
+ 1255 entailment
1258
+ 1256 not_entailment
1259
+ 1257 not_entailment
1260
+ 1258 entailment
1261
+ 1259 not_entailment
1262
+ 1260 not_entailment
1263
+ 1261 entailment
1264
+ 1262 not_entailment
1265
+ 1263 not_entailment
1266
+ 1264 not_entailment
1267
+ 1265 entailment
1268
+ 1266 entailment
1269
+ 1267 entailment
1270
+ 1268 not_entailment
1271
+ 1269 not_entailment
1272
+ 1270 entailment
1273
+ 1271 entailment
1274
+ 1272 entailment
1275
+ 1273 entailment
1276
+ 1274 not_entailment
1277
+ 1275 entailment
1278
+ 1276 not_entailment
1279
+ 1277 entailment
1280
+ 1278 entailment
1281
+ 1279 entailment
1282
+ 1280 entailment
1283
+ 1281 not_entailment
1284
+ 1282 not_entailment
1285
+ 1283 not_entailment
1286
+ 1284 not_entailment
1287
+ 1285 entailment
1288
+ 1286 not_entailment
1289
+ 1287 entailment
1290
+ 1288 entailment
1291
+ 1289 entailment
1292
+ 1290 entailment
1293
+ 1291 entailment
1294
+ 1292 entailment
1295
+ 1293 not_entailment
1296
+ 1294 entailment
1297
+ 1295 entailment
1298
+ 1296 not_entailment
1299
+ 1297 entailment
1300
+ 1298 entailment
1301
+ 1299 entailment
1302
+ 1300 entailment
1303
+ 1301 not_entailment
1304
+ 1302 entailment
1305
+ 1303 not_entailment
1306
+ 1304 entailment
1307
+ 1305 entailment
1308
+ 1306 not_entailment
1309
+ 1307 not_entailment
1310
+ 1308 entailment
1311
+ 1309 not_entailment
1312
+ 1310 not_entailment
1313
+ 1311 not_entailment
1314
+ 1312 not_entailment
1315
+ 1313 entailment
1316
+ 1314 entailment
1317
+ 1315 not_entailment
1318
+ 1316 not_entailment
1319
+ 1317 entailment
1320
+ 1318 not_entailment
1321
+ 1319 not_entailment
1322
+ 1320 entailment
1323
+ 1321 not_entailment
1324
+ 1322 not_entailment
1325
+ 1323 not_entailment
1326
+ 1324 not_entailment
1327
+ 1325 not_entailment
1328
+ 1326 not_entailment
1329
+ 1327 entailment
1330
+ 1328 not_entailment
1331
+ 1329 entailment
1332
+ 1330 entailment
1333
+ 1331 not_entailment
1334
+ 1332 not_entailment
1335
+ 1333 not_entailment
1336
+ 1334 entailment
1337
+ 1335 entailment
1338
+ 1336 not_entailment
1339
+ 1337 entailment
1340
+ 1338 entailment
1341
+ 1339 entailment
1342
+ 1340 entailment
1343
+ 1341 entailment
1344
+ 1342 entailment
1345
+ 1343 entailment
1346
+ 1344 not_entailment
1347
+ 1345 not_entailment
1348
+ 1346 entailment
1349
+ 1347 entailment
1350
+ 1348 entailment
1351
+ 1349 entailment
1352
+ 1350 not_entailment
1353
+ 1351 entailment
1354
+ 1352 entailment
1355
+ 1353 not_entailment
1356
+ 1354 not_entailment
1357
+ 1355 entailment
1358
+ 1356 entailment
1359
+ 1357 not_entailment
1360
+ 1358 entailment
1361
+ 1359 entailment
1362
+ 1360 not_entailment
1363
+ 1361 not_entailment
1364
+ 1362 not_entailment
1365
+ 1363 not_entailment
1366
+ 1364 not_entailment
1367
+ 1365 entailment
1368
+ 1366 entailment
1369
+ 1367 entailment
1370
+ 1368 entailment
1371
+ 1369 not_entailment
1372
+ 1370 entailment
1373
+ 1371 entailment
1374
+ 1372 entailment
1375
+ 1373 entailment
1376
+ 1374 not_entailment
1377
+ 1375 entailment
1378
+ 1376 not_entailment
1379
+ 1377 not_entailment
1380
+ 1378 not_entailment
1381
+ 1379 not_entailment
1382
+ 1380 not_entailment
1383
+ 1381 entailment
1384
+ 1382 entailment
1385
+ 1383 not_entailment
1386
+ 1384 not_entailment
1387
+ 1385 entailment
1388
+ 1386 entailment
1389
+ 1387 entailment
1390
+ 1388 not_entailment
1391
+ 1389 not_entailment
1392
+ 1390 entailment
1393
+ 1391 not_entailment
1394
+ 1392 not_entailment
1395
+ 1393 entailment
1396
+ 1394 entailment
1397
+ 1395 entailment
1398
+ 1396 not_entailment
1399
+ 1397 entailment
1400
+ 1398 not_entailment
1401
+ 1399 not_entailment
1402
+ 1400 entailment
1403
+ 1401 not_entailment
1404
+ 1402 entailment
1405
+ 1403 entailment
1406
+ 1404 not_entailment
1407
+ 1405 entailment
1408
+ 1406 not_entailment
1409
+ 1407 entailment
1410
+ 1408 not_entailment
1411
+ 1409 not_entailment
1412
+ 1410 entailment
1413
+ 1411 entailment
1414
+ 1412 not_entailment
1415
+ 1413 entailment
1416
+ 1414 not_entailment
1417
+ 1415 entailment
1418
+ 1416 not_entailment
1419
+ 1417 entailment
1420
+ 1418 entailment
1421
+ 1419 entailment
1422
+ 1420 not_entailment
1423
+ 1421 entailment
1424
+ 1422 entailment
1425
+ 1423 entailment
1426
+ 1424 entailment
1427
+ 1425 entailment
1428
+ 1426 entailment
1429
+ 1427 entailment
1430
+ 1428 entailment
1431
+ 1429 not_entailment
1432
+ 1430 entailment
1433
+ 1431 entailment
1434
+ 1432 not_entailment
1435
+ 1433 entailment
1436
+ 1434 entailment
1437
+ 1435 not_entailment
1438
+ 1436 entailment
1439
+ 1437 not_entailment
1440
+ 1438 entailment
1441
+ 1439 entailment
1442
+ 1440 entailment
1443
+ 1441 entailment
1444
+ 1442 entailment
1445
+ 1443 not_entailment
1446
+ 1444 entailment
1447
+ 1445 entailment
1448
+ 1446 not_entailment
1449
+ 1447 entailment
1450
+ 1448 not_entailment
1451
+ 1449 entailment
1452
+ 1450 entailment
1453
+ 1451 not_entailment
1454
+ 1452 not_entailment
1455
+ 1453 not_entailment
1456
+ 1454 not_entailment
1457
+ 1455 entailment
1458
+ 1456 entailment
1459
+ 1457 entailment
1460
+ 1458 entailment
1461
+ 1459 entailment
1462
+ 1460 not_entailment
1463
+ 1461 entailment
1464
+ 1462 entailment
1465
+ 1463 entailment
1466
+ 1464 entailment
1467
+ 1465 entailment
1468
+ 1466 entailment
1469
+ 1467 entailment
1470
+ 1468 entailment
1471
+ 1469 not_entailment
1472
+ 1470 entailment
1473
+ 1471 not_entailment
1474
+ 1472 not_entailment
1475
+ 1473 entailment
1476
+ 1474 entailment
1477
+ 1475 entailment
1478
+ 1476 entailment
1479
+ 1477 entailment
1480
+ 1478 entailment
1481
+ 1479 not_entailment
1482
+ 1480 not_entailment
1483
+ 1481 not_entailment
1484
+ 1482 entailment
1485
+ 1483 entailment
1486
+ 1484 entailment
1487
+ 1485 not_entailment
1488
+ 1486 entailment
1489
+ 1487 entailment
1490
+ 1488 entailment
1491
+ 1489 entailment
1492
+ 1490 entailment
1493
+ 1491 entailment
1494
+ 1492 entailment
1495
+ 1493 entailment
1496
+ 1494 entailment
1497
+ 1495 entailment
1498
+ 1496 entailment
1499
+ 1497 not_entailment
1500
+ 1498 not_entailment
1501
+ 1499 not_entailment
1502
+ 1500 not_entailment
1503
+ 1501 entailment
1504
+ 1502 entailment
1505
+ 1503 not_entailment
1506
+ 1504 entailment
1507
+ 1505 entailment
1508
+ 1506 not_entailment
1509
+ 1507 entailment
1510
+ 1508 not_entailment
1511
+ 1509 not_entailment
1512
+ 1510 entailment
1513
+ 1511 entailment
1514
+ 1512 entailment
1515
+ 1513 not_entailment
1516
+ 1514 not_entailment
1517
+ 1515 entailment
1518
+ 1516 entailment
1519
+ 1517 entailment
1520
+ 1518 entailment
1521
+ 1519 entailment
1522
+ 1520 entailment
1523
+ 1521 entailment
1524
+ 1522 entailment
1525
+ 1523 entailment
1526
+ 1524 entailment
1527
+ 1525 entailment
1528
+ 1526 not_entailment
1529
+ 1527 entailment
1530
+ 1528 not_entailment
1531
+ 1529 not_entailment
1532
+ 1530 entailment
1533
+ 1531 entailment
1534
+ 1532 entailment
1535
+ 1533 entailment
1536
+ 1534 entailment
1537
+ 1535 entailment
1538
+ 1536 entailment
1539
+ 1537 not_entailment
1540
+ 1538 entailment
1541
+ 1539 not_entailment
1542
+ 1540 not_entailment
1543
+ 1541 not_entailment
1544
+ 1542 not_entailment
1545
+ 1543 entailment
1546
+ 1544 entailment
1547
+ 1545 not_entailment
1548
+ 1546 entailment
1549
+ 1547 entailment
1550
+ 1548 entailment
1551
+ 1549 entailment
1552
+ 1550 entailment
1553
+ 1551 entailment
1554
+ 1552 entailment
1555
+ 1553 entailment
1556
+ 1554 not_entailment
1557
+ 1555 entailment
1558
+ 1556 entailment
1559
+ 1557 entailment
1560
+ 1558 not_entailment
1561
+ 1559 not_entailment
1562
+ 1560 entailment
1563
+ 1561 entailment
1564
+ 1562 entailment
1565
+ 1563 entailment
1566
+ 1564 not_entailment
1567
+ 1565 entailment
1568
+ 1566 not_entailment
1569
+ 1567 entailment
1570
+ 1568 entailment
1571
+ 1569 not_entailment
1572
+ 1570 entailment
1573
+ 1571 entailment
1574
+ 1572 entailment
1575
+ 1573 entailment
1576
+ 1574 not_entailment
1577
+ 1575 not_entailment
1578
+ 1576 entailment
1579
+ 1577 not_entailment
1580
+ 1578 entailment
1581
+ 1579 not_entailment
1582
+ 1580 not_entailment
1583
+ 1581 entailment
1584
+ 1582 not_entailment
1585
+ 1583 not_entailment
1586
+ 1584 not_entailment
1587
+ 1585 entailment
1588
+ 1586 entailment
1589
+ 1587 entailment
1590
+ 1588 entailment
1591
+ 1589 not_entailment
1592
+ 1590 entailment
1593
+ 1591 entailment
1594
+ 1592 not_entailment
1595
+ 1593 not_entailment
1596
+ 1594 entailment
1597
+ 1595 entailment
1598
+ 1596 not_entailment
1599
+ 1597 entailment
1600
+ 1598 entailment
1601
+ 1599 not_entailment
1602
+ 1600 entailment
1603
+ 1601 entailment
1604
+ 1602 not_entailment
1605
+ 1603 not_entailment
1606
+ 1604 not_entailment
1607
+ 1605 entailment
1608
+ 1606 entailment
1609
+ 1607 not_entailment
1610
+ 1608 entailment
1611
+ 1609 entailment
1612
+ 1610 entailment
1613
+ 1611 not_entailment
1614
+ 1612 entailment
1615
+ 1613 entailment
1616
+ 1614 not_entailment
1617
+ 1615 entailment
1618
+ 1616 entailment
1619
+ 1617 entailment
1620
+ 1618 not_entailment
1621
+ 1619 entailment
1622
+ 1620 not_entailment
1623
+ 1621 entailment
1624
+ 1622 not_entailment
1625
+ 1623 entailment
1626
+ 1624 entailment
1627
+ 1625 not_entailment
1628
+ 1626 entailment
1629
+ 1627 entailment
1630
+ 1628 not_entailment
1631
+ 1629 entailment
1632
+ 1630 not_entailment
1633
+ 1631 not_entailment
1634
+ 1632 entailment
1635
+ 1633 entailment
1636
+ 1634 entailment
1637
+ 1635 entailment
1638
+ 1636 not_entailment
1639
+ 1637 not_entailment
1640
+ 1638 not_entailment
1641
+ 1639 not_entailment
1642
+ 1640 entailment
1643
+ 1641 not_entailment
1644
+ 1642 not_entailment
1645
+ 1643 entailment
1646
+ 1644 not_entailment
1647
+ 1645 not_entailment
1648
+ 1646 not_entailment
1649
+ 1647 not_entailment
1650
+ 1648 not_entailment
1651
+ 1649 not_entailment
1652
+ 1650 entailment
1653
+ 1651 entailment
1654
+ 1652 not_entailment
1655
+ 1653 not_entailment
1656
+ 1654 entailment
1657
+ 1655 entailment
1658
+ 1656 entailment
1659
+ 1657 entailment
1660
+ 1658 entailment
1661
+ 1659 entailment
1662
+ 1660 not_entailment
1663
+ 1661 entailment
1664
+ 1662 entailment
1665
+ 1663 not_entailment
1666
+ 1664 not_entailment
1667
+ 1665 not_entailment
1668
+ 1666 entailment
1669
+ 1667 not_entailment
1670
+ 1668 entailment
1671
+ 1669 entailment
1672
+ 1670 not_entailment
1673
+ 1671 entailment
1674
+ 1672 not_entailment
1675
+ 1673 not_entailment
1676
+ 1674 entailment
1677
+ 1675 not_entailment
1678
+ 1676 entailment
1679
+ 1677 entailment
1680
+ 1678 not_entailment
1681
+ 1679 entailment
1682
+ 1680 entailment
1683
+ 1681 entailment
1684
+ 1682 not_entailment
1685
+ 1683 entailment
1686
+ 1684 entailment
1687
+ 1685 entailment
1688
+ 1686 entailment
1689
+ 1687 entailment
1690
+ 1688 not_entailment
1691
+ 1689 entailment
1692
+ 1690 entailment
1693
+ 1691 entailment
1694
+ 1692 not_entailment
1695
+ 1693 not_entailment
1696
+ 1694 not_entailment
1697
+ 1695 entailment
1698
+ 1696 entailment
1699
+ 1697 entailment
1700
+ 1698 entailment
1701
+ 1699 not_entailment
1702
+ 1700 entailment
1703
+ 1701 entailment
1704
+ 1702 entailment
1705
+ 1703 entailment
1706
+ 1704 entailment
1707
+ 1705 entailment
1708
+ 1706 not_entailment
1709
+ 1707 entailment
1710
+ 1708 not_entailment
1711
+ 1709 not_entailment
1712
+ 1710 not_entailment
1713
+ 1711 not_entailment
1714
+ 1712 entailment
1715
+ 1713 entailment
1716
+ 1714 not_entailment
1717
+ 1715 entailment
1718
+ 1716 entailment
1719
+ 1717 entailment
1720
+ 1718 not_entailment
1721
+ 1719 not_entailment
1722
+ 1720 not_entailment
1723
+ 1721 entailment
1724
+ 1722 entailment
1725
+ 1723 entailment
1726
+ 1724 entailment
1727
+ 1725 entailment
1728
+ 1726 not_entailment
1729
+ 1727 entailment
1730
+ 1728 entailment
1731
+ 1729 entailment
1732
+ 1730 entailment
1733
+ 1731 entailment
1734
+ 1732 not_entailment
1735
+ 1733 entailment
1736
+ 1734 entailment
1737
+ 1735 not_entailment
1738
+ 1736 entailment
1739
+ 1737 entailment
1740
+ 1738 not_entailment
1741
+ 1739 not_entailment
1742
+ 1740 entailment
1743
+ 1741 entailment
1744
+ 1742 not_entailment
1745
+ 1743 not_entailment
1746
+ 1744 entailment
1747
+ 1745 not_entailment
1748
+ 1746 entailment
1749
+ 1747 entailment
1750
+ 1748 not_entailment
1751
+ 1749 not_entailment
1752
+ 1750 not_entailment
1753
+ 1751 entailment
1754
+ 1752 not_entailment
1755
+ 1753 entailment
1756
+ 1754 not_entailment
1757
+ 1755 not_entailment
1758
+ 1756 not_entailment
1759
+ 1757 not_entailment
1760
+ 1758 entailment
1761
+ 1759 entailment
1762
+ 1760 entailment
1763
+ 1761 not_entailment
1764
+ 1762 entailment
1765
+ 1763 entailment
1766
+ 1764 entailment
1767
+ 1765 entailment
1768
+ 1766 not_entailment
1769
+ 1767 not_entailment
1770
+ 1768 entailment
1771
+ 1769 entailment
1772
+ 1770 entailment
1773
+ 1771 not_entailment
1774
+ 1772 not_entailment
1775
+ 1773 entailment
1776
+ 1774 not_entailment
1777
+ 1775 entailment
1778
+ 1776 not_entailment
1779
+ 1777 entailment
1780
+ 1778 not_entailment
1781
+ 1779 entailment
1782
+ 1780 not_entailment
1783
+ 1781 not_entailment
1784
+ 1782 not_entailment
1785
+ 1783 entailment
1786
+ 1784 not_entailment
1787
+ 1785 entailment
1788
+ 1786 entailment
1789
+ 1787 entailment
1790
+ 1788 entailment
1791
+ 1789 entailment
1792
+ 1790 entailment
1793
+ 1791 entailment
1794
+ 1792 entailment
1795
+ 1793 not_entailment
1796
+ 1794 entailment
1797
+ 1795 entailment
1798
+ 1796 not_entailment
1799
+ 1797 not_entailment
1800
+ 1798 entailment
1801
+ 1799 not_entailment
1802
+ 1800 entailment
1803
+ 1801 not_entailment
1804
+ 1802 entailment
1805
+ 1803 not_entailment
1806
+ 1804 not_entailment
1807
+ 1805 not_entailment
1808
+ 1806 not_entailment
1809
+ 1807 not_entailment
1810
+ 1808 not_entailment
1811
+ 1809 entailment
1812
+ 1810 entailment
1813
+ 1811 entailment
1814
+ 1812 not_entailment
1815
+ 1813 entailment
1816
+ 1814 entailment
1817
+ 1815 entailment
1818
+ 1816 entailment
1819
+ 1817 not_entailment
1820
+ 1818 entailment
1821
+ 1819 entailment
1822
+ 1820 not_entailment
1823
+ 1821 not_entailment
1824
+ 1822 entailment
1825
+ 1823 not_entailment
1826
+ 1824 entailment
1827
+ 1825 not_entailment
1828
+ 1826 entailment
1829
+ 1827 not_entailment
1830
+ 1828 entailment
1831
+ 1829 entailment
1832
+ 1830 entailment
1833
+ 1831 not_entailment
1834
+ 1832 not_entailment
1835
+ 1833 not_entailment
1836
+ 1834 entailment
1837
+ 1835 entailment
1838
+ 1836 not_entailment
1839
+ 1837 entailment
1840
+ 1838 not_entailment
1841
+ 1839 not_entailment
1842
+ 1840 entailment
1843
+ 1841 entailment
1844
+ 1842 not_entailment
1845
+ 1843 entailment
1846
+ 1844 not_entailment
1847
+ 1845 not_entailment
1848
+ 1846 not_entailment
1849
+ 1847 not_entailment
1850
+ 1848 not_entailment
1851
+ 1849 entailment
1852
+ 1850 entailment
1853
+ 1851 not_entailment
1854
+ 1852 entailment
1855
+ 1853 not_entailment
1856
+ 1854 entailment
1857
+ 1855 not_entailment
1858
+ 1856 not_entailment
1859
+ 1857 entailment
1860
+ 1858 entailment
1861
+ 1859 not_entailment
1862
+ 1860 entailment
1863
+ 1861 entailment
1864
+ 1862 not_entailment
1865
+ 1863 entailment
1866
+ 1864 entailment
1867
+ 1865 not_entailment
1868
+ 1866 entailment
1869
+ 1867 not_entailment
1870
+ 1868 entailment
1871
+ 1869 entailment
1872
+ 1870 not_entailment
1873
+ 1871 entailment
1874
+ 1872 entailment
1875
+ 1873 entailment
1876
+ 1874 entailment
1877
+ 1875 entailment
1878
+ 1876 not_entailment
1879
+ 1877 not_entailment
1880
+ 1878 not_entailment
1881
+ 1879 not_entailment
1882
+ 1880 not_entailment
1883
+ 1881 entailment
1884
+ 1882 not_entailment
1885
+ 1883 not_entailment
1886
+ 1884 entailment
1887
+ 1885 not_entailment
1888
+ 1886 not_entailment
1889
+ 1887 entailment
1890
+ 1888 not_entailment
1891
+ 1889 entailment
1892
+ 1890 not_entailment
1893
+ 1891 not_entailment
1894
+ 1892 entailment
1895
+ 1893 entailment
1896
+ 1894 entailment
1897
+ 1895 not_entailment
1898
+ 1896 entailment
1899
+ 1897 entailment
1900
+ 1898 not_entailment
1901
+ 1899 entailment
1902
+ 1900 not_entailment
1903
+ 1901 not_entailment
1904
+ 1902 not_entailment
1905
+ 1903 not_entailment
1906
+ 1904 not_entailment
1907
+ 1905 entailment
1908
+ 1906 entailment
1909
+ 1907 not_entailment
1910
+ 1908 not_entailment
1911
+ 1909 entailment
1912
+ 1910 entailment
1913
+ 1911 entailment
1914
+ 1912 not_entailment
1915
+ 1913 not_entailment
1916
+ 1914 entailment
1917
+ 1915 not_entailment
1918
+ 1916 not_entailment
1919
+ 1917 entailment
1920
+ 1918 not_entailment
1921
+ 1919 entailment
1922
+ 1920 not_entailment
1923
+ 1921 not_entailment
1924
+ 1922 entailment
1925
+ 1923 entailment
1926
+ 1924 entailment
1927
+ 1925 entailment
1928
+ 1926 not_entailment
1929
+ 1927 not_entailment
1930
+ 1928 not_entailment
1931
+ 1929 entailment
1932
+ 1930 not_entailment
1933
+ 1931 entailment
1934
+ 1932 entailment
1935
+ 1933 not_entailment
1936
+ 1934 not_entailment
1937
+ 1935 entailment
1938
+ 1936 not_entailment
1939
+ 1937 not_entailment
1940
+ 1938 not_entailment
1941
+ 1939 entailment
1942
+ 1940 entailment
1943
+ 1941 not_entailment
1944
+ 1942 not_entailment
1945
+ 1943 entailment
1946
+ 1944 entailment
1947
+ 1945 not_entailment
1948
+ 1946 entailment
1949
+ 1947 entailment
1950
+ 1948 not_entailment
1951
+ 1949 not_entailment
1952
+ 1950 not_entailment
1953
+ 1951 not_entailment
1954
+ 1952 not_entailment
1955
+ 1953 not_entailment
1956
+ 1954 not_entailment
1957
+ 1955 not_entailment
1958
+ 1956 entailment
1959
+ 1957 entailment
1960
+ 1958 not_entailment
1961
+ 1959 entailment
1962
+ 1960 entailment
1963
+ 1961 entailment
1964
+ 1962 entailment
1965
+ 1963 not_entailment
1966
+ 1964 not_entailment
1967
+ 1965 entailment
1968
+ 1966 not_entailment
1969
+ 1967 entailment
1970
+ 1968 not_entailment
1971
+ 1969 not_entailment
1972
+ 1970 entailment
1973
+ 1971 entailment
1974
+ 1972 not_entailment
1975
+ 1973 entailment
1976
+ 1974 not_entailment
1977
+ 1975 not_entailment
1978
+ 1976 entailment
1979
+ 1977 not_entailment
1980
+ 1978 entailment
1981
+ 1979 entailment
1982
+ 1980 entailment
1983
+ 1981 not_entailment
1984
+ 1982 not_entailment
1985
+ 1983 entailment
1986
+ 1984 entailment
1987
+ 1985 entailment
1988
+ 1986 not_entailment
1989
+ 1987 not_entailment
1990
+ 1988 entailment
1991
+ 1989 entailment
1992
+ 1990 entailment
1993
+ 1991 entailment
1994
+ 1992 not_entailment
1995
+ 1993 not_entailment
1996
+ 1994 not_entailment
1997
+ 1995 entailment
1998
+ 1996 entailment
1999
+ 1997 entailment
2000
+ 1998 entailment
2001
+ 1999 entailment
2002
+ 2000 entailment
2003
+ 2001 not_entailment
2004
+ 2002 entailment
2005
+ 2003 entailment
2006
+ 2004 not_entailment
2007
+ 2005 entailment
2008
+ 2006 not_entailment
2009
+ 2007 not_entailment
2010
+ 2008 not_entailment
2011
+ 2009 not_entailment
2012
+ 2010 entailment
2013
+ 2011 entailment
2014
+ 2012 not_entailment
2015
+ 2013 entailment
2016
+ 2014 entailment
2017
+ 2015 entailment
2018
+ 2016 entailment
2019
+ 2017 not_entailment
2020
+ 2018 not_entailment
2021
+ 2019 entailment
2022
+ 2020 not_entailment
2023
+ 2021 not_entailment
2024
+ 2022 entailment
2025
+ 2023 entailment
2026
+ 2024 entailment
2027
+ 2025 entailment
2028
+ 2026 entailment
2029
+ 2027 entailment
2030
+ 2028 not_entailment
2031
+ 2029 entailment
2032
+ 2030 not_entailment
2033
+ 2031 entailment
2034
+ 2032 entailment
2035
+ 2033 entailment
2036
+ 2034 entailment
2037
+ 2035 not_entailment
2038
+ 2036 entailment
2039
+ 2037 not_entailment
2040
+ 2038 entailment
2041
+ 2039 entailment
2042
+ 2040 entailment
2043
+ 2041 entailment
2044
+ 2042 entailment
2045
+ 2043 entailment
2046
+ 2044 entailment
2047
+ 2045 entailment
2048
+ 2046 entailment
2049
+ 2047 not_entailment
2050
+ 2048 not_entailment
2051
+ 2049 entailment
2052
+ 2050 entailment
2053
+ 2051 entailment
2054
+ 2052 not_entailment
2055
+ 2053 not_entailment
2056
+ 2054 not_entailment
2057
+ 2055 not_entailment
2058
+ 2056 not_entailment
2059
+ 2057 entailment
2060
+ 2058 entailment
2061
+ 2059 not_entailment
2062
+ 2060 not_entailment
2063
+ 2061 not_entailment
2064
+ 2062 not_entailment
2065
+ 2063 entailment
2066
+ 2064 not_entailment
2067
+ 2065 entailment
2068
+ 2066 entailment
2069
+ 2067 entailment
2070
+ 2068 entailment
2071
+ 2069 entailment
2072
+ 2070 entailment
2073
+ 2071 not_entailment
2074
+ 2072 entailment
2075
+ 2073 entailment
2076
+ 2074 not_entailment
2077
+ 2075 entailment
2078
+ 2076 not_entailment
2079
+ 2077 entailment
2080
+ 2078 entailment
2081
+ 2079 entailment
2082
+ 2080 entailment
2083
+ 2081 entailment
2084
+ 2082 not_entailment
2085
+ 2083 entailment
2086
+ 2084 not_entailment
2087
+ 2085 not_entailment
2088
+ 2086 entailment
2089
+ 2087 entailment
2090
+ 2088 entailment
2091
+ 2089 entailment
2092
+ 2090 not_entailment
2093
+ 2091 not_entailment
2094
+ 2092 not_entailment
2095
+ 2093 not_entailment
2096
+ 2094 entailment
2097
+ 2095 not_entailment
2098
+ 2096 not_entailment
2099
+ 2097 entailment
2100
+ 2098 not_entailment
2101
+ 2099 entailment
2102
+ 2100 not_entailment
2103
+ 2101 not_entailment
2104
+ 2102 entailment
2105
+ 2103 not_entailment
2106
+ 2104 not_entailment
2107
+ 2105 not_entailment
2108
+ 2106 entailment
2109
+ 2107 entailment
2110
+ 2108 entailment
2111
+ 2109 not_entailment
2112
+ 2110 entailment
2113
+ 2111 entailment
2114
+ 2112 entailment
2115
+ 2113 entailment
2116
+ 2114 entailment
2117
+ 2115 entailment
2118
+ 2116 not_entailment
2119
+ 2117 entailment
2120
+ 2118 not_entailment
2121
+ 2119 not_entailment
2122
+ 2120 entailment
2123
+ 2121 not_entailment
2124
+ 2122 entailment
2125
+ 2123 entailment
2126
+ 2124 not_entailment
2127
+ 2125 not_entailment
2128
+ 2126 entailment
2129
+ 2127 entailment
2130
+ 2128 entailment
2131
+ 2129 not_entailment
2132
+ 2130 entailment
2133
+ 2131 not_entailment
2134
+ 2132 not_entailment
2135
+ 2133 not_entailment
2136
+ 2134 not_entailment
2137
+ 2135 entailment
2138
+ 2136 not_entailment
2139
+ 2137 entailment
2140
+ 2138 not_entailment
2141
+ 2139 entailment
2142
+ 2140 entailment
2143
+ 2141 not_entailment
2144
+ 2142 entailment
2145
+ 2143 not_entailment
2146
+ 2144 entailment
2147
+ 2145 entailment
2148
+ 2146 entailment
2149
+ 2147 not_entailment
2150
+ 2148 not_entailment
2151
+ 2149 entailment
2152
+ 2150 entailment
2153
+ 2151 not_entailment
2154
+ 2152 entailment
2155
+ 2153 not_entailment
2156
+ 2154 entailment
2157
+ 2155 entailment
2158
+ 2156 entailment
2159
+ 2157 entailment
2160
+ 2158 not_entailment
2161
+ 2159 entailment
2162
+ 2160 not_entailment
2163
+ 2161 entailment
2164
+ 2162 entailment
2165
+ 2163 not_entailment
2166
+ 2164 not_entailment
2167
+ 2165 not_entailment
2168
+ 2166 not_entailment
2169
+ 2167 entailment
2170
+ 2168 entailment
2171
+ 2169 not_entailment
2172
+ 2170 not_entailment
2173
+ 2171 entailment
2174
+ 2172 not_entailment
2175
+ 2173 not_entailment
2176
+ 2174 entailment
2177
+ 2175 not_entailment
2178
+ 2176 not_entailment
2179
+ 2177 entailment
2180
+ 2178 entailment
2181
+ 2179 entailment
2182
+ 2180 not_entailment
2183
+ 2181 not_entailment
2184
+ 2182 entailment
2185
+ 2183 not_entailment
2186
+ 2184 not_entailment
2187
+ 2185 not_entailment
2188
+ 2186 entailment
2189
+ 2187 entailment
2190
+ 2188 entailment
2191
+ 2189 not_entailment
2192
+ 2190 not_entailment
2193
+ 2191 entailment
2194
+ 2192 entailment
2195
+ 2193 not_entailment
2196
+ 2194 not_entailment
2197
+ 2195 not_entailment
2198
+ 2196 not_entailment
2199
+ 2197 entailment
2200
+ 2198 not_entailment
2201
+ 2199 entailment
2202
+ 2200 entailment
2203
+ 2201 entailment
2204
+ 2202 entailment
2205
+ 2203 not_entailment
2206
+ 2204 not_entailment
2207
+ 2205 entailment
2208
+ 2206 not_entailment
2209
+ 2207 not_entailment
2210
+ 2208 not_entailment
2211
+ 2209 not_entailment
2212
+ 2210 not_entailment
2213
+ 2211 entailment
2214
+ 2212 entailment
2215
+ 2213 entailment
2216
+ 2214 entailment
2217
+ 2215 entailment
2218
+ 2216 not_entailment
2219
+ 2217 not_entailment
2220
+ 2218 not_entailment
2221
+ 2219 not_entailment
2222
+ 2220 not_entailment
2223
+ 2221 entailment
2224
+ 2222 entailment
2225
+ 2223 not_entailment
2226
+ 2224 not_entailment
2227
+ 2225 not_entailment
2228
+ 2226 not_entailment
2229
+ 2227 not_entailment
2230
+ 2228 not_entailment
2231
+ 2229 not_entailment
2232
+ 2230 not_entailment
2233
+ 2231 not_entailment
2234
+ 2232 not_entailment
2235
+ 2233 not_entailment
2236
+ 2234 entailment
2237
+ 2235 not_entailment
2238
+ 2236 not_entailment
2239
+ 2237 not_entailment
2240
+ 2238 entailment
2241
+ 2239 entailment
2242
+ 2240 not_entailment
2243
+ 2241 not_entailment
2244
+ 2242 entailment
2245
+ 2243 entailment
2246
+ 2244 entailment
2247
+ 2245 entailment
2248
+ 2246 not_entailment
2249
+ 2247 not_entailment
2250
+ 2248 entailment
2251
+ 2249 not_entailment
2252
+ 2250 entailment
2253
+ 2251 not_entailment
2254
+ 2252 entailment
2255
+ 2253 entailment
2256
+ 2254 entailment
2257
+ 2255 not_entailment
2258
+ 2256 entailment
2259
+ 2257 not_entailment
2260
+ 2258 not_entailment
2261
+ 2259 entailment
2262
+ 2260 entailment
2263
+ 2261 entailment
2264
+ 2262 not_entailment
2265
+ 2263 not_entailment
2266
+ 2264 not_entailment
2267
+ 2265 not_entailment
2268
+ 2266 not_entailment
2269
+ 2267 entailment
2270
+ 2268 not_entailment
2271
+ 2269 not_entailment
2272
+ 2270 not_entailment
2273
+ 2271 entailment
2274
+ 2272 entailment
2275
+ 2273 entailment
2276
+ 2274 entailment
2277
+ 2275 entailment
2278
+ 2276 not_entailment
2279
+ 2277 not_entailment
2280
+ 2278 not_entailment
2281
+ 2279 entailment
2282
+ 2280 not_entailment
2283
+ 2281 not_entailment
2284
+ 2282 entailment
2285
+ 2283 not_entailment
2286
+ 2284 not_entailment
2287
+ 2285 entailment
2288
+ 2286 not_entailment
2289
+ 2287 not_entailment
2290
+ 2288 entailment
2291
+ 2289 entailment
2292
+ 2290 entailment
2293
+ 2291 entailment
2294
+ 2292 entailment
2295
+ 2293 entailment
2296
+ 2294 entailment
2297
+ 2295 entailment
2298
+ 2296 entailment
2299
+ 2297 entailment
2300
+ 2298 entailment
2301
+ 2299 entailment
2302
+ 2300 not_entailment
2303
+ 2301 entailment
2304
+ 2302 not_entailment
2305
+ 2303 entailment
2306
+ 2304 entailment
2307
+ 2305 entailment
2308
+ 2306 entailment
2309
+ 2307 not_entailment
2310
+ 2308 not_entailment
2311
+ 2309 not_entailment
2312
+ 2310 entailment
2313
+ 2311 not_entailment
2314
+ 2312 not_entailment
2315
+ 2313 entailment
2316
+ 2314 not_entailment
2317
+ 2315 entailment
2318
+ 2316 entailment
2319
+ 2317 entailment
2320
+ 2318 entailment
2321
+ 2319 entailment
2322
+ 2320 not_entailment
2323
+ 2321 entailment
2324
+ 2322 entailment
2325
+ 2323 not_entailment
2326
+ 2324 entailment
2327
+ 2325 entailment
2328
+ 2326 not_entailment
2329
+ 2327 not_entailment
2330
+ 2328 not_entailment
2331
+ 2329 not_entailment
2332
+ 2330 entailment
2333
+ 2331 entailment
2334
+ 2332 entailment
2335
+ 2333 entailment
2336
+ 2334 not_entailment
2337
+ 2335 not_entailment
2338
+ 2336 entailment
2339
+ 2337 entailment
2340
+ 2338 entailment
2341
+ 2339 not_entailment
2342
+ 2340 not_entailment
2343
+ 2341 entailment
2344
+ 2342 not_entailment
2345
+ 2343 entailment
2346
+ 2344 not_entailment
2347
+ 2345 entailment
2348
+ 2346 entailment
2349
+ 2347 entailment
2350
+ 2348 not_entailment
2351
+ 2349 entailment
2352
+ 2350 entailment
2353
+ 2351 not_entailment
2354
+ 2352 not_entailment
2355
+ 2353 not_entailment
2356
+ 2354 not_entailment
2357
+ 2355 entailment
2358
+ 2356 entailment
2359
+ 2357 not_entailment
2360
+ 2358 entailment
2361
+ 2359 entailment
2362
+ 2360 entailment
2363
+ 2361 not_entailment
2364
+ 2362 not_entailment
2365
+ 2363 not_entailment
2366
+ 2364 not_entailment
2367
+ 2365 not_entailment
2368
+ 2366 not_entailment
2369
+ 2367 entailment
2370
+ 2368 entailment
2371
+ 2369 entailment
2372
+ 2370 entailment
2373
+ 2371 entailment
2374
+ 2372 entailment
2375
+ 2373 entailment
2376
+ 2374 entailment
2377
+ 2375 entailment
2378
+ 2376 not_entailment
2379
+ 2377 entailment
2380
+ 2378 not_entailment
2381
+ 2379 not_entailment
2382
+ 2380 not_entailment
2383
+ 2381 not_entailment
2384
+ 2382 entailment
2385
+ 2383 entailment
2386
+ 2384 entailment
2387
+ 2385 entailment
2388
+ 2386 entailment
2389
+ 2387 entailment
2390
+ 2388 entailment
2391
+ 2389 entailment
2392
+ 2390 not_entailment
2393
+ 2391 entailment
2394
+ 2392 entailment
2395
+ 2393 entailment
2396
+ 2394 entailment
2397
+ 2395 entailment
2398
+ 2396 not_entailment
2399
+ 2397 not_entailment
2400
+ 2398 entailment
2401
+ 2399 entailment
2402
+ 2400 not_entailment
2403
+ 2401 not_entailment
2404
+ 2402 not_entailment
2405
+ 2403 entailment
2406
+ 2404 not_entailment
2407
+ 2405 entailment
2408
+ 2406 entailment
2409
+ 2407 entailment
2410
+ 2408 not_entailment
2411
+ 2409 not_entailment
2412
+ 2410 not_entailment
2413
+ 2411 not_entailment
2414
+ 2412 not_entailment
2415
+ 2413 not_entailment
2416
+ 2414 entailment
2417
+ 2415 entailment
2418
+ 2416 not_entailment
2419
+ 2417 not_entailment
2420
+ 2418 not_entailment
2421
+ 2419 entailment
2422
+ 2420 entailment
2423
+ 2421 entailment
2424
+ 2422 not_entailment
2425
+ 2423 not_entailment
2426
+ 2424 not_entailment
2427
+ 2425 entailment
2428
+ 2426 entailment
2429
+ 2427 entailment
2430
+ 2428 not_entailment
2431
+ 2429 not_entailment
2432
+ 2430 entailment
2433
+ 2431 not_entailment
2434
+ 2432 entailment
2435
+ 2433 not_entailment
2436
+ 2434 not_entailment
2437
+ 2435 entailment
2438
+ 2436 entailment
2439
+ 2437 entailment
2440
+ 2438 not_entailment
2441
+ 2439 not_entailment
2442
+ 2440 not_entailment
2443
+ 2441 entailment
2444
+ 2442 entailment
2445
+ 2443 not_entailment
2446
+ 2444 entailment
2447
+ 2445 entailment
2448
+ 2446 not_entailment
2449
+ 2447 not_entailment
2450
+ 2448 entailment
2451
+ 2449 entailment
2452
+ 2450 entailment
2453
+ 2451 entailment
2454
+ 2452 entailment
2455
+ 2453 entailment
2456
+ 2454 not_entailment
2457
+ 2455 not_entailment
2458
+ 2456 entailment
2459
+ 2457 entailment
2460
+ 2458 not_entailment
2461
+ 2459 entailment
2462
+ 2460 entailment
2463
+ 2461 not_entailment
2464
+ 2462 entailment
2465
+ 2463 entailment
2466
+ 2464 entailment
2467
+ 2465 entailment
2468
+ 2466 not_entailment
2469
+ 2467 entailment
2470
+ 2468 entailment
2471
+ 2469 entailment
2472
+ 2470 entailment
2473
+ 2471 entailment
2474
+ 2472 not_entailment
2475
+ 2473 entailment
2476
+ 2474 entailment
2477
+ 2475 not_entailment
2478
+ 2476 entailment
2479
+ 2477 entailment
2480
+ 2478 entailment
2481
+ 2479 entailment
2482
+ 2480 entailment
2483
+ 2481 entailment
2484
+ 2482 entailment
2485
+ 2483 entailment
2486
+ 2484 not_entailment
2487
+ 2485 entailment
2488
+ 2486 not_entailment
2489
+ 2487 entailment
2490
+ 2488 not_entailment
2491
+ 2489 not_entailment
2492
+ 2490 entailment
2493
+ 2491 entailment
2494
+ 2492 not_entailment
2495
+ 2493 entailment
2496
+ 2494 not_entailment
2497
+ 2495 not_entailment
2498
+ 2496 not_entailment
2499
+ 2497 not_entailment
2500
+ 2498 entailment
2501
+ 2499 not_entailment
2502
+ 2500 entailment
2503
+ 2501 not_entailment
2504
+ 2502 entailment
2505
+ 2503 entailment
2506
+ 2504 entailment
2507
+ 2505 entailment
2508
+ 2506 entailment
2509
+ 2507 entailment
2510
+ 2508 not_entailment
2511
+ 2509 not_entailment
2512
+ 2510 not_entailment
2513
+ 2511 entailment
2514
+ 2512 entailment
2515
+ 2513 not_entailment
2516
+ 2514 not_entailment
2517
+ 2515 entailment
2518
+ 2516 not_entailment
2519
+ 2517 not_entailment
2520
+ 2518 entailment
2521
+ 2519 entailment
2522
+ 2520 entailment
2523
+ 2521 not_entailment
2524
+ 2522 entailment
2525
+ 2523 entailment
2526
+ 2524 not_entailment
2527
+ 2525 not_entailment
2528
+ 2526 not_entailment
2529
+ 2527 not_entailment
2530
+ 2528 not_entailment
2531
+ 2529 entailment
2532
+ 2530 not_entailment
2533
+ 2531 not_entailment
2534
+ 2532 not_entailment
2535
+ 2533 entailment
2536
+ 2534 not_entailment
2537
+ 2535 entailment
2538
+ 2536 entailment
2539
+ 2537 not_entailment
2540
+ 2538 not_entailment
2541
+ 2539 entailment
2542
+ 2540 not_entailment
2543
+ 2541 entailment
2544
+ 2542 entailment
2545
+ 2543 entailment
2546
+ 2544 entailment
2547
+ 2545 entailment
2548
+ 2546 not_entailment
2549
+ 2547 entailment
2550
+ 2548 entailment
2551
+ 2549 entailment
2552
+ 2550 entailment
2553
+ 2551 entailment
2554
+ 2552 entailment
2555
+ 2553 entailment
2556
+ 2554 entailment
2557
+ 2555 entailment
2558
+ 2556 entailment
2559
+ 2557 entailment
2560
+ 2558 entailment
2561
+ 2559 entailment
2562
+ 2560 not_entailment
2563
+ 2561 entailment
2564
+ 2562 entailment
2565
+ 2563 entailment
2566
+ 2564 entailment
2567
+ 2565 entailment
2568
+ 2566 not_entailment
2569
+ 2567 not_entailment
2570
+ 2568 entailment
2571
+ 2569 entailment
2572
+ 2570 entailment
2573
+ 2571 not_entailment
2574
+ 2572 not_entailment
2575
+ 2573 entailment
2576
+ 2574 not_entailment
2577
+ 2575 entailment
2578
+ 2576 not_entailment
2579
+ 2577 entailment
2580
+ 2578 not_entailment
2581
+ 2579 not_entailment
2582
+ 2580 entailment
2583
+ 2581 not_entailment
2584
+ 2582 not_entailment
2585
+ 2583 not_entailment
2586
+ 2584 entailment
2587
+ 2585 not_entailment
2588
+ 2586 entailment
2589
+ 2587 entailment
2590
+ 2588 not_entailment
2591
+ 2589 entailment
2592
+ 2590 entailment
2593
+ 2591 not_entailment
2594
+ 2592 not_entailment
2595
+ 2593 entailment
2596
+ 2594 entailment
2597
+ 2595 not_entailment
2598
+ 2596 entailment
2599
+ 2597 not_entailment
2600
+ 2598 entailment
2601
+ 2599 not_entailment
2602
+ 2600 entailment
2603
+ 2601 entailment
2604
+ 2602 not_entailment
2605
+ 2603 not_entailment
2606
+ 2604 not_entailment
2607
+ 2605 not_entailment
2608
+ 2606 not_entailment
2609
+ 2607 not_entailment
2610
+ 2608 entailment
2611
+ 2609 not_entailment
2612
+ 2610 not_entailment
2613
+ 2611 entailment
2614
+ 2612 not_entailment
2615
+ 2613 entailment
2616
+ 2614 not_entailment
2617
+ 2615 not_entailment
2618
+ 2616 not_entailment
2619
+ 2617 entailment
2620
+ 2618 entailment
2621
+ 2619 not_entailment
2622
+ 2620 not_entailment
2623
+ 2621 entailment
2624
+ 2622 entailment
2625
+ 2623 not_entailment
2626
+ 2624 entailment
2627
+ 2625 entailment
2628
+ 2626 entailment
2629
+ 2627 entailment
2630
+ 2628 not_entailment
2631
+ 2629 not_entailment
2632
+ 2630 not_entailment
2633
+ 2631 entailment
2634
+ 2632 not_entailment
2635
+ 2633 not_entailment
2636
+ 2634 entailment
2637
+ 2635 entailment
2638
+ 2636 not_entailment
2639
+ 2637 not_entailment
2640
+ 2638 not_entailment
2641
+ 2639 entailment
2642
+ 2640 not_entailment
2643
+ 2641 entailment
2644
+ 2642 not_entailment
2645
+ 2643 not_entailment
2646
+ 2644 not_entailment
2647
+ 2645 entailment
2648
+ 2646 entailment
2649
+ 2647 not_entailment
2650
+ 2648 entailment
2651
+ 2649 entailment
2652
+ 2650 not_entailment
2653
+ 2651 not_entailment
2654
+ 2652 not_entailment
2655
+ 2653 entailment
2656
+ 2654 entailment
2657
+ 2655 entailment
2658
+ 2656 entailment
2659
+ 2657 not_entailment
2660
+ 2658 not_entailment
2661
+ 2659 entailment
2662
+ 2660 entailment
2663
+ 2661 not_entailment
2664
+ 2662 entailment
2665
+ 2663 entailment
2666
+ 2664 not_entailment
2667
+ 2665 entailment
2668
+ 2666 not_entailment
2669
+ 2667 entailment
2670
+ 2668 entailment
2671
+ 2669 not_entailment
2672
+ 2670 not_entailment
2673
+ 2671 not_entailment
2674
+ 2672 entailment
2675
+ 2673 not_entailment
2676
+ 2674 not_entailment
2677
+ 2675 not_entailment
2678
+ 2676 not_entailment
2679
+ 2677 not_entailment
2680
+ 2678 not_entailment
2681
+ 2679 not_entailment
2682
+ 2680 entailment
2683
+ 2681 not_entailment
2684
+ 2682 not_entailment
2685
+ 2683 not_entailment
2686
+ 2684 not_entailment
2687
+ 2685 not_entailment
2688
+ 2686 not_entailment
2689
+ 2687 entailment
2690
+ 2688 entailment
2691
+ 2689 entailment
2692
+ 2690 not_entailment
2693
+ 2691 entailment
2694
+ 2692 entailment
2695
+ 2693 entailment
2696
+ 2694 not_entailment
2697
+ 2695 entailment
2698
+ 2696 entailment
2699
+ 2697 not_entailment
2700
+ 2698 entailment
2701
+ 2699 entailment
2702
+ 2700 entailment
2703
+ 2701 not_entailment
2704
+ 2702 entailment
2705
+ 2703 not_entailment
2706
+ 2704 entailment
2707
+ 2705 entailment
2708
+ 2706 not_entailment
2709
+ 2707 not_entailment
2710
+ 2708 entailment
2711
+ 2709 not_entailment
2712
+ 2710 not_entailment
2713
+ 2711 entailment
2714
+ 2712 entailment
2715
+ 2713 not_entailment
2716
+ 2714 entailment
2717
+ 2715 entailment
2718
+ 2716 entailment
2719
+ 2717 entailment
2720
+ 2718 entailment
2721
+ 2719 not_entailment
2722
+ 2720 not_entailment
2723
+ 2721 entailment
2724
+ 2722 not_entailment
2725
+ 2723 not_entailment
2726
+ 2724 not_entailment
2727
+ 2725 not_entailment
2728
+ 2726 not_entailment
2729
+ 2727 not_entailment
2730
+ 2728 not_entailment
2731
+ 2729 entailment
2732
+ 2730 not_entailment
2733
+ 2731 not_entailment
2734
+ 2732 entailment
2735
+ 2733 not_entailment
2736
+ 2734 not_entailment
2737
+ 2735 not_entailment
2738
+ 2736 entailment
2739
+ 2737 not_entailment
2740
+ 2738 not_entailment
2741
+ 2739 entailment
2742
+ 2740 not_entailment
2743
+ 2741 not_entailment
2744
+ 2742 entailment
2745
+ 2743 entailment
2746
+ 2744 entailment
2747
+ 2745 not_entailment
2748
+ 2746 entailment
2749
+ 2747 not_entailment
2750
+ 2748 entailment
2751
+ 2749 not_entailment
2752
+ 2750 entailment
2753
+ 2751 entailment
2754
+ 2752 entailment
2755
+ 2753 entailment
2756
+ 2754 not_entailment
2757
+ 2755 not_entailment
2758
+ 2756 not_entailment
2759
+ 2757 not_entailment
2760
+ 2758 not_entailment
2761
+ 2759 not_entailment
2762
+ 2760 entailment
2763
+ 2761 not_entailment
2764
+ 2762 not_entailment
2765
+ 2763 not_entailment
2766
+ 2764 not_entailment
2767
+ 2765 entailment
2768
+ 2766 not_entailment
2769
+ 2767 not_entailment
2770
+ 2768 not_entailment
2771
+ 2769 entailment
2772
+ 2770 entailment
2773
+ 2771 not_entailment
2774
+ 2772 entailment
2775
+ 2773 entailment
2776
+ 2774 entailment
2777
+ 2775 not_entailment
2778
+ 2776 entailment
2779
+ 2777 entailment
2780
+ 2778 not_entailment
2781
+ 2779 not_entailment
2782
+ 2780 entailment
2783
+ 2781 not_entailment
2784
+ 2782 entailment
2785
+ 2783 not_entailment
2786
+ 2784 not_entailment
2787
+ 2785 not_entailment
2788
+ 2786 not_entailment
2789
+ 2787 entailment
2790
+ 2788 not_entailment
2791
+ 2789 not_entailment
2792
+ 2790 not_entailment
2793
+ 2791 not_entailment
2794
+ 2792 not_entailment
2795
+ 2793 entailment
2796
+ 2794 not_entailment
2797
+ 2795 not_entailment
2798
+ 2796 entailment
2799
+ 2797 entailment
2800
+ 2798 not_entailment
2801
+ 2799 entailment
2802
+ 2800 entailment
2803
+ 2801 not_entailment
2804
+ 2802 not_entailment
2805
+ 2803 entailment
2806
+ 2804 not_entailment
2807
+ 2805 entailment
2808
+ 2806 entailment
2809
+ 2807 entailment
2810
+ 2808 not_entailment
2811
+ 2809 entailment
2812
+ 2810 entailment
2813
+ 2811 not_entailment
2814
+ 2812 entailment
2815
+ 2813 entailment
2816
+ 2814 entailment
2817
+ 2815 entailment
2818
+ 2816 not_entailment
2819
+ 2817 not_entailment
2820
+ 2818 entailment
2821
+ 2819 not_entailment
2822
+ 2820 not_entailment
2823
+ 2821 not_entailment
2824
+ 2822 not_entailment
2825
+ 2823 entailment
2826
+ 2824 entailment
2827
+ 2825 entailment
2828
+ 2826 entailment
2829
+ 2827 entailment
2830
+ 2828 entailment
2831
+ 2829 not_entailment
2832
+ 2830 not_entailment
2833
+ 2831 not_entailment
2834
+ 2832 not_entailment
2835
+ 2833 not_entailment
2836
+ 2834 entailment
2837
+ 2835 entailment
2838
+ 2836 not_entailment
2839
+ 2837 not_entailment
2840
+ 2838 not_entailment
2841
+ 2839 not_entailment
2842
+ 2840 not_entailment
2843
+ 2841 entailment
2844
+ 2842 entailment
2845
+ 2843 entailment
2846
+ 2844 not_entailment
2847
+ 2845 entailment
2848
+ 2846 entailment
2849
+ 2847 not_entailment
2850
+ 2848 not_entailment
2851
+ 2849 not_entailment
2852
+ 2850 not_entailment
2853
+ 2851 not_entailment
2854
+ 2852 entailment
2855
+ 2853 entailment
2856
+ 2854 not_entailment
2857
+ 2855 not_entailment
2858
+ 2856 not_entailment
2859
+ 2857 entailment
2860
+ 2858 entailment
2861
+ 2859 not_entailment
2862
+ 2860 not_entailment
2863
+ 2861 not_entailment
2864
+ 2862 not_entailment
2865
+ 2863 not_entailment
2866
+ 2864 entailment
2867
+ 2865 entailment
2868
+ 2866 entailment
2869
+ 2867 not_entailment
2870
+ 2868 entailment
2871
+ 2869 not_entailment
2872
+ 2870 not_entailment
2873
+ 2871 entailment
2874
+ 2872 entailment
2875
+ 2873 entailment
2876
+ 2874 not_entailment
2877
+ 2875 not_entailment
2878
+ 2876 entailment
2879
+ 2877 entailment
2880
+ 2878 not_entailment
2881
+ 2879 entailment
2882
+ 2880 entailment
2883
+ 2881 not_entailment
2884
+ 2882 entailment
2885
+ 2883 not_entailment
2886
+ 2884 entailment
2887
+ 2885 not_entailment
2888
+ 2886 not_entailment
2889
+ 2887 entailment
2890
+ 2888 not_entailment
2891
+ 2889 entailment
2892
+ 2890 entailment
2893
+ 2891 entailment
2894
+ 2892 not_entailment
2895
+ 2893 entailment
2896
+ 2894 not_entailment
2897
+ 2895 entailment
2898
+ 2896 not_entailment
2899
+ 2897 not_entailment
2900
+ 2898 not_entailment
2901
+ 2899 entailment
2902
+ 2900 not_entailment
2903
+ 2901 entailment
2904
+ 2902 entailment
2905
+ 2903 entailment
2906
+ 2904 not_entailment
2907
+ 2905 not_entailment
2908
+ 2906 not_entailment
2909
+ 2907 not_entailment
2910
+ 2908 entailment
2911
+ 2909 entailment
2912
+ 2910 entailment
2913
+ 2911 entailment
2914
+ 2912 not_entailment
2915
+ 2913 not_entailment
2916
+ 2914 entailment
2917
+ 2915 not_entailment
2918
+ 2916 not_entailment
2919
+ 2917 not_entailment
2920
+ 2918 entailment
2921
+ 2919 entailment
2922
+ 2920 entailment
2923
+ 2921 not_entailment
2924
+ 2922 entailment
2925
+ 2923 entailment
2926
+ 2924 not_entailment
2927
+ 2925 not_entailment
2928
+ 2926 not_entailment
2929
+ 2927 not_entailment
2930
+ 2928 entailment
2931
+ 2929 not_entailment
2932
+ 2930 not_entailment
2933
+ 2931 entailment
2934
+ 2932 not_entailment
2935
+ 2933 entailment
2936
+ 2934 entailment
2937
+ 2935 entailment
2938
+ 2936 entailment
2939
+ 2937 not_entailment
2940
+ 2938 not_entailment
2941
+ 2939 entailment
2942
+ 2940 not_entailment
2943
+ 2941 entailment
2944
+ 2942 not_entailment
2945
+ 2943 entailment
2946
+ 2944 entailment
2947
+ 2945 entailment
2948
+ 2946 not_entailment
2949
+ 2947 not_entailment
2950
+ 2948 entailment
2951
+ 2949 entailment
2952
+ 2950 not_entailment
2953
+ 2951 entailment
2954
+ 2952 entailment
2955
+ 2953 entailment
2956
+ 2954 not_entailment
2957
+ 2955 entailment
2958
+ 2956 not_entailment
2959
+ 2957 not_entailment
2960
+ 2958 not_entailment
2961
+ 2959 not_entailment
2962
+ 2960 entailment
2963
+ 2961 not_entailment
2964
+ 2962 entailment
2965
+ 2963 not_entailment
2966
+ 2964 not_entailment
2967
+ 2965 not_entailment
2968
+ 2966 not_entailment
2969
+ 2967 entailment
2970
+ 2968 not_entailment
2971
+ 2969 entailment
2972
+ 2970 entailment
2973
+ 2971 entailment
2974
+ 2972 not_entailment
2975
+ 2973 entailment
2976
+ 2974 entailment
2977
+ 2975 entailment
2978
+ 2976 entailment
2979
+ 2977 entailment
2980
+ 2978 entailment
2981
+ 2979 not_entailment
2982
+ 2980 entailment
2983
+ 2981 not_entailment
2984
+ 2982 entailment
2985
+ 2983 entailment
2986
+ 2984 entailment
2987
+ 2985 not_entailment
2988
+ 2986 not_entailment
2989
+ 2987 not_entailment
2990
+ 2988 entailment
2991
+ 2989 entailment
2992
+ 2990 not_entailment
2993
+ 2991 entailment
2994
+ 2992 entailment
2995
+ 2993 not_entailment
2996
+ 2994 entailment
2997
+ 2995 entailment
2998
+ 2996 not_entailment
2999
+ 2997 entailment
3000
+ 2998 not_entailment
3001
+ 2999 not_entailment
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 6.0,
3
+ "eval_accuracy": 0.8086642599277978,
4
+ "eval_loss": 0.5369592905044556,
5
+ "eval_runtime": 0.4898,
6
+ "eval_samples": 277,
7
+ "eval_samples_per_second": 565.514,
8
+ "eval_steps_per_second": 2.042
9
+ }
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 6.0,
3
+ "eval_accuracy": 0.8086642599277978,
4
+ "eval_loss": 0.5369592905044556,
5
+ "eval_runtime": 0.4898,
6
+ "eval_samples": 277,
7
+ "eval_samples_per_second": 565.514,
8
+ "eval_steps_per_second": 2.042
9
+ }
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft/tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "extra_special_tokens": {},
50
+ "mask_token": "[MASK]",
51
+ "model_max_length": 512,
52
+ "pad_token": "[PAD]",
53
+ "padding_side": "right",
54
+ "sep_token": "[SEP]",
55
+ "sp_model_kwargs": {},
56
+ "split_by_punct": false,
57
+ "tokenizer_class": "DebertaV2Tokenizer",
58
+ "unk_token": "[UNK]",
59
+ "vocab_type": "spm"
60
+ }
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: microsoft/deberta-v3-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:microsoft/deberta-v3-base
6
+ - transformers
7
+ ---
8
+
9
+ # Model Card for Model ID
10
+
11
+ <!-- Provide a quick summary of what the model is/does. -->
12
+
13
+
14
+
15
+ ## Model Details
16
+
17
+ ### Model Description
18
+
19
+ <!-- Provide a longer summary of what this model is. -->
20
+
21
+
22
+
23
+ - **Developed by:** [More Information Needed]
24
+ - **Funded by [optional]:** [More Information Needed]
25
+ - **Shared by [optional]:** [More Information Needed]
26
+ - **Model type:** [More Information Needed]
27
+ - **Language(s) (NLP):** [More Information Needed]
28
+ - **License:** [More Information Needed]
29
+ - **Finetuned from model [optional]:** [More Information Needed]
30
+
31
+ ### Model Sources [optional]
32
+
33
+ <!-- Provide the basic links for the model. -->
34
+
35
+ - **Repository:** [More Information Needed]
36
+ - **Paper [optional]:** [More Information Needed]
37
+ - **Demo [optional]:** [More Information Needed]
38
+
39
+ ## Uses
40
+
41
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
42
+
43
+ ### Direct Use
44
+
45
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
46
+
47
+ [More Information Needed]
48
+
49
+ ### Downstream Use [optional]
50
+
51
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
52
+
53
+ [More Information Needed]
54
+
55
+ ### Out-of-Scope Use
56
+
57
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
58
+
59
+ [More Information Needed]
60
+
61
+ ## Bias, Risks, and Limitations
62
+
63
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
64
+
65
+ [More Information Needed]
66
+
67
+ ### Recommendations
68
+
69
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
70
+
71
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
72
+
73
+ ## How to Get Started with the Model
74
+
75
+ Use the code below to get started with the model.
76
+
77
+ [More Information Needed]
78
+
79
+ ## Training Details
80
+
81
+ ### Training Data
82
+
83
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
84
+
85
+ [More Information Needed]
86
+
87
+ ### Training Procedure
88
+
89
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
90
+
91
+ #### Preprocessing [optional]
92
+
93
+ [More Information Needed]
94
+
95
+
96
+ #### Training Hyperparameters
97
+
98
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
99
+
100
+ #### Speeds, Sizes, Times [optional]
101
+
102
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
103
+
104
+ [More Information Needed]
105
+
106
+ ## Evaluation
107
+
108
+ <!-- This section describes the evaluation protocols and provides the results. -->
109
+
110
+ ### Testing Data, Factors & Metrics
111
+
112
+ #### Testing Data
113
+
114
+ <!-- This should link to a Dataset Card if possible. -->
115
+
116
+ [More Information Needed]
117
+
118
+ #### Factors
119
+
120
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
121
+
122
+ [More Information Needed]
123
+
124
+ #### Metrics
125
+
126
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
127
+
128
+ [More Information Needed]
129
+
130
+ ### Results
131
+
132
+ [More Information Needed]
133
+
134
+ #### Summary
135
+
136
+
137
+
138
+ ## Model Examination [optional]
139
+
140
+ <!-- Relevant interpretability work for the model goes here -->
141
+
142
+ [More Information Needed]
143
+
144
+ ## Environmental Impact
145
+
146
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
147
+
148
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
149
+
150
+ - **Hardware Type:** [More Information Needed]
151
+ - **Hours used:** [More Information Needed]
152
+ - **Cloud Provider:** [More Information Needed]
153
+ - **Compute Region:** [More Information Needed]
154
+ - **Carbon Emitted:** [More Information Needed]
155
+
156
+ ## Technical Specifications [optional]
157
+
158
+ ### Model Architecture and Objective
159
+
160
+ [More Information Needed]
161
+
162
+ ### Compute Infrastructure
163
+
164
+ [More Information Needed]
165
+
166
+ #### Hardware
167
+
168
+ [More Information Needed]
169
+
170
+ #### Software
171
+
172
+ [More Information Needed]
173
+
174
+ ## Citation [optional]
175
+
176
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
177
+
178
+ **BibTeX:**
179
+
180
+ [More Information Needed]
181
+
182
+ **APA:**
183
+
184
+ [More Information Needed]
185
+
186
+ ## Glossary [optional]
187
+
188
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
189
+
190
+ [More Information Needed]
191
+
192
+ ## More Information [optional]
193
+
194
+ [More Information Needed]
195
+
196
+ ## Model Card Authors [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Contact
201
+
202
+ [More Information Needed]
203
+ ### Framework versions
204
+
205
+ - PEFT 0.18.0
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/ft2/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": {
3
+ "base_model_class": "DebertaV2ForSequenceClassification",
4
+ "parent_library": "transformers.models.deberta_v2.modeling_deberta_v2"
5
+ },
6
+ "base_model_name_or_path": "microsoft/deberta-v3-base",
7
+ "bias": "none",
8
+ "boft_block_num": 0,
9
+ "boft_block_size": 4,
10
+ "boft_dropout": 0.05,
11
+ "boft_n_butterfly_factor": 2,
12
+ "exclude_modules": null,
13
+ "fan_in_fan_out": false,
14
+ "inference_mode": true,
15
+ "init_weights": true,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "modules_to_save": [
19
+ "classifier",
20
+ "pooler"
21
+ ],
22
+ "peft_type": "BOFT",
23
+ "peft_version": "0.18.0",
24
+ "revision": null,
25
+ "target_modules": [
26
+ "key_proj",
27
+ "attention.output.dense",
28
+ "output.dense",
29
+ "value_proj",
30
+ "intermediate.dense",
31
+ "query_proj"
32
+ ],
33
+ "task_type": null
34
+ }
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/trainer_state.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 200,
3
+ "best_metric": 0.8086642599277978,
4
+ "best_model_checkpoint": "./glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=22d14h42m42/checkpoint-200",
5
+ "epoch": 6.0,
6
+ "eval_steps": 100,
7
+ "global_step": 468,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.282051282051282,
14
+ "grad_norm": 1.0231401920318604,
15
+ "learning_rate": 0.00038620126895479395,
16
+ "loss": 0.677,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 1.282051282051282,
21
+ "eval_accuracy": 0.7364620938628159,
22
+ "eval_loss": 0.52269047498703,
23
+ "eval_runtime": 0.5615,
24
+ "eval_samples_per_second": 493.282,
25
+ "eval_steps_per_second": 1.781,
26
+ "step": 100
27
+ },
28
+ {
29
+ "epoch": 2.564102564102564,
30
+ "grad_norm": 3.1564369201660156,
31
+ "learning_rate": 0.00028421638445081326,
32
+ "loss": 0.3847,
33
+ "step": 200
34
+ },
35
+ {
36
+ "epoch": 2.564102564102564,
37
+ "eval_accuracy": 0.8086642599277978,
38
+ "eval_loss": 0.5369592905044556,
39
+ "eval_runtime": 0.4935,
40
+ "eval_samples_per_second": 561.324,
41
+ "eval_steps_per_second": 2.026,
42
+ "step": 200
43
+ },
44
+ {
45
+ "epoch": 3.8461538461538463,
46
+ "grad_norm": 4.220888614654541,
47
+ "learning_rate": 0.0001355917773399892,
48
+ "loss": 0.2557,
49
+ "step": 300
50
+ },
51
+ {
52
+ "epoch": 3.8461538461538463,
53
+ "eval_accuracy": 0.7942238267148014,
54
+ "eval_loss": 0.572320818901062,
55
+ "eval_runtime": 0.4881,
56
+ "eval_samples_per_second": 567.514,
57
+ "eval_steps_per_second": 2.049,
58
+ "step": 300
59
+ },
60
+ {
61
+ "epoch": 5.128205128205128,
62
+ "grad_norm": 2.0307106971740723,
63
+ "learning_rate": 2.2636970803626878e-05,
64
+ "loss": 0.1431,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 5.128205128205128,
69
+ "eval_accuracy": 0.7906137184115524,
70
+ "eval_loss": 0.6627222895622253,
71
+ "eval_runtime": 0.4917,
72
+ "eval_samples_per_second": 563.36,
73
+ "eval_steps_per_second": 2.034,
74
+ "step": 400
75
+ },
76
+ {
77
+ "epoch": 6.0,
78
+ "step": 468,
79
+ "total_flos": 2495241758361600.0,
80
+ "train_loss": 0.32858998551327956,
81
+ "train_runtime": 148.0171,
82
+ "train_samples_per_second": 100.934,
83
+ "train_steps_per_second": 3.162
84
+ }
85
+ ],
86
+ "logging_steps": 100,
87
+ "max_steps": 468,
88
+ "num_input_tokens_seen": 0,
89
+ "num_train_epochs": 6,
90
+ "save_steps": 100,
91
+ "stateful_callbacks": {
92
+ "TrainerControl": {
93
+ "args": {
94
+ "should_epoch_stop": false,
95
+ "should_evaluate": false,
96
+ "should_log": false,
97
+ "should_save": true,
98
+ "should_training_stop": true
99
+ },
100
+ "attributes": {}
101
+ }
102
+ },
103
+ "total_flos": 2495241758361600.0,
104
+ "train_batch_size": 32,
105
+ "trial_name": null,
106
+ "trial_params": null
107
+ }
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=25d17h57m30,sd43/all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 6.0,
3
+ "eval_accuracy": 0.8231046931407943,
4
+ "eval_loss": 0.5094519853591919,
5
+ "eval_runtime": 0.8486,
6
+ "eval_samples": 277,
7
+ "eval_samples_per_second": 326.406,
8
+ "eval_steps_per_second": 1.178
9
+ }
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=25d17h57m30,sd43/eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 6.0,
3
+ "eval_accuracy": 0.8231046931407943,
4
+ "eval_loss": 0.5094519853591919,
5
+ "eval_runtime": 0.8486,
6
+ "eval_samples": 277,
7
+ "eval_samples_per_second": 326.406,
8
+ "eval_steps_per_second": 1.178
9
+ }
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=25d17h57m30,sd43/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
reproduction/glue_expBOFT/rte/dr0.05,mlr4e-04,clr4e-04,ep=6.0t=25d17h57m30,sd43/ft/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }