allstax commited on
Commit
2d6f02f
1 Parent(s): 0c6b34f

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-10400/README.md +202 -0
  2. checkpoint-10400/adapter_config.json +33 -0
  3. checkpoint-10400/adapter_model.safetensors +3 -0
  4. checkpoint-10400/added_tokens.json +40 -0
  5. checkpoint-10400/merges.txt +0 -0
  6. checkpoint-10400/rng_state.pth +3 -0
  7. checkpoint-10400/scheduler.pt +3 -0
  8. checkpoint-10400/special_tokens_map.json +24 -0
  9. checkpoint-10400/tokenizer_config.json +327 -0
  10. checkpoint-10400/trainer_state.json +984 -0
  11. checkpoint-10400/training_args.bin +3 -0
  12. checkpoint-10400/vocab.json +0 -0
  13. checkpoint-11200/README.md +202 -0
  14. checkpoint-11200/adapter_config.json +33 -0
  15. checkpoint-11200/adapter_model.safetensors +3 -0
  16. checkpoint-11200/added_tokens.json +40 -0
  17. checkpoint-11200/merges.txt +0 -0
  18. checkpoint-11200/rng_state.pth +3 -0
  19. checkpoint-11200/scheduler.pt +3 -0
  20. checkpoint-11200/special_tokens_map.json +24 -0
  21. checkpoint-11200/tokenizer_config.json +327 -0
  22. checkpoint-11200/trainer_state.json +1057 -0
  23. checkpoint-11200/training_args.bin +3 -0
  24. checkpoint-11200/vocab.json +0 -0
  25. checkpoint-12000/README.md +202 -0
  26. checkpoint-12000/adapter_config.json +33 -0
  27. checkpoint-12000/adapter_model.safetensors +3 -0
  28. checkpoint-12000/added_tokens.json +40 -0
  29. checkpoint-12000/merges.txt +0 -0
  30. checkpoint-12000/rng_state.pth +3 -0
  31. checkpoint-12000/scheduler.pt +3 -0
  32. checkpoint-12000/special_tokens_map.json +24 -0
  33. checkpoint-12000/tokenizer_config.json +327 -0
  34. checkpoint-12000/trainer_state.json +1137 -0
  35. checkpoint-12000/training_args.bin +3 -0
  36. checkpoint-12000/vocab.json +0 -0
  37. checkpoint-12800/README.md +202 -0
  38. checkpoint-12800/adapter_config.json +33 -0
  39. checkpoint-12800/adapter_model.safetensors +3 -0
  40. checkpoint-12800/added_tokens.json +40 -0
  41. checkpoint-12800/merges.txt +0 -0
  42. checkpoint-12800/rng_state.pth +3 -0
  43. checkpoint-12800/scheduler.pt +3 -0
  44. checkpoint-12800/special_tokens_map.json +24 -0
  45. checkpoint-12800/tokenizer_config.json +327 -0
  46. checkpoint-12800/trainer_state.json +1210 -0
  47. checkpoint-12800/training_args.bin +3 -0
  48. checkpoint-12800/vocab.json +0 -0
  49. checkpoint-13600/README.md +202 -0
  50. checkpoint-13600/adapter_config.json +33 -0
checkpoint-10400/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: /workspace/model-export/allstax/shorting-phi-e4
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-10400/adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/workspace/model-export/allstax/shorting-phi-e4",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "q_proj",
25
+ "fc2",
26
+ "dense",
27
+ "v_proj",
28
+ "fc1"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }
checkpoint-10400/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51dce85121adbcc0b434bb7b3f6f5c2a87374603878c55a6d72999664efa6473
3
+ size 377538512
checkpoint-10400/added_tokens.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\t\t": 50294,
3
+ "\t\t\t": 50293,
4
+ "\t\t\t\t": 50292,
5
+ "\t\t\t\t\t": 50291,
6
+ "\t\t\t\t\t\t": 50290,
7
+ "\t\t\t\t\t\t\t": 50289,
8
+ "\t\t\t\t\t\t\t\t": 50288,
9
+ "\t\t\t\t\t\t\t\t\t": 50287,
10
+ " ": 50286,
11
+ " ": 50285,
12
+ " ": 50284,
13
+ " ": 50283,
14
+ " ": 50282,
15
+ " ": 50281,
16
+ " ": 50280,
17
+ " ": 50279,
18
+ " ": 50278,
19
+ " ": 50277,
20
+ " ": 50276,
21
+ " ": 50275,
22
+ " ": 50274,
23
+ " ": 50273,
24
+ " ": 50272,
25
+ " ": 50271,
26
+ " ": 50270,
27
+ " ": 50269,
28
+ " ": 50268,
29
+ " ": 50267,
30
+ " ": 50266,
31
+ " ": 50265,
32
+ " ": 50264,
33
+ " ": 50263,
34
+ " ": 50262,
35
+ " ": 50261,
36
+ " ": 50260,
37
+ " ": 50259,
38
+ " ": 50258,
39
+ " ": 50257
40
+ }
checkpoint-10400/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeab18862b0b22c21a073ab4545dd2fa480ab14027b9feba3fb9276d21cd5bf4
3
+ size 14244
checkpoint-10400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67ad33ca7cb2d596ed7f7c6c17c9b9f03635dba3151d321dd15363dfc4114458
3
+ size 1064
checkpoint-10400/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-10400/tokenizer_config.json ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "50257": {
14
+ "content": " ",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": false
20
+ },
21
+ "50258": {
22
+ "content": " ",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "50259": {
30
+ "content": " ",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "50260": {
38
+ "content": " ",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "50261": {
46
+ "content": " ",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "50262": {
54
+ "content": " ",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "50263": {
62
+ "content": " ",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "50264": {
70
+ "content": " ",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "50265": {
78
+ "content": " ",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "50266": {
86
+ "content": " ",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "50267": {
94
+ "content": " ",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "50268": {
102
+ "content": " ",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "50269": {
110
+ "content": " ",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "50270": {
118
+ "content": " ",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "50271": {
126
+ "content": " ",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "50272": {
134
+ "content": " ",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "50273": {
142
+ "content": " ",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "50274": {
150
+ "content": " ",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "50275": {
158
+ "content": " ",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "50276": {
166
+ "content": " ",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "50277": {
174
+ "content": " ",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "50278": {
182
+ "content": " ",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "50279": {
190
+ "content": " ",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "50280": {
198
+ "content": " ",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "50281": {
206
+ "content": " ",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "50282": {
214
+ "content": " ",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "50283": {
222
+ "content": " ",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "50284": {
230
+ "content": " ",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "50285": {
238
+ "content": " ",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "50286": {
246
+ "content": " ",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "50287": {
254
+ "content": "\t\t\t\t\t\t\t\t\t",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "50288": {
262
+ "content": "\t\t\t\t\t\t\t\t",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "50289": {
270
+ "content": "\t\t\t\t\t\t\t",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "50290": {
278
+ "content": "\t\t\t\t\t\t",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "50291": {
286
+ "content": "\t\t\t\t\t",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "50292": {
294
+ "content": "\t\t\t\t",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "50293": {
302
+ "content": "\t\t\t",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "50294": {
310
+ "content": "\t\t",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ }
317
+ },
318
+ "bos_token": "<|endoftext|>",
319
+ "clean_up_tokenization_spaces": true,
320
+ "eos_token": "<|endoftext|>",
321
+ "errors": "replace",
322
+ "model_max_length": 2048,
323
+ "pad_token": "<|endoftext|>",
324
+ "padding_side": "left",
325
+ "tokenizer_class": "CodeGenTokenizer",
326
+ "unk_token": "<|endoftext|>"
327
+ }
checkpoint-10400/trainer_state.json ADDED
@@ -0,0 +1,984 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9535598037867327,
5
+ "eval_steps": 200,
6
+ "global_step": 10400,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "eval_bertscore": 0.7401605248451233,
14
+ "eval_loss": 1.9530484676361084,
15
+ "eval_rouge1": 0.6562857460474375,
16
+ "eval_rouge2": 0.3640670727106235,
17
+ "eval_rougeL": 0.5655212336424695,
18
+ "eval_rougeLsum": 0.6414840198810386,
19
+ "eval_runtime": 21.7196,
20
+ "eval_samples_per_second": 1.381,
21
+ "eval_steps_per_second": 0.691,
22
+ "step": 200
23
+ },
24
+ {
25
+ "epoch": 0.02,
26
+ "grad_norm": 0.25105270743370056,
27
+ "learning_rate": 0.00019771674842969145,
28
+ "loss": 1.7353,
29
+ "step": 250
30
+ },
31
+ {
32
+ "epoch": 0.04,
33
+ "eval_bertscore": 0.7432050108909607,
34
+ "eval_loss": 1.9583823680877686,
35
+ "eval_rouge1": 0.6554226269617707,
36
+ "eval_rouge2": 0.36661086995296877,
37
+ "eval_rougeL": 0.5637448790342183,
38
+ "eval_rougeLsum": 0.6419796784912521,
39
+ "eval_runtime": 21.9623,
40
+ "eval_samples_per_second": 1.366,
41
+ "eval_steps_per_second": 0.683,
42
+ "step": 400
43
+ },
44
+ {
45
+ "epoch": 0.05,
46
+ "grad_norm": 0.26550447940826416,
47
+ "learning_rate": 0.00019542432717436156,
48
+ "loss": 1.7786,
49
+ "step": 500
50
+ },
51
+ {
52
+ "epoch": 0.06,
53
+ "eval_bertscore": 0.7469045519828796,
54
+ "eval_loss": 1.9245686531066895,
55
+ "eval_rouge1": 0.6662431635890791,
56
+ "eval_rouge2": 0.3735263724826765,
57
+ "eval_rougeL": 0.5755071616151013,
58
+ "eval_rougeLsum": 0.6538383087686117,
59
+ "eval_runtime": 21.5302,
60
+ "eval_samples_per_second": 1.393,
61
+ "eval_steps_per_second": 0.697,
62
+ "step": 600
63
+ },
64
+ {
65
+ "epoch": 0.07,
66
+ "grad_norm": 0.1538015753030777,
67
+ "learning_rate": 0.0001931319059190317,
68
+ "loss": 1.8851,
69
+ "step": 750
70
+ },
71
+ {
72
+ "epoch": 0.07,
73
+ "eval_bertscore": 0.7442477941513062,
74
+ "eval_loss": 1.9187489748001099,
75
+ "eval_rouge1": 0.6606221897489035,
76
+ "eval_rouge2": 0.368654563659435,
77
+ "eval_rougeL": 0.5731546210408094,
78
+ "eval_rougeLsum": 0.6470590823125606,
79
+ "eval_runtime": 21.9831,
80
+ "eval_samples_per_second": 1.365,
81
+ "eval_steps_per_second": 0.682,
82
+ "step": 800
83
+ },
84
+ {
85
+ "epoch": 0.09,
86
+ "grad_norm": 0.1681252270936966,
87
+ "learning_rate": 0.0001908394846637018,
88
+ "loss": 1.8919,
89
+ "step": 1000
90
+ },
91
+ {
92
+ "epoch": 0.09,
93
+ "eval_bertscore": 0.7458053231239319,
94
+ "eval_loss": 1.9159075021743774,
95
+ "eval_rouge1": 0.6621259186456026,
96
+ "eval_rouge2": 0.372024043683234,
97
+ "eval_rougeL": 0.5743354509339939,
98
+ "eval_rougeLsum": 0.6491550893780276,
99
+ "eval_runtime": 21.7159,
100
+ "eval_samples_per_second": 1.381,
101
+ "eval_steps_per_second": 0.691,
102
+ "step": 1000
103
+ },
104
+ {
105
+ "epoch": 0.11,
106
+ "eval_bertscore": 0.7468854784965515,
107
+ "eval_loss": 1.9140182733535767,
108
+ "eval_rouge1": 0.6626581781149132,
109
+ "eval_rouge2": 0.37318557504782157,
110
+ "eval_rougeL": 0.5759264203594217,
111
+ "eval_rougeLsum": 0.6490702446275723,
112
+ "eval_runtime": 21.6486,
113
+ "eval_samples_per_second": 1.386,
114
+ "eval_steps_per_second": 0.693,
115
+ "step": 1200
116
+ },
117
+ {
118
+ "epoch": 0.11,
119
+ "grad_norm": 0.1552441120147705,
120
+ "learning_rate": 0.00018854706340837193,
121
+ "loss": 1.9052,
122
+ "step": 1250
123
+ },
124
+ {
125
+ "epoch": 0.13,
126
+ "eval_bertscore": 0.7475314736366272,
127
+ "eval_loss": 1.913794755935669,
128
+ "eval_rouge1": 0.6648687174353192,
129
+ "eval_rouge2": 0.3760379232448734,
130
+ "eval_rougeL": 0.5784915488164926,
131
+ "eval_rougeLsum": 0.6513864520108938,
132
+ "eval_runtime": 21.664,
133
+ "eval_samples_per_second": 1.385,
134
+ "eval_steps_per_second": 0.692,
135
+ "step": 1400
136
+ },
137
+ {
138
+ "epoch": 0.14,
139
+ "grad_norm": 0.14638397097587585,
140
+ "learning_rate": 0.00018625464215304204,
141
+ "loss": 1.8843,
142
+ "step": 1500
143
+ },
144
+ {
145
+ "epoch": 0.15,
146
+ "eval_bertscore": 0.747238039970398,
147
+ "eval_loss": 1.9117029905319214,
148
+ "eval_rouge1": 0.6638085237198453,
149
+ "eval_rouge2": 0.3742779818055127,
150
+ "eval_rougeL": 0.5754209460423059,
151
+ "eval_rougeLsum": 0.6506476155592722,
152
+ "eval_runtime": 21.9308,
153
+ "eval_samples_per_second": 1.368,
154
+ "eval_steps_per_second": 0.684,
155
+ "step": 1600
156
+ },
157
+ {
158
+ "epoch": 0.16,
159
+ "grad_norm": 0.15738993883132935,
160
+ "learning_rate": 0.00018396222089771218,
161
+ "loss": 1.8964,
162
+ "step": 1750
163
+ },
164
+ {
165
+ "epoch": 0.17,
166
+ "eval_bertscore": 0.7473016381263733,
167
+ "eval_loss": 1.9117563962936401,
168
+ "eval_rouge1": 0.6620053151663765,
169
+ "eval_rouge2": 0.37406692119411245,
170
+ "eval_rougeL": 0.5758911607323577,
171
+ "eval_rougeLsum": 0.6494070575604445,
172
+ "eval_runtime": 21.6727,
173
+ "eval_samples_per_second": 1.384,
174
+ "eval_steps_per_second": 0.692,
175
+ "step": 1800
176
+ },
177
+ {
178
+ "epoch": 0.18,
179
+ "grad_norm": 0.1588907092809677,
180
+ "learning_rate": 0.00018166979964238228,
181
+ "loss": 1.8827,
182
+ "step": 2000
183
+ },
184
+ {
185
+ "epoch": 0.18,
186
+ "eval_bertscore": 0.7485987544059753,
187
+ "eval_loss": 1.9126006364822388,
188
+ "eval_rouge1": 0.6641836156334741,
189
+ "eval_rouge2": 0.37320215574735827,
190
+ "eval_rougeL": 0.5783015040447993,
191
+ "eval_rougeLsum": 0.6522235940423647,
192
+ "eval_runtime": 21.9759,
193
+ "eval_samples_per_second": 1.365,
194
+ "eval_steps_per_second": 0.683,
195
+ "step": 2000
196
+ },
197
+ {
198
+ "epoch": 0.2,
199
+ "eval_bertscore": 0.7482583522796631,
200
+ "eval_loss": 1.9075205326080322,
201
+ "eval_rouge1": 0.6658219484766166,
202
+ "eval_rouge2": 0.37723364952258465,
203
+ "eval_rougeL": 0.5769040785174693,
204
+ "eval_rougeLsum": 0.6511328888044219,
205
+ "eval_runtime": 21.5892,
206
+ "eval_samples_per_second": 1.39,
207
+ "eval_steps_per_second": 0.695,
208
+ "step": 2200
209
+ },
210
+ {
211
+ "epoch": 0.21,
212
+ "grad_norm": 0.15247465670108795,
213
+ "learning_rate": 0.00017937737838705242,
214
+ "loss": 1.8831,
215
+ "step": 2250
216
+ },
217
+ {
218
+ "epoch": 0.22,
219
+ "eval_bertscore": 0.7460805177688599,
220
+ "eval_loss": 1.9088668823242188,
221
+ "eval_rouge1": 0.6627321043292516,
222
+ "eval_rouge2": 0.3696581195003696,
223
+ "eval_rougeL": 0.5740988544467178,
224
+ "eval_rougeLsum": 0.6478729042661874,
225
+ "eval_runtime": 21.9221,
226
+ "eval_samples_per_second": 1.368,
227
+ "eval_steps_per_second": 0.684,
228
+ "step": 2400
229
+ },
230
+ {
231
+ "epoch": 0.23,
232
+ "grad_norm": 0.1587379276752472,
233
+ "learning_rate": 0.00017708495713172253,
234
+ "loss": 1.8829,
235
+ "step": 2500
236
+ },
237
+ {
238
+ "epoch": 0.24,
239
+ "eval_bertscore": 0.7472203373908997,
240
+ "eval_loss": 1.906219482421875,
241
+ "eval_rouge1": 0.6637415370426804,
242
+ "eval_rouge2": 0.37565276875837994,
243
+ "eval_rougeL": 0.5773879369079004,
244
+ "eval_rougeLsum": 0.6488719947518645,
245
+ "eval_runtime": 21.8112,
246
+ "eval_samples_per_second": 1.375,
247
+ "eval_steps_per_second": 0.688,
248
+ "step": 2600
249
+ },
250
+ {
251
+ "epoch": 0.25,
252
+ "grad_norm": 0.1558646410703659,
253
+ "learning_rate": 0.00017479253587639266,
254
+ "loss": 1.8978,
255
+ "step": 2750
256
+ },
257
+ {
258
+ "epoch": 0.26,
259
+ "eval_bertscore": 0.7466126680374146,
260
+ "eval_loss": 1.9045982360839844,
261
+ "eval_rouge1": 0.6616225540296956,
262
+ "eval_rouge2": 0.37370762164745913,
263
+ "eval_rougeL": 0.5759418528371097,
264
+ "eval_rougeLsum": 0.6479977636906877,
265
+ "eval_runtime": 21.8772,
266
+ "eval_samples_per_second": 1.371,
267
+ "eval_steps_per_second": 0.686,
268
+ "step": 2800
269
+ },
270
+ {
271
+ "epoch": 0.28,
272
+ "grad_norm": 0.14783035218715668,
273
+ "learning_rate": 0.00017250011462106277,
274
+ "loss": 1.8978,
275
+ "step": 3000
276
+ },
277
+ {
278
+ "epoch": 0.28,
279
+ "eval_bertscore": 0.7485571503639221,
280
+ "eval_loss": 1.9035439491271973,
281
+ "eval_rouge1": 0.6664050030501707,
282
+ "eval_rouge2": 0.379492440917784,
283
+ "eval_rougeL": 0.5806973731221475,
284
+ "eval_rougeLsum": 0.6524346156604702,
285
+ "eval_runtime": 21.9217,
286
+ "eval_samples_per_second": 1.369,
287
+ "eval_steps_per_second": 0.684,
288
+ "step": 3000
289
+ },
290
+ {
291
+ "epoch": 0.29,
292
+ "eval_bertscore": 0.7483461499214172,
293
+ "eval_loss": 1.9022458791732788,
294
+ "eval_rouge1": 0.6618989733136488,
295
+ "eval_rouge2": 0.37377379177271053,
296
+ "eval_rougeL": 0.5780989082173933,
297
+ "eval_rougeLsum": 0.6490379362631586,
298
+ "eval_runtime": 21.7847,
299
+ "eval_samples_per_second": 1.377,
300
+ "eval_steps_per_second": 0.689,
301
+ "step": 3200
302
+ },
303
+ {
304
+ "epoch": 0.3,
305
+ "grad_norm": 0.16484151780605316,
306
+ "learning_rate": 0.0001702076933657329,
307
+ "loss": 1.8715,
308
+ "step": 3250
309
+ },
310
+ {
311
+ "epoch": 0.31,
312
+ "eval_bertscore": 0.7490711212158203,
313
+ "eval_loss": 1.9013088941574097,
314
+ "eval_rouge1": 0.6638141306545007,
315
+ "eval_rouge2": 0.37356255553691553,
316
+ "eval_rougeL": 0.577975450251653,
317
+ "eval_rougeLsum": 0.6492478632295806,
318
+ "eval_runtime": 21.8807,
319
+ "eval_samples_per_second": 1.371,
320
+ "eval_steps_per_second": 0.686,
321
+ "step": 3400
322
+ },
323
+ {
324
+ "epoch": 0.32,
325
+ "grad_norm": 0.14130128920078278,
326
+ "learning_rate": 0.000167915272110403,
327
+ "loss": 1.8819,
328
+ "step": 3500
329
+ },
330
+ {
331
+ "epoch": 0.33,
332
+ "eval_bertscore": 0.7475283741950989,
333
+ "eval_loss": 1.9002223014831543,
334
+ "eval_rouge1": 0.6628836314413511,
335
+ "eval_rouge2": 0.37179988805094977,
336
+ "eval_rougeL": 0.5764222388923268,
337
+ "eval_rougeLsum": 0.649864229310889,
338
+ "eval_runtime": 22.124,
339
+ "eval_samples_per_second": 1.356,
340
+ "eval_steps_per_second": 0.678,
341
+ "step": 3600
342
+ },
343
+ {
344
+ "epoch": 0.34,
345
+ "grad_norm": 0.1494186818599701,
346
+ "learning_rate": 0.00016562285085507315,
347
+ "loss": 1.8828,
348
+ "step": 3750
349
+ },
350
+ {
351
+ "epoch": 0.35,
352
+ "eval_bertscore": 0.7486498951911926,
353
+ "eval_loss": 1.9011151790618896,
354
+ "eval_rouge1": 0.6669673680023924,
355
+ "eval_rouge2": 0.3771780440183751,
356
+ "eval_rougeL": 0.5792518624130161,
357
+ "eval_rougeLsum": 0.6534484242953056,
358
+ "eval_runtime": 21.813,
359
+ "eval_samples_per_second": 1.375,
360
+ "eval_steps_per_second": 0.688,
361
+ "step": 3800
362
+ },
363
+ {
364
+ "epoch": 0.37,
365
+ "grad_norm": 0.14803479611873627,
366
+ "learning_rate": 0.00016333042959974325,
367
+ "loss": 1.8761,
368
+ "step": 4000
369
+ },
370
+ {
371
+ "epoch": 0.37,
372
+ "eval_bertscore": 0.7471507787704468,
373
+ "eval_loss": 1.9001713991165161,
374
+ "eval_rouge1": 0.6651735220672027,
375
+ "eval_rouge2": 0.3736698451416937,
376
+ "eval_rougeL": 0.5779938808281732,
377
+ "eval_rougeLsum": 0.6509815118131576,
378
+ "eval_runtime": 21.5004,
379
+ "eval_samples_per_second": 1.395,
380
+ "eval_steps_per_second": 0.698,
381
+ "step": 4000
382
+ },
383
+ {
384
+ "epoch": 0.39,
385
+ "eval_bertscore": 0.7485501766204834,
386
+ "eval_loss": 1.8993827104568481,
387
+ "eval_rouge1": 0.6646424082737133,
388
+ "eval_rouge2": 0.37318485364862475,
389
+ "eval_rougeL": 0.5773338159759467,
390
+ "eval_rougeLsum": 0.6507594353103527,
391
+ "eval_runtime": 21.2963,
392
+ "eval_samples_per_second": 1.409,
393
+ "eval_steps_per_second": 0.704,
394
+ "step": 4200
395
+ },
396
+ {
397
+ "epoch": 0.39,
398
+ "grad_norm": 0.15562959015369415,
399
+ "learning_rate": 0.0001610380083444134,
400
+ "loss": 1.8672,
401
+ "step": 4250
402
+ },
403
+ {
404
+ "epoch": 0.4,
405
+ "eval_bertscore": 0.7469989061355591,
406
+ "eval_loss": 1.900540828704834,
407
+ "eval_rouge1": 0.6620664558691891,
408
+ "eval_rouge2": 0.37299419371215703,
409
+ "eval_rougeL": 0.5765442194831125,
410
+ "eval_rougeLsum": 0.6472642385429858,
411
+ "eval_runtime": 21.9086,
412
+ "eval_samples_per_second": 1.369,
413
+ "eval_steps_per_second": 0.685,
414
+ "step": 4400
415
+ },
416
+ {
417
+ "epoch": 0.41,
418
+ "grad_norm": 0.15420928597450256,
419
+ "learning_rate": 0.0001587455870890835,
420
+ "loss": 1.8754,
421
+ "step": 4500
422
+ },
423
+ {
424
+ "epoch": 0.42,
425
+ "eval_bertscore": 0.7475299835205078,
426
+ "eval_loss": 1.8988685607910156,
427
+ "eval_rouge1": 0.6656661780424216,
428
+ "eval_rouge2": 0.37467258880478527,
429
+ "eval_rougeL": 0.5770800519970718,
430
+ "eval_rougeLsum": 0.6522703864288166,
431
+ "eval_runtime": 22.063,
432
+ "eval_samples_per_second": 1.36,
433
+ "eval_steps_per_second": 0.68,
434
+ "step": 4600
435
+ },
436
+ {
437
+ "epoch": 0.44,
438
+ "grad_norm": 0.15809176862239838,
439
+ "learning_rate": 0.00015645316583375363,
440
+ "loss": 1.8848,
441
+ "step": 4750
442
+ },
443
+ {
444
+ "epoch": 0.44,
445
+ "eval_bertscore": 0.7490234375,
446
+ "eval_loss": 1.8991097211837769,
447
+ "eval_rouge1": 0.6651730257289085,
448
+ "eval_rouge2": 0.3778893043274054,
449
+ "eval_rougeL": 0.5782673838033503,
450
+ "eval_rougeLsum": 0.6516865674488727,
451
+ "eval_runtime": 22.0202,
452
+ "eval_samples_per_second": 1.362,
453
+ "eval_steps_per_second": 0.681,
454
+ "step": 4800
455
+ },
456
+ {
457
+ "epoch": 0.46,
458
+ "grad_norm": 0.17979757487773895,
459
+ "learning_rate": 0.00015416074457842374,
460
+ "loss": 1.8851,
461
+ "step": 5000
462
+ },
463
+ {
464
+ "epoch": 0.46,
465
+ "eval_bertscore": 0.7492111325263977,
466
+ "eval_loss": 1.897339940071106,
467
+ "eval_rouge1": 0.665920573890169,
468
+ "eval_rouge2": 0.37917993898535385,
469
+ "eval_rougeL": 0.5800236892888617,
470
+ "eval_rougeLsum": 0.6529131688355863,
471
+ "eval_runtime": 21.6103,
472
+ "eval_samples_per_second": 1.388,
473
+ "eval_steps_per_second": 0.694,
474
+ "step": 5000
475
+ },
476
+ {
477
+ "epoch": 0.48,
478
+ "eval_bertscore": 0.7491253614425659,
479
+ "eval_loss": 1.897528052330017,
480
+ "eval_rouge1": 0.6653452054219615,
481
+ "eval_rouge2": 0.3759208437918665,
482
+ "eval_rougeL": 0.5776757077854651,
483
+ "eval_rougeLsum": 0.6511876484723524,
484
+ "eval_runtime": 21.3101,
485
+ "eval_samples_per_second": 1.408,
486
+ "eval_steps_per_second": 0.704,
487
+ "step": 5200
488
+ },
489
+ {
490
+ "epoch": 0.48,
491
+ "grad_norm": 0.16869671642780304,
492
+ "learning_rate": 0.00015186832332309387,
493
+ "loss": 1.8783,
494
+ "step": 5250
495
+ },
496
+ {
497
+ "epoch": 0.5,
498
+ "eval_bertscore": 0.7494469881057739,
499
+ "eval_loss": 1.895969271659851,
500
+ "eval_rouge1": 0.6660951369469854,
501
+ "eval_rouge2": 0.3764077134133328,
502
+ "eval_rougeL": 0.578785826234568,
503
+ "eval_rougeLsum": 0.6525967284041656,
504
+ "eval_runtime": 21.7955,
505
+ "eval_samples_per_second": 1.376,
506
+ "eval_steps_per_second": 0.688,
507
+ "step": 5400
508
+ },
509
+ {
510
+ "epoch": 0.5,
511
+ "grad_norm": 0.15996231138706207,
512
+ "learning_rate": 0.00014957590206776398,
513
+ "loss": 1.8805,
514
+ "step": 5500
515
+ },
516
+ {
517
+ "epoch": 0.51,
518
+ "eval_bertscore": 0.7486470341682434,
519
+ "eval_loss": 1.8955131769180298,
520
+ "eval_rouge1": 0.6670292173522965,
521
+ "eval_rouge2": 0.37457018529010144,
522
+ "eval_rougeL": 0.5775243235432015,
523
+ "eval_rougeLsum": 0.652574079807632,
524
+ "eval_runtime": 21.7576,
525
+ "eval_samples_per_second": 1.379,
526
+ "eval_steps_per_second": 0.689,
527
+ "step": 5600
528
+ },
529
+ {
530
+ "epoch": 0.53,
531
+ "grad_norm": 0.17192547023296356,
532
+ "learning_rate": 0.00014728348081243412,
533
+ "loss": 1.8884,
534
+ "step": 5750
535
+ },
536
+ {
537
+ "epoch": 0.53,
538
+ "eval_bertscore": 0.7483081817626953,
539
+ "eval_loss": 1.895763874053955,
540
+ "eval_rouge1": 0.6659275328276997,
541
+ "eval_rouge2": 0.3778666475350364,
542
+ "eval_rougeL": 0.579425140056643,
543
+ "eval_rougeLsum": 0.6515870828784887,
544
+ "eval_runtime": 21.6648,
545
+ "eval_samples_per_second": 1.385,
546
+ "eval_steps_per_second": 0.692,
547
+ "step": 5800
548
+ },
549
+ {
550
+ "epoch": 0.55,
551
+ "grad_norm": 0.15838442742824554,
552
+ "learning_rate": 0.00014499105955710422,
553
+ "loss": 1.8913,
554
+ "step": 6000
555
+ },
556
+ {
557
+ "epoch": 0.55,
558
+ "eval_bertscore": 0.7493732571601868,
559
+ "eval_loss": 1.8914682865142822,
560
+ "eval_rouge1": 0.6669695240447069,
561
+ "eval_rouge2": 0.3769441114214874,
562
+ "eval_rougeL": 0.5798986667152066,
563
+ "eval_rougeLsum": 0.6534527583592111,
564
+ "eval_runtime": 21.4686,
565
+ "eval_samples_per_second": 1.397,
566
+ "eval_steps_per_second": 0.699,
567
+ "step": 6000
568
+ },
569
+ {
570
+ "epoch": 0.57,
571
+ "eval_bertscore": 0.7510559558868408,
572
+ "eval_loss": 1.8923884630203247,
573
+ "eval_rouge1": 0.6677938121282943,
574
+ "eval_rouge2": 0.37854575387307554,
575
+ "eval_rougeL": 0.5817052753830161,
576
+ "eval_rougeLsum": 0.6534737907551461,
577
+ "eval_runtime": 21.593,
578
+ "eval_samples_per_second": 1.389,
579
+ "eval_steps_per_second": 0.695,
580
+ "step": 6200
581
+ },
582
+ {
583
+ "epoch": 0.57,
584
+ "grad_norm": 0.15312573313713074,
585
+ "learning_rate": 0.00014269863830177433,
586
+ "loss": 1.8705,
587
+ "step": 6250
588
+ },
589
+ {
590
+ "epoch": 0.59,
591
+ "eval_bertscore": 0.7479371428489685,
592
+ "eval_loss": 1.891802430152893,
593
+ "eval_rouge1": 0.6658674357402252,
594
+ "eval_rouge2": 0.3757712649269345,
595
+ "eval_rougeL": 0.5791817270712349,
596
+ "eval_rougeLsum": 0.6509960265397259,
597
+ "eval_runtime": 21.8726,
598
+ "eval_samples_per_second": 1.372,
599
+ "eval_steps_per_second": 0.686,
600
+ "step": 6400
601
+ },
602
+ {
603
+ "epoch": 0.6,
604
+ "grad_norm": 0.15844614803791046,
605
+ "learning_rate": 0.00014040621704644447,
606
+ "loss": 1.8643,
607
+ "step": 6500
608
+ },
609
+ {
610
+ "epoch": 0.61,
611
+ "eval_bertscore": 0.7484550476074219,
612
+ "eval_loss": 1.8903728723526,
613
+ "eval_rouge1": 0.6683828816523312,
614
+ "eval_rouge2": 0.37811618722345436,
615
+ "eval_rougeL": 0.5802581730590705,
616
+ "eval_rougeLsum": 0.6534402764651661,
617
+ "eval_runtime": 21.8343,
618
+ "eval_samples_per_second": 1.374,
619
+ "eval_steps_per_second": 0.687,
620
+ "step": 6600
621
+ },
622
+ {
623
+ "epoch": 0.62,
624
+ "grad_norm": 0.1661410629749298,
625
+ "learning_rate": 0.00013811379579111458,
626
+ "loss": 1.877,
627
+ "step": 6750
628
+ },
629
+ {
630
+ "epoch": 0.62,
631
+ "eval_bertscore": 0.747416615486145,
632
+ "eval_loss": 1.8915189504623413,
633
+ "eval_rouge1": 0.6644777881148224,
634
+ "eval_rouge2": 0.3747657029706615,
635
+ "eval_rougeL": 0.5793454557198501,
636
+ "eval_rougeLsum": 0.6521716611395593,
637
+ "eval_runtime": 21.523,
638
+ "eval_samples_per_second": 1.394,
639
+ "eval_steps_per_second": 0.697,
640
+ "step": 6800
641
+ },
642
+ {
643
+ "epoch": 0.64,
644
+ "grad_norm": 0.16483080387115479,
645
+ "learning_rate": 0.00013582137453578468,
646
+ "loss": 1.8792,
647
+ "step": 7000
648
+ },
649
+ {
650
+ "epoch": 0.64,
651
+ "eval_bertscore": 0.7480576634407043,
652
+ "eval_loss": 1.8913365602493286,
653
+ "eval_rouge1": 0.6655764268912302,
654
+ "eval_rouge2": 0.3757671289735428,
655
+ "eval_rougeL": 0.577951380212153,
656
+ "eval_rougeLsum": 0.6507587412359694,
657
+ "eval_runtime": 21.3067,
658
+ "eval_samples_per_second": 1.408,
659
+ "eval_steps_per_second": 0.704,
660
+ "step": 7000
661
+ },
662
+ {
663
+ "epoch": 0.66,
664
+ "eval_bertscore": 0.7505319714546204,
665
+ "eval_loss": 1.889721155166626,
666
+ "eval_rouge1": 0.6706532239207523,
667
+ "eval_rouge2": 0.37986537729431724,
668
+ "eval_rougeL": 0.5824624008038861,
669
+ "eval_rougeLsum": 0.6571986550416876,
670
+ "eval_runtime": 21.8193,
671
+ "eval_samples_per_second": 1.375,
672
+ "eval_steps_per_second": 0.687,
673
+ "step": 7200
674
+ },
675
+ {
676
+ "epoch": 0.66,
677
+ "grad_norm": 0.1685444712638855,
678
+ "learning_rate": 0.00013352895328045482,
679
+ "loss": 1.8748,
680
+ "step": 7250
681
+ },
682
+ {
683
+ "epoch": 0.68,
684
+ "eval_bertscore": 0.7472131252288818,
685
+ "eval_loss": 1.889514684677124,
686
+ "eval_rouge1": 0.6647481520892182,
687
+ "eval_rouge2": 0.3727968089505218,
688
+ "eval_rougeL": 0.5772333167389081,
689
+ "eval_rougeLsum": 0.6503920840351167,
690
+ "eval_runtime": 21.5794,
691
+ "eval_samples_per_second": 1.39,
692
+ "eval_steps_per_second": 0.695,
693
+ "step": 7400
694
+ },
695
+ {
696
+ "epoch": 0.69,
697
+ "grad_norm": 0.16196218132972717,
698
+ "learning_rate": 0.00013123653202512493,
699
+ "loss": 1.8958,
700
+ "step": 7500
701
+ },
702
+ {
703
+ "epoch": 0.7,
704
+ "eval_bertscore": 0.7467525005340576,
705
+ "eval_loss": 1.8874704837799072,
706
+ "eval_rouge1": 0.6652789954777591,
707
+ "eval_rouge2": 0.3747211875622626,
708
+ "eval_rougeL": 0.5781018250975862,
709
+ "eval_rougeLsum": 0.6512065884264598,
710
+ "eval_runtime": 21.6436,
711
+ "eval_samples_per_second": 1.386,
712
+ "eval_steps_per_second": 0.693,
713
+ "step": 7600
714
+ },
715
+ {
716
+ "epoch": 0.71,
717
+ "grad_norm": 0.17379231750965118,
718
+ "learning_rate": 0.00012894411076979506,
719
+ "loss": 1.8655,
720
+ "step": 7750
721
+ },
722
+ {
723
+ "epoch": 0.72,
724
+ "eval_bertscore": 0.7478018403053284,
725
+ "eval_loss": 1.8879252672195435,
726
+ "eval_rouge1": 0.6676077444849423,
727
+ "eval_rouge2": 0.37550824667101645,
728
+ "eval_rougeL": 0.5792625587400696,
729
+ "eval_rougeLsum": 0.6537654224373248,
730
+ "eval_runtime": 21.8026,
731
+ "eval_samples_per_second": 1.376,
732
+ "eval_steps_per_second": 0.688,
733
+ "step": 7800
734
+ },
735
+ {
736
+ "epoch": 0.73,
737
+ "grad_norm": 0.17975503206253052,
738
+ "learning_rate": 0.00012665168951446517,
739
+ "loss": 1.8593,
740
+ "step": 8000
741
+ },
742
+ {
743
+ "epoch": 0.73,
744
+ "eval_bertscore": 0.7490061521530151,
745
+ "eval_loss": 1.8872514963150024,
746
+ "eval_rouge1": 0.6677074837057098,
747
+ "eval_rouge2": 0.37723681410973775,
748
+ "eval_rougeL": 0.5806554105436175,
749
+ "eval_rougeLsum": 0.6531691046113964,
750
+ "eval_runtime": 21.2682,
751
+ "eval_samples_per_second": 1.411,
752
+ "eval_steps_per_second": 0.705,
753
+ "step": 8000
754
+ },
755
+ {
756
+ "epoch": 0.75,
757
+ "eval_bertscore": 0.7476587295532227,
758
+ "eval_loss": 1.8857940435409546,
759
+ "eval_rouge1": 0.6675733171919529,
760
+ "eval_rouge2": 0.37667421034338344,
761
+ "eval_rougeL": 0.5804128987718613,
762
+ "eval_rougeLsum": 0.6534287804714597,
763
+ "eval_runtime": 21.5325,
764
+ "eval_samples_per_second": 1.393,
765
+ "eval_steps_per_second": 0.697,
766
+ "step": 8200
767
+ },
768
+ {
769
+ "epoch": 0.76,
770
+ "grad_norm": 0.1596900373697281,
771
+ "learning_rate": 0.0001243592682591353,
772
+ "loss": 1.8627,
773
+ "step": 8250
774
+ },
775
+ {
776
+ "epoch": 0.77,
777
+ "eval_bertscore": 0.7444086074829102,
778
+ "eval_loss": 1.8874648809432983,
779
+ "eval_rouge1": 0.6633779669482168,
780
+ "eval_rouge2": 0.3710094509675216,
781
+ "eval_rougeL": 0.5760576627400225,
782
+ "eval_rougeLsum": 0.6499803336918719,
783
+ "eval_runtime": 21.4464,
784
+ "eval_samples_per_second": 1.399,
785
+ "eval_steps_per_second": 0.699,
786
+ "step": 8400
787
+ },
788
+ {
789
+ "epoch": 0.78,
790
+ "grad_norm": 0.16890183091163635,
791
+ "learning_rate": 0.00012206684700380542,
792
+ "loss": 1.8534,
793
+ "step": 8500
794
+ },
795
+ {
796
+ "epoch": 0.79,
797
+ "eval_bertscore": 0.7483052611351013,
798
+ "eval_loss": 1.8880757093429565,
799
+ "eval_rouge1": 0.6686948143176776,
800
+ "eval_rouge2": 0.3803796130427515,
801
+ "eval_rougeL": 0.5802459813261722,
802
+ "eval_rougeLsum": 0.6536962466082527,
803
+ "eval_runtime": 21.5416,
804
+ "eval_samples_per_second": 1.393,
805
+ "eval_steps_per_second": 0.696,
806
+ "step": 8600
807
+ },
808
+ {
809
+ "epoch": 0.8,
810
+ "grad_norm": 0.1596900373697281,
811
+ "learning_rate": 0.00011977442574847555,
812
+ "loss": 1.882,
813
+ "step": 8750
814
+ },
815
+ {
816
+ "epoch": 0.81,
817
+ "eval_bertscore": 0.748338520526886,
818
+ "eval_loss": 1.8871524333953857,
819
+ "eval_rouge1": 0.6673919143770407,
820
+ "eval_rouge2": 0.3761761743795482,
821
+ "eval_rougeL": 0.5797615995019129,
822
+ "eval_rougeLsum": 0.6526650363891257,
823
+ "eval_runtime": 21.8432,
824
+ "eval_samples_per_second": 1.373,
825
+ "eval_steps_per_second": 0.687,
826
+ "step": 8800
827
+ },
828
+ {
829
+ "epoch": 0.83,
830
+ "grad_norm": 0.16380883753299713,
831
+ "learning_rate": 0.00011748200449314565,
832
+ "loss": 1.8781,
833
+ "step": 9000
834
+ },
835
+ {
836
+ "epoch": 0.83,
837
+ "eval_bertscore": 0.7473989129066467,
838
+ "eval_loss": 1.885389804840088,
839
+ "eval_rouge1": 0.6660513187618474,
840
+ "eval_rouge2": 0.3728645884799071,
841
+ "eval_rougeL": 0.5767833607673931,
842
+ "eval_rougeLsum": 0.6518177265346137,
843
+ "eval_runtime": 21.5415,
844
+ "eval_samples_per_second": 1.393,
845
+ "eval_steps_per_second": 0.696,
846
+ "step": 9000
847
+ },
848
+ {
849
+ "epoch": 0.84,
850
+ "eval_bertscore": 0.7469697594642639,
851
+ "eval_loss": 1.8835673332214355,
852
+ "eval_rouge1": 0.6655382276884847,
853
+ "eval_rouge2": 0.3743925229327822,
854
+ "eval_rougeL": 0.5808516524350132,
855
+ "eval_rougeLsum": 0.6518276923554284,
856
+ "eval_runtime": 21.7289,
857
+ "eval_samples_per_second": 1.381,
858
+ "eval_steps_per_second": 0.69,
859
+ "step": 9200
860
+ },
861
+ {
862
+ "epoch": 0.85,
863
+ "grad_norm": 0.17286422848701477,
864
+ "learning_rate": 0.00011518958323781579,
865
+ "loss": 1.8672,
866
+ "step": 9250
867
+ },
868
+ {
869
+ "epoch": 0.86,
870
+ "eval_bertscore": 0.7491498589515686,
871
+ "eval_loss": 1.8845998048782349,
872
+ "eval_rouge1": 0.6670160490080832,
873
+ "eval_rouge2": 0.37860182825781935,
874
+ "eval_rougeL": 0.5797856034485049,
875
+ "eval_rougeLsum": 0.6531203725936218,
876
+ "eval_runtime": 21.5625,
877
+ "eval_samples_per_second": 1.391,
878
+ "eval_steps_per_second": 0.696,
879
+ "step": 9400
880
+ },
881
+ {
882
+ "epoch": 0.87,
883
+ "grad_norm": 0.16658568382263184,
884
+ "learning_rate": 0.0001128971619824859,
885
+ "loss": 1.8691,
886
+ "step": 9500
887
+ },
888
+ {
889
+ "epoch": 0.88,
890
+ "eval_bertscore": 0.7493313550949097,
891
+ "eval_loss": 1.8821747303009033,
892
+ "eval_rouge1": 0.6659791441681278,
893
+ "eval_rouge2": 0.3796033834485131,
894
+ "eval_rougeL": 0.580414529806212,
895
+ "eval_rougeLsum": 0.6528068238734432,
896
+ "eval_runtime": 21.8698,
897
+ "eval_samples_per_second": 1.372,
898
+ "eval_steps_per_second": 0.686,
899
+ "step": 9600
900
+ },
901
+ {
902
+ "epoch": 0.89,
903
+ "grad_norm": 0.1733073741197586,
904
+ "learning_rate": 0.00011060474072715603,
905
+ "loss": 1.8575,
906
+ "step": 9750
907
+ },
908
+ {
909
+ "epoch": 0.9,
910
+ "eval_bertscore": 0.7497690320014954,
911
+ "eval_loss": 1.8809062242507935,
912
+ "eval_rouge1": 0.6683202809005669,
913
+ "eval_rouge2": 0.379647408271533,
914
+ "eval_rougeL": 0.5812799059293663,
915
+ "eval_rougeLsum": 0.6549076224428805,
916
+ "eval_runtime": 21.461,
917
+ "eval_samples_per_second": 1.398,
918
+ "eval_steps_per_second": 0.699,
919
+ "step": 9800
920
+ },
921
+ {
922
+ "epoch": 0.92,
923
+ "grad_norm": 0.16828681528568268,
924
+ "learning_rate": 0.00010831231947182614,
925
+ "loss": 1.8799,
926
+ "step": 10000
927
+ },
928
+ {
929
+ "epoch": 0.92,
930
+ "eval_bertscore": 0.7487274408340454,
931
+ "eval_loss": 1.8800114393234253,
932
+ "eval_rouge1": 0.6694707226380743,
933
+ "eval_rouge2": 0.37780830529690856,
934
+ "eval_rougeL": 0.5789377835641822,
935
+ "eval_rougeLsum": 0.6540561492044448,
936
+ "eval_runtime": 21.6228,
937
+ "eval_samples_per_second": 1.387,
938
+ "eval_steps_per_second": 0.694,
939
+ "step": 10000
940
+ },
941
+ {
942
+ "epoch": 0.94,
943
+ "eval_bertscore": 0.7495086789131165,
944
+ "eval_loss": 1.8811218738555908,
945
+ "eval_rouge1": 0.6714277794869861,
946
+ "eval_rouge2": 0.3814957239141348,
947
+ "eval_rougeL": 0.5817721016839257,
948
+ "eval_rougeLsum": 0.6566092952916721,
949
+ "eval_runtime": 23.1282,
950
+ "eval_samples_per_second": 1.297,
951
+ "eval_steps_per_second": 0.649,
952
+ "step": 10200
953
+ },
954
+ {
955
+ "epoch": 0.94,
956
+ "grad_norm": 0.16498848795890808,
957
+ "learning_rate": 0.00010601989821649627,
958
+ "loss": 1.8656,
959
+ "step": 10250
960
+ },
961
+ {
962
+ "epoch": 0.95,
963
+ "eval_bertscore": 0.749505877494812,
964
+ "eval_loss": 1.8809926509857178,
965
+ "eval_rouge1": 0.6720420767359538,
966
+ "eval_rouge2": 0.38239237549289784,
967
+ "eval_rougeL": 0.5825845512902208,
968
+ "eval_rougeLsum": 0.6590116525116119,
969
+ "eval_runtime": 21.5266,
970
+ "eval_samples_per_second": 1.394,
971
+ "eval_steps_per_second": 0.697,
972
+ "step": 10400
973
+ }
974
+ ],
975
+ "logging_steps": 250,
976
+ "max_steps": 21812,
977
+ "num_input_tokens_seen": 0,
978
+ "num_train_epochs": 2,
979
+ "save_steps": 800,
980
+ "total_flos": 7.01080295964672e+17,
981
+ "train_batch_size": 2,
982
+ "trial_name": null,
983
+ "trial_params": null
984
+ }
checkpoint-10400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18a96e9baacd033c0a419444553d18b70e4f76e7b37401a6dcc6b00ceb2cc1e1
3
+ size 5048
checkpoint-10400/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11200/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: /workspace/model-export/allstax/shorting-phi-e4
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-11200/adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/workspace/model-export/allstax/shorting-phi-e4",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "q_proj",
25
+ "fc2",
26
+ "dense",
27
+ "v_proj",
28
+ "fc1"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }
checkpoint-11200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0a2f5c6b989c35a3db4011ee4dfec0141525254934771eac4e371531a833a39
3
+ size 377538512
checkpoint-11200/added_tokens.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\t\t": 50294,
3
+ "\t\t\t": 50293,
4
+ "\t\t\t\t": 50292,
5
+ "\t\t\t\t\t": 50291,
6
+ "\t\t\t\t\t\t": 50290,
7
+ "\t\t\t\t\t\t\t": 50289,
8
+ "\t\t\t\t\t\t\t\t": 50288,
9
+ "\t\t\t\t\t\t\t\t\t": 50287,
10
+ " ": 50286,
11
+ " ": 50285,
12
+ " ": 50284,
13
+ " ": 50283,
14
+ " ": 50282,
15
+ " ": 50281,
16
+ " ": 50280,
17
+ " ": 50279,
18
+ " ": 50278,
19
+ " ": 50277,
20
+ " ": 50276,
21
+ " ": 50275,
22
+ " ": 50274,
23
+ " ": 50273,
24
+ " ": 50272,
25
+ " ": 50271,
26
+ " ": 50270,
27
+ " ": 50269,
28
+ " ": 50268,
29
+ " ": 50267,
30
+ " ": 50266,
31
+ " ": 50265,
32
+ " ": 50264,
33
+ " ": 50263,
34
+ " ": 50262,
35
+ " ": 50261,
36
+ " ": 50260,
37
+ " ": 50259,
38
+ " ": 50258,
39
+ " ": 50257
40
+ }
checkpoint-11200/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-11200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b3a32f52a1df736803b7539d39f9005fb8892e3edcc445f5f3ce290d7b2577d
3
+ size 14244
checkpoint-11200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f74e468da13cfc6ceddf7876075afca1baaa207a5e6a9f73e42f07352188c6e
3
+ size 1064
checkpoint-11200/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-11200/tokenizer_config.json ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "50257": {
14
+ "content": " ",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": false
20
+ },
21
+ "50258": {
22
+ "content": " ",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "50259": {
30
+ "content": " ",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "50260": {
38
+ "content": " ",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "50261": {
46
+ "content": " ",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "50262": {
54
+ "content": " ",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "50263": {
62
+ "content": " ",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "50264": {
70
+ "content": " ",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "50265": {
78
+ "content": " ",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "50266": {
86
+ "content": " ",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "50267": {
94
+ "content": " ",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "50268": {
102
+ "content": " ",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "50269": {
110
+ "content": " ",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "50270": {
118
+ "content": " ",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "50271": {
126
+ "content": " ",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "50272": {
134
+ "content": " ",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "50273": {
142
+ "content": " ",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "50274": {
150
+ "content": " ",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "50275": {
158
+ "content": " ",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "50276": {
166
+ "content": " ",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "50277": {
174
+ "content": " ",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "50278": {
182
+ "content": " ",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "50279": {
190
+ "content": " ",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "50280": {
198
+ "content": " ",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "50281": {
206
+ "content": " ",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "50282": {
214
+ "content": " ",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "50283": {
222
+ "content": " ",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "50284": {
230
+ "content": " ",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "50285": {
238
+ "content": " ",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "50286": {
246
+ "content": " ",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "50287": {
254
+ "content": "\t\t\t\t\t\t\t\t\t",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "50288": {
262
+ "content": "\t\t\t\t\t\t\t\t",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "50289": {
270
+ "content": "\t\t\t\t\t\t\t",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "50290": {
278
+ "content": "\t\t\t\t\t\t",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "50291": {
286
+ "content": "\t\t\t\t\t",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "50292": {
294
+ "content": "\t\t\t\t",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "50293": {
302
+ "content": "\t\t\t",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "50294": {
310
+ "content": "\t\t",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ }
317
+ },
318
+ "bos_token": "<|endoftext|>",
319
+ "clean_up_tokenization_spaces": true,
320
+ "eos_token": "<|endoftext|>",
321
+ "errors": "replace",
322
+ "model_max_length": 2048,
323
+ "pad_token": "<|endoftext|>",
324
+ "padding_side": "left",
325
+ "tokenizer_class": "CodeGenTokenizer",
326
+ "unk_token": "<|endoftext|>"
327
+ }
checkpoint-11200/trainer_state.json ADDED
@@ -0,0 +1,1057 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0269105579241737,
5
+ "eval_steps": 200,
6
+ "global_step": 11200,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "eval_bertscore": 0.7401605248451233,
14
+ "eval_loss": 1.9530484676361084,
15
+ "eval_rouge1": 0.6562857460474375,
16
+ "eval_rouge2": 0.3640670727106235,
17
+ "eval_rougeL": 0.5655212336424695,
18
+ "eval_rougeLsum": 0.6414840198810386,
19
+ "eval_runtime": 21.7196,
20
+ "eval_samples_per_second": 1.381,
21
+ "eval_steps_per_second": 0.691,
22
+ "step": 200
23
+ },
24
+ {
25
+ "epoch": 0.02,
26
+ "grad_norm": 0.25105270743370056,
27
+ "learning_rate": 0.00019771674842969145,
28
+ "loss": 1.7353,
29
+ "step": 250
30
+ },
31
+ {
32
+ "epoch": 0.04,
33
+ "eval_bertscore": 0.7432050108909607,
34
+ "eval_loss": 1.9583823680877686,
35
+ "eval_rouge1": 0.6554226269617707,
36
+ "eval_rouge2": 0.36661086995296877,
37
+ "eval_rougeL": 0.5637448790342183,
38
+ "eval_rougeLsum": 0.6419796784912521,
39
+ "eval_runtime": 21.9623,
40
+ "eval_samples_per_second": 1.366,
41
+ "eval_steps_per_second": 0.683,
42
+ "step": 400
43
+ },
44
+ {
45
+ "epoch": 0.05,
46
+ "grad_norm": 0.26550447940826416,
47
+ "learning_rate": 0.00019542432717436156,
48
+ "loss": 1.7786,
49
+ "step": 500
50
+ },
51
+ {
52
+ "epoch": 0.06,
53
+ "eval_bertscore": 0.7469045519828796,
54
+ "eval_loss": 1.9245686531066895,
55
+ "eval_rouge1": 0.6662431635890791,
56
+ "eval_rouge2": 0.3735263724826765,
57
+ "eval_rougeL": 0.5755071616151013,
58
+ "eval_rougeLsum": 0.6538383087686117,
59
+ "eval_runtime": 21.5302,
60
+ "eval_samples_per_second": 1.393,
61
+ "eval_steps_per_second": 0.697,
62
+ "step": 600
63
+ },
64
+ {
65
+ "epoch": 0.07,
66
+ "grad_norm": 0.1538015753030777,
67
+ "learning_rate": 0.0001931319059190317,
68
+ "loss": 1.8851,
69
+ "step": 750
70
+ },
71
+ {
72
+ "epoch": 0.07,
73
+ "eval_bertscore": 0.7442477941513062,
74
+ "eval_loss": 1.9187489748001099,
75
+ "eval_rouge1": 0.6606221897489035,
76
+ "eval_rouge2": 0.368654563659435,
77
+ "eval_rougeL": 0.5731546210408094,
78
+ "eval_rougeLsum": 0.6470590823125606,
79
+ "eval_runtime": 21.9831,
80
+ "eval_samples_per_second": 1.365,
81
+ "eval_steps_per_second": 0.682,
82
+ "step": 800
83
+ },
84
+ {
85
+ "epoch": 0.09,
86
+ "grad_norm": 0.1681252270936966,
87
+ "learning_rate": 0.0001908394846637018,
88
+ "loss": 1.8919,
89
+ "step": 1000
90
+ },
91
+ {
92
+ "epoch": 0.09,
93
+ "eval_bertscore": 0.7458053231239319,
94
+ "eval_loss": 1.9159075021743774,
95
+ "eval_rouge1": 0.6621259186456026,
96
+ "eval_rouge2": 0.372024043683234,
97
+ "eval_rougeL": 0.5743354509339939,
98
+ "eval_rougeLsum": 0.6491550893780276,
99
+ "eval_runtime": 21.7159,
100
+ "eval_samples_per_second": 1.381,
101
+ "eval_steps_per_second": 0.691,
102
+ "step": 1000
103
+ },
104
+ {
105
+ "epoch": 0.11,
106
+ "eval_bertscore": 0.7468854784965515,
107
+ "eval_loss": 1.9140182733535767,
108
+ "eval_rouge1": 0.6626581781149132,
109
+ "eval_rouge2": 0.37318557504782157,
110
+ "eval_rougeL": 0.5759264203594217,
111
+ "eval_rougeLsum": 0.6490702446275723,
112
+ "eval_runtime": 21.6486,
113
+ "eval_samples_per_second": 1.386,
114
+ "eval_steps_per_second": 0.693,
115
+ "step": 1200
116
+ },
117
+ {
118
+ "epoch": 0.11,
119
+ "grad_norm": 0.1552441120147705,
120
+ "learning_rate": 0.00018854706340837193,
121
+ "loss": 1.9052,
122
+ "step": 1250
123
+ },
124
+ {
125
+ "epoch": 0.13,
126
+ "eval_bertscore": 0.7475314736366272,
127
+ "eval_loss": 1.913794755935669,
128
+ "eval_rouge1": 0.6648687174353192,
129
+ "eval_rouge2": 0.3760379232448734,
130
+ "eval_rougeL": 0.5784915488164926,
131
+ "eval_rougeLsum": 0.6513864520108938,
132
+ "eval_runtime": 21.664,
133
+ "eval_samples_per_second": 1.385,
134
+ "eval_steps_per_second": 0.692,
135
+ "step": 1400
136
+ },
137
+ {
138
+ "epoch": 0.14,
139
+ "grad_norm": 0.14638397097587585,
140
+ "learning_rate": 0.00018625464215304204,
141
+ "loss": 1.8843,
142
+ "step": 1500
143
+ },
144
+ {
145
+ "epoch": 0.15,
146
+ "eval_bertscore": 0.747238039970398,
147
+ "eval_loss": 1.9117029905319214,
148
+ "eval_rouge1": 0.6638085237198453,
149
+ "eval_rouge2": 0.3742779818055127,
150
+ "eval_rougeL": 0.5754209460423059,
151
+ "eval_rougeLsum": 0.6506476155592722,
152
+ "eval_runtime": 21.9308,
153
+ "eval_samples_per_second": 1.368,
154
+ "eval_steps_per_second": 0.684,
155
+ "step": 1600
156
+ },
157
+ {
158
+ "epoch": 0.16,
159
+ "grad_norm": 0.15738993883132935,
160
+ "learning_rate": 0.00018396222089771218,
161
+ "loss": 1.8964,
162
+ "step": 1750
163
+ },
164
+ {
165
+ "epoch": 0.17,
166
+ "eval_bertscore": 0.7473016381263733,
167
+ "eval_loss": 1.9117563962936401,
168
+ "eval_rouge1": 0.6620053151663765,
169
+ "eval_rouge2": 0.37406692119411245,
170
+ "eval_rougeL": 0.5758911607323577,
171
+ "eval_rougeLsum": 0.6494070575604445,
172
+ "eval_runtime": 21.6727,
173
+ "eval_samples_per_second": 1.384,
174
+ "eval_steps_per_second": 0.692,
175
+ "step": 1800
176
+ },
177
+ {
178
+ "epoch": 0.18,
179
+ "grad_norm": 0.1588907092809677,
180
+ "learning_rate": 0.00018166979964238228,
181
+ "loss": 1.8827,
182
+ "step": 2000
183
+ },
184
+ {
185
+ "epoch": 0.18,
186
+ "eval_bertscore": 0.7485987544059753,
187
+ "eval_loss": 1.9126006364822388,
188
+ "eval_rouge1": 0.6641836156334741,
189
+ "eval_rouge2": 0.37320215574735827,
190
+ "eval_rougeL": 0.5783015040447993,
191
+ "eval_rougeLsum": 0.6522235940423647,
192
+ "eval_runtime": 21.9759,
193
+ "eval_samples_per_second": 1.365,
194
+ "eval_steps_per_second": 0.683,
195
+ "step": 2000
196
+ },
197
+ {
198
+ "epoch": 0.2,
199
+ "eval_bertscore": 0.7482583522796631,
200
+ "eval_loss": 1.9075205326080322,
201
+ "eval_rouge1": 0.6658219484766166,
202
+ "eval_rouge2": 0.37723364952258465,
203
+ "eval_rougeL": 0.5769040785174693,
204
+ "eval_rougeLsum": 0.6511328888044219,
205
+ "eval_runtime": 21.5892,
206
+ "eval_samples_per_second": 1.39,
207
+ "eval_steps_per_second": 0.695,
208
+ "step": 2200
209
+ },
210
+ {
211
+ "epoch": 0.21,
212
+ "grad_norm": 0.15247465670108795,
213
+ "learning_rate": 0.00017937737838705242,
214
+ "loss": 1.8831,
215
+ "step": 2250
216
+ },
217
+ {
218
+ "epoch": 0.22,
219
+ "eval_bertscore": 0.7460805177688599,
220
+ "eval_loss": 1.9088668823242188,
221
+ "eval_rouge1": 0.6627321043292516,
222
+ "eval_rouge2": 0.3696581195003696,
223
+ "eval_rougeL": 0.5740988544467178,
224
+ "eval_rougeLsum": 0.6478729042661874,
225
+ "eval_runtime": 21.9221,
226
+ "eval_samples_per_second": 1.368,
227
+ "eval_steps_per_second": 0.684,
228
+ "step": 2400
229
+ },
230
+ {
231
+ "epoch": 0.23,
232
+ "grad_norm": 0.1587379276752472,
233
+ "learning_rate": 0.00017708495713172253,
234
+ "loss": 1.8829,
235
+ "step": 2500
236
+ },
237
+ {
238
+ "epoch": 0.24,
239
+ "eval_bertscore": 0.7472203373908997,
240
+ "eval_loss": 1.906219482421875,
241
+ "eval_rouge1": 0.6637415370426804,
242
+ "eval_rouge2": 0.37565276875837994,
243
+ "eval_rougeL": 0.5773879369079004,
244
+ "eval_rougeLsum": 0.6488719947518645,
245
+ "eval_runtime": 21.8112,
246
+ "eval_samples_per_second": 1.375,
247
+ "eval_steps_per_second": 0.688,
248
+ "step": 2600
249
+ },
250
+ {
251
+ "epoch": 0.25,
252
+ "grad_norm": 0.1558646410703659,
253
+ "learning_rate": 0.00017479253587639266,
254
+ "loss": 1.8978,
255
+ "step": 2750
256
+ },
257
+ {
258
+ "epoch": 0.26,
259
+ "eval_bertscore": 0.7466126680374146,
260
+ "eval_loss": 1.9045982360839844,
261
+ "eval_rouge1": 0.6616225540296956,
262
+ "eval_rouge2": 0.37370762164745913,
263
+ "eval_rougeL": 0.5759418528371097,
264
+ "eval_rougeLsum": 0.6479977636906877,
265
+ "eval_runtime": 21.8772,
266
+ "eval_samples_per_second": 1.371,
267
+ "eval_steps_per_second": 0.686,
268
+ "step": 2800
269
+ },
270
+ {
271
+ "epoch": 0.28,
272
+ "grad_norm": 0.14783035218715668,
273
+ "learning_rate": 0.00017250011462106277,
274
+ "loss": 1.8978,
275
+ "step": 3000
276
+ },
277
+ {
278
+ "epoch": 0.28,
279
+ "eval_bertscore": 0.7485571503639221,
280
+ "eval_loss": 1.9035439491271973,
281
+ "eval_rouge1": 0.6664050030501707,
282
+ "eval_rouge2": 0.379492440917784,
283
+ "eval_rougeL": 0.5806973731221475,
284
+ "eval_rougeLsum": 0.6524346156604702,
285
+ "eval_runtime": 21.9217,
286
+ "eval_samples_per_second": 1.369,
287
+ "eval_steps_per_second": 0.684,
288
+ "step": 3000
289
+ },
290
+ {
291
+ "epoch": 0.29,
292
+ "eval_bertscore": 0.7483461499214172,
293
+ "eval_loss": 1.9022458791732788,
294
+ "eval_rouge1": 0.6618989733136488,
295
+ "eval_rouge2": 0.37377379177271053,
296
+ "eval_rougeL": 0.5780989082173933,
297
+ "eval_rougeLsum": 0.6490379362631586,
298
+ "eval_runtime": 21.7847,
299
+ "eval_samples_per_second": 1.377,
300
+ "eval_steps_per_second": 0.689,
301
+ "step": 3200
302
+ },
303
+ {
304
+ "epoch": 0.3,
305
+ "grad_norm": 0.16484151780605316,
306
+ "learning_rate": 0.0001702076933657329,
307
+ "loss": 1.8715,
308
+ "step": 3250
309
+ },
310
+ {
311
+ "epoch": 0.31,
312
+ "eval_bertscore": 0.7490711212158203,
313
+ "eval_loss": 1.9013088941574097,
314
+ "eval_rouge1": 0.6638141306545007,
315
+ "eval_rouge2": 0.37356255553691553,
316
+ "eval_rougeL": 0.577975450251653,
317
+ "eval_rougeLsum": 0.6492478632295806,
318
+ "eval_runtime": 21.8807,
319
+ "eval_samples_per_second": 1.371,
320
+ "eval_steps_per_second": 0.686,
321
+ "step": 3400
322
+ },
323
+ {
324
+ "epoch": 0.32,
325
+ "grad_norm": 0.14130128920078278,
326
+ "learning_rate": 0.000167915272110403,
327
+ "loss": 1.8819,
328
+ "step": 3500
329
+ },
330
+ {
331
+ "epoch": 0.33,
332
+ "eval_bertscore": 0.7475283741950989,
333
+ "eval_loss": 1.9002223014831543,
334
+ "eval_rouge1": 0.6628836314413511,
335
+ "eval_rouge2": 0.37179988805094977,
336
+ "eval_rougeL": 0.5764222388923268,
337
+ "eval_rougeLsum": 0.649864229310889,
338
+ "eval_runtime": 22.124,
339
+ "eval_samples_per_second": 1.356,
340
+ "eval_steps_per_second": 0.678,
341
+ "step": 3600
342
+ },
343
+ {
344
+ "epoch": 0.34,
345
+ "grad_norm": 0.1494186818599701,
346
+ "learning_rate": 0.00016562285085507315,
347
+ "loss": 1.8828,
348
+ "step": 3750
349
+ },
350
+ {
351
+ "epoch": 0.35,
352
+ "eval_bertscore": 0.7486498951911926,
353
+ "eval_loss": 1.9011151790618896,
354
+ "eval_rouge1": 0.6669673680023924,
355
+ "eval_rouge2": 0.3771780440183751,
356
+ "eval_rougeL": 0.5792518624130161,
357
+ "eval_rougeLsum": 0.6534484242953056,
358
+ "eval_runtime": 21.813,
359
+ "eval_samples_per_second": 1.375,
360
+ "eval_steps_per_second": 0.688,
361
+ "step": 3800
362
+ },
363
+ {
364
+ "epoch": 0.37,
365
+ "grad_norm": 0.14803479611873627,
366
+ "learning_rate": 0.00016333042959974325,
367
+ "loss": 1.8761,
368
+ "step": 4000
369
+ },
370
+ {
371
+ "epoch": 0.37,
372
+ "eval_bertscore": 0.7471507787704468,
373
+ "eval_loss": 1.9001713991165161,
374
+ "eval_rouge1": 0.6651735220672027,
375
+ "eval_rouge2": 0.3736698451416937,
376
+ "eval_rougeL": 0.5779938808281732,
377
+ "eval_rougeLsum": 0.6509815118131576,
378
+ "eval_runtime": 21.5004,
379
+ "eval_samples_per_second": 1.395,
380
+ "eval_steps_per_second": 0.698,
381
+ "step": 4000
382
+ },
383
+ {
384
+ "epoch": 0.39,
385
+ "eval_bertscore": 0.7485501766204834,
386
+ "eval_loss": 1.8993827104568481,
387
+ "eval_rouge1": 0.6646424082737133,
388
+ "eval_rouge2": 0.37318485364862475,
389
+ "eval_rougeL": 0.5773338159759467,
390
+ "eval_rougeLsum": 0.6507594353103527,
391
+ "eval_runtime": 21.2963,
392
+ "eval_samples_per_second": 1.409,
393
+ "eval_steps_per_second": 0.704,
394
+ "step": 4200
395
+ },
396
+ {
397
+ "epoch": 0.39,
398
+ "grad_norm": 0.15562959015369415,
399
+ "learning_rate": 0.0001610380083444134,
400
+ "loss": 1.8672,
401
+ "step": 4250
402
+ },
403
+ {
404
+ "epoch": 0.4,
405
+ "eval_bertscore": 0.7469989061355591,
406
+ "eval_loss": 1.900540828704834,
407
+ "eval_rouge1": 0.6620664558691891,
408
+ "eval_rouge2": 0.37299419371215703,
409
+ "eval_rougeL": 0.5765442194831125,
410
+ "eval_rougeLsum": 0.6472642385429858,
411
+ "eval_runtime": 21.9086,
412
+ "eval_samples_per_second": 1.369,
413
+ "eval_steps_per_second": 0.685,
414
+ "step": 4400
415
+ },
416
+ {
417
+ "epoch": 0.41,
418
+ "grad_norm": 0.15420928597450256,
419
+ "learning_rate": 0.0001587455870890835,
420
+ "loss": 1.8754,
421
+ "step": 4500
422
+ },
423
+ {
424
+ "epoch": 0.42,
425
+ "eval_bertscore": 0.7475299835205078,
426
+ "eval_loss": 1.8988685607910156,
427
+ "eval_rouge1": 0.6656661780424216,
428
+ "eval_rouge2": 0.37467258880478527,
429
+ "eval_rougeL": 0.5770800519970718,
430
+ "eval_rougeLsum": 0.6522703864288166,
431
+ "eval_runtime": 22.063,
432
+ "eval_samples_per_second": 1.36,
433
+ "eval_steps_per_second": 0.68,
434
+ "step": 4600
435
+ },
436
+ {
437
+ "epoch": 0.44,
438
+ "grad_norm": 0.15809176862239838,
439
+ "learning_rate": 0.00015645316583375363,
440
+ "loss": 1.8848,
441
+ "step": 4750
442
+ },
443
+ {
444
+ "epoch": 0.44,
445
+ "eval_bertscore": 0.7490234375,
446
+ "eval_loss": 1.8991097211837769,
447
+ "eval_rouge1": 0.6651730257289085,
448
+ "eval_rouge2": 0.3778893043274054,
449
+ "eval_rougeL": 0.5782673838033503,
450
+ "eval_rougeLsum": 0.6516865674488727,
451
+ "eval_runtime": 22.0202,
452
+ "eval_samples_per_second": 1.362,
453
+ "eval_steps_per_second": 0.681,
454
+ "step": 4800
455
+ },
456
+ {
457
+ "epoch": 0.46,
458
+ "grad_norm": 0.17979757487773895,
459
+ "learning_rate": 0.00015416074457842374,
460
+ "loss": 1.8851,
461
+ "step": 5000
462
+ },
463
+ {
464
+ "epoch": 0.46,
465
+ "eval_bertscore": 0.7492111325263977,
466
+ "eval_loss": 1.897339940071106,
467
+ "eval_rouge1": 0.665920573890169,
468
+ "eval_rouge2": 0.37917993898535385,
469
+ "eval_rougeL": 0.5800236892888617,
470
+ "eval_rougeLsum": 0.6529131688355863,
471
+ "eval_runtime": 21.6103,
472
+ "eval_samples_per_second": 1.388,
473
+ "eval_steps_per_second": 0.694,
474
+ "step": 5000
475
+ },
476
+ {
477
+ "epoch": 0.48,
478
+ "eval_bertscore": 0.7491253614425659,
479
+ "eval_loss": 1.897528052330017,
480
+ "eval_rouge1": 0.6653452054219615,
481
+ "eval_rouge2": 0.3759208437918665,
482
+ "eval_rougeL": 0.5776757077854651,
483
+ "eval_rougeLsum": 0.6511876484723524,
484
+ "eval_runtime": 21.3101,
485
+ "eval_samples_per_second": 1.408,
486
+ "eval_steps_per_second": 0.704,
487
+ "step": 5200
488
+ },
489
+ {
490
+ "epoch": 0.48,
491
+ "grad_norm": 0.16869671642780304,
492
+ "learning_rate": 0.00015186832332309387,
493
+ "loss": 1.8783,
494
+ "step": 5250
495
+ },
496
+ {
497
+ "epoch": 0.5,
498
+ "eval_bertscore": 0.7494469881057739,
499
+ "eval_loss": 1.895969271659851,
500
+ "eval_rouge1": 0.6660951369469854,
501
+ "eval_rouge2": 0.3764077134133328,
502
+ "eval_rougeL": 0.578785826234568,
503
+ "eval_rougeLsum": 0.6525967284041656,
504
+ "eval_runtime": 21.7955,
505
+ "eval_samples_per_second": 1.376,
506
+ "eval_steps_per_second": 0.688,
507
+ "step": 5400
508
+ },
509
+ {
510
+ "epoch": 0.5,
511
+ "grad_norm": 0.15996231138706207,
512
+ "learning_rate": 0.00014957590206776398,
513
+ "loss": 1.8805,
514
+ "step": 5500
515
+ },
516
+ {
517
+ "epoch": 0.51,
518
+ "eval_bertscore": 0.7486470341682434,
519
+ "eval_loss": 1.8955131769180298,
520
+ "eval_rouge1": 0.6670292173522965,
521
+ "eval_rouge2": 0.37457018529010144,
522
+ "eval_rougeL": 0.5775243235432015,
523
+ "eval_rougeLsum": 0.652574079807632,
524
+ "eval_runtime": 21.7576,
525
+ "eval_samples_per_second": 1.379,
526
+ "eval_steps_per_second": 0.689,
527
+ "step": 5600
528
+ },
529
+ {
530
+ "epoch": 0.53,
531
+ "grad_norm": 0.17192547023296356,
532
+ "learning_rate": 0.00014728348081243412,
533
+ "loss": 1.8884,
534
+ "step": 5750
535
+ },
536
+ {
537
+ "epoch": 0.53,
538
+ "eval_bertscore": 0.7483081817626953,
539
+ "eval_loss": 1.895763874053955,
540
+ "eval_rouge1": 0.6659275328276997,
541
+ "eval_rouge2": 0.3778666475350364,
542
+ "eval_rougeL": 0.579425140056643,
543
+ "eval_rougeLsum": 0.6515870828784887,
544
+ "eval_runtime": 21.6648,
545
+ "eval_samples_per_second": 1.385,
546
+ "eval_steps_per_second": 0.692,
547
+ "step": 5800
548
+ },
549
+ {
550
+ "epoch": 0.55,
551
+ "grad_norm": 0.15838442742824554,
552
+ "learning_rate": 0.00014499105955710422,
553
+ "loss": 1.8913,
554
+ "step": 6000
555
+ },
556
+ {
557
+ "epoch": 0.55,
558
+ "eval_bertscore": 0.7493732571601868,
559
+ "eval_loss": 1.8914682865142822,
560
+ "eval_rouge1": 0.6669695240447069,
561
+ "eval_rouge2": 0.3769441114214874,
562
+ "eval_rougeL": 0.5798986667152066,
563
+ "eval_rougeLsum": 0.6534527583592111,
564
+ "eval_runtime": 21.4686,
565
+ "eval_samples_per_second": 1.397,
566
+ "eval_steps_per_second": 0.699,
567
+ "step": 6000
568
+ },
569
+ {
570
+ "epoch": 0.57,
571
+ "eval_bertscore": 0.7510559558868408,
572
+ "eval_loss": 1.8923884630203247,
573
+ "eval_rouge1": 0.6677938121282943,
574
+ "eval_rouge2": 0.37854575387307554,
575
+ "eval_rougeL": 0.5817052753830161,
576
+ "eval_rougeLsum": 0.6534737907551461,
577
+ "eval_runtime": 21.593,
578
+ "eval_samples_per_second": 1.389,
579
+ "eval_steps_per_second": 0.695,
580
+ "step": 6200
581
+ },
582
+ {
583
+ "epoch": 0.57,
584
+ "grad_norm": 0.15312573313713074,
585
+ "learning_rate": 0.00014269863830177433,
586
+ "loss": 1.8705,
587
+ "step": 6250
588
+ },
589
+ {
590
+ "epoch": 0.59,
591
+ "eval_bertscore": 0.7479371428489685,
592
+ "eval_loss": 1.891802430152893,
593
+ "eval_rouge1": 0.6658674357402252,
594
+ "eval_rouge2": 0.3757712649269345,
595
+ "eval_rougeL": 0.5791817270712349,
596
+ "eval_rougeLsum": 0.6509960265397259,
597
+ "eval_runtime": 21.8726,
598
+ "eval_samples_per_second": 1.372,
599
+ "eval_steps_per_second": 0.686,
600
+ "step": 6400
601
+ },
602
+ {
603
+ "epoch": 0.6,
604
+ "grad_norm": 0.15844614803791046,
605
+ "learning_rate": 0.00014040621704644447,
606
+ "loss": 1.8643,
607
+ "step": 6500
608
+ },
609
+ {
610
+ "epoch": 0.61,
611
+ "eval_bertscore": 0.7484550476074219,
612
+ "eval_loss": 1.8903728723526,
613
+ "eval_rouge1": 0.6683828816523312,
614
+ "eval_rouge2": 0.37811618722345436,
615
+ "eval_rougeL": 0.5802581730590705,
616
+ "eval_rougeLsum": 0.6534402764651661,
617
+ "eval_runtime": 21.8343,
618
+ "eval_samples_per_second": 1.374,
619
+ "eval_steps_per_second": 0.687,
620
+ "step": 6600
621
+ },
622
+ {
623
+ "epoch": 0.62,
624
+ "grad_norm": 0.1661410629749298,
625
+ "learning_rate": 0.00013811379579111458,
626
+ "loss": 1.877,
627
+ "step": 6750
628
+ },
629
+ {
630
+ "epoch": 0.62,
631
+ "eval_bertscore": 0.747416615486145,
632
+ "eval_loss": 1.8915189504623413,
633
+ "eval_rouge1": 0.6644777881148224,
634
+ "eval_rouge2": 0.3747657029706615,
635
+ "eval_rougeL": 0.5793454557198501,
636
+ "eval_rougeLsum": 0.6521716611395593,
637
+ "eval_runtime": 21.523,
638
+ "eval_samples_per_second": 1.394,
639
+ "eval_steps_per_second": 0.697,
640
+ "step": 6800
641
+ },
642
+ {
643
+ "epoch": 0.64,
644
+ "grad_norm": 0.16483080387115479,
645
+ "learning_rate": 0.00013582137453578468,
646
+ "loss": 1.8792,
647
+ "step": 7000
648
+ },
649
+ {
650
+ "epoch": 0.64,
651
+ "eval_bertscore": 0.7480576634407043,
652
+ "eval_loss": 1.8913365602493286,
653
+ "eval_rouge1": 0.6655764268912302,
654
+ "eval_rouge2": 0.3757671289735428,
655
+ "eval_rougeL": 0.577951380212153,
656
+ "eval_rougeLsum": 0.6507587412359694,
657
+ "eval_runtime": 21.3067,
658
+ "eval_samples_per_second": 1.408,
659
+ "eval_steps_per_second": 0.704,
660
+ "step": 7000
661
+ },
662
+ {
663
+ "epoch": 0.66,
664
+ "eval_bertscore": 0.7505319714546204,
665
+ "eval_loss": 1.889721155166626,
666
+ "eval_rouge1": 0.6706532239207523,
667
+ "eval_rouge2": 0.37986537729431724,
668
+ "eval_rougeL": 0.5824624008038861,
669
+ "eval_rougeLsum": 0.6571986550416876,
670
+ "eval_runtime": 21.8193,
671
+ "eval_samples_per_second": 1.375,
672
+ "eval_steps_per_second": 0.687,
673
+ "step": 7200
674
+ },
675
+ {
676
+ "epoch": 0.66,
677
+ "grad_norm": 0.1685444712638855,
678
+ "learning_rate": 0.00013352895328045482,
679
+ "loss": 1.8748,
680
+ "step": 7250
681
+ },
682
+ {
683
+ "epoch": 0.68,
684
+ "eval_bertscore": 0.7472131252288818,
685
+ "eval_loss": 1.889514684677124,
686
+ "eval_rouge1": 0.6647481520892182,
687
+ "eval_rouge2": 0.3727968089505218,
688
+ "eval_rougeL": 0.5772333167389081,
689
+ "eval_rougeLsum": 0.6503920840351167,
690
+ "eval_runtime": 21.5794,
691
+ "eval_samples_per_second": 1.39,
692
+ "eval_steps_per_second": 0.695,
693
+ "step": 7400
694
+ },
695
+ {
696
+ "epoch": 0.69,
697
+ "grad_norm": 0.16196218132972717,
698
+ "learning_rate": 0.00013123653202512493,
699
+ "loss": 1.8958,
700
+ "step": 7500
701
+ },
702
+ {
703
+ "epoch": 0.7,
704
+ "eval_bertscore": 0.7467525005340576,
705
+ "eval_loss": 1.8874704837799072,
706
+ "eval_rouge1": 0.6652789954777591,
707
+ "eval_rouge2": 0.3747211875622626,
708
+ "eval_rougeL": 0.5781018250975862,
709
+ "eval_rougeLsum": 0.6512065884264598,
710
+ "eval_runtime": 21.6436,
711
+ "eval_samples_per_second": 1.386,
712
+ "eval_steps_per_second": 0.693,
713
+ "step": 7600
714
+ },
715
+ {
716
+ "epoch": 0.71,
717
+ "grad_norm": 0.17379231750965118,
718
+ "learning_rate": 0.00012894411076979506,
719
+ "loss": 1.8655,
720
+ "step": 7750
721
+ },
722
+ {
723
+ "epoch": 0.72,
724
+ "eval_bertscore": 0.7478018403053284,
725
+ "eval_loss": 1.8879252672195435,
726
+ "eval_rouge1": 0.6676077444849423,
727
+ "eval_rouge2": 0.37550824667101645,
728
+ "eval_rougeL": 0.5792625587400696,
729
+ "eval_rougeLsum": 0.6537654224373248,
730
+ "eval_runtime": 21.8026,
731
+ "eval_samples_per_second": 1.376,
732
+ "eval_steps_per_second": 0.688,
733
+ "step": 7800
734
+ },
735
+ {
736
+ "epoch": 0.73,
737
+ "grad_norm": 0.17975503206253052,
738
+ "learning_rate": 0.00012665168951446517,
739
+ "loss": 1.8593,
740
+ "step": 8000
741
+ },
742
+ {
743
+ "epoch": 0.73,
744
+ "eval_bertscore": 0.7490061521530151,
745
+ "eval_loss": 1.8872514963150024,
746
+ "eval_rouge1": 0.6677074837057098,
747
+ "eval_rouge2": 0.37723681410973775,
748
+ "eval_rougeL": 0.5806554105436175,
749
+ "eval_rougeLsum": 0.6531691046113964,
750
+ "eval_runtime": 21.2682,
751
+ "eval_samples_per_second": 1.411,
752
+ "eval_steps_per_second": 0.705,
753
+ "step": 8000
754
+ },
755
+ {
756
+ "epoch": 0.75,
757
+ "eval_bertscore": 0.7476587295532227,
758
+ "eval_loss": 1.8857940435409546,
759
+ "eval_rouge1": 0.6675733171919529,
760
+ "eval_rouge2": 0.37667421034338344,
761
+ "eval_rougeL": 0.5804128987718613,
762
+ "eval_rougeLsum": 0.6534287804714597,
763
+ "eval_runtime": 21.5325,
764
+ "eval_samples_per_second": 1.393,
765
+ "eval_steps_per_second": 0.697,
766
+ "step": 8200
767
+ },
768
+ {
769
+ "epoch": 0.76,
770
+ "grad_norm": 0.1596900373697281,
771
+ "learning_rate": 0.0001243592682591353,
772
+ "loss": 1.8627,
773
+ "step": 8250
774
+ },
775
+ {
776
+ "epoch": 0.77,
777
+ "eval_bertscore": 0.7444086074829102,
778
+ "eval_loss": 1.8874648809432983,
779
+ "eval_rouge1": 0.6633779669482168,
780
+ "eval_rouge2": 0.3710094509675216,
781
+ "eval_rougeL": 0.5760576627400225,
782
+ "eval_rougeLsum": 0.6499803336918719,
783
+ "eval_runtime": 21.4464,
784
+ "eval_samples_per_second": 1.399,
785
+ "eval_steps_per_second": 0.699,
786
+ "step": 8400
787
+ },
788
+ {
789
+ "epoch": 0.78,
790
+ "grad_norm": 0.16890183091163635,
791
+ "learning_rate": 0.00012206684700380542,
792
+ "loss": 1.8534,
793
+ "step": 8500
794
+ },
795
+ {
796
+ "epoch": 0.79,
797
+ "eval_bertscore": 0.7483052611351013,
798
+ "eval_loss": 1.8880757093429565,
799
+ "eval_rouge1": 0.6686948143176776,
800
+ "eval_rouge2": 0.3803796130427515,
801
+ "eval_rougeL": 0.5802459813261722,
802
+ "eval_rougeLsum": 0.6536962466082527,
803
+ "eval_runtime": 21.5416,
804
+ "eval_samples_per_second": 1.393,
805
+ "eval_steps_per_second": 0.696,
806
+ "step": 8600
807
+ },
808
+ {
809
+ "epoch": 0.8,
810
+ "grad_norm": 0.1596900373697281,
811
+ "learning_rate": 0.00011977442574847555,
812
+ "loss": 1.882,
813
+ "step": 8750
814
+ },
815
+ {
816
+ "epoch": 0.81,
817
+ "eval_bertscore": 0.748338520526886,
818
+ "eval_loss": 1.8871524333953857,
819
+ "eval_rouge1": 0.6673919143770407,
820
+ "eval_rouge2": 0.3761761743795482,
821
+ "eval_rougeL": 0.5797615995019129,
822
+ "eval_rougeLsum": 0.6526650363891257,
823
+ "eval_runtime": 21.8432,
824
+ "eval_samples_per_second": 1.373,
825
+ "eval_steps_per_second": 0.687,
826
+ "step": 8800
827
+ },
828
+ {
829
+ "epoch": 0.83,
830
+ "grad_norm": 0.16380883753299713,
831
+ "learning_rate": 0.00011748200449314565,
832
+ "loss": 1.8781,
833
+ "step": 9000
834
+ },
835
+ {
836
+ "epoch": 0.83,
837
+ "eval_bertscore": 0.7473989129066467,
838
+ "eval_loss": 1.885389804840088,
839
+ "eval_rouge1": 0.6660513187618474,
840
+ "eval_rouge2": 0.3728645884799071,
841
+ "eval_rougeL": 0.5767833607673931,
842
+ "eval_rougeLsum": 0.6518177265346137,
843
+ "eval_runtime": 21.5415,
844
+ "eval_samples_per_second": 1.393,
845
+ "eval_steps_per_second": 0.696,
846
+ "step": 9000
847
+ },
848
+ {
849
+ "epoch": 0.84,
850
+ "eval_bertscore": 0.7469697594642639,
851
+ "eval_loss": 1.8835673332214355,
852
+ "eval_rouge1": 0.6655382276884847,
853
+ "eval_rouge2": 0.3743925229327822,
854
+ "eval_rougeL": 0.5808516524350132,
855
+ "eval_rougeLsum": 0.6518276923554284,
856
+ "eval_runtime": 21.7289,
857
+ "eval_samples_per_second": 1.381,
858
+ "eval_steps_per_second": 0.69,
859
+ "step": 9200
860
+ },
861
+ {
862
+ "epoch": 0.85,
863
+ "grad_norm": 0.17286422848701477,
864
+ "learning_rate": 0.00011518958323781579,
865
+ "loss": 1.8672,
866
+ "step": 9250
867
+ },
868
+ {
869
+ "epoch": 0.86,
870
+ "eval_bertscore": 0.7491498589515686,
871
+ "eval_loss": 1.8845998048782349,
872
+ "eval_rouge1": 0.6670160490080832,
873
+ "eval_rouge2": 0.37860182825781935,
874
+ "eval_rougeL": 0.5797856034485049,
875
+ "eval_rougeLsum": 0.6531203725936218,
876
+ "eval_runtime": 21.5625,
877
+ "eval_samples_per_second": 1.391,
878
+ "eval_steps_per_second": 0.696,
879
+ "step": 9400
880
+ },
881
+ {
882
+ "epoch": 0.87,
883
+ "grad_norm": 0.16658568382263184,
884
+ "learning_rate": 0.0001128971619824859,
885
+ "loss": 1.8691,
886
+ "step": 9500
887
+ },
888
+ {
889
+ "epoch": 0.88,
890
+ "eval_bertscore": 0.7493313550949097,
891
+ "eval_loss": 1.8821747303009033,
892
+ "eval_rouge1": 0.6659791441681278,
893
+ "eval_rouge2": 0.3796033834485131,
894
+ "eval_rougeL": 0.580414529806212,
895
+ "eval_rougeLsum": 0.6528068238734432,
896
+ "eval_runtime": 21.8698,
897
+ "eval_samples_per_second": 1.372,
898
+ "eval_steps_per_second": 0.686,
899
+ "step": 9600
900
+ },
901
+ {
902
+ "epoch": 0.89,
903
+ "grad_norm": 0.1733073741197586,
904
+ "learning_rate": 0.00011060474072715603,
905
+ "loss": 1.8575,
906
+ "step": 9750
907
+ },
908
+ {
909
+ "epoch": 0.9,
910
+ "eval_bertscore": 0.7497690320014954,
911
+ "eval_loss": 1.8809062242507935,
912
+ "eval_rouge1": 0.6683202809005669,
913
+ "eval_rouge2": 0.379647408271533,
914
+ "eval_rougeL": 0.5812799059293663,
915
+ "eval_rougeLsum": 0.6549076224428805,
916
+ "eval_runtime": 21.461,
917
+ "eval_samples_per_second": 1.398,
918
+ "eval_steps_per_second": 0.699,
919
+ "step": 9800
920
+ },
921
+ {
922
+ "epoch": 0.92,
923
+ "grad_norm": 0.16828681528568268,
924
+ "learning_rate": 0.00010831231947182614,
925
+ "loss": 1.8799,
926
+ "step": 10000
927
+ },
928
+ {
929
+ "epoch": 0.92,
930
+ "eval_bertscore": 0.7487274408340454,
931
+ "eval_loss": 1.8800114393234253,
932
+ "eval_rouge1": 0.6694707226380743,
933
+ "eval_rouge2": 0.37780830529690856,
934
+ "eval_rougeL": 0.5789377835641822,
935
+ "eval_rougeLsum": 0.6540561492044448,
936
+ "eval_runtime": 21.6228,
937
+ "eval_samples_per_second": 1.387,
938
+ "eval_steps_per_second": 0.694,
939
+ "step": 10000
940
+ },
941
+ {
942
+ "epoch": 0.94,
943
+ "eval_bertscore": 0.7495086789131165,
944
+ "eval_loss": 1.8811218738555908,
945
+ "eval_rouge1": 0.6714277794869861,
946
+ "eval_rouge2": 0.3814957239141348,
947
+ "eval_rougeL": 0.5817721016839257,
948
+ "eval_rougeLsum": 0.6566092952916721,
949
+ "eval_runtime": 23.1282,
950
+ "eval_samples_per_second": 1.297,
951
+ "eval_steps_per_second": 0.649,
952
+ "step": 10200
953
+ },
954
+ {
955
+ "epoch": 0.94,
956
+ "grad_norm": 0.16498848795890808,
957
+ "learning_rate": 0.00010601989821649627,
958
+ "loss": 1.8656,
959
+ "step": 10250
960
+ },
961
+ {
962
+ "epoch": 0.95,
963
+ "eval_bertscore": 0.749505877494812,
964
+ "eval_loss": 1.8809926509857178,
965
+ "eval_rouge1": 0.6720420767359538,
966
+ "eval_rouge2": 0.38239237549289784,
967
+ "eval_rougeL": 0.5825845512902208,
968
+ "eval_rougeLsum": 0.6590116525116119,
969
+ "eval_runtime": 21.5266,
970
+ "eval_samples_per_second": 1.394,
971
+ "eval_steps_per_second": 0.697,
972
+ "step": 10400
973
+ },
974
+ {
975
+ "epoch": 0.96,
976
+ "grad_norm": 0.1661728322505951,
977
+ "learning_rate": 0.00010372747696116638,
978
+ "loss": 1.8633,
979
+ "step": 10500
980
+ },
981
+ {
982
+ "epoch": 0.97,
983
+ "eval_bertscore": 0.7484509944915771,
984
+ "eval_loss": 1.8795918226242065,
985
+ "eval_rouge1": 0.66861224256168,
986
+ "eval_rouge2": 0.3810938571231235,
987
+ "eval_rougeL": 0.581338929419374,
988
+ "eval_rougeLsum": 0.6556287448758898,
989
+ "eval_runtime": 21.6144,
990
+ "eval_samples_per_second": 1.388,
991
+ "eval_steps_per_second": 0.694,
992
+ "step": 10600
993
+ },
994
+ {
995
+ "epoch": 0.99,
996
+ "grad_norm": 0.1695539355278015,
997
+ "learning_rate": 0.00010143505570583652,
998
+ "loss": 1.8778,
999
+ "step": 10750
1000
+ },
1001
+ {
1002
+ "epoch": 0.99,
1003
+ "eval_bertscore": 0.747430145740509,
1004
+ "eval_loss": 1.8807307481765747,
1005
+ "eval_rouge1": 0.6659775067192504,
1006
+ "eval_rouge2": 0.37723044840422537,
1007
+ "eval_rougeL": 0.5790798830214317,
1008
+ "eval_rougeLsum": 0.6509981906464294,
1009
+ "eval_runtime": 21.9658,
1010
+ "eval_samples_per_second": 1.366,
1011
+ "eval_steps_per_second": 0.683,
1012
+ "step": 10800
1013
+ },
1014
+ {
1015
+ "epoch": 1.01,
1016
+ "grad_norm": 0.18244074285030365,
1017
+ "learning_rate": 9.914263445050664e-05,
1018
+ "loss": 1.8425,
1019
+ "step": 11000
1020
+ },
1021
+ {
1022
+ "epoch": 1.01,
1023
+ "eval_bertscore": 0.7464674711227417,
1024
+ "eval_loss": 1.8850181102752686,
1025
+ "eval_rouge1": 0.6682062462715245,
1026
+ "eval_rouge2": 0.377961045305675,
1027
+ "eval_rougeL": 0.5785946041032981,
1028
+ "eval_rougeLsum": 0.6544658695180745,
1029
+ "eval_runtime": 21.4985,
1030
+ "eval_samples_per_second": 1.395,
1031
+ "eval_steps_per_second": 0.698,
1032
+ "step": 11000
1033
+ },
1034
+ {
1035
+ "epoch": 1.03,
1036
+ "eval_bertscore": 0.748903214931488,
1037
+ "eval_loss": 1.8819694519042969,
1038
+ "eval_rouge1": 0.6702994540242251,
1039
+ "eval_rouge2": 0.38293287997414793,
1040
+ "eval_rougeL": 0.5814513237567966,
1041
+ "eval_rougeLsum": 0.6559726946972199,
1042
+ "eval_runtime": 21.4139,
1043
+ "eval_samples_per_second": 1.401,
1044
+ "eval_steps_per_second": 0.7,
1045
+ "step": 11200
1046
+ }
1047
+ ],
1048
+ "logging_steps": 250,
1049
+ "max_steps": 21812,
1050
+ "num_input_tokens_seen": 0,
1051
+ "num_train_epochs": 2,
1052
+ "save_steps": 800,
1053
+ "total_flos": 7.549926966086861e+17,
1054
+ "train_batch_size": 2,
1055
+ "trial_name": null,
1056
+ "trial_params": null
1057
+ }
checkpoint-11200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18a96e9baacd033c0a419444553d18b70e4f76e7b37401a6dcc6b00ceb2cc1e1
3
+ size 5048
checkpoint-11200/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-12000/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: /workspace/model-export/allstax/shorting-phi-e4
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-12000/adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/workspace/model-export/allstax/shorting-phi-e4",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "q_proj",
25
+ "fc2",
26
+ "dense",
27
+ "v_proj",
28
+ "fc1"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }
checkpoint-12000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bcc998ff37f69532992962775aed4f6a80b3c8e009e23e20b1bf5dbcbb83fd9
3
+ size 377538512
checkpoint-12000/added_tokens.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\t\t": 50294,
3
+ "\t\t\t": 50293,
4
+ "\t\t\t\t": 50292,
5
+ "\t\t\t\t\t": 50291,
6
+ "\t\t\t\t\t\t": 50290,
7
+ "\t\t\t\t\t\t\t": 50289,
8
+ "\t\t\t\t\t\t\t\t": 50288,
9
+ "\t\t\t\t\t\t\t\t\t": 50287,
10
+ " ": 50286,
11
+ " ": 50285,
12
+ " ": 50284,
13
+ " ": 50283,
14
+ " ": 50282,
15
+ " ": 50281,
16
+ " ": 50280,
17
+ " ": 50279,
18
+ " ": 50278,
19
+ " ": 50277,
20
+ " ": 50276,
21
+ " ": 50275,
22
+ " ": 50274,
23
+ " ": 50273,
24
+ " ": 50272,
25
+ " ": 50271,
26
+ " ": 50270,
27
+ " ": 50269,
28
+ " ": 50268,
29
+ " ": 50267,
30
+ " ": 50266,
31
+ " ": 50265,
32
+ " ": 50264,
33
+ " ": 50263,
34
+ " ": 50262,
35
+ " ": 50261,
36
+ " ": 50260,
37
+ " ": 50259,
38
+ " ": 50258,
39
+ " ": 50257
40
+ }
checkpoint-12000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-12000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:635f5075cdcf9121978808d531e507836afe13157c1a1a5ed7d374290c267ed1
3
+ size 14244
checkpoint-12000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3e3ee4e174f10e984375d35252785e934ed01d8147400bdc0335f19ae1f06e9
3
+ size 1064
checkpoint-12000/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-12000/tokenizer_config.json ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "50257": {
14
+ "content": " ",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": false
20
+ },
21
+ "50258": {
22
+ "content": " ",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "50259": {
30
+ "content": " ",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "50260": {
38
+ "content": " ",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "50261": {
46
+ "content": " ",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "50262": {
54
+ "content": " ",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "50263": {
62
+ "content": " ",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "50264": {
70
+ "content": " ",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "50265": {
78
+ "content": " ",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "50266": {
86
+ "content": " ",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "50267": {
94
+ "content": " ",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "50268": {
102
+ "content": " ",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "50269": {
110
+ "content": " ",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "50270": {
118
+ "content": " ",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "50271": {
126
+ "content": " ",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "50272": {
134
+ "content": " ",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "50273": {
142
+ "content": " ",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "50274": {
150
+ "content": " ",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "50275": {
158
+ "content": " ",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "50276": {
166
+ "content": " ",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "50277": {
174
+ "content": " ",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "50278": {
182
+ "content": " ",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "50279": {
190
+ "content": " ",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "50280": {
198
+ "content": " ",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "50281": {
206
+ "content": " ",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "50282": {
214
+ "content": " ",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "50283": {
222
+ "content": " ",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "50284": {
230
+ "content": " ",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "50285": {
238
+ "content": " ",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "50286": {
246
+ "content": " ",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "50287": {
254
+ "content": "\t\t\t\t\t\t\t\t\t",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "50288": {
262
+ "content": "\t\t\t\t\t\t\t\t",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "50289": {
270
+ "content": "\t\t\t\t\t\t\t",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "50290": {
278
+ "content": "\t\t\t\t\t\t",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "50291": {
286
+ "content": "\t\t\t\t\t",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "50292": {
294
+ "content": "\t\t\t\t",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "50293": {
302
+ "content": "\t\t\t",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "50294": {
310
+ "content": "\t\t",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ }
317
+ },
318
+ "bos_token": "<|endoftext|>",
319
+ "clean_up_tokenization_spaces": true,
320
+ "eos_token": "<|endoftext|>",
321
+ "errors": "replace",
322
+ "model_max_length": 2048,
323
+ "pad_token": "<|endoftext|>",
324
+ "padding_side": "left",
325
+ "tokenizer_class": "CodeGenTokenizer",
326
+ "unk_token": "<|endoftext|>"
327
+ }
checkpoint-12000/trainer_state.json ADDED
@@ -0,0 +1,1137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.1002613120616147,
5
+ "eval_steps": 200,
6
+ "global_step": 12000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "eval_bertscore": 0.7401605248451233,
14
+ "eval_loss": 1.9530484676361084,
15
+ "eval_rouge1": 0.6562857460474375,
16
+ "eval_rouge2": 0.3640670727106235,
17
+ "eval_rougeL": 0.5655212336424695,
18
+ "eval_rougeLsum": 0.6414840198810386,
19
+ "eval_runtime": 21.7196,
20
+ "eval_samples_per_second": 1.381,
21
+ "eval_steps_per_second": 0.691,
22
+ "step": 200
23
+ },
24
+ {
25
+ "epoch": 0.02,
26
+ "grad_norm": 0.25105270743370056,
27
+ "learning_rate": 0.00019771674842969145,
28
+ "loss": 1.7353,
29
+ "step": 250
30
+ },
31
+ {
32
+ "epoch": 0.04,
33
+ "eval_bertscore": 0.7432050108909607,
34
+ "eval_loss": 1.9583823680877686,
35
+ "eval_rouge1": 0.6554226269617707,
36
+ "eval_rouge2": 0.36661086995296877,
37
+ "eval_rougeL": 0.5637448790342183,
38
+ "eval_rougeLsum": 0.6419796784912521,
39
+ "eval_runtime": 21.9623,
40
+ "eval_samples_per_second": 1.366,
41
+ "eval_steps_per_second": 0.683,
42
+ "step": 400
43
+ },
44
+ {
45
+ "epoch": 0.05,
46
+ "grad_norm": 0.26550447940826416,
47
+ "learning_rate": 0.00019542432717436156,
48
+ "loss": 1.7786,
49
+ "step": 500
50
+ },
51
+ {
52
+ "epoch": 0.06,
53
+ "eval_bertscore": 0.7469045519828796,
54
+ "eval_loss": 1.9245686531066895,
55
+ "eval_rouge1": 0.6662431635890791,
56
+ "eval_rouge2": 0.3735263724826765,
57
+ "eval_rougeL": 0.5755071616151013,
58
+ "eval_rougeLsum": 0.6538383087686117,
59
+ "eval_runtime": 21.5302,
60
+ "eval_samples_per_second": 1.393,
61
+ "eval_steps_per_second": 0.697,
62
+ "step": 600
63
+ },
64
+ {
65
+ "epoch": 0.07,
66
+ "grad_norm": 0.1538015753030777,
67
+ "learning_rate": 0.0001931319059190317,
68
+ "loss": 1.8851,
69
+ "step": 750
70
+ },
71
+ {
72
+ "epoch": 0.07,
73
+ "eval_bertscore": 0.7442477941513062,
74
+ "eval_loss": 1.9187489748001099,
75
+ "eval_rouge1": 0.6606221897489035,
76
+ "eval_rouge2": 0.368654563659435,
77
+ "eval_rougeL": 0.5731546210408094,
78
+ "eval_rougeLsum": 0.6470590823125606,
79
+ "eval_runtime": 21.9831,
80
+ "eval_samples_per_second": 1.365,
81
+ "eval_steps_per_second": 0.682,
82
+ "step": 800
83
+ },
84
+ {
85
+ "epoch": 0.09,
86
+ "grad_norm": 0.1681252270936966,
87
+ "learning_rate": 0.0001908394846637018,
88
+ "loss": 1.8919,
89
+ "step": 1000
90
+ },
91
+ {
92
+ "epoch": 0.09,
93
+ "eval_bertscore": 0.7458053231239319,
94
+ "eval_loss": 1.9159075021743774,
95
+ "eval_rouge1": 0.6621259186456026,
96
+ "eval_rouge2": 0.372024043683234,
97
+ "eval_rougeL": 0.5743354509339939,
98
+ "eval_rougeLsum": 0.6491550893780276,
99
+ "eval_runtime": 21.7159,
100
+ "eval_samples_per_second": 1.381,
101
+ "eval_steps_per_second": 0.691,
102
+ "step": 1000
103
+ },
104
+ {
105
+ "epoch": 0.11,
106
+ "eval_bertscore": 0.7468854784965515,
107
+ "eval_loss": 1.9140182733535767,
108
+ "eval_rouge1": 0.6626581781149132,
109
+ "eval_rouge2": 0.37318557504782157,
110
+ "eval_rougeL": 0.5759264203594217,
111
+ "eval_rougeLsum": 0.6490702446275723,
112
+ "eval_runtime": 21.6486,
113
+ "eval_samples_per_second": 1.386,
114
+ "eval_steps_per_second": 0.693,
115
+ "step": 1200
116
+ },
117
+ {
118
+ "epoch": 0.11,
119
+ "grad_norm": 0.1552441120147705,
120
+ "learning_rate": 0.00018854706340837193,
121
+ "loss": 1.9052,
122
+ "step": 1250
123
+ },
124
+ {
125
+ "epoch": 0.13,
126
+ "eval_bertscore": 0.7475314736366272,
127
+ "eval_loss": 1.913794755935669,
128
+ "eval_rouge1": 0.6648687174353192,
129
+ "eval_rouge2": 0.3760379232448734,
130
+ "eval_rougeL": 0.5784915488164926,
131
+ "eval_rougeLsum": 0.6513864520108938,
132
+ "eval_runtime": 21.664,
133
+ "eval_samples_per_second": 1.385,
134
+ "eval_steps_per_second": 0.692,
135
+ "step": 1400
136
+ },
137
+ {
138
+ "epoch": 0.14,
139
+ "grad_norm": 0.14638397097587585,
140
+ "learning_rate": 0.00018625464215304204,
141
+ "loss": 1.8843,
142
+ "step": 1500
143
+ },
144
+ {
145
+ "epoch": 0.15,
146
+ "eval_bertscore": 0.747238039970398,
147
+ "eval_loss": 1.9117029905319214,
148
+ "eval_rouge1": 0.6638085237198453,
149
+ "eval_rouge2": 0.3742779818055127,
150
+ "eval_rougeL": 0.5754209460423059,
151
+ "eval_rougeLsum": 0.6506476155592722,
152
+ "eval_runtime": 21.9308,
153
+ "eval_samples_per_second": 1.368,
154
+ "eval_steps_per_second": 0.684,
155
+ "step": 1600
156
+ },
157
+ {
158
+ "epoch": 0.16,
159
+ "grad_norm": 0.15738993883132935,
160
+ "learning_rate": 0.00018396222089771218,
161
+ "loss": 1.8964,
162
+ "step": 1750
163
+ },
164
+ {
165
+ "epoch": 0.17,
166
+ "eval_bertscore": 0.7473016381263733,
167
+ "eval_loss": 1.9117563962936401,
168
+ "eval_rouge1": 0.6620053151663765,
169
+ "eval_rouge2": 0.37406692119411245,
170
+ "eval_rougeL": 0.5758911607323577,
171
+ "eval_rougeLsum": 0.6494070575604445,
172
+ "eval_runtime": 21.6727,
173
+ "eval_samples_per_second": 1.384,
174
+ "eval_steps_per_second": 0.692,
175
+ "step": 1800
176
+ },
177
+ {
178
+ "epoch": 0.18,
179
+ "grad_norm": 0.1588907092809677,
180
+ "learning_rate": 0.00018166979964238228,
181
+ "loss": 1.8827,
182
+ "step": 2000
183
+ },
184
+ {
185
+ "epoch": 0.18,
186
+ "eval_bertscore": 0.7485987544059753,
187
+ "eval_loss": 1.9126006364822388,
188
+ "eval_rouge1": 0.6641836156334741,
189
+ "eval_rouge2": 0.37320215574735827,
190
+ "eval_rougeL": 0.5783015040447993,
191
+ "eval_rougeLsum": 0.6522235940423647,
192
+ "eval_runtime": 21.9759,
193
+ "eval_samples_per_second": 1.365,
194
+ "eval_steps_per_second": 0.683,
195
+ "step": 2000
196
+ },
197
+ {
198
+ "epoch": 0.2,
199
+ "eval_bertscore": 0.7482583522796631,
200
+ "eval_loss": 1.9075205326080322,
201
+ "eval_rouge1": 0.6658219484766166,
202
+ "eval_rouge2": 0.37723364952258465,
203
+ "eval_rougeL": 0.5769040785174693,
204
+ "eval_rougeLsum": 0.6511328888044219,
205
+ "eval_runtime": 21.5892,
206
+ "eval_samples_per_second": 1.39,
207
+ "eval_steps_per_second": 0.695,
208
+ "step": 2200
209
+ },
210
+ {
211
+ "epoch": 0.21,
212
+ "grad_norm": 0.15247465670108795,
213
+ "learning_rate": 0.00017937737838705242,
214
+ "loss": 1.8831,
215
+ "step": 2250
216
+ },
217
+ {
218
+ "epoch": 0.22,
219
+ "eval_bertscore": 0.7460805177688599,
220
+ "eval_loss": 1.9088668823242188,
221
+ "eval_rouge1": 0.6627321043292516,
222
+ "eval_rouge2": 0.3696581195003696,
223
+ "eval_rougeL": 0.5740988544467178,
224
+ "eval_rougeLsum": 0.6478729042661874,
225
+ "eval_runtime": 21.9221,
226
+ "eval_samples_per_second": 1.368,
227
+ "eval_steps_per_second": 0.684,
228
+ "step": 2400
229
+ },
230
+ {
231
+ "epoch": 0.23,
232
+ "grad_norm": 0.1587379276752472,
233
+ "learning_rate": 0.00017708495713172253,
234
+ "loss": 1.8829,
235
+ "step": 2500
236
+ },
237
+ {
238
+ "epoch": 0.24,
239
+ "eval_bertscore": 0.7472203373908997,
240
+ "eval_loss": 1.906219482421875,
241
+ "eval_rouge1": 0.6637415370426804,
242
+ "eval_rouge2": 0.37565276875837994,
243
+ "eval_rougeL": 0.5773879369079004,
244
+ "eval_rougeLsum": 0.6488719947518645,
245
+ "eval_runtime": 21.8112,
246
+ "eval_samples_per_second": 1.375,
247
+ "eval_steps_per_second": 0.688,
248
+ "step": 2600
249
+ },
250
+ {
251
+ "epoch": 0.25,
252
+ "grad_norm": 0.1558646410703659,
253
+ "learning_rate": 0.00017479253587639266,
254
+ "loss": 1.8978,
255
+ "step": 2750
256
+ },
257
+ {
258
+ "epoch": 0.26,
259
+ "eval_bertscore": 0.7466126680374146,
260
+ "eval_loss": 1.9045982360839844,
261
+ "eval_rouge1": 0.6616225540296956,
262
+ "eval_rouge2": 0.37370762164745913,
263
+ "eval_rougeL": 0.5759418528371097,
264
+ "eval_rougeLsum": 0.6479977636906877,
265
+ "eval_runtime": 21.8772,
266
+ "eval_samples_per_second": 1.371,
267
+ "eval_steps_per_second": 0.686,
268
+ "step": 2800
269
+ },
270
+ {
271
+ "epoch": 0.28,
272
+ "grad_norm": 0.14783035218715668,
273
+ "learning_rate": 0.00017250011462106277,
274
+ "loss": 1.8978,
275
+ "step": 3000
276
+ },
277
+ {
278
+ "epoch": 0.28,
279
+ "eval_bertscore": 0.7485571503639221,
280
+ "eval_loss": 1.9035439491271973,
281
+ "eval_rouge1": 0.6664050030501707,
282
+ "eval_rouge2": 0.379492440917784,
283
+ "eval_rougeL": 0.5806973731221475,
284
+ "eval_rougeLsum": 0.6524346156604702,
285
+ "eval_runtime": 21.9217,
286
+ "eval_samples_per_second": 1.369,
287
+ "eval_steps_per_second": 0.684,
288
+ "step": 3000
289
+ },
290
+ {
291
+ "epoch": 0.29,
292
+ "eval_bertscore": 0.7483461499214172,
293
+ "eval_loss": 1.9022458791732788,
294
+ "eval_rouge1": 0.6618989733136488,
295
+ "eval_rouge2": 0.37377379177271053,
296
+ "eval_rougeL": 0.5780989082173933,
297
+ "eval_rougeLsum": 0.6490379362631586,
298
+ "eval_runtime": 21.7847,
299
+ "eval_samples_per_second": 1.377,
300
+ "eval_steps_per_second": 0.689,
301
+ "step": 3200
302
+ },
303
+ {
304
+ "epoch": 0.3,
305
+ "grad_norm": 0.16484151780605316,
306
+ "learning_rate": 0.0001702076933657329,
307
+ "loss": 1.8715,
308
+ "step": 3250
309
+ },
310
+ {
311
+ "epoch": 0.31,
312
+ "eval_bertscore": 0.7490711212158203,
313
+ "eval_loss": 1.9013088941574097,
314
+ "eval_rouge1": 0.6638141306545007,
315
+ "eval_rouge2": 0.37356255553691553,
316
+ "eval_rougeL": 0.577975450251653,
317
+ "eval_rougeLsum": 0.6492478632295806,
318
+ "eval_runtime": 21.8807,
319
+ "eval_samples_per_second": 1.371,
320
+ "eval_steps_per_second": 0.686,
321
+ "step": 3400
322
+ },
323
+ {
324
+ "epoch": 0.32,
325
+ "grad_norm": 0.14130128920078278,
326
+ "learning_rate": 0.000167915272110403,
327
+ "loss": 1.8819,
328
+ "step": 3500
329
+ },
330
+ {
331
+ "epoch": 0.33,
332
+ "eval_bertscore": 0.7475283741950989,
333
+ "eval_loss": 1.9002223014831543,
334
+ "eval_rouge1": 0.6628836314413511,
335
+ "eval_rouge2": 0.37179988805094977,
336
+ "eval_rougeL": 0.5764222388923268,
337
+ "eval_rougeLsum": 0.649864229310889,
338
+ "eval_runtime": 22.124,
339
+ "eval_samples_per_second": 1.356,
340
+ "eval_steps_per_second": 0.678,
341
+ "step": 3600
342
+ },
343
+ {
344
+ "epoch": 0.34,
345
+ "grad_norm": 0.1494186818599701,
346
+ "learning_rate": 0.00016562285085507315,
347
+ "loss": 1.8828,
348
+ "step": 3750
349
+ },
350
+ {
351
+ "epoch": 0.35,
352
+ "eval_bertscore": 0.7486498951911926,
353
+ "eval_loss": 1.9011151790618896,
354
+ "eval_rouge1": 0.6669673680023924,
355
+ "eval_rouge2": 0.3771780440183751,
356
+ "eval_rougeL": 0.5792518624130161,
357
+ "eval_rougeLsum": 0.6534484242953056,
358
+ "eval_runtime": 21.813,
359
+ "eval_samples_per_second": 1.375,
360
+ "eval_steps_per_second": 0.688,
361
+ "step": 3800
362
+ },
363
+ {
364
+ "epoch": 0.37,
365
+ "grad_norm": 0.14803479611873627,
366
+ "learning_rate": 0.00016333042959974325,
367
+ "loss": 1.8761,
368
+ "step": 4000
369
+ },
370
+ {
371
+ "epoch": 0.37,
372
+ "eval_bertscore": 0.7471507787704468,
373
+ "eval_loss": 1.9001713991165161,
374
+ "eval_rouge1": 0.6651735220672027,
375
+ "eval_rouge2": 0.3736698451416937,
376
+ "eval_rougeL": 0.5779938808281732,
377
+ "eval_rougeLsum": 0.6509815118131576,
378
+ "eval_runtime": 21.5004,
379
+ "eval_samples_per_second": 1.395,
380
+ "eval_steps_per_second": 0.698,
381
+ "step": 4000
382
+ },
383
+ {
384
+ "epoch": 0.39,
385
+ "eval_bertscore": 0.7485501766204834,
386
+ "eval_loss": 1.8993827104568481,
387
+ "eval_rouge1": 0.6646424082737133,
388
+ "eval_rouge2": 0.37318485364862475,
389
+ "eval_rougeL": 0.5773338159759467,
390
+ "eval_rougeLsum": 0.6507594353103527,
391
+ "eval_runtime": 21.2963,
392
+ "eval_samples_per_second": 1.409,
393
+ "eval_steps_per_second": 0.704,
394
+ "step": 4200
395
+ },
396
+ {
397
+ "epoch": 0.39,
398
+ "grad_norm": 0.15562959015369415,
399
+ "learning_rate": 0.0001610380083444134,
400
+ "loss": 1.8672,
401
+ "step": 4250
402
+ },
403
+ {
404
+ "epoch": 0.4,
405
+ "eval_bertscore": 0.7469989061355591,
406
+ "eval_loss": 1.900540828704834,
407
+ "eval_rouge1": 0.6620664558691891,
408
+ "eval_rouge2": 0.37299419371215703,
409
+ "eval_rougeL": 0.5765442194831125,
410
+ "eval_rougeLsum": 0.6472642385429858,
411
+ "eval_runtime": 21.9086,
412
+ "eval_samples_per_second": 1.369,
413
+ "eval_steps_per_second": 0.685,
414
+ "step": 4400
415
+ },
416
+ {
417
+ "epoch": 0.41,
418
+ "grad_norm": 0.15420928597450256,
419
+ "learning_rate": 0.0001587455870890835,
420
+ "loss": 1.8754,
421
+ "step": 4500
422
+ },
423
+ {
424
+ "epoch": 0.42,
425
+ "eval_bertscore": 0.7475299835205078,
426
+ "eval_loss": 1.8988685607910156,
427
+ "eval_rouge1": 0.6656661780424216,
428
+ "eval_rouge2": 0.37467258880478527,
429
+ "eval_rougeL": 0.5770800519970718,
430
+ "eval_rougeLsum": 0.6522703864288166,
431
+ "eval_runtime": 22.063,
432
+ "eval_samples_per_second": 1.36,
433
+ "eval_steps_per_second": 0.68,
434
+ "step": 4600
435
+ },
436
+ {
437
+ "epoch": 0.44,
438
+ "grad_norm": 0.15809176862239838,
439
+ "learning_rate": 0.00015645316583375363,
440
+ "loss": 1.8848,
441
+ "step": 4750
442
+ },
443
+ {
444
+ "epoch": 0.44,
445
+ "eval_bertscore": 0.7490234375,
446
+ "eval_loss": 1.8991097211837769,
447
+ "eval_rouge1": 0.6651730257289085,
448
+ "eval_rouge2": 0.3778893043274054,
449
+ "eval_rougeL": 0.5782673838033503,
450
+ "eval_rougeLsum": 0.6516865674488727,
451
+ "eval_runtime": 22.0202,
452
+ "eval_samples_per_second": 1.362,
453
+ "eval_steps_per_second": 0.681,
454
+ "step": 4800
455
+ },
456
+ {
457
+ "epoch": 0.46,
458
+ "grad_norm": 0.17979757487773895,
459
+ "learning_rate": 0.00015416074457842374,
460
+ "loss": 1.8851,
461
+ "step": 5000
462
+ },
463
+ {
464
+ "epoch": 0.46,
465
+ "eval_bertscore": 0.7492111325263977,
466
+ "eval_loss": 1.897339940071106,
467
+ "eval_rouge1": 0.665920573890169,
468
+ "eval_rouge2": 0.37917993898535385,
469
+ "eval_rougeL": 0.5800236892888617,
470
+ "eval_rougeLsum": 0.6529131688355863,
471
+ "eval_runtime": 21.6103,
472
+ "eval_samples_per_second": 1.388,
473
+ "eval_steps_per_second": 0.694,
474
+ "step": 5000
475
+ },
476
+ {
477
+ "epoch": 0.48,
478
+ "eval_bertscore": 0.7491253614425659,
479
+ "eval_loss": 1.897528052330017,
480
+ "eval_rouge1": 0.6653452054219615,
481
+ "eval_rouge2": 0.3759208437918665,
482
+ "eval_rougeL": 0.5776757077854651,
483
+ "eval_rougeLsum": 0.6511876484723524,
484
+ "eval_runtime": 21.3101,
485
+ "eval_samples_per_second": 1.408,
486
+ "eval_steps_per_second": 0.704,
487
+ "step": 5200
488
+ },
489
+ {
490
+ "epoch": 0.48,
491
+ "grad_norm": 0.16869671642780304,
492
+ "learning_rate": 0.00015186832332309387,
493
+ "loss": 1.8783,
494
+ "step": 5250
495
+ },
496
+ {
497
+ "epoch": 0.5,
498
+ "eval_bertscore": 0.7494469881057739,
499
+ "eval_loss": 1.895969271659851,
500
+ "eval_rouge1": 0.6660951369469854,
501
+ "eval_rouge2": 0.3764077134133328,
502
+ "eval_rougeL": 0.578785826234568,
503
+ "eval_rougeLsum": 0.6525967284041656,
504
+ "eval_runtime": 21.7955,
505
+ "eval_samples_per_second": 1.376,
506
+ "eval_steps_per_second": 0.688,
507
+ "step": 5400
508
+ },
509
+ {
510
+ "epoch": 0.5,
511
+ "grad_norm": 0.15996231138706207,
512
+ "learning_rate": 0.00014957590206776398,
513
+ "loss": 1.8805,
514
+ "step": 5500
515
+ },
516
+ {
517
+ "epoch": 0.51,
518
+ "eval_bertscore": 0.7486470341682434,
519
+ "eval_loss": 1.8955131769180298,
520
+ "eval_rouge1": 0.6670292173522965,
521
+ "eval_rouge2": 0.37457018529010144,
522
+ "eval_rougeL": 0.5775243235432015,
523
+ "eval_rougeLsum": 0.652574079807632,
524
+ "eval_runtime": 21.7576,
525
+ "eval_samples_per_second": 1.379,
526
+ "eval_steps_per_second": 0.689,
527
+ "step": 5600
528
+ },
529
+ {
530
+ "epoch": 0.53,
531
+ "grad_norm": 0.17192547023296356,
532
+ "learning_rate": 0.00014728348081243412,
533
+ "loss": 1.8884,
534
+ "step": 5750
535
+ },
536
+ {
537
+ "epoch": 0.53,
538
+ "eval_bertscore": 0.7483081817626953,
539
+ "eval_loss": 1.895763874053955,
540
+ "eval_rouge1": 0.6659275328276997,
541
+ "eval_rouge2": 0.3778666475350364,
542
+ "eval_rougeL": 0.579425140056643,
543
+ "eval_rougeLsum": 0.6515870828784887,
544
+ "eval_runtime": 21.6648,
545
+ "eval_samples_per_second": 1.385,
546
+ "eval_steps_per_second": 0.692,
547
+ "step": 5800
548
+ },
549
+ {
550
+ "epoch": 0.55,
551
+ "grad_norm": 0.15838442742824554,
552
+ "learning_rate": 0.00014499105955710422,
553
+ "loss": 1.8913,
554
+ "step": 6000
555
+ },
556
+ {
557
+ "epoch": 0.55,
558
+ "eval_bertscore": 0.7493732571601868,
559
+ "eval_loss": 1.8914682865142822,
560
+ "eval_rouge1": 0.6669695240447069,
561
+ "eval_rouge2": 0.3769441114214874,
562
+ "eval_rougeL": 0.5798986667152066,
563
+ "eval_rougeLsum": 0.6534527583592111,
564
+ "eval_runtime": 21.4686,
565
+ "eval_samples_per_second": 1.397,
566
+ "eval_steps_per_second": 0.699,
567
+ "step": 6000
568
+ },
569
+ {
570
+ "epoch": 0.57,
571
+ "eval_bertscore": 0.7510559558868408,
572
+ "eval_loss": 1.8923884630203247,
573
+ "eval_rouge1": 0.6677938121282943,
574
+ "eval_rouge2": 0.37854575387307554,
575
+ "eval_rougeL": 0.5817052753830161,
576
+ "eval_rougeLsum": 0.6534737907551461,
577
+ "eval_runtime": 21.593,
578
+ "eval_samples_per_second": 1.389,
579
+ "eval_steps_per_second": 0.695,
580
+ "step": 6200
581
+ },
582
+ {
583
+ "epoch": 0.57,
584
+ "grad_norm": 0.15312573313713074,
585
+ "learning_rate": 0.00014269863830177433,
586
+ "loss": 1.8705,
587
+ "step": 6250
588
+ },
589
+ {
590
+ "epoch": 0.59,
591
+ "eval_bertscore": 0.7479371428489685,
592
+ "eval_loss": 1.891802430152893,
593
+ "eval_rouge1": 0.6658674357402252,
594
+ "eval_rouge2": 0.3757712649269345,
595
+ "eval_rougeL": 0.5791817270712349,
596
+ "eval_rougeLsum": 0.6509960265397259,
597
+ "eval_runtime": 21.8726,
598
+ "eval_samples_per_second": 1.372,
599
+ "eval_steps_per_second": 0.686,
600
+ "step": 6400
601
+ },
602
+ {
603
+ "epoch": 0.6,
604
+ "grad_norm": 0.15844614803791046,
605
+ "learning_rate": 0.00014040621704644447,
606
+ "loss": 1.8643,
607
+ "step": 6500
608
+ },
609
+ {
610
+ "epoch": 0.61,
611
+ "eval_bertscore": 0.7484550476074219,
612
+ "eval_loss": 1.8903728723526,
613
+ "eval_rouge1": 0.6683828816523312,
614
+ "eval_rouge2": 0.37811618722345436,
615
+ "eval_rougeL": 0.5802581730590705,
616
+ "eval_rougeLsum": 0.6534402764651661,
617
+ "eval_runtime": 21.8343,
618
+ "eval_samples_per_second": 1.374,
619
+ "eval_steps_per_second": 0.687,
620
+ "step": 6600
621
+ },
622
+ {
623
+ "epoch": 0.62,
624
+ "grad_norm": 0.1661410629749298,
625
+ "learning_rate": 0.00013811379579111458,
626
+ "loss": 1.877,
627
+ "step": 6750
628
+ },
629
+ {
630
+ "epoch": 0.62,
631
+ "eval_bertscore": 0.747416615486145,
632
+ "eval_loss": 1.8915189504623413,
633
+ "eval_rouge1": 0.6644777881148224,
634
+ "eval_rouge2": 0.3747657029706615,
635
+ "eval_rougeL": 0.5793454557198501,
636
+ "eval_rougeLsum": 0.6521716611395593,
637
+ "eval_runtime": 21.523,
638
+ "eval_samples_per_second": 1.394,
639
+ "eval_steps_per_second": 0.697,
640
+ "step": 6800
641
+ },
642
+ {
643
+ "epoch": 0.64,
644
+ "grad_norm": 0.16483080387115479,
645
+ "learning_rate": 0.00013582137453578468,
646
+ "loss": 1.8792,
647
+ "step": 7000
648
+ },
649
+ {
650
+ "epoch": 0.64,
651
+ "eval_bertscore": 0.7480576634407043,
652
+ "eval_loss": 1.8913365602493286,
653
+ "eval_rouge1": 0.6655764268912302,
654
+ "eval_rouge2": 0.3757671289735428,
655
+ "eval_rougeL": 0.577951380212153,
656
+ "eval_rougeLsum": 0.6507587412359694,
657
+ "eval_runtime": 21.3067,
658
+ "eval_samples_per_second": 1.408,
659
+ "eval_steps_per_second": 0.704,
660
+ "step": 7000
661
+ },
662
+ {
663
+ "epoch": 0.66,
664
+ "eval_bertscore": 0.7505319714546204,
665
+ "eval_loss": 1.889721155166626,
666
+ "eval_rouge1": 0.6706532239207523,
667
+ "eval_rouge2": 0.37986537729431724,
668
+ "eval_rougeL": 0.5824624008038861,
669
+ "eval_rougeLsum": 0.6571986550416876,
670
+ "eval_runtime": 21.8193,
671
+ "eval_samples_per_second": 1.375,
672
+ "eval_steps_per_second": 0.687,
673
+ "step": 7200
674
+ },
675
+ {
676
+ "epoch": 0.66,
677
+ "grad_norm": 0.1685444712638855,
678
+ "learning_rate": 0.00013352895328045482,
679
+ "loss": 1.8748,
680
+ "step": 7250
681
+ },
682
+ {
683
+ "epoch": 0.68,
684
+ "eval_bertscore": 0.7472131252288818,
685
+ "eval_loss": 1.889514684677124,
686
+ "eval_rouge1": 0.6647481520892182,
687
+ "eval_rouge2": 0.3727968089505218,
688
+ "eval_rougeL": 0.5772333167389081,
689
+ "eval_rougeLsum": 0.6503920840351167,
690
+ "eval_runtime": 21.5794,
691
+ "eval_samples_per_second": 1.39,
692
+ "eval_steps_per_second": 0.695,
693
+ "step": 7400
694
+ },
695
+ {
696
+ "epoch": 0.69,
697
+ "grad_norm": 0.16196218132972717,
698
+ "learning_rate": 0.00013123653202512493,
699
+ "loss": 1.8958,
700
+ "step": 7500
701
+ },
702
+ {
703
+ "epoch": 0.7,
704
+ "eval_bertscore": 0.7467525005340576,
705
+ "eval_loss": 1.8874704837799072,
706
+ "eval_rouge1": 0.6652789954777591,
707
+ "eval_rouge2": 0.3747211875622626,
708
+ "eval_rougeL": 0.5781018250975862,
709
+ "eval_rougeLsum": 0.6512065884264598,
710
+ "eval_runtime": 21.6436,
711
+ "eval_samples_per_second": 1.386,
712
+ "eval_steps_per_second": 0.693,
713
+ "step": 7600
714
+ },
715
+ {
716
+ "epoch": 0.71,
717
+ "grad_norm": 0.17379231750965118,
718
+ "learning_rate": 0.00012894411076979506,
719
+ "loss": 1.8655,
720
+ "step": 7750
721
+ },
722
+ {
723
+ "epoch": 0.72,
724
+ "eval_bertscore": 0.7478018403053284,
725
+ "eval_loss": 1.8879252672195435,
726
+ "eval_rouge1": 0.6676077444849423,
727
+ "eval_rouge2": 0.37550824667101645,
728
+ "eval_rougeL": 0.5792625587400696,
729
+ "eval_rougeLsum": 0.6537654224373248,
730
+ "eval_runtime": 21.8026,
731
+ "eval_samples_per_second": 1.376,
732
+ "eval_steps_per_second": 0.688,
733
+ "step": 7800
734
+ },
735
+ {
736
+ "epoch": 0.73,
737
+ "grad_norm": 0.17975503206253052,
738
+ "learning_rate": 0.00012665168951446517,
739
+ "loss": 1.8593,
740
+ "step": 8000
741
+ },
742
+ {
743
+ "epoch": 0.73,
744
+ "eval_bertscore": 0.7490061521530151,
745
+ "eval_loss": 1.8872514963150024,
746
+ "eval_rouge1": 0.6677074837057098,
747
+ "eval_rouge2": 0.37723681410973775,
748
+ "eval_rougeL": 0.5806554105436175,
749
+ "eval_rougeLsum": 0.6531691046113964,
750
+ "eval_runtime": 21.2682,
751
+ "eval_samples_per_second": 1.411,
752
+ "eval_steps_per_second": 0.705,
753
+ "step": 8000
754
+ },
755
+ {
756
+ "epoch": 0.75,
757
+ "eval_bertscore": 0.7476587295532227,
758
+ "eval_loss": 1.8857940435409546,
759
+ "eval_rouge1": 0.6675733171919529,
760
+ "eval_rouge2": 0.37667421034338344,
761
+ "eval_rougeL": 0.5804128987718613,
762
+ "eval_rougeLsum": 0.6534287804714597,
763
+ "eval_runtime": 21.5325,
764
+ "eval_samples_per_second": 1.393,
765
+ "eval_steps_per_second": 0.697,
766
+ "step": 8200
767
+ },
768
+ {
769
+ "epoch": 0.76,
770
+ "grad_norm": 0.1596900373697281,
771
+ "learning_rate": 0.0001243592682591353,
772
+ "loss": 1.8627,
773
+ "step": 8250
774
+ },
775
+ {
776
+ "epoch": 0.77,
777
+ "eval_bertscore": 0.7444086074829102,
778
+ "eval_loss": 1.8874648809432983,
779
+ "eval_rouge1": 0.6633779669482168,
780
+ "eval_rouge2": 0.3710094509675216,
781
+ "eval_rougeL": 0.5760576627400225,
782
+ "eval_rougeLsum": 0.6499803336918719,
783
+ "eval_runtime": 21.4464,
784
+ "eval_samples_per_second": 1.399,
785
+ "eval_steps_per_second": 0.699,
786
+ "step": 8400
787
+ },
788
+ {
789
+ "epoch": 0.78,
790
+ "grad_norm": 0.16890183091163635,
791
+ "learning_rate": 0.00012206684700380542,
792
+ "loss": 1.8534,
793
+ "step": 8500
794
+ },
795
+ {
796
+ "epoch": 0.79,
797
+ "eval_bertscore": 0.7483052611351013,
798
+ "eval_loss": 1.8880757093429565,
799
+ "eval_rouge1": 0.6686948143176776,
800
+ "eval_rouge2": 0.3803796130427515,
801
+ "eval_rougeL": 0.5802459813261722,
802
+ "eval_rougeLsum": 0.6536962466082527,
803
+ "eval_runtime": 21.5416,
804
+ "eval_samples_per_second": 1.393,
805
+ "eval_steps_per_second": 0.696,
806
+ "step": 8600
807
+ },
808
+ {
809
+ "epoch": 0.8,
810
+ "grad_norm": 0.1596900373697281,
811
+ "learning_rate": 0.00011977442574847555,
812
+ "loss": 1.882,
813
+ "step": 8750
814
+ },
815
+ {
816
+ "epoch": 0.81,
817
+ "eval_bertscore": 0.748338520526886,
818
+ "eval_loss": 1.8871524333953857,
819
+ "eval_rouge1": 0.6673919143770407,
820
+ "eval_rouge2": 0.3761761743795482,
821
+ "eval_rougeL": 0.5797615995019129,
822
+ "eval_rougeLsum": 0.6526650363891257,
823
+ "eval_runtime": 21.8432,
824
+ "eval_samples_per_second": 1.373,
825
+ "eval_steps_per_second": 0.687,
826
+ "step": 8800
827
+ },
828
+ {
829
+ "epoch": 0.83,
830
+ "grad_norm": 0.16380883753299713,
831
+ "learning_rate": 0.00011748200449314565,
832
+ "loss": 1.8781,
833
+ "step": 9000
834
+ },
835
+ {
836
+ "epoch": 0.83,
837
+ "eval_bertscore": 0.7473989129066467,
838
+ "eval_loss": 1.885389804840088,
839
+ "eval_rouge1": 0.6660513187618474,
840
+ "eval_rouge2": 0.3728645884799071,
841
+ "eval_rougeL": 0.5767833607673931,
842
+ "eval_rougeLsum": 0.6518177265346137,
843
+ "eval_runtime": 21.5415,
844
+ "eval_samples_per_second": 1.393,
845
+ "eval_steps_per_second": 0.696,
846
+ "step": 9000
847
+ },
848
+ {
849
+ "epoch": 0.84,
850
+ "eval_bertscore": 0.7469697594642639,
851
+ "eval_loss": 1.8835673332214355,
852
+ "eval_rouge1": 0.6655382276884847,
853
+ "eval_rouge2": 0.3743925229327822,
854
+ "eval_rougeL": 0.5808516524350132,
855
+ "eval_rougeLsum": 0.6518276923554284,
856
+ "eval_runtime": 21.7289,
857
+ "eval_samples_per_second": 1.381,
858
+ "eval_steps_per_second": 0.69,
859
+ "step": 9200
860
+ },
861
+ {
862
+ "epoch": 0.85,
863
+ "grad_norm": 0.17286422848701477,
864
+ "learning_rate": 0.00011518958323781579,
865
+ "loss": 1.8672,
866
+ "step": 9250
867
+ },
868
+ {
869
+ "epoch": 0.86,
870
+ "eval_bertscore": 0.7491498589515686,
871
+ "eval_loss": 1.8845998048782349,
872
+ "eval_rouge1": 0.6670160490080832,
873
+ "eval_rouge2": 0.37860182825781935,
874
+ "eval_rougeL": 0.5797856034485049,
875
+ "eval_rougeLsum": 0.6531203725936218,
876
+ "eval_runtime": 21.5625,
877
+ "eval_samples_per_second": 1.391,
878
+ "eval_steps_per_second": 0.696,
879
+ "step": 9400
880
+ },
881
+ {
882
+ "epoch": 0.87,
883
+ "grad_norm": 0.16658568382263184,
884
+ "learning_rate": 0.0001128971619824859,
885
+ "loss": 1.8691,
886
+ "step": 9500
887
+ },
888
+ {
889
+ "epoch": 0.88,
890
+ "eval_bertscore": 0.7493313550949097,
891
+ "eval_loss": 1.8821747303009033,
892
+ "eval_rouge1": 0.6659791441681278,
893
+ "eval_rouge2": 0.3796033834485131,
894
+ "eval_rougeL": 0.580414529806212,
895
+ "eval_rougeLsum": 0.6528068238734432,
896
+ "eval_runtime": 21.8698,
897
+ "eval_samples_per_second": 1.372,
898
+ "eval_steps_per_second": 0.686,
899
+ "step": 9600
900
+ },
901
+ {
902
+ "epoch": 0.89,
903
+ "grad_norm": 0.1733073741197586,
904
+ "learning_rate": 0.00011060474072715603,
905
+ "loss": 1.8575,
906
+ "step": 9750
907
+ },
908
+ {
909
+ "epoch": 0.9,
910
+ "eval_bertscore": 0.7497690320014954,
911
+ "eval_loss": 1.8809062242507935,
912
+ "eval_rouge1": 0.6683202809005669,
913
+ "eval_rouge2": 0.379647408271533,
914
+ "eval_rougeL": 0.5812799059293663,
915
+ "eval_rougeLsum": 0.6549076224428805,
916
+ "eval_runtime": 21.461,
917
+ "eval_samples_per_second": 1.398,
918
+ "eval_steps_per_second": 0.699,
919
+ "step": 9800
920
+ },
921
+ {
922
+ "epoch": 0.92,
923
+ "grad_norm": 0.16828681528568268,
924
+ "learning_rate": 0.00010831231947182614,
925
+ "loss": 1.8799,
926
+ "step": 10000
927
+ },
928
+ {
929
+ "epoch": 0.92,
930
+ "eval_bertscore": 0.7487274408340454,
931
+ "eval_loss": 1.8800114393234253,
932
+ "eval_rouge1": 0.6694707226380743,
933
+ "eval_rouge2": 0.37780830529690856,
934
+ "eval_rougeL": 0.5789377835641822,
935
+ "eval_rougeLsum": 0.6540561492044448,
936
+ "eval_runtime": 21.6228,
937
+ "eval_samples_per_second": 1.387,
938
+ "eval_steps_per_second": 0.694,
939
+ "step": 10000
940
+ },
941
+ {
942
+ "epoch": 0.94,
943
+ "eval_bertscore": 0.7495086789131165,
944
+ "eval_loss": 1.8811218738555908,
945
+ "eval_rouge1": 0.6714277794869861,
946
+ "eval_rouge2": 0.3814957239141348,
947
+ "eval_rougeL": 0.5817721016839257,
948
+ "eval_rougeLsum": 0.6566092952916721,
949
+ "eval_runtime": 23.1282,
950
+ "eval_samples_per_second": 1.297,
951
+ "eval_steps_per_second": 0.649,
952
+ "step": 10200
953
+ },
954
+ {
955
+ "epoch": 0.94,
956
+ "grad_norm": 0.16498848795890808,
957
+ "learning_rate": 0.00010601989821649627,
958
+ "loss": 1.8656,
959
+ "step": 10250
960
+ },
961
+ {
962
+ "epoch": 0.95,
963
+ "eval_bertscore": 0.749505877494812,
964
+ "eval_loss": 1.8809926509857178,
965
+ "eval_rouge1": 0.6720420767359538,
966
+ "eval_rouge2": 0.38239237549289784,
967
+ "eval_rougeL": 0.5825845512902208,
968
+ "eval_rougeLsum": 0.6590116525116119,
969
+ "eval_runtime": 21.5266,
970
+ "eval_samples_per_second": 1.394,
971
+ "eval_steps_per_second": 0.697,
972
+ "step": 10400
973
+ },
974
+ {
975
+ "epoch": 0.96,
976
+ "grad_norm": 0.1661728322505951,
977
+ "learning_rate": 0.00010372747696116638,
978
+ "loss": 1.8633,
979
+ "step": 10500
980
+ },
981
+ {
982
+ "epoch": 0.97,
983
+ "eval_bertscore": 0.7484509944915771,
984
+ "eval_loss": 1.8795918226242065,
985
+ "eval_rouge1": 0.66861224256168,
986
+ "eval_rouge2": 0.3810938571231235,
987
+ "eval_rougeL": 0.581338929419374,
988
+ "eval_rougeLsum": 0.6556287448758898,
989
+ "eval_runtime": 21.6144,
990
+ "eval_samples_per_second": 1.388,
991
+ "eval_steps_per_second": 0.694,
992
+ "step": 10600
993
+ },
994
+ {
995
+ "epoch": 0.99,
996
+ "grad_norm": 0.1695539355278015,
997
+ "learning_rate": 0.00010143505570583652,
998
+ "loss": 1.8778,
999
+ "step": 10750
1000
+ },
1001
+ {
1002
+ "epoch": 0.99,
1003
+ "eval_bertscore": 0.747430145740509,
1004
+ "eval_loss": 1.8807307481765747,
1005
+ "eval_rouge1": 0.6659775067192504,
1006
+ "eval_rouge2": 0.37723044840422537,
1007
+ "eval_rougeL": 0.5790798830214317,
1008
+ "eval_rougeLsum": 0.6509981906464294,
1009
+ "eval_runtime": 21.9658,
1010
+ "eval_samples_per_second": 1.366,
1011
+ "eval_steps_per_second": 0.683,
1012
+ "step": 10800
1013
+ },
1014
+ {
1015
+ "epoch": 1.01,
1016
+ "grad_norm": 0.18244074285030365,
1017
+ "learning_rate": 9.914263445050664e-05,
1018
+ "loss": 1.8425,
1019
+ "step": 11000
1020
+ },
1021
+ {
1022
+ "epoch": 1.01,
1023
+ "eval_bertscore": 0.7464674711227417,
1024
+ "eval_loss": 1.8850181102752686,
1025
+ "eval_rouge1": 0.6682062462715245,
1026
+ "eval_rouge2": 0.377961045305675,
1027
+ "eval_rougeL": 0.5785946041032981,
1028
+ "eval_rougeLsum": 0.6544658695180745,
1029
+ "eval_runtime": 21.4985,
1030
+ "eval_samples_per_second": 1.395,
1031
+ "eval_steps_per_second": 0.698,
1032
+ "step": 11000
1033
+ },
1034
+ {
1035
+ "epoch": 1.03,
1036
+ "eval_bertscore": 0.748903214931488,
1037
+ "eval_loss": 1.8819694519042969,
1038
+ "eval_rouge1": 0.6702994540242251,
1039
+ "eval_rouge2": 0.38293287997414793,
1040
+ "eval_rougeL": 0.5814513237567966,
1041
+ "eval_rougeLsum": 0.6559726946972199,
1042
+ "eval_runtime": 21.4139,
1043
+ "eval_samples_per_second": 1.401,
1044
+ "eval_steps_per_second": 0.7,
1045
+ "step": 11200
1046
+ },
1047
+ {
1048
+ "epoch": 1.03,
1049
+ "grad_norm": 0.17693208158016205,
1050
+ "learning_rate": 9.685021319517676e-05,
1051
+ "loss": 1.8016,
1052
+ "step": 11250
1053
+ },
1054
+ {
1055
+ "epoch": 1.05,
1056
+ "eval_bertscore": 0.7492591738700867,
1057
+ "eval_loss": 1.8827041387557983,
1058
+ "eval_rouge1": 0.6691453593118961,
1059
+ "eval_rouge2": 0.3798853572019327,
1060
+ "eval_rougeL": 0.5809966833392892,
1061
+ "eval_rougeLsum": 0.6558794288097127,
1062
+ "eval_runtime": 21.4988,
1063
+ "eval_samples_per_second": 1.395,
1064
+ "eval_steps_per_second": 0.698,
1065
+ "step": 11400
1066
+ },
1067
+ {
1068
+ "epoch": 1.05,
1069
+ "grad_norm": 0.19059012830257416,
1070
+ "learning_rate": 9.455779193984687e-05,
1071
+ "loss": 1.806,
1072
+ "step": 11500
1073
+ },
1074
+ {
1075
+ "epoch": 1.06,
1076
+ "eval_bertscore": 0.7471604943275452,
1077
+ "eval_loss": 1.8825455904006958,
1078
+ "eval_rouge1": 0.666961451977486,
1079
+ "eval_rouge2": 0.37886614565714727,
1080
+ "eval_rougeL": 0.5782594534845417,
1081
+ "eval_rougeLsum": 0.6527754475869945,
1082
+ "eval_runtime": 21.6349,
1083
+ "eval_samples_per_second": 1.387,
1084
+ "eval_steps_per_second": 0.693,
1085
+ "step": 11600
1086
+ },
1087
+ {
1088
+ "epoch": 1.08,
1089
+ "grad_norm": 0.17817597091197968,
1090
+ "learning_rate": 9.226537068451699e-05,
1091
+ "loss": 1.8087,
1092
+ "step": 11750
1093
+ },
1094
+ {
1095
+ "epoch": 1.08,
1096
+ "eval_bertscore": 0.7492148876190186,
1097
+ "eval_loss": 1.8826088905334473,
1098
+ "eval_rouge1": 0.6677645500651761,
1099
+ "eval_rouge2": 0.3804313457558821,
1100
+ "eval_rougeL": 0.5808965378999502,
1101
+ "eval_rougeLsum": 0.654710106622618,
1102
+ "eval_runtime": 21.938,
1103
+ "eval_samples_per_second": 1.367,
1104
+ "eval_steps_per_second": 0.684,
1105
+ "step": 11800
1106
+ },
1107
+ {
1108
+ "epoch": 1.1,
1109
+ "grad_norm": 0.1762418895959854,
1110
+ "learning_rate": 8.997294942918711e-05,
1111
+ "loss": 1.806,
1112
+ "step": 12000
1113
+ },
1114
+ {
1115
+ "epoch": 1.1,
1116
+ "eval_bertscore": 0.748414933681488,
1117
+ "eval_loss": 1.8811677694320679,
1118
+ "eval_rouge1": 0.6688262090158613,
1119
+ "eval_rouge2": 0.38050452253222067,
1120
+ "eval_rougeL": 0.5800878428874158,
1121
+ "eval_rougeLsum": 0.6541444781570895,
1122
+ "eval_runtime": 21.4244,
1123
+ "eval_samples_per_second": 1.4,
1124
+ "eval_steps_per_second": 0.7,
1125
+ "step": 12000
1126
+ }
1127
+ ],
1128
+ "logging_steps": 250,
1129
+ "max_steps": 21812,
1130
+ "num_input_tokens_seen": 0,
1131
+ "num_train_epochs": 2,
1132
+ "save_steps": 800,
1133
+ "total_flos": 8.089219501444301e+17,
1134
+ "train_batch_size": 2,
1135
+ "trial_name": null,
1136
+ "trial_params": null
1137
+ }
checkpoint-12000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18a96e9baacd033c0a419444553d18b70e4f76e7b37401a6dcc6b00ceb2cc1e1
3
+ size 5048
checkpoint-12000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-12800/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: /workspace/model-export/allstax/shorting-phi-e4
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-12800/adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/workspace/model-export/allstax/shorting-phi-e4",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "q_proj",
25
+ "fc2",
26
+ "dense",
27
+ "v_proj",
28
+ "fc1"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }
checkpoint-12800/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45ea403e2897753e58b3237fba48226ec8727b1f71a6a5240ea6ce9de07d06fd
3
+ size 377538512
checkpoint-12800/added_tokens.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\t\t": 50294,
3
+ "\t\t\t": 50293,
4
+ "\t\t\t\t": 50292,
5
+ "\t\t\t\t\t": 50291,
6
+ "\t\t\t\t\t\t": 50290,
7
+ "\t\t\t\t\t\t\t": 50289,
8
+ "\t\t\t\t\t\t\t\t": 50288,
9
+ "\t\t\t\t\t\t\t\t\t": 50287,
10
+ " ": 50286,
11
+ " ": 50285,
12
+ " ": 50284,
13
+ " ": 50283,
14
+ " ": 50282,
15
+ " ": 50281,
16
+ " ": 50280,
17
+ " ": 50279,
18
+ " ": 50278,
19
+ " ": 50277,
20
+ " ": 50276,
21
+ " ": 50275,
22
+ " ": 50274,
23
+ " ": 50273,
24
+ " ": 50272,
25
+ " ": 50271,
26
+ " ": 50270,
27
+ " ": 50269,
28
+ " ": 50268,
29
+ " ": 50267,
30
+ " ": 50266,
31
+ " ": 50265,
32
+ " ": 50264,
33
+ " ": 50263,
34
+ " ": 50262,
35
+ " ": 50261,
36
+ " ": 50260,
37
+ " ": 50259,
38
+ " ": 50258,
39
+ " ": 50257
40
+ }
checkpoint-12800/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-12800/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6858becda9bbffc109417b22693ab8a93fd73729c27d87fb5b49d548b53bd2ae
3
+ size 14244
checkpoint-12800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2001511c42d4b0cf82379a62d959b6e6e00ccf86af40fb6577c26c9f1abd0b73
3
+ size 1064
checkpoint-12800/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-12800/tokenizer_config.json ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "50257": {
14
+ "content": " ",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": false
20
+ },
21
+ "50258": {
22
+ "content": " ",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "50259": {
30
+ "content": " ",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "50260": {
38
+ "content": " ",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "50261": {
46
+ "content": " ",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "50262": {
54
+ "content": " ",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "50263": {
62
+ "content": " ",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "50264": {
70
+ "content": " ",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "50265": {
78
+ "content": " ",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "50266": {
86
+ "content": " ",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "50267": {
94
+ "content": " ",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "50268": {
102
+ "content": " ",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "50269": {
110
+ "content": " ",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "50270": {
118
+ "content": " ",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "50271": {
126
+ "content": " ",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "50272": {
134
+ "content": " ",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "50273": {
142
+ "content": " ",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "50274": {
150
+ "content": " ",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "50275": {
158
+ "content": " ",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "50276": {
166
+ "content": " ",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "50277": {
174
+ "content": " ",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "50278": {
182
+ "content": " ",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "50279": {
190
+ "content": " ",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "50280": {
198
+ "content": " ",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "50281": {
206
+ "content": " ",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "50282": {
214
+ "content": " ",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "50283": {
222
+ "content": " ",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "50284": {
230
+ "content": " ",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "50285": {
238
+ "content": " ",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "50286": {
246
+ "content": " ",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "50287": {
254
+ "content": "\t\t\t\t\t\t\t\t\t",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "50288": {
262
+ "content": "\t\t\t\t\t\t\t\t",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "50289": {
270
+ "content": "\t\t\t\t\t\t\t",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "50290": {
278
+ "content": "\t\t\t\t\t\t",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "50291": {
286
+ "content": "\t\t\t\t\t",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "50292": {
294
+ "content": "\t\t\t\t",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "50293": {
302
+ "content": "\t\t\t",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "50294": {
310
+ "content": "\t\t",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ }
317
+ },
318
+ "bos_token": "<|endoftext|>",
319
+ "clean_up_tokenization_spaces": true,
320
+ "eos_token": "<|endoftext|>",
321
+ "errors": "replace",
322
+ "model_max_length": 2048,
323
+ "pad_token": "<|endoftext|>",
324
+ "padding_side": "left",
325
+ "tokenizer_class": "CodeGenTokenizer",
326
+ "unk_token": "<|endoftext|>"
327
+ }
checkpoint-12800/trainer_state.json ADDED
@@ -0,0 +1,1210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.1736120661990557,
5
+ "eval_steps": 200,
6
+ "global_step": 12800,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "eval_bertscore": 0.7401605248451233,
14
+ "eval_loss": 1.9530484676361084,
15
+ "eval_rouge1": 0.6562857460474375,
16
+ "eval_rouge2": 0.3640670727106235,
17
+ "eval_rougeL": 0.5655212336424695,
18
+ "eval_rougeLsum": 0.6414840198810386,
19
+ "eval_runtime": 21.7196,
20
+ "eval_samples_per_second": 1.381,
21
+ "eval_steps_per_second": 0.691,
22
+ "step": 200
23
+ },
24
+ {
25
+ "epoch": 0.02,
26
+ "grad_norm": 0.25105270743370056,
27
+ "learning_rate": 0.00019771674842969145,
28
+ "loss": 1.7353,
29
+ "step": 250
30
+ },
31
+ {
32
+ "epoch": 0.04,
33
+ "eval_bertscore": 0.7432050108909607,
34
+ "eval_loss": 1.9583823680877686,
35
+ "eval_rouge1": 0.6554226269617707,
36
+ "eval_rouge2": 0.36661086995296877,
37
+ "eval_rougeL": 0.5637448790342183,
38
+ "eval_rougeLsum": 0.6419796784912521,
39
+ "eval_runtime": 21.9623,
40
+ "eval_samples_per_second": 1.366,
41
+ "eval_steps_per_second": 0.683,
42
+ "step": 400
43
+ },
44
+ {
45
+ "epoch": 0.05,
46
+ "grad_norm": 0.26550447940826416,
47
+ "learning_rate": 0.00019542432717436156,
48
+ "loss": 1.7786,
49
+ "step": 500
50
+ },
51
+ {
52
+ "epoch": 0.06,
53
+ "eval_bertscore": 0.7469045519828796,
54
+ "eval_loss": 1.9245686531066895,
55
+ "eval_rouge1": 0.6662431635890791,
56
+ "eval_rouge2": 0.3735263724826765,
57
+ "eval_rougeL": 0.5755071616151013,
58
+ "eval_rougeLsum": 0.6538383087686117,
59
+ "eval_runtime": 21.5302,
60
+ "eval_samples_per_second": 1.393,
61
+ "eval_steps_per_second": 0.697,
62
+ "step": 600
63
+ },
64
+ {
65
+ "epoch": 0.07,
66
+ "grad_norm": 0.1538015753030777,
67
+ "learning_rate": 0.0001931319059190317,
68
+ "loss": 1.8851,
69
+ "step": 750
70
+ },
71
+ {
72
+ "epoch": 0.07,
73
+ "eval_bertscore": 0.7442477941513062,
74
+ "eval_loss": 1.9187489748001099,
75
+ "eval_rouge1": 0.6606221897489035,
76
+ "eval_rouge2": 0.368654563659435,
77
+ "eval_rougeL": 0.5731546210408094,
78
+ "eval_rougeLsum": 0.6470590823125606,
79
+ "eval_runtime": 21.9831,
80
+ "eval_samples_per_second": 1.365,
81
+ "eval_steps_per_second": 0.682,
82
+ "step": 800
83
+ },
84
+ {
85
+ "epoch": 0.09,
86
+ "grad_norm": 0.1681252270936966,
87
+ "learning_rate": 0.0001908394846637018,
88
+ "loss": 1.8919,
89
+ "step": 1000
90
+ },
91
+ {
92
+ "epoch": 0.09,
93
+ "eval_bertscore": 0.7458053231239319,
94
+ "eval_loss": 1.9159075021743774,
95
+ "eval_rouge1": 0.6621259186456026,
96
+ "eval_rouge2": 0.372024043683234,
97
+ "eval_rougeL": 0.5743354509339939,
98
+ "eval_rougeLsum": 0.6491550893780276,
99
+ "eval_runtime": 21.7159,
100
+ "eval_samples_per_second": 1.381,
101
+ "eval_steps_per_second": 0.691,
102
+ "step": 1000
103
+ },
104
+ {
105
+ "epoch": 0.11,
106
+ "eval_bertscore": 0.7468854784965515,
107
+ "eval_loss": 1.9140182733535767,
108
+ "eval_rouge1": 0.6626581781149132,
109
+ "eval_rouge2": 0.37318557504782157,
110
+ "eval_rougeL": 0.5759264203594217,
111
+ "eval_rougeLsum": 0.6490702446275723,
112
+ "eval_runtime": 21.6486,
113
+ "eval_samples_per_second": 1.386,
114
+ "eval_steps_per_second": 0.693,
115
+ "step": 1200
116
+ },
117
+ {
118
+ "epoch": 0.11,
119
+ "grad_norm": 0.1552441120147705,
120
+ "learning_rate": 0.00018854706340837193,
121
+ "loss": 1.9052,
122
+ "step": 1250
123
+ },
124
+ {
125
+ "epoch": 0.13,
126
+ "eval_bertscore": 0.7475314736366272,
127
+ "eval_loss": 1.913794755935669,
128
+ "eval_rouge1": 0.6648687174353192,
129
+ "eval_rouge2": 0.3760379232448734,
130
+ "eval_rougeL": 0.5784915488164926,
131
+ "eval_rougeLsum": 0.6513864520108938,
132
+ "eval_runtime": 21.664,
133
+ "eval_samples_per_second": 1.385,
134
+ "eval_steps_per_second": 0.692,
135
+ "step": 1400
136
+ },
137
+ {
138
+ "epoch": 0.14,
139
+ "grad_norm": 0.14638397097587585,
140
+ "learning_rate": 0.00018625464215304204,
141
+ "loss": 1.8843,
142
+ "step": 1500
143
+ },
144
+ {
145
+ "epoch": 0.15,
146
+ "eval_bertscore": 0.747238039970398,
147
+ "eval_loss": 1.9117029905319214,
148
+ "eval_rouge1": 0.6638085237198453,
149
+ "eval_rouge2": 0.3742779818055127,
150
+ "eval_rougeL": 0.5754209460423059,
151
+ "eval_rougeLsum": 0.6506476155592722,
152
+ "eval_runtime": 21.9308,
153
+ "eval_samples_per_second": 1.368,
154
+ "eval_steps_per_second": 0.684,
155
+ "step": 1600
156
+ },
157
+ {
158
+ "epoch": 0.16,
159
+ "grad_norm": 0.15738993883132935,
160
+ "learning_rate": 0.00018396222089771218,
161
+ "loss": 1.8964,
162
+ "step": 1750
163
+ },
164
+ {
165
+ "epoch": 0.17,
166
+ "eval_bertscore": 0.7473016381263733,
167
+ "eval_loss": 1.9117563962936401,
168
+ "eval_rouge1": 0.6620053151663765,
169
+ "eval_rouge2": 0.37406692119411245,
170
+ "eval_rougeL": 0.5758911607323577,
171
+ "eval_rougeLsum": 0.6494070575604445,
172
+ "eval_runtime": 21.6727,
173
+ "eval_samples_per_second": 1.384,
174
+ "eval_steps_per_second": 0.692,
175
+ "step": 1800
176
+ },
177
+ {
178
+ "epoch": 0.18,
179
+ "grad_norm": 0.1588907092809677,
180
+ "learning_rate": 0.00018166979964238228,
181
+ "loss": 1.8827,
182
+ "step": 2000
183
+ },
184
+ {
185
+ "epoch": 0.18,
186
+ "eval_bertscore": 0.7485987544059753,
187
+ "eval_loss": 1.9126006364822388,
188
+ "eval_rouge1": 0.6641836156334741,
189
+ "eval_rouge2": 0.37320215574735827,
190
+ "eval_rougeL": 0.5783015040447993,
191
+ "eval_rougeLsum": 0.6522235940423647,
192
+ "eval_runtime": 21.9759,
193
+ "eval_samples_per_second": 1.365,
194
+ "eval_steps_per_second": 0.683,
195
+ "step": 2000
196
+ },
197
+ {
198
+ "epoch": 0.2,
199
+ "eval_bertscore": 0.7482583522796631,
200
+ "eval_loss": 1.9075205326080322,
201
+ "eval_rouge1": 0.6658219484766166,
202
+ "eval_rouge2": 0.37723364952258465,
203
+ "eval_rougeL": 0.5769040785174693,
204
+ "eval_rougeLsum": 0.6511328888044219,
205
+ "eval_runtime": 21.5892,
206
+ "eval_samples_per_second": 1.39,
207
+ "eval_steps_per_second": 0.695,
208
+ "step": 2200
209
+ },
210
+ {
211
+ "epoch": 0.21,
212
+ "grad_norm": 0.15247465670108795,
213
+ "learning_rate": 0.00017937737838705242,
214
+ "loss": 1.8831,
215
+ "step": 2250
216
+ },
217
+ {
218
+ "epoch": 0.22,
219
+ "eval_bertscore": 0.7460805177688599,
220
+ "eval_loss": 1.9088668823242188,
221
+ "eval_rouge1": 0.6627321043292516,
222
+ "eval_rouge2": 0.3696581195003696,
223
+ "eval_rougeL": 0.5740988544467178,
224
+ "eval_rougeLsum": 0.6478729042661874,
225
+ "eval_runtime": 21.9221,
226
+ "eval_samples_per_second": 1.368,
227
+ "eval_steps_per_second": 0.684,
228
+ "step": 2400
229
+ },
230
+ {
231
+ "epoch": 0.23,
232
+ "grad_norm": 0.1587379276752472,
233
+ "learning_rate": 0.00017708495713172253,
234
+ "loss": 1.8829,
235
+ "step": 2500
236
+ },
237
+ {
238
+ "epoch": 0.24,
239
+ "eval_bertscore": 0.7472203373908997,
240
+ "eval_loss": 1.906219482421875,
241
+ "eval_rouge1": 0.6637415370426804,
242
+ "eval_rouge2": 0.37565276875837994,
243
+ "eval_rougeL": 0.5773879369079004,
244
+ "eval_rougeLsum": 0.6488719947518645,
245
+ "eval_runtime": 21.8112,
246
+ "eval_samples_per_second": 1.375,
247
+ "eval_steps_per_second": 0.688,
248
+ "step": 2600
249
+ },
250
+ {
251
+ "epoch": 0.25,
252
+ "grad_norm": 0.1558646410703659,
253
+ "learning_rate": 0.00017479253587639266,
254
+ "loss": 1.8978,
255
+ "step": 2750
256
+ },
257
+ {
258
+ "epoch": 0.26,
259
+ "eval_bertscore": 0.7466126680374146,
260
+ "eval_loss": 1.9045982360839844,
261
+ "eval_rouge1": 0.6616225540296956,
262
+ "eval_rouge2": 0.37370762164745913,
263
+ "eval_rougeL": 0.5759418528371097,
264
+ "eval_rougeLsum": 0.6479977636906877,
265
+ "eval_runtime": 21.8772,
266
+ "eval_samples_per_second": 1.371,
267
+ "eval_steps_per_second": 0.686,
268
+ "step": 2800
269
+ },
270
+ {
271
+ "epoch": 0.28,
272
+ "grad_norm": 0.14783035218715668,
273
+ "learning_rate": 0.00017250011462106277,
274
+ "loss": 1.8978,
275
+ "step": 3000
276
+ },
277
+ {
278
+ "epoch": 0.28,
279
+ "eval_bertscore": 0.7485571503639221,
280
+ "eval_loss": 1.9035439491271973,
281
+ "eval_rouge1": 0.6664050030501707,
282
+ "eval_rouge2": 0.379492440917784,
283
+ "eval_rougeL": 0.5806973731221475,
284
+ "eval_rougeLsum": 0.6524346156604702,
285
+ "eval_runtime": 21.9217,
286
+ "eval_samples_per_second": 1.369,
287
+ "eval_steps_per_second": 0.684,
288
+ "step": 3000
289
+ },
290
+ {
291
+ "epoch": 0.29,
292
+ "eval_bertscore": 0.7483461499214172,
293
+ "eval_loss": 1.9022458791732788,
294
+ "eval_rouge1": 0.6618989733136488,
295
+ "eval_rouge2": 0.37377379177271053,
296
+ "eval_rougeL": 0.5780989082173933,
297
+ "eval_rougeLsum": 0.6490379362631586,
298
+ "eval_runtime": 21.7847,
299
+ "eval_samples_per_second": 1.377,
300
+ "eval_steps_per_second": 0.689,
301
+ "step": 3200
302
+ },
303
+ {
304
+ "epoch": 0.3,
305
+ "grad_norm": 0.16484151780605316,
306
+ "learning_rate": 0.0001702076933657329,
307
+ "loss": 1.8715,
308
+ "step": 3250
309
+ },
310
+ {
311
+ "epoch": 0.31,
312
+ "eval_bertscore": 0.7490711212158203,
313
+ "eval_loss": 1.9013088941574097,
314
+ "eval_rouge1": 0.6638141306545007,
315
+ "eval_rouge2": 0.37356255553691553,
316
+ "eval_rougeL": 0.577975450251653,
317
+ "eval_rougeLsum": 0.6492478632295806,
318
+ "eval_runtime": 21.8807,
319
+ "eval_samples_per_second": 1.371,
320
+ "eval_steps_per_second": 0.686,
321
+ "step": 3400
322
+ },
323
+ {
324
+ "epoch": 0.32,
325
+ "grad_norm": 0.14130128920078278,
326
+ "learning_rate": 0.000167915272110403,
327
+ "loss": 1.8819,
328
+ "step": 3500
329
+ },
330
+ {
331
+ "epoch": 0.33,
332
+ "eval_bertscore": 0.7475283741950989,
333
+ "eval_loss": 1.9002223014831543,
334
+ "eval_rouge1": 0.6628836314413511,
335
+ "eval_rouge2": 0.37179988805094977,
336
+ "eval_rougeL": 0.5764222388923268,
337
+ "eval_rougeLsum": 0.649864229310889,
338
+ "eval_runtime": 22.124,
339
+ "eval_samples_per_second": 1.356,
340
+ "eval_steps_per_second": 0.678,
341
+ "step": 3600
342
+ },
343
+ {
344
+ "epoch": 0.34,
345
+ "grad_norm": 0.1494186818599701,
346
+ "learning_rate": 0.00016562285085507315,
347
+ "loss": 1.8828,
348
+ "step": 3750
349
+ },
350
+ {
351
+ "epoch": 0.35,
352
+ "eval_bertscore": 0.7486498951911926,
353
+ "eval_loss": 1.9011151790618896,
354
+ "eval_rouge1": 0.6669673680023924,
355
+ "eval_rouge2": 0.3771780440183751,
356
+ "eval_rougeL": 0.5792518624130161,
357
+ "eval_rougeLsum": 0.6534484242953056,
358
+ "eval_runtime": 21.813,
359
+ "eval_samples_per_second": 1.375,
360
+ "eval_steps_per_second": 0.688,
361
+ "step": 3800
362
+ },
363
+ {
364
+ "epoch": 0.37,
365
+ "grad_norm": 0.14803479611873627,
366
+ "learning_rate": 0.00016333042959974325,
367
+ "loss": 1.8761,
368
+ "step": 4000
369
+ },
370
+ {
371
+ "epoch": 0.37,
372
+ "eval_bertscore": 0.7471507787704468,
373
+ "eval_loss": 1.9001713991165161,
374
+ "eval_rouge1": 0.6651735220672027,
375
+ "eval_rouge2": 0.3736698451416937,
376
+ "eval_rougeL": 0.5779938808281732,
377
+ "eval_rougeLsum": 0.6509815118131576,
378
+ "eval_runtime": 21.5004,
379
+ "eval_samples_per_second": 1.395,
380
+ "eval_steps_per_second": 0.698,
381
+ "step": 4000
382
+ },
383
+ {
384
+ "epoch": 0.39,
385
+ "eval_bertscore": 0.7485501766204834,
386
+ "eval_loss": 1.8993827104568481,
387
+ "eval_rouge1": 0.6646424082737133,
388
+ "eval_rouge2": 0.37318485364862475,
389
+ "eval_rougeL": 0.5773338159759467,
390
+ "eval_rougeLsum": 0.6507594353103527,
391
+ "eval_runtime": 21.2963,
392
+ "eval_samples_per_second": 1.409,
393
+ "eval_steps_per_second": 0.704,
394
+ "step": 4200
395
+ },
396
+ {
397
+ "epoch": 0.39,
398
+ "grad_norm": 0.15562959015369415,
399
+ "learning_rate": 0.0001610380083444134,
400
+ "loss": 1.8672,
401
+ "step": 4250
402
+ },
403
+ {
404
+ "epoch": 0.4,
405
+ "eval_bertscore": 0.7469989061355591,
406
+ "eval_loss": 1.900540828704834,
407
+ "eval_rouge1": 0.6620664558691891,
408
+ "eval_rouge2": 0.37299419371215703,
409
+ "eval_rougeL": 0.5765442194831125,
410
+ "eval_rougeLsum": 0.6472642385429858,
411
+ "eval_runtime": 21.9086,
412
+ "eval_samples_per_second": 1.369,
413
+ "eval_steps_per_second": 0.685,
414
+ "step": 4400
415
+ },
416
+ {
417
+ "epoch": 0.41,
418
+ "grad_norm": 0.15420928597450256,
419
+ "learning_rate": 0.0001587455870890835,
420
+ "loss": 1.8754,
421
+ "step": 4500
422
+ },
423
+ {
424
+ "epoch": 0.42,
425
+ "eval_bertscore": 0.7475299835205078,
426
+ "eval_loss": 1.8988685607910156,
427
+ "eval_rouge1": 0.6656661780424216,
428
+ "eval_rouge2": 0.37467258880478527,
429
+ "eval_rougeL": 0.5770800519970718,
430
+ "eval_rougeLsum": 0.6522703864288166,
431
+ "eval_runtime": 22.063,
432
+ "eval_samples_per_second": 1.36,
433
+ "eval_steps_per_second": 0.68,
434
+ "step": 4600
435
+ },
436
+ {
437
+ "epoch": 0.44,
438
+ "grad_norm": 0.15809176862239838,
439
+ "learning_rate": 0.00015645316583375363,
440
+ "loss": 1.8848,
441
+ "step": 4750
442
+ },
443
+ {
444
+ "epoch": 0.44,
445
+ "eval_bertscore": 0.7490234375,
446
+ "eval_loss": 1.8991097211837769,
447
+ "eval_rouge1": 0.6651730257289085,
448
+ "eval_rouge2": 0.3778893043274054,
449
+ "eval_rougeL": 0.5782673838033503,
450
+ "eval_rougeLsum": 0.6516865674488727,
451
+ "eval_runtime": 22.0202,
452
+ "eval_samples_per_second": 1.362,
453
+ "eval_steps_per_second": 0.681,
454
+ "step": 4800
455
+ },
456
+ {
457
+ "epoch": 0.46,
458
+ "grad_norm": 0.17979757487773895,
459
+ "learning_rate": 0.00015416074457842374,
460
+ "loss": 1.8851,
461
+ "step": 5000
462
+ },
463
+ {
464
+ "epoch": 0.46,
465
+ "eval_bertscore": 0.7492111325263977,
466
+ "eval_loss": 1.897339940071106,
467
+ "eval_rouge1": 0.665920573890169,
468
+ "eval_rouge2": 0.37917993898535385,
469
+ "eval_rougeL": 0.5800236892888617,
470
+ "eval_rougeLsum": 0.6529131688355863,
471
+ "eval_runtime": 21.6103,
472
+ "eval_samples_per_second": 1.388,
473
+ "eval_steps_per_second": 0.694,
474
+ "step": 5000
475
+ },
476
+ {
477
+ "epoch": 0.48,
478
+ "eval_bertscore": 0.7491253614425659,
479
+ "eval_loss": 1.897528052330017,
480
+ "eval_rouge1": 0.6653452054219615,
481
+ "eval_rouge2": 0.3759208437918665,
482
+ "eval_rougeL": 0.5776757077854651,
483
+ "eval_rougeLsum": 0.6511876484723524,
484
+ "eval_runtime": 21.3101,
485
+ "eval_samples_per_second": 1.408,
486
+ "eval_steps_per_second": 0.704,
487
+ "step": 5200
488
+ },
489
+ {
490
+ "epoch": 0.48,
491
+ "grad_norm": 0.16869671642780304,
492
+ "learning_rate": 0.00015186832332309387,
493
+ "loss": 1.8783,
494
+ "step": 5250
495
+ },
496
+ {
497
+ "epoch": 0.5,
498
+ "eval_bertscore": 0.7494469881057739,
499
+ "eval_loss": 1.895969271659851,
500
+ "eval_rouge1": 0.6660951369469854,
501
+ "eval_rouge2": 0.3764077134133328,
502
+ "eval_rougeL": 0.578785826234568,
503
+ "eval_rougeLsum": 0.6525967284041656,
504
+ "eval_runtime": 21.7955,
505
+ "eval_samples_per_second": 1.376,
506
+ "eval_steps_per_second": 0.688,
507
+ "step": 5400
508
+ },
509
+ {
510
+ "epoch": 0.5,
511
+ "grad_norm": 0.15996231138706207,
512
+ "learning_rate": 0.00014957590206776398,
513
+ "loss": 1.8805,
514
+ "step": 5500
515
+ },
516
+ {
517
+ "epoch": 0.51,
518
+ "eval_bertscore": 0.7486470341682434,
519
+ "eval_loss": 1.8955131769180298,
520
+ "eval_rouge1": 0.6670292173522965,
521
+ "eval_rouge2": 0.37457018529010144,
522
+ "eval_rougeL": 0.5775243235432015,
523
+ "eval_rougeLsum": 0.652574079807632,
524
+ "eval_runtime": 21.7576,
525
+ "eval_samples_per_second": 1.379,
526
+ "eval_steps_per_second": 0.689,
527
+ "step": 5600
528
+ },
529
+ {
530
+ "epoch": 0.53,
531
+ "grad_norm": 0.17192547023296356,
532
+ "learning_rate": 0.00014728348081243412,
533
+ "loss": 1.8884,
534
+ "step": 5750
535
+ },
536
+ {
537
+ "epoch": 0.53,
538
+ "eval_bertscore": 0.7483081817626953,
539
+ "eval_loss": 1.895763874053955,
540
+ "eval_rouge1": 0.6659275328276997,
541
+ "eval_rouge2": 0.3778666475350364,
542
+ "eval_rougeL": 0.579425140056643,
543
+ "eval_rougeLsum": 0.6515870828784887,
544
+ "eval_runtime": 21.6648,
545
+ "eval_samples_per_second": 1.385,
546
+ "eval_steps_per_second": 0.692,
547
+ "step": 5800
548
+ },
549
+ {
550
+ "epoch": 0.55,
551
+ "grad_norm": 0.15838442742824554,
552
+ "learning_rate": 0.00014499105955710422,
553
+ "loss": 1.8913,
554
+ "step": 6000
555
+ },
556
+ {
557
+ "epoch": 0.55,
558
+ "eval_bertscore": 0.7493732571601868,
559
+ "eval_loss": 1.8914682865142822,
560
+ "eval_rouge1": 0.6669695240447069,
561
+ "eval_rouge2": 0.3769441114214874,
562
+ "eval_rougeL": 0.5798986667152066,
563
+ "eval_rougeLsum": 0.6534527583592111,
564
+ "eval_runtime": 21.4686,
565
+ "eval_samples_per_second": 1.397,
566
+ "eval_steps_per_second": 0.699,
567
+ "step": 6000
568
+ },
569
+ {
570
+ "epoch": 0.57,
571
+ "eval_bertscore": 0.7510559558868408,
572
+ "eval_loss": 1.8923884630203247,
573
+ "eval_rouge1": 0.6677938121282943,
574
+ "eval_rouge2": 0.37854575387307554,
575
+ "eval_rougeL": 0.5817052753830161,
576
+ "eval_rougeLsum": 0.6534737907551461,
577
+ "eval_runtime": 21.593,
578
+ "eval_samples_per_second": 1.389,
579
+ "eval_steps_per_second": 0.695,
580
+ "step": 6200
581
+ },
582
+ {
583
+ "epoch": 0.57,
584
+ "grad_norm": 0.15312573313713074,
585
+ "learning_rate": 0.00014269863830177433,
586
+ "loss": 1.8705,
587
+ "step": 6250
588
+ },
589
+ {
590
+ "epoch": 0.59,
591
+ "eval_bertscore": 0.7479371428489685,
592
+ "eval_loss": 1.891802430152893,
593
+ "eval_rouge1": 0.6658674357402252,
594
+ "eval_rouge2": 0.3757712649269345,
595
+ "eval_rougeL": 0.5791817270712349,
596
+ "eval_rougeLsum": 0.6509960265397259,
597
+ "eval_runtime": 21.8726,
598
+ "eval_samples_per_second": 1.372,
599
+ "eval_steps_per_second": 0.686,
600
+ "step": 6400
601
+ },
602
+ {
603
+ "epoch": 0.6,
604
+ "grad_norm": 0.15844614803791046,
605
+ "learning_rate": 0.00014040621704644447,
606
+ "loss": 1.8643,
607
+ "step": 6500
608
+ },
609
+ {
610
+ "epoch": 0.61,
611
+ "eval_bertscore": 0.7484550476074219,
612
+ "eval_loss": 1.8903728723526,
613
+ "eval_rouge1": 0.6683828816523312,
614
+ "eval_rouge2": 0.37811618722345436,
615
+ "eval_rougeL": 0.5802581730590705,
616
+ "eval_rougeLsum": 0.6534402764651661,
617
+ "eval_runtime": 21.8343,
618
+ "eval_samples_per_second": 1.374,
619
+ "eval_steps_per_second": 0.687,
620
+ "step": 6600
621
+ },
622
+ {
623
+ "epoch": 0.62,
624
+ "grad_norm": 0.1661410629749298,
625
+ "learning_rate": 0.00013811379579111458,
626
+ "loss": 1.877,
627
+ "step": 6750
628
+ },
629
+ {
630
+ "epoch": 0.62,
631
+ "eval_bertscore": 0.747416615486145,
632
+ "eval_loss": 1.8915189504623413,
633
+ "eval_rouge1": 0.6644777881148224,
634
+ "eval_rouge2": 0.3747657029706615,
635
+ "eval_rougeL": 0.5793454557198501,
636
+ "eval_rougeLsum": 0.6521716611395593,
637
+ "eval_runtime": 21.523,
638
+ "eval_samples_per_second": 1.394,
639
+ "eval_steps_per_second": 0.697,
640
+ "step": 6800
641
+ },
642
+ {
643
+ "epoch": 0.64,
644
+ "grad_norm": 0.16483080387115479,
645
+ "learning_rate": 0.00013582137453578468,
646
+ "loss": 1.8792,
647
+ "step": 7000
648
+ },
649
+ {
650
+ "epoch": 0.64,
651
+ "eval_bertscore": 0.7480576634407043,
652
+ "eval_loss": 1.8913365602493286,
653
+ "eval_rouge1": 0.6655764268912302,
654
+ "eval_rouge2": 0.3757671289735428,
655
+ "eval_rougeL": 0.577951380212153,
656
+ "eval_rougeLsum": 0.6507587412359694,
657
+ "eval_runtime": 21.3067,
658
+ "eval_samples_per_second": 1.408,
659
+ "eval_steps_per_second": 0.704,
660
+ "step": 7000
661
+ },
662
+ {
663
+ "epoch": 0.66,
664
+ "eval_bertscore": 0.7505319714546204,
665
+ "eval_loss": 1.889721155166626,
666
+ "eval_rouge1": 0.6706532239207523,
667
+ "eval_rouge2": 0.37986537729431724,
668
+ "eval_rougeL": 0.5824624008038861,
669
+ "eval_rougeLsum": 0.6571986550416876,
670
+ "eval_runtime": 21.8193,
671
+ "eval_samples_per_second": 1.375,
672
+ "eval_steps_per_second": 0.687,
673
+ "step": 7200
674
+ },
675
+ {
676
+ "epoch": 0.66,
677
+ "grad_norm": 0.1685444712638855,
678
+ "learning_rate": 0.00013352895328045482,
679
+ "loss": 1.8748,
680
+ "step": 7250
681
+ },
682
+ {
683
+ "epoch": 0.68,
684
+ "eval_bertscore": 0.7472131252288818,
685
+ "eval_loss": 1.889514684677124,
686
+ "eval_rouge1": 0.6647481520892182,
687
+ "eval_rouge2": 0.3727968089505218,
688
+ "eval_rougeL": 0.5772333167389081,
689
+ "eval_rougeLsum": 0.6503920840351167,
690
+ "eval_runtime": 21.5794,
691
+ "eval_samples_per_second": 1.39,
692
+ "eval_steps_per_second": 0.695,
693
+ "step": 7400
694
+ },
695
+ {
696
+ "epoch": 0.69,
697
+ "grad_norm": 0.16196218132972717,
698
+ "learning_rate": 0.00013123653202512493,
699
+ "loss": 1.8958,
700
+ "step": 7500
701
+ },
702
+ {
703
+ "epoch": 0.7,
704
+ "eval_bertscore": 0.7467525005340576,
705
+ "eval_loss": 1.8874704837799072,
706
+ "eval_rouge1": 0.6652789954777591,
707
+ "eval_rouge2": 0.3747211875622626,
708
+ "eval_rougeL": 0.5781018250975862,
709
+ "eval_rougeLsum": 0.6512065884264598,
710
+ "eval_runtime": 21.6436,
711
+ "eval_samples_per_second": 1.386,
712
+ "eval_steps_per_second": 0.693,
713
+ "step": 7600
714
+ },
715
+ {
716
+ "epoch": 0.71,
717
+ "grad_norm": 0.17379231750965118,
718
+ "learning_rate": 0.00012894411076979506,
719
+ "loss": 1.8655,
720
+ "step": 7750
721
+ },
722
+ {
723
+ "epoch": 0.72,
724
+ "eval_bertscore": 0.7478018403053284,
725
+ "eval_loss": 1.8879252672195435,
726
+ "eval_rouge1": 0.6676077444849423,
727
+ "eval_rouge2": 0.37550824667101645,
728
+ "eval_rougeL": 0.5792625587400696,
729
+ "eval_rougeLsum": 0.6537654224373248,
730
+ "eval_runtime": 21.8026,
731
+ "eval_samples_per_second": 1.376,
732
+ "eval_steps_per_second": 0.688,
733
+ "step": 7800
734
+ },
735
+ {
736
+ "epoch": 0.73,
737
+ "grad_norm": 0.17975503206253052,
738
+ "learning_rate": 0.00012665168951446517,
739
+ "loss": 1.8593,
740
+ "step": 8000
741
+ },
742
+ {
743
+ "epoch": 0.73,
744
+ "eval_bertscore": 0.7490061521530151,
745
+ "eval_loss": 1.8872514963150024,
746
+ "eval_rouge1": 0.6677074837057098,
747
+ "eval_rouge2": 0.37723681410973775,
748
+ "eval_rougeL": 0.5806554105436175,
749
+ "eval_rougeLsum": 0.6531691046113964,
750
+ "eval_runtime": 21.2682,
751
+ "eval_samples_per_second": 1.411,
752
+ "eval_steps_per_second": 0.705,
753
+ "step": 8000
754
+ },
755
+ {
756
+ "epoch": 0.75,
757
+ "eval_bertscore": 0.7476587295532227,
758
+ "eval_loss": 1.8857940435409546,
759
+ "eval_rouge1": 0.6675733171919529,
760
+ "eval_rouge2": 0.37667421034338344,
761
+ "eval_rougeL": 0.5804128987718613,
762
+ "eval_rougeLsum": 0.6534287804714597,
763
+ "eval_runtime": 21.5325,
764
+ "eval_samples_per_second": 1.393,
765
+ "eval_steps_per_second": 0.697,
766
+ "step": 8200
767
+ },
768
+ {
769
+ "epoch": 0.76,
770
+ "grad_norm": 0.1596900373697281,
771
+ "learning_rate": 0.0001243592682591353,
772
+ "loss": 1.8627,
773
+ "step": 8250
774
+ },
775
+ {
776
+ "epoch": 0.77,
777
+ "eval_bertscore": 0.7444086074829102,
778
+ "eval_loss": 1.8874648809432983,
779
+ "eval_rouge1": 0.6633779669482168,
780
+ "eval_rouge2": 0.3710094509675216,
781
+ "eval_rougeL": 0.5760576627400225,
782
+ "eval_rougeLsum": 0.6499803336918719,
783
+ "eval_runtime": 21.4464,
784
+ "eval_samples_per_second": 1.399,
785
+ "eval_steps_per_second": 0.699,
786
+ "step": 8400
787
+ },
788
+ {
789
+ "epoch": 0.78,
790
+ "grad_norm": 0.16890183091163635,
791
+ "learning_rate": 0.00012206684700380542,
792
+ "loss": 1.8534,
793
+ "step": 8500
794
+ },
795
+ {
796
+ "epoch": 0.79,
797
+ "eval_bertscore": 0.7483052611351013,
798
+ "eval_loss": 1.8880757093429565,
799
+ "eval_rouge1": 0.6686948143176776,
800
+ "eval_rouge2": 0.3803796130427515,
801
+ "eval_rougeL": 0.5802459813261722,
802
+ "eval_rougeLsum": 0.6536962466082527,
803
+ "eval_runtime": 21.5416,
804
+ "eval_samples_per_second": 1.393,
805
+ "eval_steps_per_second": 0.696,
806
+ "step": 8600
807
+ },
808
+ {
809
+ "epoch": 0.8,
810
+ "grad_norm": 0.1596900373697281,
811
+ "learning_rate": 0.00011977442574847555,
812
+ "loss": 1.882,
813
+ "step": 8750
814
+ },
815
+ {
816
+ "epoch": 0.81,
817
+ "eval_bertscore": 0.748338520526886,
818
+ "eval_loss": 1.8871524333953857,
819
+ "eval_rouge1": 0.6673919143770407,
820
+ "eval_rouge2": 0.3761761743795482,
821
+ "eval_rougeL": 0.5797615995019129,
822
+ "eval_rougeLsum": 0.6526650363891257,
823
+ "eval_runtime": 21.8432,
824
+ "eval_samples_per_second": 1.373,
825
+ "eval_steps_per_second": 0.687,
826
+ "step": 8800
827
+ },
828
+ {
829
+ "epoch": 0.83,
830
+ "grad_norm": 0.16380883753299713,
831
+ "learning_rate": 0.00011748200449314565,
832
+ "loss": 1.8781,
833
+ "step": 9000
834
+ },
835
+ {
836
+ "epoch": 0.83,
837
+ "eval_bertscore": 0.7473989129066467,
838
+ "eval_loss": 1.885389804840088,
839
+ "eval_rouge1": 0.6660513187618474,
840
+ "eval_rouge2": 0.3728645884799071,
841
+ "eval_rougeL": 0.5767833607673931,
842
+ "eval_rougeLsum": 0.6518177265346137,
843
+ "eval_runtime": 21.5415,
844
+ "eval_samples_per_second": 1.393,
845
+ "eval_steps_per_second": 0.696,
846
+ "step": 9000
847
+ },
848
+ {
849
+ "epoch": 0.84,
850
+ "eval_bertscore": 0.7469697594642639,
851
+ "eval_loss": 1.8835673332214355,
852
+ "eval_rouge1": 0.6655382276884847,
853
+ "eval_rouge2": 0.3743925229327822,
854
+ "eval_rougeL": 0.5808516524350132,
855
+ "eval_rougeLsum": 0.6518276923554284,
856
+ "eval_runtime": 21.7289,
857
+ "eval_samples_per_second": 1.381,
858
+ "eval_steps_per_second": 0.69,
859
+ "step": 9200
860
+ },
861
+ {
862
+ "epoch": 0.85,
863
+ "grad_norm": 0.17286422848701477,
864
+ "learning_rate": 0.00011518958323781579,
865
+ "loss": 1.8672,
866
+ "step": 9250
867
+ },
868
+ {
869
+ "epoch": 0.86,
870
+ "eval_bertscore": 0.7491498589515686,
871
+ "eval_loss": 1.8845998048782349,
872
+ "eval_rouge1": 0.6670160490080832,
873
+ "eval_rouge2": 0.37860182825781935,
874
+ "eval_rougeL": 0.5797856034485049,
875
+ "eval_rougeLsum": 0.6531203725936218,
876
+ "eval_runtime": 21.5625,
877
+ "eval_samples_per_second": 1.391,
878
+ "eval_steps_per_second": 0.696,
879
+ "step": 9400
880
+ },
881
+ {
882
+ "epoch": 0.87,
883
+ "grad_norm": 0.16658568382263184,
884
+ "learning_rate": 0.0001128971619824859,
885
+ "loss": 1.8691,
886
+ "step": 9500
887
+ },
888
+ {
889
+ "epoch": 0.88,
890
+ "eval_bertscore": 0.7493313550949097,
891
+ "eval_loss": 1.8821747303009033,
892
+ "eval_rouge1": 0.6659791441681278,
893
+ "eval_rouge2": 0.3796033834485131,
894
+ "eval_rougeL": 0.580414529806212,
895
+ "eval_rougeLsum": 0.6528068238734432,
896
+ "eval_runtime": 21.8698,
897
+ "eval_samples_per_second": 1.372,
898
+ "eval_steps_per_second": 0.686,
899
+ "step": 9600
900
+ },
901
+ {
902
+ "epoch": 0.89,
903
+ "grad_norm": 0.1733073741197586,
904
+ "learning_rate": 0.00011060474072715603,
905
+ "loss": 1.8575,
906
+ "step": 9750
907
+ },
908
+ {
909
+ "epoch": 0.9,
910
+ "eval_bertscore": 0.7497690320014954,
911
+ "eval_loss": 1.8809062242507935,
912
+ "eval_rouge1": 0.6683202809005669,
913
+ "eval_rouge2": 0.379647408271533,
914
+ "eval_rougeL": 0.5812799059293663,
915
+ "eval_rougeLsum": 0.6549076224428805,
916
+ "eval_runtime": 21.461,
917
+ "eval_samples_per_second": 1.398,
918
+ "eval_steps_per_second": 0.699,
919
+ "step": 9800
920
+ },
921
+ {
922
+ "epoch": 0.92,
923
+ "grad_norm": 0.16828681528568268,
924
+ "learning_rate": 0.00010831231947182614,
925
+ "loss": 1.8799,
926
+ "step": 10000
927
+ },
928
+ {
929
+ "epoch": 0.92,
930
+ "eval_bertscore": 0.7487274408340454,
931
+ "eval_loss": 1.8800114393234253,
932
+ "eval_rouge1": 0.6694707226380743,
933
+ "eval_rouge2": 0.37780830529690856,
934
+ "eval_rougeL": 0.5789377835641822,
935
+ "eval_rougeLsum": 0.6540561492044448,
936
+ "eval_runtime": 21.6228,
937
+ "eval_samples_per_second": 1.387,
938
+ "eval_steps_per_second": 0.694,
939
+ "step": 10000
940
+ },
941
+ {
942
+ "epoch": 0.94,
943
+ "eval_bertscore": 0.7495086789131165,
944
+ "eval_loss": 1.8811218738555908,
945
+ "eval_rouge1": 0.6714277794869861,
946
+ "eval_rouge2": 0.3814957239141348,
947
+ "eval_rougeL": 0.5817721016839257,
948
+ "eval_rougeLsum": 0.6566092952916721,
949
+ "eval_runtime": 23.1282,
950
+ "eval_samples_per_second": 1.297,
951
+ "eval_steps_per_second": 0.649,
952
+ "step": 10200
953
+ },
954
+ {
955
+ "epoch": 0.94,
956
+ "grad_norm": 0.16498848795890808,
957
+ "learning_rate": 0.00010601989821649627,
958
+ "loss": 1.8656,
959
+ "step": 10250
960
+ },
961
+ {
962
+ "epoch": 0.95,
963
+ "eval_bertscore": 0.749505877494812,
964
+ "eval_loss": 1.8809926509857178,
965
+ "eval_rouge1": 0.6720420767359538,
966
+ "eval_rouge2": 0.38239237549289784,
967
+ "eval_rougeL": 0.5825845512902208,
968
+ "eval_rougeLsum": 0.6590116525116119,
969
+ "eval_runtime": 21.5266,
970
+ "eval_samples_per_second": 1.394,
971
+ "eval_steps_per_second": 0.697,
972
+ "step": 10400
973
+ },
974
+ {
975
+ "epoch": 0.96,
976
+ "grad_norm": 0.1661728322505951,
977
+ "learning_rate": 0.00010372747696116638,
978
+ "loss": 1.8633,
979
+ "step": 10500
980
+ },
981
+ {
982
+ "epoch": 0.97,
983
+ "eval_bertscore": 0.7484509944915771,
984
+ "eval_loss": 1.8795918226242065,
985
+ "eval_rouge1": 0.66861224256168,
986
+ "eval_rouge2": 0.3810938571231235,
987
+ "eval_rougeL": 0.581338929419374,
988
+ "eval_rougeLsum": 0.6556287448758898,
989
+ "eval_runtime": 21.6144,
990
+ "eval_samples_per_second": 1.388,
991
+ "eval_steps_per_second": 0.694,
992
+ "step": 10600
993
+ },
994
+ {
995
+ "epoch": 0.99,
996
+ "grad_norm": 0.1695539355278015,
997
+ "learning_rate": 0.00010143505570583652,
998
+ "loss": 1.8778,
999
+ "step": 10750
1000
+ },
1001
+ {
1002
+ "epoch": 0.99,
1003
+ "eval_bertscore": 0.747430145740509,
1004
+ "eval_loss": 1.8807307481765747,
1005
+ "eval_rouge1": 0.6659775067192504,
1006
+ "eval_rouge2": 0.37723044840422537,
1007
+ "eval_rougeL": 0.5790798830214317,
1008
+ "eval_rougeLsum": 0.6509981906464294,
1009
+ "eval_runtime": 21.9658,
1010
+ "eval_samples_per_second": 1.366,
1011
+ "eval_steps_per_second": 0.683,
1012
+ "step": 10800
1013
+ },
1014
+ {
1015
+ "epoch": 1.01,
1016
+ "grad_norm": 0.18244074285030365,
1017
+ "learning_rate": 9.914263445050664e-05,
1018
+ "loss": 1.8425,
1019
+ "step": 11000
1020
+ },
1021
+ {
1022
+ "epoch": 1.01,
1023
+ "eval_bertscore": 0.7464674711227417,
1024
+ "eval_loss": 1.8850181102752686,
1025
+ "eval_rouge1": 0.6682062462715245,
1026
+ "eval_rouge2": 0.377961045305675,
1027
+ "eval_rougeL": 0.5785946041032981,
1028
+ "eval_rougeLsum": 0.6544658695180745,
1029
+ "eval_runtime": 21.4985,
1030
+ "eval_samples_per_second": 1.395,
1031
+ "eval_steps_per_second": 0.698,
1032
+ "step": 11000
1033
+ },
1034
+ {
1035
+ "epoch": 1.03,
1036
+ "eval_bertscore": 0.748903214931488,
1037
+ "eval_loss": 1.8819694519042969,
1038
+ "eval_rouge1": 0.6702994540242251,
1039
+ "eval_rouge2": 0.38293287997414793,
1040
+ "eval_rougeL": 0.5814513237567966,
1041
+ "eval_rougeLsum": 0.6559726946972199,
1042
+ "eval_runtime": 21.4139,
1043
+ "eval_samples_per_second": 1.401,
1044
+ "eval_steps_per_second": 0.7,
1045
+ "step": 11200
1046
+ },
1047
+ {
1048
+ "epoch": 1.03,
1049
+ "grad_norm": 0.17693208158016205,
1050
+ "learning_rate": 9.685021319517676e-05,
1051
+ "loss": 1.8016,
1052
+ "step": 11250
1053
+ },
1054
+ {
1055
+ "epoch": 1.05,
1056
+ "eval_bertscore": 0.7492591738700867,
1057
+ "eval_loss": 1.8827041387557983,
1058
+ "eval_rouge1": 0.6691453593118961,
1059
+ "eval_rouge2": 0.3798853572019327,
1060
+ "eval_rougeL": 0.5809966833392892,
1061
+ "eval_rougeLsum": 0.6558794288097127,
1062
+ "eval_runtime": 21.4988,
1063
+ "eval_samples_per_second": 1.395,
1064
+ "eval_steps_per_second": 0.698,
1065
+ "step": 11400
1066
+ },
1067
+ {
1068
+ "epoch": 1.05,
1069
+ "grad_norm": 0.19059012830257416,
1070
+ "learning_rate": 9.455779193984687e-05,
1071
+ "loss": 1.806,
1072
+ "step": 11500
1073
+ },
1074
+ {
1075
+ "epoch": 1.06,
1076
+ "eval_bertscore": 0.7471604943275452,
1077
+ "eval_loss": 1.8825455904006958,
1078
+ "eval_rouge1": 0.666961451977486,
1079
+ "eval_rouge2": 0.37886614565714727,
1080
+ "eval_rougeL": 0.5782594534845417,
1081
+ "eval_rougeLsum": 0.6527754475869945,
1082
+ "eval_runtime": 21.6349,
1083
+ "eval_samples_per_second": 1.387,
1084
+ "eval_steps_per_second": 0.693,
1085
+ "step": 11600
1086
+ },
1087
+ {
1088
+ "epoch": 1.08,
1089
+ "grad_norm": 0.17817597091197968,
1090
+ "learning_rate": 9.226537068451699e-05,
1091
+ "loss": 1.8087,
1092
+ "step": 11750
1093
+ },
1094
+ {
1095
+ "epoch": 1.08,
1096
+ "eval_bertscore": 0.7492148876190186,
1097
+ "eval_loss": 1.8826088905334473,
1098
+ "eval_rouge1": 0.6677645500651761,
1099
+ "eval_rouge2": 0.3804313457558821,
1100
+ "eval_rougeL": 0.5808965378999502,
1101
+ "eval_rougeLsum": 0.654710106622618,
1102
+ "eval_runtime": 21.938,
1103
+ "eval_samples_per_second": 1.367,
1104
+ "eval_steps_per_second": 0.684,
1105
+ "step": 11800
1106
+ },
1107
+ {
1108
+ "epoch": 1.1,
1109
+ "grad_norm": 0.1762418895959854,
1110
+ "learning_rate": 8.997294942918711e-05,
1111
+ "loss": 1.806,
1112
+ "step": 12000
1113
+ },
1114
+ {
1115
+ "epoch": 1.1,
1116
+ "eval_bertscore": 0.748414933681488,
1117
+ "eval_loss": 1.8811677694320679,
1118
+ "eval_rouge1": 0.6688262090158613,
1119
+ "eval_rouge2": 0.38050452253222067,
1120
+ "eval_rougeL": 0.5800878428874158,
1121
+ "eval_rougeLsum": 0.6541444781570895,
1122
+ "eval_runtime": 21.4244,
1123
+ "eval_samples_per_second": 1.4,
1124
+ "eval_steps_per_second": 0.7,
1125
+ "step": 12000
1126
+ },
1127
+ {
1128
+ "epoch": 1.12,
1129
+ "eval_bertscore": 0.7469298243522644,
1130
+ "eval_loss": 1.8827059268951416,
1131
+ "eval_rouge1": 0.667125898932208,
1132
+ "eval_rouge2": 0.37762418321204805,
1133
+ "eval_rougeL": 0.5799787290068156,
1134
+ "eval_rougeLsum": 0.6549075242794395,
1135
+ "eval_runtime": 21.4415,
1136
+ "eval_samples_per_second": 1.399,
1137
+ "eval_steps_per_second": 0.7,
1138
+ "step": 12200
1139
+ },
1140
+ {
1141
+ "epoch": 1.12,
1142
+ "grad_norm": 0.1830277293920517,
1143
+ "learning_rate": 8.768969785887855e-05,
1144
+ "loss": 1.805,
1145
+ "step": 12250
1146
+ },
1147
+ {
1148
+ "epoch": 1.14,
1149
+ "eval_bertscore": 0.7483015060424805,
1150
+ "eval_loss": 1.881732702255249,
1151
+ "eval_rouge1": 0.6693189289720543,
1152
+ "eval_rouge2": 0.37779647405803307,
1153
+ "eval_rougeL": 0.579417997628969,
1154
+ "eval_rougeLsum": 0.6561505915526004,
1155
+ "eval_runtime": 21.6773,
1156
+ "eval_samples_per_second": 1.384,
1157
+ "eval_steps_per_second": 0.692,
1158
+ "step": 12400
1159
+ },
1160
+ {
1161
+ "epoch": 1.15,
1162
+ "grad_norm": 0.20985420048236847,
1163
+ "learning_rate": 8.539727660354867e-05,
1164
+ "loss": 1.8041,
1165
+ "step": 12500
1166
+ },
1167
+ {
1168
+ "epoch": 1.16,
1169
+ "eval_bertscore": 0.7499834895133972,
1170
+ "eval_loss": 1.88084077835083,
1171
+ "eval_rouge1": 0.6683080555468759,
1172
+ "eval_rouge2": 0.38032152133281283,
1173
+ "eval_rougeL": 0.5810300348705915,
1174
+ "eval_rougeLsum": 0.6551372270660842,
1175
+ "eval_runtime": 21.938,
1176
+ "eval_samples_per_second": 1.367,
1177
+ "eval_steps_per_second": 0.684,
1178
+ "step": 12600
1179
+ },
1180
+ {
1181
+ "epoch": 1.17,
1182
+ "grad_norm": 0.20808811485767365,
1183
+ "learning_rate": 8.310485534821879e-05,
1184
+ "loss": 1.8043,
1185
+ "step": 12750
1186
+ },
1187
+ {
1188
+ "epoch": 1.17,
1189
+ "eval_bertscore": 0.7486943602561951,
1190
+ "eval_loss": 1.8817172050476074,
1191
+ "eval_rouge1": 0.6679189638795886,
1192
+ "eval_rouge2": 0.37996382910514515,
1193
+ "eval_rougeL": 0.5796001956914257,
1194
+ "eval_rougeLsum": 0.6541573673696073,
1195
+ "eval_runtime": 21.4517,
1196
+ "eval_samples_per_second": 1.398,
1197
+ "eval_steps_per_second": 0.699,
1198
+ "step": 12800
1199
+ }
1200
+ ],
1201
+ "logging_steps": 250,
1202
+ "max_steps": 21812,
1203
+ "num_input_tokens_seen": 0,
1204
+ "num_train_epochs": 2,
1205
+ "save_steps": 800,
1206
+ "total_flos": 8.628512036801741e+17,
1207
+ "train_batch_size": 2,
1208
+ "trial_name": null,
1209
+ "trial_params": null
1210
+ }
checkpoint-12800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18a96e9baacd033c0a419444553d18b70e4f76e7b37401a6dcc6b00ceb2cc1e1
3
+ size 5048
checkpoint-12800/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-13600/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: /workspace/model-export/allstax/shorting-phi-e4
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-13600/adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/workspace/model-export/allstax/shorting-phi-e4",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "q_proj",
25
+ "fc2",
26
+ "dense",
27
+ "v_proj",
28
+ "fc1"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }