ethankasa commited on
Commit
568cf9a
1 Parent(s): 26d9445

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -7,14 +7,14 @@ tags:
7
  - generated_from_trainer
8
  base_model: microsoft/phi-2
9
  model-index:
10
- - name: trivia
11
  results: []
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
17
- # trivia
18
 
19
  This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the trivia dataset.
20
 
@@ -35,7 +35,7 @@ More information needed
35
  ### Training hyperparameters
36
 
37
  The following hyperparameters were used during training:
38
- - learning_rate: 5e-10
39
  - train_batch_size: 2
40
  - eval_batch_size: 8
41
  - seed: 42
@@ -43,7 +43,7 @@ The following hyperparameters were used during training:
43
  - total_train_batch_size: 16
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: cosine
46
- - num_epochs: 30.0
47
 
48
  ### Training results
49
 
 
7
  - generated_from_trainer
8
  base_model: microsoft/phi-2
9
  model-index:
10
+ - name: glaiveNew
11
  results: []
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
17
+ # glaiveNew
18
 
19
  This model is a fine-tuned version of [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) on the trivia dataset.
20
 
 
35
  ### Training hyperparameters
36
 
37
  The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
  - train_batch_size: 2
40
  - eval_batch_size: 8
41
  - seed: 42
 
43
  - total_train_batch_size: 16
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: cosine
46
+ - num_epochs: 3.0
47
 
48
  ### Training results
49
 
adapter_config.json CHANGED
@@ -20,8 +20,8 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "v_proj",
24
- "q_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "q_proj",
24
+ "v_proj"
25
  ],
26
  "task_type": "CAUSAL_LM",
27
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35a4860ecef30d83bb4e8ca7d2b06962573b59019755b3618ef7a1d2dd861a98
3
  size 10502640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9abcb87ca0fc0f59556142c682feb9092e98296cd75e9449e28864e34339661a
3
  size 10502640
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 28.8,
3
- "total_flos": 1335967054725120.0,
4
- "train_loss": 7.109028116861979,
5
- "train_runtime": 593.1625,
6
- "train_samples_per_second": 5.058,
7
- "train_steps_per_second": 0.303
8
  }
 
1
  {
2
+ "epoch": 2.88,
3
+ "total_flos": 133622157312000.0,
4
+ "train_loss": 6.60821893480089,
5
+ "train_runtime": 59.777,
6
+ "train_samples_per_second": 5.019,
7
+ "train_steps_per_second": 0.301
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 28.8,
3
- "total_flos": 1335967054725120.0,
4
- "train_loss": 7.109028116861979,
5
- "train_runtime": 593.1625,
6
- "train_samples_per_second": 5.058,
7
- "train_steps_per_second": 0.303
8
  }
 
1
  {
2
+ "epoch": 2.88,
3
+ "total_flos": 133622157312000.0,
4
+ "train_loss": 6.60821893480089,
5
+ "train_runtime": 59.777,
6
+ "train_samples_per_second": 5.019,
7
+ "train_steps_per_second": 0.301
8
  }
trainer_log.jsonl CHANGED
@@ -1,37 +1,4 @@
1
- {"current_steps": 5, "total_steps": 180, "loss": 7.386, "learning_rate": 4.990486745229364e-10, "epoch": 0.8, "percentage": 2.78, "elapsed_time": "0:00:16", "remaining_time": "0:09:43"}
2
- {"current_steps": 10, "total_steps": 180, "loss": 6.7817, "learning_rate": 4.962019382530521e-10, "epoch": 1.6, "percentage": 5.56, "elapsed_time": "0:00:33", "remaining_time": "0:09:23"}
3
- {"current_steps": 15, "total_steps": 180, "loss": 7.1632, "learning_rate": 4.914814565722671e-10, "epoch": 2.4, "percentage": 8.33, "elapsed_time": "0:00:49", "remaining_time": "0:09:04"}
4
- {"current_steps": 20, "total_steps": 180, "loss": 7.0109, "learning_rate": 4.849231551964771e-10, "epoch": 3.2, "percentage": 11.11, "elapsed_time": "0:01:05", "remaining_time": "0:08:47"}
5
- {"current_steps": 25, "total_steps": 180, "loss": 7.1004, "learning_rate": 4.765769467591626e-10, "epoch": 4.0, "percentage": 13.89, "elapsed_time": "0:01:21", "remaining_time": "0:08:27"}
6
- {"current_steps": 30, "total_steps": 180, "loss": 6.9577, "learning_rate": 4.665063509461097e-10, "epoch": 4.8, "percentage": 16.67, "elapsed_time": "0:01:38", "remaining_time": "0:08:11"}
7
- {"current_steps": 35, "total_steps": 180, "loss": 7.1725, "learning_rate": 4.54788011072248e-10, "epoch": 5.6, "percentage": 19.44, "elapsed_time": "0:01:54", "remaining_time": "0:07:55"}
8
- {"current_steps": 40, "total_steps": 180, "loss": 7.1535, "learning_rate": 4.415111107797445e-10, "epoch": 6.4, "percentage": 22.22, "elapsed_time": "0:02:11", "remaining_time": "0:07:39"}
9
- {"current_steps": 45, "total_steps": 180, "loss": 6.9961, "learning_rate": 4.267766952966369e-10, "epoch": 7.2, "percentage": 25.0, "elapsed_time": "0:02:28", "remaining_time": "0:07:24"}
10
- {"current_steps": 50, "total_steps": 180, "loss": 7.1581, "learning_rate": 4.106969024216348e-10, "epoch": 8.0, "percentage": 27.78, "elapsed_time": "0:02:44", "remaining_time": "0:07:08"}
11
- {"current_steps": 55, "total_steps": 180, "loss": 7.3208, "learning_rate": 3.933941090877615e-10, "epoch": 8.8, "percentage": 30.56, "elapsed_time": "0:03:02", "remaining_time": "0:06:54"}
12
- {"current_steps": 60, "total_steps": 180, "loss": 6.7152, "learning_rate": 3.7500000000000005e-10, "epoch": 9.6, "percentage": 33.33, "elapsed_time": "0:03:19", "remaining_time": "0:06:39"}
13
- {"current_steps": 65, "total_steps": 180, "loss": 7.1168, "learning_rate": 3.556545654351749e-10, "epoch": 10.4, "percentage": 36.11, "elapsed_time": "0:03:36", "remaining_time": "0:06:23"}
14
- {"current_steps": 70, "total_steps": 180, "loss": 7.143, "learning_rate": 3.3550503583141725e-10, "epoch": 11.2, "percentage": 38.89, "elapsed_time": "0:03:52", "remaining_time": "0:06:06"}
15
- {"current_steps": 75, "total_steps": 180, "loss": 7.2724, "learning_rate": 3.147047612756302e-10, "epoch": 12.0, "percentage": 41.67, "elapsed_time": "0:04:09", "remaining_time": "0:05:49"}
16
- {"current_steps": 80, "total_steps": 180, "loss": 6.995, "learning_rate": 2.9341204441673265e-10, "epoch": 12.8, "percentage": 44.44, "elapsed_time": "0:04:26", "remaining_time": "0:05:32"}
17
- {"current_steps": 85, "total_steps": 180, "loss": 7.1134, "learning_rate": 2.717889356869146e-10, "epoch": 13.6, "percentage": 47.22, "elapsed_time": "0:04:42", "remaining_time": "0:05:15"}
18
- {"current_steps": 90, "total_steps": 180, "loss": 7.2029, "learning_rate": 2.5e-10, "epoch": 14.4, "percentage": 50.0, "elapsed_time": "0:04:59", "remaining_time": "0:04:59"}
19
- {"current_steps": 95, "total_steps": 180, "loss": 7.4673, "learning_rate": 2.2821106431308546e-10, "epoch": 15.2, "percentage": 52.78, "elapsed_time": "0:05:15", "remaining_time": "0:04:42"}
20
- {"current_steps": 100, "total_steps": 180, "loss": 6.8563, "learning_rate": 2.0658795558326743e-10, "epoch": 16.0, "percentage": 55.56, "elapsed_time": "0:05:31", "remaining_time": "0:04:25"}
21
- {"current_steps": 105, "total_steps": 180, "loss": 7.0683, "learning_rate": 1.852952387243698e-10, "epoch": 16.8, "percentage": 58.33, "elapsed_time": "0:05:48", "remaining_time": "0:04:08"}
22
- {"current_steps": 110, "total_steps": 180, "loss": 7.1976, "learning_rate": 1.6449496416858284e-10, "epoch": 17.6, "percentage": 61.11, "elapsed_time": "0:06:04", "remaining_time": "0:03:52"}
23
- {"current_steps": 115, "total_steps": 180, "loss": 7.1575, "learning_rate": 1.443454345648252e-10, "epoch": 18.4, "percentage": 63.89, "elapsed_time": "0:06:20", "remaining_time": "0:03:35"}
24
- {"current_steps": 120, "total_steps": 180, "loss": 7.073, "learning_rate": 1.2500000000000006e-10, "epoch": 19.2, "percentage": 66.67, "elapsed_time": "0:06:37", "remaining_time": "0:03:18"}
25
- {"current_steps": 125, "total_steps": 180, "loss": 7.0932, "learning_rate": 1.0660589091223855e-10, "epoch": 20.0, "percentage": 69.44, "elapsed_time": "0:06:53", "remaining_time": "0:03:01"}
26
- {"current_steps": 130, "total_steps": 180, "loss": 7.1324, "learning_rate": 8.930309757836516e-11, "epoch": 20.8, "percentage": 72.22, "elapsed_time": "0:07:09", "remaining_time": "0:02:45"}
27
- {"current_steps": 135, "total_steps": 180, "loss": 7.1434, "learning_rate": 7.322330470336314e-11, "epoch": 21.6, "percentage": 75.0, "elapsed_time": "0:07:27", "remaining_time": "0:02:29"}
28
- {"current_steps": 140, "total_steps": 180, "loss": 7.0423, "learning_rate": 5.848888922025552e-11, "epoch": 22.4, "percentage": 77.78, "elapsed_time": "0:07:42", "remaining_time": "0:02:12"}
29
- {"current_steps": 145, "total_steps": 180, "loss": 7.3638, "learning_rate": 4.5211988927752025e-11, "epoch": 23.2, "percentage": 80.56, "elapsed_time": "0:07:59", "remaining_time": "0:01:55"}
30
- {"current_steps": 150, "total_steps": 180, "loss": 6.8454, "learning_rate": 3.3493649053890324e-11, "epoch": 24.0, "percentage": 83.33, "elapsed_time": "0:08:15", "remaining_time": "0:01:39"}
31
- {"current_steps": 155, "total_steps": 180, "loss": 7.1881, "learning_rate": 2.3423053240837516e-11, "epoch": 24.8, "percentage": 86.11, "elapsed_time": "0:08:31", "remaining_time": "0:01:22"}
32
- {"current_steps": 160, "total_steps": 180, "loss": 7.1249, "learning_rate": 1.5076844803522922e-11, "epoch": 25.6, "percentage": 88.89, "elapsed_time": "0:08:48", "remaining_time": "0:01:06"}
33
- {"current_steps": 165, "total_steps": 180, "loss": 7.1924, "learning_rate": 8.51854342773295e-12, "epoch": 26.4, "percentage": 91.67, "elapsed_time": "0:09:04", "remaining_time": "0:00:49"}
34
- {"current_steps": 170, "total_steps": 180, "loss": 6.9059, "learning_rate": 3.798061746947995e-12, "epoch": 27.2, "percentage": 94.44, "elapsed_time": "0:09:20", "remaining_time": "0:00:32"}
35
- {"current_steps": 175, "total_steps": 180, "loss": 7.2371, "learning_rate": 9.513254770636138e-13, "epoch": 28.0, "percentage": 97.22, "elapsed_time": "0:09:37", "remaining_time": "0:00:16"}
36
- {"current_steps": 180, "total_steps": 180, "loss": 7.0767, "learning_rate": 0.0, "epoch": 28.8, "percentage": 100.0, "elapsed_time": "0:09:53", "remaining_time": "0:00:00"}
37
- {"current_steps": 180, "total_steps": 180, "epoch": 28.8, "percentage": 100.0, "elapsed_time": "0:09:53", "remaining_time": "0:00:00"}
 
1
+ {"current_steps": 5, "total_steps": 18, "loss": 7.2633, "learning_rate": 4.1069690242163484e-05, "epoch": 0.8, "percentage": 27.78, "elapsed_time": "0:00:17", "remaining_time": "0:00:45"}
2
+ {"current_steps": 10, "total_steps": 18, "loss": 6.3067, "learning_rate": 2.0658795558326743e-05, "epoch": 1.6, "percentage": 55.56, "elapsed_time": "0:00:33", "remaining_time": "0:00:27"}
3
+ {"current_steps": 15, "total_steps": 18, "loss": 6.4305, "learning_rate": 3.3493649053890326e-06, "epoch": 2.4, "percentage": 83.33, "elapsed_time": "0:00:50", "remaining_time": "0:00:10"}
4
+ {"current_steps": 18, "total_steps": 18, "epoch": 2.88, "percentage": 100.0, "elapsed_time": "0:00:59", "remaining_time": "0:00:00"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
trainer_state.json CHANGED
@@ -1,281 +1,50 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 28.8,
5
  "eval_steps": 500,
6
- "global_step": 180,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.8,
13
- "grad_norm": 2.3331761360168457,
14
- "learning_rate": 4.990486745229364e-10,
15
- "loss": 7.386,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 1.6,
20
- "grad_norm": 2.494887351989746,
21
- "learning_rate": 4.962019382530521e-10,
22
- "loss": 6.7817,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 2.4,
27
- "grad_norm": 2.389345407485962,
28
- "learning_rate": 4.914814565722671e-10,
29
- "loss": 7.1632,
30
  "step": 15
31
  },
32
  {
33
- "epoch": 3.2,
34
- "grad_norm": 2.437628984451294,
35
- "learning_rate": 4.849231551964771e-10,
36
- "loss": 7.0109,
37
- "step": 20
38
- },
39
- {
40
- "epoch": 4.0,
41
- "grad_norm": 3.2548422813415527,
42
- "learning_rate": 4.765769467591626e-10,
43
- "loss": 7.1004,
44
- "step": 25
45
- },
46
- {
47
- "epoch": 4.8,
48
- "grad_norm": 2.268507480621338,
49
- "learning_rate": 4.665063509461097e-10,
50
- "loss": 6.9577,
51
- "step": 30
52
- },
53
- {
54
- "epoch": 5.6,
55
- "grad_norm": 2.739196300506592,
56
- "learning_rate": 4.54788011072248e-10,
57
- "loss": 7.1725,
58
- "step": 35
59
- },
60
- {
61
- "epoch": 6.4,
62
- "grad_norm": 2.399449348449707,
63
- "learning_rate": 4.415111107797445e-10,
64
- "loss": 7.1535,
65
- "step": 40
66
- },
67
- {
68
- "epoch": 7.2,
69
- "grad_norm": 2.798766613006592,
70
- "learning_rate": 4.267766952966369e-10,
71
- "loss": 6.9961,
72
- "step": 45
73
- },
74
- {
75
- "epoch": 8.0,
76
- "grad_norm": 2.742884635925293,
77
- "learning_rate": 4.106969024216348e-10,
78
- "loss": 7.1581,
79
- "step": 50
80
- },
81
- {
82
- "epoch": 8.8,
83
- "grad_norm": 3.0411534309387207,
84
- "learning_rate": 3.933941090877615e-10,
85
- "loss": 7.3208,
86
- "step": 55
87
- },
88
- {
89
- "epoch": 9.6,
90
- "grad_norm": 2.590532064437866,
91
- "learning_rate": 3.7500000000000005e-10,
92
- "loss": 6.7152,
93
- "step": 60
94
- },
95
- {
96
- "epoch": 10.4,
97
- "grad_norm": 2.6501505374908447,
98
- "learning_rate": 3.556545654351749e-10,
99
- "loss": 7.1168,
100
- "step": 65
101
- },
102
- {
103
- "epoch": 11.2,
104
- "grad_norm": 2.6842129230499268,
105
- "learning_rate": 3.3550503583141725e-10,
106
- "loss": 7.143,
107
- "step": 70
108
- },
109
- {
110
- "epoch": 12.0,
111
- "grad_norm": 2.7974400520324707,
112
- "learning_rate": 3.147047612756302e-10,
113
- "loss": 7.2724,
114
- "step": 75
115
- },
116
- {
117
- "epoch": 12.8,
118
- "grad_norm": 2.68391489982605,
119
- "learning_rate": 2.9341204441673265e-10,
120
- "loss": 6.995,
121
- "step": 80
122
- },
123
- {
124
- "epoch": 13.6,
125
- "grad_norm": 3.0854671001434326,
126
- "learning_rate": 2.717889356869146e-10,
127
- "loss": 7.1134,
128
- "step": 85
129
- },
130
- {
131
- "epoch": 14.4,
132
- "grad_norm": 2.772061347961426,
133
- "learning_rate": 2.5e-10,
134
- "loss": 7.2029,
135
- "step": 90
136
- },
137
- {
138
- "epoch": 15.2,
139
- "grad_norm": 2.6627767086029053,
140
- "learning_rate": 2.2821106431308546e-10,
141
- "loss": 7.4673,
142
- "step": 95
143
- },
144
- {
145
- "epoch": 16.0,
146
- "grad_norm": 2.7043120861053467,
147
- "learning_rate": 2.0658795558326743e-10,
148
- "loss": 6.8563,
149
- "step": 100
150
- },
151
- {
152
- "epoch": 16.8,
153
- "grad_norm": 2.4765264987945557,
154
- "learning_rate": 1.852952387243698e-10,
155
- "loss": 7.0683,
156
- "step": 105
157
- },
158
- {
159
- "epoch": 17.6,
160
- "grad_norm": 2.775627613067627,
161
- "learning_rate": 1.6449496416858284e-10,
162
- "loss": 7.1976,
163
- "step": 110
164
- },
165
- {
166
- "epoch": 18.4,
167
- "grad_norm": 2.3891263008117676,
168
- "learning_rate": 1.443454345648252e-10,
169
- "loss": 7.1575,
170
- "step": 115
171
- },
172
- {
173
- "epoch": 19.2,
174
- "grad_norm": 2.5396955013275146,
175
- "learning_rate": 1.2500000000000006e-10,
176
- "loss": 7.073,
177
- "step": 120
178
- },
179
- {
180
- "epoch": 20.0,
181
- "grad_norm": 2.4532394409179688,
182
- "learning_rate": 1.0660589091223855e-10,
183
- "loss": 7.0932,
184
- "step": 125
185
- },
186
- {
187
- "epoch": 20.8,
188
- "grad_norm": 2.822531223297119,
189
- "learning_rate": 8.930309757836516e-11,
190
- "loss": 7.1324,
191
- "step": 130
192
- },
193
- {
194
- "epoch": 21.6,
195
- "grad_norm": 2.5621225833892822,
196
- "learning_rate": 7.322330470336314e-11,
197
- "loss": 7.1434,
198
- "step": 135
199
- },
200
- {
201
- "epoch": 22.4,
202
- "grad_norm": 2.63484263420105,
203
- "learning_rate": 5.848888922025552e-11,
204
- "loss": 7.0423,
205
- "step": 140
206
- },
207
- {
208
- "epoch": 23.2,
209
- "grad_norm": 2.882169246673584,
210
- "learning_rate": 4.5211988927752025e-11,
211
- "loss": 7.3638,
212
- "step": 145
213
- },
214
- {
215
- "epoch": 24.0,
216
- "grad_norm": 2.356477975845337,
217
- "learning_rate": 3.3493649053890324e-11,
218
- "loss": 6.8454,
219
- "step": 150
220
- },
221
- {
222
- "epoch": 24.8,
223
- "grad_norm": 3.1740143299102783,
224
- "learning_rate": 2.3423053240837516e-11,
225
- "loss": 7.1881,
226
- "step": 155
227
- },
228
- {
229
- "epoch": 25.6,
230
- "grad_norm": 2.634425640106201,
231
- "learning_rate": 1.5076844803522922e-11,
232
- "loss": 7.1249,
233
- "step": 160
234
- },
235
- {
236
- "epoch": 26.4,
237
- "grad_norm": 2.412172317504883,
238
- "learning_rate": 8.51854342773295e-12,
239
- "loss": 7.1924,
240
- "step": 165
241
- },
242
- {
243
- "epoch": 27.2,
244
- "grad_norm": 2.655557870864868,
245
- "learning_rate": 3.798061746947995e-12,
246
- "loss": 6.9059,
247
- "step": 170
248
- },
249
- {
250
- "epoch": 28.0,
251
- "grad_norm": 2.859827756881714,
252
- "learning_rate": 9.513254770636138e-13,
253
- "loss": 7.2371,
254
- "step": 175
255
- },
256
- {
257
- "epoch": 28.8,
258
- "grad_norm": 2.7501277923583984,
259
- "learning_rate": 0.0,
260
- "loss": 7.0767,
261
- "step": 180
262
- },
263
- {
264
- "epoch": 28.8,
265
- "step": 180,
266
- "total_flos": 1335967054725120.0,
267
- "train_loss": 7.109028116861979,
268
- "train_runtime": 593.1625,
269
- "train_samples_per_second": 5.058,
270
- "train_steps_per_second": 0.303
271
  }
272
  ],
273
  "logging_steps": 5,
274
- "max_steps": 180,
275
  "num_input_tokens_seen": 0,
276
- "num_train_epochs": 30,
277
  "save_steps": 100,
278
- "total_flos": 1335967054725120.0,
279
  "train_batch_size": 2,
280
  "trial_name": null,
281
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.88,
5
  "eval_steps": 500,
6
+ "global_step": 18,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.8,
13
+ "grad_norm": 2.860818386077881,
14
+ "learning_rate": 4.1069690242163484e-05,
15
+ "loss": 7.2633,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 1.6,
20
+ "grad_norm": 4.06754207611084,
21
+ "learning_rate": 2.0658795558326743e-05,
22
+ "loss": 6.3067,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 2.4,
27
+ "grad_norm": 4.378876686096191,
28
+ "learning_rate": 3.3493649053890326e-06,
29
+ "loss": 6.4305,
30
  "step": 15
31
  },
32
  {
33
+ "epoch": 2.88,
34
+ "step": 18,
35
+ "total_flos": 133622157312000.0,
36
+ "train_loss": 6.60821893480089,
37
+ "train_runtime": 59.777,
38
+ "train_samples_per_second": 5.019,
39
+ "train_steps_per_second": 0.301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  }
41
  ],
42
  "logging_steps": 5,
43
+ "max_steps": 18,
44
  "num_input_tokens_seen": 0,
45
+ "num_train_epochs": 3,
46
  "save_steps": 100,
47
+ "total_flos": 133622157312000.0,
48
  "train_batch_size": 2,
49
  "trial_name": null,
50
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26ca302ed42091be2724cc3ec6e2a8ab6f813368320dc36808ada89634d0b367
3
- size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:877561c09d9ec252be597601fb20cc3870d9bf16887509642b7e8821a1e2e78c
3
+ size 5176
training_loss.png CHANGED