Text Generation
Transformers
PyTorch
gpt_neox
Inference Endpoints
text-generation-inference
theblackcat102 commited on
Commit
40feaac
1 Parent(s): d001628

upload small file first

Browse files
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "EleutherAI/pythia-12b-deduped",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 0,
8
+ "hidden_act": "gelu",
9
+ "hidden_size": 5120,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 20480,
12
+ "layer_norm_eps": 1e-05,
13
+ "max_position_embeddings": 2048,
14
+ "model_type": "gpt_neox",
15
+ "num_attention_heads": 40,
16
+ "num_hidden_layers": 36,
17
+ "rotary_emb_base": 10000,
18
+ "rotary_pct": 0.25,
19
+ "tie_word_embeddings": false,
20
+ "torch_dtype": "float16",
21
+ "transformers_version": "4.25.1",
22
+ "use_cache": true,
23
+ "use_parallel_residual": true,
24
+ "vocab_size": 50281
25
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<prefix>",
4
+ "<human>",
5
+ "</prefix>",
6
+ "<bot>"
7
+ ],
8
+ "bos_token": "<|endoftext|>",
9
+ "eos_token": "<|endoftext|>",
10
+ "pad_token": "<|padding|>",
11
+ "sep_token": "<|endoftext|>",
12
+ "unk_token": "<|endoftext|>"
13
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "name_or_path": "EleutherAI/pythia-12b-deduped",
7
+ "special_tokens_map_file": "/fsx/home-hailey/.cache/huggingface/hub/models--EleutherAI--gpt-neox-20b/snapshots/3523781c8df75f7741687a4284f6f70e1afa12f4/special_tokens_map.json",
8
+ "tokenizer_class": "GPTNeoXTokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
trainer_state.json ADDED
@@ -0,0 +1,1408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.22919483853223627,
5
+ "global_step": 1000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 1.4084967333570947e-06,
13
+ "loss": 2.2507,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.0,
18
+ "learning_rate": 2.0507482022971233e-06,
19
+ "loss": 1.9542,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "learning_rate": 2.385606273598312e-06,
25
+ "loss": 1.8446,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.01,
30
+ "learning_rate": 2.6136695401116585e-06,
31
+ "loss": 1.831,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.01,
36
+ "learning_rate": 2.7868297632261957e-06,
37
+ "loss": 1.8121,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.01,
42
+ "learning_rate": 2.926458092787486e-06,
43
+ "loss": 1.7884,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.02,
48
+ "learning_rate": 3.0434580045013773e-06,
49
+ "loss": 1.755,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 0.02,
54
+ "learning_rate": 3.1441512086208035e-06,
55
+ "loss": 1.7662,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 0.02,
60
+ "learning_rate": 3.232532087697698e-06,
61
+ "loss": 1.7246,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 0.02,
66
+ "learning_rate": 3.3112862237770753e-06,
67
+ "loss": 1.7563,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.03,
72
+ "learning_rate": 3.3823062961420163e-06,
73
+ "loss": 1.7531,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 0.03,
78
+ "learning_rate": 3.446976436243603e-06,
79
+ "loss": 1.7334,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 0.03,
84
+ "learning_rate": 3.506339534926595e-06,
85
+ "loss": 1.7231,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 0.03,
90
+ "learning_rate": 3.5612009452606784e-06,
91
+ "loss": 1.7151,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 0.03,
96
+ "learning_rate": 3.612195557913627e-06,
97
+ "loss": 1.7218,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 0.04,
102
+ "learning_rate": 3.65983275401539e-06,
103
+ "loss": 1.7144,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 0.04,
108
+ "learning_rate": 3.7045274519126395e-06,
109
+ "loss": 1.7195,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 0.04,
114
+ "learning_rate": 3.7466221106030114e-06,
115
+ "loss": 1.6989,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 0.04,
120
+ "learning_rate": 3.786402677560832e-06,
121
+ "loss": 1.7034,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 0.05,
126
+ "learning_rate": 3.824110376935989e-06,
127
+ "loss": 1.7049,
128
+ "step": 200
129
+ },
130
+ {
131
+ "epoch": 0.05,
132
+ "learning_rate": 3.8599505757615295e-06,
133
+ "loss": 1.7457,
134
+ "step": 210
135
+ },
136
+ {
137
+ "epoch": 0.05,
138
+ "learning_rate": 3.894099556414216e-06,
139
+ "loss": 1.7092,
140
+ "step": 220
141
+ },
142
+ {
143
+ "epoch": 0.05,
144
+ "learning_rate": 3.9267097619885385e-06,
145
+ "loss": 1.7283,
146
+ "step": 230
147
+ },
148
+ {
149
+ "epoch": 0.05,
150
+ "learning_rate": 3.95791391001684e-06,
151
+ "loss": 1.6915,
152
+ "step": 240
153
+ },
154
+ {
155
+ "epoch": 0.06,
156
+ "learning_rate": 3.987828255432777e-06,
157
+ "loss": 1.6902,
158
+ "step": 250
159
+ },
160
+ {
161
+ "epoch": 0.06,
162
+ "learning_rate": 4.016555205552159e-06,
163
+ "loss": 1.7059,
164
+ "step": 260
165
+ },
166
+ {
167
+ "epoch": 0.06,
168
+ "learning_rate": 4.044185435607626e-06,
169
+ "loss": 1.7044,
170
+ "step": 270
171
+ },
172
+ {
173
+ "epoch": 0.06,
174
+ "learning_rate": 4.070799615107415e-06,
175
+ "loss": 1.6984,
176
+ "step": 280
177
+ },
178
+ {
179
+ "epoch": 0.07,
180
+ "learning_rate": 4.096469827889988e-06,
181
+ "loss": 1.7203,
182
+ "step": 290
183
+ },
184
+ {
185
+ "epoch": 0.07,
186
+ "learning_rate": 4.121260748862021e-06,
187
+ "loss": 1.7046,
188
+ "step": 300
189
+ },
190
+ {
191
+ "epoch": 0.07,
192
+ "learning_rate": 4.145230625795312e-06,
193
+ "loss": 1.6732,
194
+ "step": 310
195
+ },
196
+ {
197
+ "epoch": 0.07,
198
+ "learning_rate": 4.1684321036962525e-06,
199
+ "loss": 1.6948,
200
+ "step": 320
201
+ },
202
+ {
203
+ "epoch": 0.08,
204
+ "learning_rate": 4.190912921100477e-06,
205
+ "loss": 1.656,
206
+ "step": 330
207
+ },
208
+ {
209
+ "epoch": 0.08,
210
+ "learning_rate": 4.212716501452232e-06,
211
+ "loss": 1.6729,
212
+ "step": 340
213
+ },
214
+ {
215
+ "epoch": 0.08,
216
+ "learning_rate": 4.233882457984791e-06,
217
+ "loss": 1.7092,
218
+ "step": 350
219
+ },
220
+ {
221
+ "epoch": 0.08,
222
+ "learning_rate": 4.2544470268536555e-06,
223
+ "loss": 1.6883,
224
+ "step": 360
225
+ },
226
+ {
227
+ "epoch": 0.08,
228
+ "learning_rate": 4.27444344042015e-06,
229
+ "loss": 1.6877,
230
+ "step": 370
231
+ },
232
+ {
233
+ "epoch": 0.09,
234
+ "learning_rate": 4.293902250342989e-06,
235
+ "loss": 1.6774,
236
+ "step": 380
237
+ },
238
+ {
239
+ "epoch": 0.09,
240
+ "learning_rate": 4.312851608364853e-06,
241
+ "loss": 1.6957,
242
+ "step": 390
243
+ },
244
+ {
245
+ "epoch": 0.09,
246
+ "learning_rate": 4.3313175112718595e-06,
247
+ "loss": 1.6848,
248
+ "step": 400
249
+ },
250
+ {
251
+ "epoch": 0.09,
252
+ "learning_rate": 4.3493240153753665e-06,
253
+ "loss": 1.682,
254
+ "step": 410
255
+ },
256
+ {
257
+ "epoch": 0.1,
258
+ "learning_rate": 4.366893424956263e-06,
259
+ "loss": 1.6724,
260
+ "step": 420
261
+ },
262
+ {
263
+ "epoch": 0.1,
264
+ "learning_rate": 4.38404645837504e-06,
265
+ "loss": 1.7079,
266
+ "step": 430
267
+ },
268
+ {
269
+ "epoch": 0.1,
270
+ "learning_rate": 4.400802394950703e-06,
271
+ "loss": 1.6605,
272
+ "step": 440
273
+ },
274
+ {
275
+ "epoch": 0.1,
276
+ "learning_rate": 4.4171792052198945e-06,
277
+ "loss": 1.6822,
278
+ "step": 450
279
+ },
280
+ {
281
+ "epoch": 0.11,
282
+ "learning_rate": 4.433193666783084e-06,
283
+ "loss": 1.6731,
284
+ "step": 460
285
+ },
286
+ {
287
+ "epoch": 0.11,
288
+ "learning_rate": 4.448861467610187e-06,
289
+ "loss": 1.6648,
290
+ "step": 470
291
+ },
292
+ {
293
+ "epoch": 0.11,
294
+ "learning_rate": 4.4641972984001906e-06,
295
+ "loss": 1.6781,
296
+ "step": 480
297
+ },
298
+ {
299
+ "epoch": 0.11,
300
+ "learning_rate": 4.479214935357724e-06,
301
+ "loss": 1.6752,
302
+ "step": 490
303
+ },
304
+ {
305
+ "epoch": 0.11,
306
+ "learning_rate": 4.493927314555554e-06,
307
+ "loss": 1.6754,
308
+ "step": 500
309
+ },
310
+ {
311
+ "epoch": 0.11,
312
+ "eval_webgpt_accuracy": 0.5055517960817275,
313
+ "eval_webgpt_loss": 2.15625,
314
+ "eval_webgpt_runtime": 39.0916,
315
+ "eval_webgpt_samples_per_second": 100.175,
316
+ "eval_webgpt_steps_per_second": 1.253,
317
+ "step": 500
318
+ },
319
+ {
320
+ "epoch": 0.11,
321
+ "eval_prompt_dialogue_accuracy": 0.6254543673617606,
322
+ "eval_prompt_dialogue_loss": 1.357421875,
323
+ "eval_prompt_dialogue_runtime": 71.3081,
324
+ "eval_prompt_dialogue_samples_per_second": 144.57,
325
+ "eval_prompt_dialogue_steps_per_second": 1.809,
326
+ "step": 500
327
+ },
328
+ {
329
+ "epoch": 0.11,
330
+ "eval_adversarial_qa_accuracy": 0.8029728725380899,
331
+ "eval_adversarial_qa_loss": 0.70654296875,
332
+ "eval_adversarial_qa_runtime": 20.7874,
333
+ "eval_adversarial_qa_samples_per_second": 144.318,
334
+ "eval_adversarial_qa_steps_per_second": 1.828,
335
+ "step": 500
336
+ },
337
+ {
338
+ "epoch": 0.11,
339
+ "eval_xsum_accuracy": 0.632906181388279,
340
+ "eval_xsum_loss": 1.3935546875,
341
+ "eval_xsum_runtime": 122.5752,
342
+ "eval_xsum_samples_per_second": 92.449,
343
+ "eval_xsum_steps_per_second": 1.158,
344
+ "step": 500
345
+ },
346
+ {
347
+ "epoch": 0.11,
348
+ "eval_cnn_dailymail_accuracy": 0.7001129736496595,
349
+ "eval_cnn_dailymail_loss": NaN,
350
+ "eval_cnn_dailymail_runtime": 144.5118,
351
+ "eval_cnn_dailymail_samples_per_second": 92.505,
352
+ "eval_cnn_dailymail_steps_per_second": 1.163,
353
+ "step": 500
354
+ },
355
+ {
356
+ "epoch": 0.11,
357
+ "eval_multi_news_accuracy": 0.5801641857474902,
358
+ "eval_multi_news_loss": NaN,
359
+ "eval_multi_news_runtime": 62.7124,
360
+ "eval_multi_news_samples_per_second": 89.647,
361
+ "eval_multi_news_steps_per_second": 1.132,
362
+ "step": 500
363
+ },
364
+ {
365
+ "epoch": 0.11,
366
+ "eval_scitldr_accuracy": 0.4978125,
367
+ "eval_scitldr_loss": NaN,
368
+ "eval_scitldr_runtime": 8.0438,
369
+ "eval_scitldr_samples_per_second": 76.954,
370
+ "eval_scitldr_steps_per_second": 0.995,
371
+ "step": 500
372
+ },
373
+ {
374
+ "epoch": 0.11,
375
+ "eval_joke_accuracy": 0.5093821076573162,
376
+ "eval_joke_loss": 2.1171875,
377
+ "eval_joke_runtime": 0.9767,
378
+ "eval_joke_samples_per_second": 77.813,
379
+ "eval_joke_steps_per_second": 1.024,
380
+ "step": 500
381
+ },
382
+ {
383
+ "epoch": 0.11,
384
+ "eval_gsm8k_accuracy": 0.7808137739345274,
385
+ "eval_gsm8k_loss": 0.8134765625,
386
+ "eval_gsm8k_runtime": 9.4148,
387
+ "eval_gsm8k_samples_per_second": 140.099,
388
+ "eval_gsm8k_steps_per_second": 1.806,
389
+ "step": 500
390
+ },
391
+ {
392
+ "epoch": 0.11,
393
+ "eval_dive_mt_accuracy": 0.7367253212240054,
394
+ "eval_dive_mt_loss": 1.04296875,
395
+ "eval_dive_mt_runtime": 10.0465,
396
+ "eval_dive_mt_samples_per_second": 128.403,
397
+ "eval_dive_mt_steps_per_second": 1.692,
398
+ "step": 500
399
+ },
400
+ {
401
+ "epoch": 0.11,
402
+ "eval_math_qa_accuracy": 0.608876541257212,
403
+ "eval_math_qa_loss": 1.6689453125,
404
+ "eval_math_qa_runtime": 30.5447,
405
+ "eval_math_qa_samples_per_second": 146.507,
406
+ "eval_math_qa_steps_per_second": 1.833,
407
+ "step": 500
408
+ },
409
+ {
410
+ "epoch": 0.11,
411
+ "eval_essay_instruction_accuracy": 0.6053833226455565,
412
+ "eval_essay_instruction_loss": 1.8876953125,
413
+ "eval_essay_instruction_runtime": 8.3301,
414
+ "eval_essay_instruction_samples_per_second": 49.579,
415
+ "eval_essay_instruction_steps_per_second": 0.72,
416
+ "step": 500
417
+ },
418
+ {
419
+ "epoch": 0.11,
420
+ "eval_tldr_news_accuracy": 0.6061431123968348,
421
+ "eval_tldr_news_loss": 1.697265625,
422
+ "eval_tldr_news_runtime": 5.1098,
423
+ "eval_tldr_news_samples_per_second": 155.389,
424
+ "eval_tldr_news_steps_per_second": 1.957,
425
+ "step": 500
426
+ },
427
+ {
428
+ "epoch": 0.11,
429
+ "eval_reddit_eli5_accuracy": 0.46120351563185963,
430
+ "eval_reddit_eli5_loss": 2.423828125,
431
+ "eval_reddit_eli5_runtime": 107.5158,
432
+ "eval_reddit_eli5_samples_per_second": 91.261,
433
+ "eval_reddit_eli5_steps_per_second": 1.144,
434
+ "step": 500
435
+ },
436
+ {
437
+ "epoch": 0.11,
438
+ "eval_reddit_asks_accuracy": 0.4690436591507088,
439
+ "eval_reddit_asks_loss": 2.412109375,
440
+ "eval_reddit_asks_runtime": 32.2161,
441
+ "eval_reddit_asks_samples_per_second": 70.803,
442
+ "eval_reddit_asks_steps_per_second": 0.9,
443
+ "step": 500
444
+ },
445
+ {
446
+ "epoch": 0.11,
447
+ "eval_reddit_askh_accuracy": 0.466922516495131,
448
+ "eval_reddit_askh_loss": 2.513671875,
449
+ "eval_reddit_askh_runtime": 61.0062,
450
+ "eval_reddit_askh_samples_per_second": 80.336,
451
+ "eval_reddit_askh_steps_per_second": 1.016,
452
+ "step": 500
453
+ },
454
+ {
455
+ "epoch": 0.11,
456
+ "eval_wmt2019_zh-en_accuracy": 0.6711313468964325,
457
+ "eval_wmt2019_zh-en_loss": 1.44140625,
458
+ "eval_wmt2019_zh-en_runtime": 27.1705,
459
+ "eval_wmt2019_zh-en_samples_per_second": 146.519,
460
+ "eval_wmt2019_zh-en_steps_per_second": 1.84,
461
+ "step": 500
462
+ },
463
+ {
464
+ "epoch": 0.11,
465
+ "eval_wmt2019_fr-de_accuracy": 0.751335577309082,
466
+ "eval_wmt2019_fr-de_loss": 0.9892578125,
467
+ "eval_wmt2019_fr-de_runtime": 9.8591,
468
+ "eval_wmt2019_fr-de_samples_per_second": 153.361,
469
+ "eval_wmt2019_fr-de_steps_per_second": 1.927,
470
+ "step": 500
471
+ },
472
+ {
473
+ "epoch": 0.11,
474
+ "eval_wmt2019_ru-en_accuracy": 0.7610682787220373,
475
+ "eval_wmt2019_ru-en_loss": 0.92138671875,
476
+ "eval_wmt2019_ru-en_runtime": 21.983,
477
+ "eval_wmt2019_ru-en_samples_per_second": 136.469,
478
+ "eval_wmt2019_ru-en_steps_per_second": 1.729,
479
+ "step": 500
480
+ },
481
+ {
482
+ "epoch": 0.11,
483
+ "eval_wmt2019_de-en_accuracy": 0.7658361423127319,
484
+ "eval_wmt2019_de-en_loss": 0.92041015625,
485
+ "eval_wmt2019_de-en_runtime": 17.0498,
486
+ "eval_wmt2019_de-en_samples_per_second": 175.838,
487
+ "eval_wmt2019_de-en_steps_per_second": 2.229,
488
+ "step": 500
489
+ },
490
+ {
491
+ "epoch": 0.11,
492
+ "eval_ted_trans_de-ja_accuracy": 0.6635957565605807,
493
+ "eval_ted_trans_de-ja_loss": 1.4384765625,
494
+ "eval_ted_trans_de-ja_runtime": 7.9688,
495
+ "eval_ted_trans_de-ja_samples_per_second": 90.101,
496
+ "eval_ted_trans_de-ja_steps_per_second": 1.129,
497
+ "step": 500
498
+ },
499
+ {
500
+ "epoch": 0.11,
501
+ "eval_ted_trans_en-ja_accuracy": 0.6737063575554276,
502
+ "eval_ted_trans_en-ja_loss": 1.3544921875,
503
+ "eval_ted_trans_en-ja_runtime": 9.6629,
504
+ "eval_ted_trans_en-ja_samples_per_second": 82.894,
505
+ "eval_ted_trans_en-ja_steps_per_second": 1.138,
506
+ "step": 500
507
+ },
508
+ {
509
+ "epoch": 0.11,
510
+ "eval_ted_trans_en-hi_accuracy": 0.6986381322957198,
511
+ "eval_ted_trans_en-hi_loss": 1.1357421875,
512
+ "eval_ted_trans_en-hi_runtime": 2.3375,
513
+ "eval_ted_trans_en-hi_samples_per_second": 44.064,
514
+ "eval_ted_trans_en-hi_steps_per_second": 0.856,
515
+ "step": 500
516
+ },
517
+ {
518
+ "epoch": 0.11,
519
+ "eval_ted_trans_en-es_accuracy": 0.7880831502109065,
520
+ "eval_ted_trans_en-es_loss": 0.87353515625,
521
+ "eval_ted_trans_en-es_runtime": 8.2834,
522
+ "eval_ted_trans_en-es_samples_per_second": 99.718,
523
+ "eval_ted_trans_en-es_steps_per_second": 1.328,
524
+ "step": 500
525
+ },
526
+ {
527
+ "epoch": 0.11,
528
+ "eval_private_tuning_accuracy": 0.6889973407198902,
529
+ "eval_private_tuning_loss": 1.130859375,
530
+ "eval_private_tuning_runtime": 142.1785,
531
+ "eval_private_tuning_samples_per_second": 148.954,
532
+ "eval_private_tuning_steps_per_second": 1.864,
533
+ "step": 500
534
+ },
535
+ {
536
+ "epoch": 0.11,
537
+ "eval_samsum_accuracy": 0.6474302924317498,
538
+ "eval_samsum_loss": 1.27734375,
539
+ "eval_samsum_runtime": 12.2877,
540
+ "eval_samsum_samples_per_second": 66.571,
541
+ "eval_samsum_steps_per_second": 0.895,
542
+ "step": 500
543
+ },
544
+ {
545
+ "epoch": 0.11,
546
+ "eval_prosocial_dialogue_accuracy": 0.5408795463448586,
547
+ "eval_prosocial_dialogue_loss": 1.7060546875,
548
+ "eval_prosocial_dialogue_runtime": 48.1618,
549
+ "eval_prosocial_dialogue_samples_per_second": 560.257,
550
+ "eval_prosocial_dialogue_steps_per_second": 7.018,
551
+ "step": 500
552
+ },
553
+ {
554
+ "epoch": 0.11,
555
+ "eval_oa_translated_accuracy": 0.719222779150248,
556
+ "eval_oa_translated_loss": 1.1240234375,
557
+ "eval_oa_translated_runtime": 59.7453,
558
+ "eval_oa_translated_samples_per_second": 86.484,
559
+ "eval_oa_translated_steps_per_second": 1.088,
560
+ "step": 500
561
+ },
562
+ {
563
+ "epoch": 0.11,
564
+ "eval_wikihow_accuracy": 0.622957980862571,
565
+ "eval_wikihow_loss": 1.7578125,
566
+ "eval_wikihow_runtime": 15.5342,
567
+ "eval_wikihow_samples_per_second": 147.61,
568
+ "eval_wikihow_steps_per_second": 1.867,
569
+ "step": 500
570
+ },
571
+ {
572
+ "epoch": 0.11,
573
+ "eval_explain_prosocial_accuracy": 0.6863205647867285,
574
+ "eval_explain_prosocial_loss": 1.310546875,
575
+ "eval_explain_prosocial_runtime": 111.3962,
576
+ "eval_explain_prosocial_samples_per_second": 549.821,
577
+ "eval_explain_prosocial_steps_per_second": 6.876,
578
+ "step": 500
579
+ },
580
+ {
581
+ "epoch": 0.12,
582
+ "learning_rate": 4.5083465988888945e-06,
583
+ "loss": 1.6702,
584
+ "step": 510
585
+ },
586
+ {
587
+ "epoch": 0.12,
588
+ "learning_rate": 4.5224842384899045e-06,
589
+ "loss": 1.6841,
590
+ "step": 520
591
+ },
592
+ {
593
+ "epoch": 0.12,
594
+ "learning_rate": 4.5363510253542444e-06,
595
+ "loss": 1.6574,
596
+ "step": 530
597
+ },
598
+ {
599
+ "epoch": 0.12,
600
+ "learning_rate": 4.549957142832593e-06,
601
+ "loss": 1.673,
602
+ "step": 540
603
+ },
604
+ {
605
+ "epoch": 0.13,
606
+ "learning_rate": 4.563312210555719e-06,
607
+ "loss": 1.6541,
608
+ "step": 550
609
+ },
610
+ {
611
+ "epoch": 0.13,
612
+ "learning_rate": 4.576425325289549e-06,
613
+ "loss": 1.6516,
614
+ "step": 560
615
+ },
616
+ {
617
+ "epoch": 0.13,
618
+ "learning_rate": 4.589305098154845e-06,
619
+ "loss": 1.6717,
620
+ "step": 570
621
+ },
622
+ {
623
+ "epoch": 0.13,
624
+ "learning_rate": 4.601959688592886e-06,
625
+ "loss": 1.6191,
626
+ "step": 580
627
+ },
628
+ {
629
+ "epoch": 0.14,
630
+ "learning_rate": 4.614396835412691e-06,
631
+ "loss": 1.6685,
632
+ "step": 590
633
+ },
634
+ {
635
+ "epoch": 0.14,
636
+ "learning_rate": 4.626623885215616e-06,
637
+ "loss": 1.6424,
638
+ "step": 600
639
+ },
640
+ {
641
+ "epoch": 0.14,
642
+ "learning_rate": 4.638647818458763e-06,
643
+ "loss": 1.6391,
644
+ "step": 610
645
+ },
646
+ {
647
+ "epoch": 0.14,
648
+ "learning_rate": 4.650475273388737e-06,
649
+ "loss": 1.6604,
650
+ "step": 620
651
+ },
652
+ {
653
+ "epoch": 0.14,
654
+ "learning_rate": 4.662112568051194e-06,
655
+ "loss": 1.6693,
656
+ "step": 630
657
+ },
658
+ {
659
+ "epoch": 0.15,
660
+ "learning_rate": 4.673565720558918e-06,
661
+ "loss": 1.6437,
662
+ "step": 640
663
+ },
664
+ {
665
+ "epoch": 0.15,
666
+ "learning_rate": 4.6848404677811685e-06,
667
+ "loss": 1.6688,
668
+ "step": 650
669
+ },
670
+ {
671
+ "epoch": 0.15,
672
+ "learning_rate": 4.695942282599635e-06,
673
+ "loss": 1.6521,
674
+ "step": 660
675
+ },
676
+ {
677
+ "epoch": 0.15,
678
+ "learning_rate": 4.706876389860915e-06,
679
+ "loss": 1.6568,
680
+ "step": 670
681
+ },
682
+ {
683
+ "epoch": 0.16,
684
+ "learning_rate": 4.717647781141908e-06,
685
+ "loss": 1.6462,
686
+ "step": 680
687
+ },
688
+ {
689
+ "epoch": 0.16,
690
+ "learning_rate": 4.7282612284325845e-06,
691
+ "loss": 1.6463,
692
+ "step": 690
693
+ },
694
+ {
695
+ "epoch": 0.16,
696
+ "learning_rate": 4.738721296830016e-06,
697
+ "loss": 1.6495,
698
+ "step": 700
699
+ },
700
+ {
701
+ "epoch": 0.16,
702
+ "learning_rate": 4.749032356328167e-06,
703
+ "loss": 1.6536,
704
+ "step": 710
705
+ },
706
+ {
707
+ "epoch": 0.17,
708
+ "learning_rate": 4.759198592779668e-06,
709
+ "loss": 1.6366,
710
+ "step": 720
711
+ },
712
+ {
713
+ "epoch": 0.17,
714
+ "learning_rate": 4.769224018098397e-06,
715
+ "loss": 1.6626,
716
+ "step": 730
717
+ },
718
+ {
719
+ "epoch": 0.17,
720
+ "learning_rate": 4.7791124797650865e-06,
721
+ "loss": 1.616,
722
+ "step": 740
723
+ },
724
+ {
725
+ "epoch": 0.17,
726
+ "learning_rate": 4.788867669692332e-06,
727
+ "loss": 1.6401,
728
+ "step": 750
729
+ },
730
+ {
731
+ "epoch": 0.17,
732
+ "eval_webgpt_accuracy": 0.5064634455053698,
733
+ "eval_webgpt_loss": 2.14453125,
734
+ "eval_webgpt_runtime": 38.6915,
735
+ "eval_webgpt_samples_per_second": 101.211,
736
+ "eval_webgpt_steps_per_second": 1.266,
737
+ "step": 750
738
+ },
739
+ {
740
+ "epoch": 0.17,
741
+ "eval_prompt_dialogue_accuracy": 0.6281575769303819,
742
+ "eval_prompt_dialogue_loss": 1.3408203125,
743
+ "eval_prompt_dialogue_runtime": 73.877,
744
+ "eval_prompt_dialogue_samples_per_second": 139.543,
745
+ "eval_prompt_dialogue_steps_per_second": 1.746,
746
+ "step": 750
747
+ },
748
+ {
749
+ "epoch": 0.17,
750
+ "eval_adversarial_qa_accuracy": 0.8144184318097362,
751
+ "eval_adversarial_qa_loss": 0.67626953125,
752
+ "eval_adversarial_qa_runtime": 20.0332,
753
+ "eval_adversarial_qa_samples_per_second": 149.751,
754
+ "eval_adversarial_qa_steps_per_second": 1.897,
755
+ "step": 750
756
+ },
757
+ {
758
+ "epoch": 0.17,
759
+ "eval_xsum_accuracy": 0.6340356358723188,
760
+ "eval_xsum_loss": 1.3828125,
761
+ "eval_xsum_runtime": 121.341,
762
+ "eval_xsum_samples_per_second": 93.39,
763
+ "eval_xsum_steps_per_second": 1.17,
764
+ "step": 750
765
+ },
766
+ {
767
+ "epoch": 0.17,
768
+ "eval_cnn_dailymail_accuracy": 0.7028426612927849,
769
+ "eval_cnn_dailymail_loss": NaN,
770
+ "eval_cnn_dailymail_runtime": 144.2535,
771
+ "eval_cnn_dailymail_samples_per_second": 92.67,
772
+ "eval_cnn_dailymail_steps_per_second": 1.165,
773
+ "step": 750
774
+ },
775
+ {
776
+ "epoch": 0.17,
777
+ "eval_multi_news_accuracy": 0.5819939666683152,
778
+ "eval_multi_news_loss": NaN,
779
+ "eval_multi_news_runtime": 61.526,
780
+ "eval_multi_news_samples_per_second": 91.376,
781
+ "eval_multi_news_steps_per_second": 1.154,
782
+ "step": 750
783
+ },
784
+ {
785
+ "epoch": 0.17,
786
+ "eval_scitldr_accuracy": 0.491875,
787
+ "eval_scitldr_loss": NaN,
788
+ "eval_scitldr_runtime": 8.1765,
789
+ "eval_scitldr_samples_per_second": 75.704,
790
+ "eval_scitldr_steps_per_second": 0.978,
791
+ "step": 750
792
+ },
793
+ {
794
+ "epoch": 0.17,
795
+ "eval_joke_accuracy": 0.5185746777862017,
796
+ "eval_joke_loss": 2.07421875,
797
+ "eval_joke_runtime": 0.7033,
798
+ "eval_joke_samples_per_second": 108.069,
799
+ "eval_joke_steps_per_second": 1.422,
800
+ "step": 750
801
+ },
802
+ {
803
+ "epoch": 0.17,
804
+ "eval_gsm8k_accuracy": 0.7870444718962323,
805
+ "eval_gsm8k_loss": 0.78466796875,
806
+ "eval_gsm8k_runtime": 10.9372,
807
+ "eval_gsm8k_samples_per_second": 120.597,
808
+ "eval_gsm8k_steps_per_second": 1.554,
809
+ "step": 750
810
+ },
811
+ {
812
+ "epoch": 0.17,
813
+ "eval_math_qa_accuracy": 0.619624940687815,
814
+ "eval_math_qa_loss": 1.6083984375,
815
+ "eval_math_qa_runtime": 31.4027,
816
+ "eval_math_qa_samples_per_second": 142.504,
817
+ "eval_math_qa_steps_per_second": 1.783,
818
+ "step": 750
819
+ },
820
+ {
821
+ "epoch": 0.17,
822
+ "eval_essay_instruction_accuracy": 0.6070694115826017,
823
+ "eval_essay_instruction_loss": 1.8740234375,
824
+ "eval_essay_instruction_runtime": 8.0521,
825
+ "eval_essay_instruction_samples_per_second": 51.291,
826
+ "eval_essay_instruction_steps_per_second": 0.745,
827
+ "step": 750
828
+ },
829
+ {
830
+ "epoch": 0.17,
831
+ "eval_tldr_news_accuracy": 0.615757678890496,
832
+ "eval_tldr_news_loss": 1.66015625,
833
+ "eval_tldr_news_runtime": 4.1264,
834
+ "eval_tldr_news_samples_per_second": 192.418,
835
+ "eval_tldr_news_steps_per_second": 2.423,
836
+ "step": 750
837
+ },
838
+ {
839
+ "epoch": 0.17,
840
+ "eval_reddit_eli5_accuracy": 0.461742772350252,
841
+ "eval_reddit_eli5_loss": 2.421875,
842
+ "eval_reddit_eli5_runtime": 108.3649,
843
+ "eval_reddit_eli5_samples_per_second": 90.546,
844
+ "eval_reddit_eli5_steps_per_second": 1.135,
845
+ "step": 750
846
+ },
847
+ {
848
+ "epoch": 0.17,
849
+ "eval_reddit_asks_accuracy": 0.4700219866226591,
850
+ "eval_reddit_asks_loss": 2.41015625,
851
+ "eval_reddit_asks_runtime": 31.416,
852
+ "eval_reddit_asks_samples_per_second": 72.606,
853
+ "eval_reddit_asks_steps_per_second": 0.923,
854
+ "step": 750
855
+ },
856
+ {
857
+ "epoch": 0.17,
858
+ "eval_reddit_askh_accuracy": 0.46774579304106356,
859
+ "eval_reddit_askh_loss": 2.5078125,
860
+ "eval_reddit_askh_runtime": 61.093,
861
+ "eval_reddit_askh_samples_per_second": 80.222,
862
+ "eval_reddit_askh_steps_per_second": 1.015,
863
+ "step": 750
864
+ },
865
+ {
866
+ "epoch": 0.17,
867
+ "eval_wmt2019_zh-en_accuracy": 0.6671902987021268,
868
+ "eval_wmt2019_zh-en_loss": 1.4541015625,
869
+ "eval_wmt2019_zh-en_runtime": 27.3556,
870
+ "eval_wmt2019_zh-en_samples_per_second": 145.528,
871
+ "eval_wmt2019_zh-en_steps_per_second": 1.828,
872
+ "step": 750
873
+ },
874
+ {
875
+ "epoch": 0.17,
876
+ "eval_wmt2019_fr-de_accuracy": 0.7487164373574896,
877
+ "eval_wmt2019_fr-de_loss": 0.9892578125,
878
+ "eval_wmt2019_fr-de_runtime": 11.3417,
879
+ "eval_wmt2019_fr-de_samples_per_second": 133.314,
880
+ "eval_wmt2019_fr-de_steps_per_second": 1.675,
881
+ "step": 750
882
+ },
883
+ {
884
+ "epoch": 0.17,
885
+ "eval_wmt2019_ru-en_accuracy": 0.7546621422248451,
886
+ "eval_wmt2019_ru-en_loss": 0.94970703125,
887
+ "eval_wmt2019_ru-en_runtime": 22.6465,
888
+ "eval_wmt2019_ru-en_samples_per_second": 132.471,
889
+ "eval_wmt2019_ru-en_steps_per_second": 1.678,
890
+ "step": 750
891
+ },
892
+ {
893
+ "epoch": 0.17,
894
+ "eval_wmt2019_de-en_accuracy": 0.7651105551969012,
895
+ "eval_wmt2019_de-en_loss": 0.92236328125,
896
+ "eval_wmt2019_de-en_runtime": 16.3647,
897
+ "eval_wmt2019_de-en_samples_per_second": 183.199,
898
+ "eval_wmt2019_de-en_steps_per_second": 2.322,
899
+ "step": 750
900
+ },
901
+ {
902
+ "epoch": 0.17,
903
+ "eval_ted_trans_de-ja_accuracy": 0.6670448957978744,
904
+ "eval_ted_trans_de-ja_loss": 1.4306640625,
905
+ "eval_ted_trans_de-ja_runtime": 8.4143,
906
+ "eval_ted_trans_de-ja_samples_per_second": 85.331,
907
+ "eval_ted_trans_de-ja_steps_per_second": 1.07,
908
+ "step": 750
909
+ },
910
+ {
911
+ "epoch": 0.17,
912
+ "eval_ted_trans_en-ja_accuracy": 0.6718075628588398,
913
+ "eval_ted_trans_en-ja_loss": 1.33203125,
914
+ "eval_ted_trans_en-ja_runtime": 10.3712,
915
+ "eval_ted_trans_en-ja_samples_per_second": 77.233,
916
+ "eval_ted_trans_en-ja_steps_per_second": 1.061,
917
+ "step": 750
918
+ },
919
+ {
920
+ "epoch": 0.17,
921
+ "eval_ted_trans_en-hi_accuracy": 0.6781445982723938,
922
+ "eval_ted_trans_en-hi_loss": 1.2021484375,
923
+ "eval_ted_trans_en-hi_runtime": 1.747,
924
+ "eval_ted_trans_en-hi_samples_per_second": 58.959,
925
+ "eval_ted_trans_en-hi_steps_per_second": 1.145,
926
+ "step": 750
927
+ },
928
+ {
929
+ "epoch": 0.17,
930
+ "eval_ted_trans_en-es_accuracy": 0.787559638615369,
931
+ "eval_ted_trans_en-es_loss": 0.88525390625,
932
+ "eval_ted_trans_en-es_runtime": 9.0268,
933
+ "eval_ted_trans_en-es_samples_per_second": 91.505,
934
+ "eval_ted_trans_en-es_steps_per_second": 1.219,
935
+ "step": 750
936
+ },
937
+ {
938
+ "epoch": 0.17,
939
+ "eval_private_tuning_accuracy": 0.693209861771278,
940
+ "eval_private_tuning_loss": 1.1103515625,
941
+ "eval_private_tuning_runtime": 144.1209,
942
+ "eval_private_tuning_samples_per_second": 146.946,
943
+ "eval_private_tuning_steps_per_second": 1.839,
944
+ "step": 750
945
+ },
946
+ {
947
+ "epoch": 0.17,
948
+ "eval_samsum_accuracy": 0.6467502185951618,
949
+ "eval_samsum_loss": 1.259765625,
950
+ "eval_samsum_runtime": 9.1622,
951
+ "eval_samsum_samples_per_second": 89.28,
952
+ "eval_samsum_steps_per_second": 1.201,
953
+ "step": 750
954
+ },
955
+ {
956
+ "epoch": 0.17,
957
+ "eval_prosocial_dialogue_accuracy": 0.5496461048716204,
958
+ "eval_prosocial_dialogue_loss": 1.6904296875,
959
+ "eval_prosocial_dialogue_runtime": 49.4898,
960
+ "eval_prosocial_dialogue_samples_per_second": 545.224,
961
+ "eval_prosocial_dialogue_steps_per_second": 6.83,
962
+ "step": 750
963
+ },
964
+ {
965
+ "epoch": 0.17,
966
+ "eval_oa_translated_accuracy": 0.7254537658996713,
967
+ "eval_oa_translated_loss": 1.0947265625,
968
+ "eval_oa_translated_runtime": 57.4991,
969
+ "eval_oa_translated_samples_per_second": 89.862,
970
+ "eval_oa_translated_steps_per_second": 1.13,
971
+ "step": 750
972
+ },
973
+ {
974
+ "epoch": 0.17,
975
+ "eval_wikihow_accuracy": 0.6223200665649702,
976
+ "eval_wikihow_loss": 1.744140625,
977
+ "eval_wikihow_runtime": 16.9927,
978
+ "eval_wikihow_samples_per_second": 134.94,
979
+ "eval_wikihow_steps_per_second": 1.707,
980
+ "step": 750
981
+ },
982
+ {
983
+ "epoch": 0.17,
984
+ "eval_explain_prosocial_accuracy": 0.6895944881927522,
985
+ "eval_explain_prosocial_loss": 1.2900390625,
986
+ "eval_explain_prosocial_runtime": 109.8962,
987
+ "eval_explain_prosocial_samples_per_second": 557.326,
988
+ "eval_explain_prosocial_steps_per_second": 6.97,
989
+ "step": 750
990
+ },
991
+ {
992
+ "epoch": 0.17,
993
+ "learning_rate": 4.798493132500121e-06,
994
+ "loss": 1.6331,
995
+ "step": 760
996
+ },
997
+ {
998
+ "epoch": 0.18,
999
+ "learning_rate": 4.8079922732483016e-06,
1000
+ "loss": 1.6242,
1001
+ "step": 770
1002
+ },
1003
+ {
1004
+ "epoch": 0.18,
1005
+ "learning_rate": 4.817368364668191e-06,
1006
+ "loss": 1.6471,
1007
+ "step": 780
1008
+ },
1009
+ {
1010
+ "epoch": 0.18,
1011
+ "learning_rate": 4.8266245539317745e-06,
1012
+ "loss": 1.6592,
1013
+ "step": 790
1014
+ },
1015
+ {
1016
+ "epoch": 0.18,
1017
+ "learning_rate": 4.835763868993521e-06,
1018
+ "loss": 1.6586,
1019
+ "step": 800
1020
+ },
1021
+ {
1022
+ "epoch": 0.19,
1023
+ "learning_rate": 4.844789224536785e-06,
1024
+ "loss": 1.6354,
1025
+ "step": 810
1026
+ },
1027
+ {
1028
+ "epoch": 0.19,
1029
+ "learning_rate": 4.853703427554027e-06,
1030
+ "loss": 1.6602,
1031
+ "step": 820
1032
+ },
1033
+ {
1034
+ "epoch": 0.19,
1035
+ "learning_rate": 4.862509182587578e-06,
1036
+ "loss": 1.652,
1037
+ "step": 830
1038
+ },
1039
+ {
1040
+ "epoch": 0.19,
1041
+ "learning_rate": 4.871209096655434e-06,
1042
+ "loss": 1.6451,
1043
+ "step": 840
1044
+ },
1045
+ {
1046
+ "epoch": 0.19,
1047
+ "learning_rate": 4.879805683884512e-06,
1048
+ "loss": 1.6404,
1049
+ "step": 850
1050
+ },
1051
+ {
1052
+ "epoch": 0.2,
1053
+ "learning_rate": 4.888301369871998e-06,
1054
+ "loss": 1.6411,
1055
+ "step": 860
1056
+ },
1057
+ {
1058
+ "epoch": 0.2,
1059
+ "learning_rate": 4.8966984957936845e-06,
1060
+ "loss": 1.6314,
1061
+ "step": 870
1062
+ },
1063
+ {
1064
+ "epoch": 0.2,
1065
+ "learning_rate": 4.904999322276735e-06,
1066
+ "loss": 1.6189,
1067
+ "step": 880
1068
+ },
1069
+ {
1070
+ "epoch": 0.2,
1071
+ "learning_rate": 4.913206033052878e-06,
1072
+ "loss": 1.6514,
1073
+ "step": 890
1074
+ },
1075
+ {
1076
+ "epoch": 0.21,
1077
+ "learning_rate": 4.921320738406821e-06,
1078
+ "loss": 1.6363,
1079
+ "step": 900
1080
+ },
1081
+ {
1082
+ "epoch": 0.21,
1083
+ "learning_rate": 4.929345478433492e-06,
1084
+ "loss": 1.6398,
1085
+ "step": 910
1086
+ },
1087
+ {
1088
+ "epoch": 0.21,
1089
+ "learning_rate": 4.937282226116702e-06,
1090
+ "loss": 1.6277,
1091
+ "step": 920
1092
+ },
1093
+ {
1094
+ "epoch": 0.21,
1095
+ "learning_rate": 4.945132890240829e-06,
1096
+ "loss": 1.6236,
1097
+ "step": 930
1098
+ },
1099
+ {
1100
+ "epoch": 0.22,
1101
+ "learning_rate": 4.952899318146298e-06,
1102
+ "loss": 1.6353,
1103
+ "step": 940
1104
+ },
1105
+ {
1106
+ "epoch": 0.22,
1107
+ "learning_rate": 4.96058329833879e-06,
1108
+ "loss": 1.6394,
1109
+ "step": 950
1110
+ },
1111
+ {
1112
+ "epoch": 0.22,
1113
+ "learning_rate": 4.968186562961406e-06,
1114
+ "loss": 1.6293,
1115
+ "step": 960
1116
+ },
1117
+ {
1118
+ "epoch": 0.22,
1119
+ "learning_rate": 4.975710790138337e-06,
1120
+ "loss": 1.6259,
1121
+ "step": 970
1122
+ },
1123
+ {
1124
+ "epoch": 0.22,
1125
+ "learning_rate": 4.9831576061979556e-06,
1126
+ "loss": 1.6124,
1127
+ "step": 980
1128
+ },
1129
+ {
1130
+ "epoch": 0.23,
1131
+ "learning_rate": 4.990528587782728e-06,
1132
+ "loss": 1.6569,
1133
+ "step": 990
1134
+ },
1135
+ {
1136
+ "epoch": 0.23,
1137
+ "learning_rate": 4.99782526385276e-06,
1138
+ "loss": 1.638,
1139
+ "step": 1000
1140
+ },
1141
+ {
1142
+ "epoch": 0.23,
1143
+ "eval_webgpt_accuracy": 0.5073254588442403,
1144
+ "eval_webgpt_loss": 2.140625,
1145
+ "eval_webgpt_runtime": 38.7594,
1146
+ "eval_webgpt_samples_per_second": 101.034,
1147
+ "eval_webgpt_steps_per_second": 1.264,
1148
+ "step": 1000
1149
+ },
1150
+ {
1151
+ "epoch": 0.23,
1152
+ "eval_prompt_dialogue_accuracy": 0.6310662447605156,
1153
+ "eval_prompt_dialogue_loss": 1.3232421875,
1154
+ "eval_prompt_dialogue_runtime": 75.506,
1155
+ "eval_prompt_dialogue_samples_per_second": 136.532,
1156
+ "eval_prompt_dialogue_steps_per_second": 1.708,
1157
+ "step": 1000
1158
+ },
1159
+ {
1160
+ "epoch": 0.23,
1161
+ "eval_adversarial_qa_accuracy": 0.8161278335191379,
1162
+ "eval_adversarial_qa_loss": 0.65185546875,
1163
+ "eval_adversarial_qa_runtime": 17.8135,
1164
+ "eval_adversarial_qa_samples_per_second": 168.411,
1165
+ "eval_adversarial_qa_steps_per_second": 2.133,
1166
+ "step": 1000
1167
+ },
1168
+ {
1169
+ "epoch": 0.23,
1170
+ "eval_xsum_accuracy": 0.6358828745144401,
1171
+ "eval_xsum_loss": 1.3759765625,
1172
+ "eval_xsum_runtime": 120.9036,
1173
+ "eval_xsum_samples_per_second": 93.728,
1174
+ "eval_xsum_steps_per_second": 1.174,
1175
+ "step": 1000
1176
+ },
1177
+ {
1178
+ "epoch": 0.23,
1179
+ "eval_cnn_dailymail_accuracy": 0.7034533296693231,
1180
+ "eval_cnn_dailymail_loss": NaN,
1181
+ "eval_cnn_dailymail_runtime": 143.0011,
1182
+ "eval_cnn_dailymail_samples_per_second": 93.482,
1183
+ "eval_cnn_dailymail_steps_per_second": 1.175,
1184
+ "step": 1000
1185
+ },
1186
+ {
1187
+ "epoch": 0.23,
1188
+ "eval_multi_news_accuracy": 0.58382374758914,
1189
+ "eval_multi_news_loss": NaN,
1190
+ "eval_multi_news_runtime": 62.8321,
1191
+ "eval_multi_news_samples_per_second": 89.477,
1192
+ "eval_multi_news_steps_per_second": 1.13,
1193
+ "step": 1000
1194
+ },
1195
+ {
1196
+ "epoch": 0.23,
1197
+ "eval_scitldr_accuracy": 0.495625,
1198
+ "eval_scitldr_loss": NaN,
1199
+ "eval_scitldr_runtime": 7.4344,
1200
+ "eval_scitldr_samples_per_second": 83.262,
1201
+ "eval_scitldr_steps_per_second": 1.076,
1202
+ "step": 1000
1203
+ },
1204
+ {
1205
+ "epoch": 0.23,
1206
+ "eval_joke_accuracy": 0.5290940106141016,
1207
+ "eval_joke_loss": 2.01953125,
1208
+ "eval_joke_runtime": 0.7811,
1209
+ "eval_joke_samples_per_second": 97.299,
1210
+ "eval_joke_steps_per_second": 1.28,
1211
+ "step": 1000
1212
+ },
1213
+ {
1214
+ "epoch": 0.23,
1215
+ "eval_gsm8k_accuracy": 0.7911056207535516,
1216
+ "eval_gsm8k_loss": 0.76318359375,
1217
+ "eval_gsm8k_runtime": 9.9142,
1218
+ "eval_gsm8k_samples_per_second": 133.041,
1219
+ "eval_gsm8k_steps_per_second": 1.715,
1220
+ "step": 1000
1221
+ },
1222
+ {
1223
+ "epoch": 0.23,
1224
+ "eval_math_qa_accuracy": 0.6306243424083842,
1225
+ "eval_math_qa_loss": 1.548828125,
1226
+ "eval_math_qa_runtime": 31.7722,
1227
+ "eval_math_qa_samples_per_second": 140.847,
1228
+ "eval_math_qa_steps_per_second": 1.763,
1229
+ "step": 1000
1230
+ },
1231
+ {
1232
+ "epoch": 0.23,
1233
+ "eval_essay_instruction_accuracy": 0.6082469097583089,
1234
+ "eval_essay_instruction_loss": 1.8671875,
1235
+ "eval_essay_instruction_runtime": 8.3983,
1236
+ "eval_essay_instruction_samples_per_second": 49.176,
1237
+ "eval_essay_instruction_steps_per_second": 0.714,
1238
+ "step": 1000
1239
+ },
1240
+ {
1241
+ "epoch": 0.23,
1242
+ "eval_tldr_news_accuracy": 0.6160980175274398,
1243
+ "eval_tldr_news_loss": 1.6416015625,
1244
+ "eval_tldr_news_runtime": 3.6256,
1245
+ "eval_tldr_news_samples_per_second": 218.998,
1246
+ "eval_tldr_news_steps_per_second": 2.758,
1247
+ "step": 1000
1248
+ },
1249
+ {
1250
+ "epoch": 0.23,
1251
+ "eval_reddit_eli5_accuracy": 0.4611581507518863,
1252
+ "eval_reddit_eli5_loss": 2.419921875,
1253
+ "eval_reddit_eli5_runtime": 107.6335,
1254
+ "eval_reddit_eli5_samples_per_second": 91.161,
1255
+ "eval_reddit_eli5_steps_per_second": 1.143,
1256
+ "step": 1000
1257
+ },
1258
+ {
1259
+ "epoch": 0.23,
1260
+ "eval_reddit_asks_accuracy": 0.4688479936563187,
1261
+ "eval_reddit_asks_loss": 2.408203125,
1262
+ "eval_reddit_asks_runtime": 31.0014,
1263
+ "eval_reddit_asks_samples_per_second": 73.577,
1264
+ "eval_reddit_asks_steps_per_second": 0.935,
1265
+ "step": 1000
1266
+ },
1267
+ {
1268
+ "epoch": 0.23,
1269
+ "eval_reddit_askh_accuracy": 0.4678089398518734,
1270
+ "eval_reddit_askh_loss": 2.505859375,
1271
+ "eval_reddit_askh_runtime": 58.2699,
1272
+ "eval_reddit_askh_samples_per_second": 84.109,
1273
+ "eval_reddit_askh_steps_per_second": 1.064,
1274
+ "step": 1000
1275
+ },
1276
+ {
1277
+ "epoch": 0.23,
1278
+ "eval_wmt2019_zh-en_accuracy": 0.6663175800777634,
1279
+ "eval_wmt2019_zh-en_loss": 1.4599609375,
1280
+ "eval_wmt2019_zh-en_runtime": 29.9768,
1281
+ "eval_wmt2019_zh-en_samples_per_second": 132.803,
1282
+ "eval_wmt2019_zh-en_steps_per_second": 1.668,
1283
+ "step": 1000
1284
+ },
1285
+ {
1286
+ "epoch": 0.23,
1287
+ "eval_wmt2019_fr-de_accuracy": 0.7535103098381767,
1288
+ "eval_wmt2019_fr-de_loss": 0.9736328125,
1289
+ "eval_wmt2019_fr-de_runtime": 10.3533,
1290
+ "eval_wmt2019_fr-de_samples_per_second": 146.04,
1291
+ "eval_wmt2019_fr-de_steps_per_second": 1.835,
1292
+ "step": 1000
1293
+ },
1294
+ {
1295
+ "epoch": 0.23,
1296
+ "eval_wmt2019_ru-en_accuracy": 0.7577544408610879,
1297
+ "eval_wmt2019_ru-en_loss": 0.93603515625,
1298
+ "eval_wmt2019_ru-en_runtime": 22.7659,
1299
+ "eval_wmt2019_ru-en_samples_per_second": 131.776,
1300
+ "eval_wmt2019_ru-en_steps_per_second": 1.669,
1301
+ "step": 1000
1302
+ },
1303
+ {
1304
+ "epoch": 0.23,
1305
+ "eval_wmt2019_de-en_accuracy": 0.7657056972240155,
1306
+ "eval_wmt2019_de-en_loss": 0.9150390625,
1307
+ "eval_wmt2019_de-en_runtime": 15.5528,
1308
+ "eval_wmt2019_de-en_samples_per_second": 192.762,
1309
+ "eval_wmt2019_de-en_steps_per_second": 2.443,
1310
+ "step": 1000
1311
+ },
1312
+ {
1313
+ "epoch": 0.23,
1314
+ "eval_ted_trans_de-ja_accuracy": 0.6747348791651755,
1315
+ "eval_ted_trans_de-ja_loss": 1.3994140625,
1316
+ "eval_ted_trans_de-ja_runtime": 8.3691,
1317
+ "eval_ted_trans_de-ja_samples_per_second": 85.792,
1318
+ "eval_ted_trans_de-ja_steps_per_second": 1.075,
1319
+ "step": 1000
1320
+ },
1321
+ {
1322
+ "epoch": 0.23,
1323
+ "eval_ted_trans_en-ja_accuracy": 0.6812527237431164,
1324
+ "eval_ted_trans_en-ja_loss": 1.3115234375,
1325
+ "eval_ted_trans_en-ja_runtime": 9.3233,
1326
+ "eval_ted_trans_en-ja_samples_per_second": 85.914,
1327
+ "eval_ted_trans_en-ja_steps_per_second": 1.18,
1328
+ "step": 1000
1329
+ },
1330
+ {
1331
+ "epoch": 0.23,
1332
+ "eval_ted_trans_en-hi_accuracy": 0.6843067779174763,
1333
+ "eval_ted_trans_en-hi_loss": 1.19921875,
1334
+ "eval_ted_trans_en-hi_runtime": 2.8696,
1335
+ "eval_ted_trans_en-hi_samples_per_second": 35.893,
1336
+ "eval_ted_trans_en-hi_steps_per_second": 0.697,
1337
+ "step": 1000
1338
+ },
1339
+ {
1340
+ "epoch": 0.23,
1341
+ "eval_ted_trans_en-es_accuracy": 0.7943555499559027,
1342
+ "eval_ted_trans_en-es_loss": 0.85009765625,
1343
+ "eval_ted_trans_en-es_runtime": 8.4213,
1344
+ "eval_ted_trans_en-es_samples_per_second": 98.085,
1345
+ "eval_ted_trans_en-es_steps_per_second": 1.306,
1346
+ "step": 1000
1347
+ },
1348
+ {
1349
+ "epoch": 0.23,
1350
+ "eval_private_tuning_accuracy": 0.6955408449271764,
1351
+ "eval_private_tuning_loss": 1.0966796875,
1352
+ "eval_private_tuning_runtime": 144.1155,
1353
+ "eval_private_tuning_samples_per_second": 146.952,
1354
+ "eval_private_tuning_steps_per_second": 1.839,
1355
+ "step": 1000
1356
+ },
1357
+ {
1358
+ "epoch": 0.23,
1359
+ "eval_samsum_accuracy": 0.651219275235597,
1360
+ "eval_samsum_loss": 1.2451171875,
1361
+ "eval_samsum_runtime": 10.3624,
1362
+ "eval_samsum_samples_per_second": 78.939,
1363
+ "eval_samsum_steps_per_second": 1.062,
1364
+ "step": 1000
1365
+ },
1366
+ {
1367
+ "epoch": 0.23,
1368
+ "eval_prosocial_dialogue_accuracy": 0.5541225644750905,
1369
+ "eval_prosocial_dialogue_loss": 1.671875,
1370
+ "eval_prosocial_dialogue_runtime": 49.1807,
1371
+ "eval_prosocial_dialogue_samples_per_second": 548.65,
1372
+ "eval_prosocial_dialogue_steps_per_second": 6.873,
1373
+ "step": 1000
1374
+ },
1375
+ {
1376
+ "epoch": 0.23,
1377
+ "eval_oa_translated_accuracy": 0.7297566303926173,
1378
+ "eval_oa_translated_loss": 1.0712890625,
1379
+ "eval_oa_translated_runtime": 57.6497,
1380
+ "eval_oa_translated_samples_per_second": 89.628,
1381
+ "eval_oa_translated_steps_per_second": 1.127,
1382
+ "step": 1000
1383
+ },
1384
+ {
1385
+ "epoch": 0.23,
1386
+ "eval_wikihow_accuracy": 0.6223200665649702,
1387
+ "eval_wikihow_loss": 1.73828125,
1388
+ "eval_wikihow_runtime": 16.8591,
1389
+ "eval_wikihow_samples_per_second": 136.009,
1390
+ "eval_wikihow_steps_per_second": 1.72,
1391
+ "step": 1000
1392
+ },
1393
+ {
1394
+ "epoch": 0.23,
1395
+ "eval_explain_prosocial_accuracy": 0.6887323092680878,
1396
+ "eval_explain_prosocial_loss": 1.2744140625,
1397
+ "eval_explain_prosocial_runtime": 111.2104,
1398
+ "eval_explain_prosocial_samples_per_second": 550.74,
1399
+ "eval_explain_prosocial_steps_per_second": 6.888,
1400
+ "step": 1000
1401
+ }
1402
+ ],
1403
+ "max_steps": 17452,
1404
+ "num_train_epochs": 4,
1405
+ "total_flos": 1.6761689779124306e+19,
1406
+ "trial_name": null,
1407
+ "trial_params": null
1408
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1da1a1b5b8776a68d6047a6ee67615efe815e751c08dce644a4cff1eb757bd1
3
+ size 4475