kejian commited on
Commit
ad44e12
1 Parent(s): 5fdcb10

Training in progress, step 1334

Browse files
checkpoint-1334/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMAndValueHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": true,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.23.0",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
checkpoint-1334/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1334/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:979d1530027db31f9203c7933c3402262eb346b832d52933bc497bbd9b7b1eb5
3
+ size 995611909
checkpoint-1334/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18526a5d91cde5bfc2d5b87457cf22c4434382b1dffb1425c83b153e956c3c94
3
+ size 510401409
checkpoint-1334/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7c2abaeca8b39188ef4d024976e6b313f1edff37e845008546436359f70d704
3
+ size 15533
checkpoint-1334/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5970b634b76e24683de44b37ff55f568f86fe0760701f4b9ffa126dee06d439
3
+ size 557
checkpoint-1334/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ef515473b5a52626fce1bd33c4fadf38c02a6a297eb95e59e62ff42672e042d
3
+ size 627
checkpoint-1334/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-1334/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1334/tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1024,
6
+ "name_or_path": "gpt2",
7
+ "special_tokens_map_file": null,
8
+ "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
checkpoint-1334/trainer_state.json ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.49962546816479403,
5
+ "global_step": 1334,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 3.7037037037037037e-05,
13
+ "loss": 5.8178,
14
+ "theoretical_loss": 10.87642657795271,
15
+ "tokens_seen": 1048576
16
+ },
17
+ {
18
+ "epoch": 0.02,
19
+ "learning_rate": 0.000991297767688233,
20
+ "loss": 3.8434,
21
+ "theoretical_loss": 5.240583117265738,
22
+ "tokens_seen": 52428800
23
+ },
24
+ {
25
+ "epoch": 0.04,
26
+ "learning_rate": 0.000972379871358305,
27
+ "loss": 3.2483,
28
+ "theoretical_loss": 4.741048233458233,
29
+ "tokens_seen": 104857600
30
+ },
31
+ {
32
+ "epoch": 0.06,
33
+ "learning_rate": 0.0009534619750283768,
34
+ "loss": 3.0804,
35
+ "theoretical_loss": 4.490755246681026,
36
+ "tokens_seen": 157286400
37
+ },
38
+ {
39
+ "epoch": 0.06,
40
+ "objective/train/advantage_avg": -0.008124944753944874,
41
+ "objective/train/docs_used": 104000,
42
+ "objective/train/instantaneous_batch_size": 32,
43
+ "objective/train/instantaneous_microbatch_size": 32768,
44
+ "objective/train/lm_loss": 5.698267936706543,
45
+ "objective/train/original_loss": 5.698267936706543,
46
+ "objective/train/theoretical_loss": 4.467094755136979,
47
+ "objective/train/tokens_used": 184300000,
48
+ "objective/train/value_avg": -0.0197296142578125,
49
+ "objective/train/value_loss": 0.00596056692302227,
50
+ "objective/train/value_max": -0.0023326873779296875,
51
+ "objective/train/value_min": -0.1529541015625,
52
+ "objective/train/value_reward_corr": 0.23002298967484386,
53
+ "objective/train/value_std": 0.01715087890625,
54
+ "objective/train/weight_avg": 0.9992169737815857,
55
+ "objective/train/weighted_lm_loss": 5.693380355834961,
56
+ "objective/train/weights_max": 1.01529061794281,
57
+ "objective/train/weights_min": 0.9343342781066895,
58
+ "theoretical_loss": 4.467094755136979,
59
+ "tokens_seen": 163840000
60
+ },
61
+ {
62
+ "epoch": 0.07,
63
+ "learning_rate": 0.0009345440786984487,
64
+ "loss": 2.8679,
65
+ "theoretical_loss": 4.3296357903425715,
66
+ "tokens_seen": 209715200
67
+ },
68
+ {
69
+ "epoch": 0.09,
70
+ "learning_rate": 0.0009156261823685207,
71
+ "loss": 2.7211,
72
+ "theoretical_loss": 4.213299841239684,
73
+ "tokens_seen": 262144000
74
+ },
75
+ {
76
+ "epoch": 0.11,
77
+ "learning_rate": 0.0008967082860385925,
78
+ "loss": 2.5736,
79
+ "theoretical_loss": 4.123496734747793,
80
+ "tokens_seen": 314572800
81
+ },
82
+ {
83
+ "debugging/Self-BLEU-5": 0.5365128506817183,
84
+ "debugging/distinct-1-grams": 0.7612814402327299,
85
+ "debugging/distinct-2-grams": 0.9694583753853511,
86
+ "debugging/entropy-1-grams": 6.003629944255698,
87
+ "debugging/entropy-2-grams": 7.054987089269872,
88
+ "debugging/length": 495.25,
89
+ "debugging/num_segments": 16,
90
+ "debugging/raw_token_scores_avg": 0.04385810345411301,
91
+ "debugging/raw_token_scores_std": 0.15687797963619232,
92
+ "epoch": 0.12,
93
+ "objective/train/advantage_avg": -0.026558605954051018,
94
+ "objective/train/docs_used": 197327,
95
+ "objective/train/instantaneous_batch_size": 32,
96
+ "objective/train/instantaneous_microbatch_size": 32768,
97
+ "objective/train/lm_loss": 4.860468864440918,
98
+ "objective/train/original_loss": 4.860468864440918,
99
+ "objective/train/theoretical_loss": 4.10401016644798,
100
+ "objective/train/tokens_used": 348140000,
101
+ "objective/train/value_avg": -0.01727294921875,
102
+ "objective/train/value_loss": 0.02569347620010376,
103
+ "objective/train/value_max": -0.0023593902587890625,
104
+ "objective/train/value_min": -0.274169921875,
105
+ "objective/train/value_reward_corr": -0.0460843086754045,
106
+ "objective/train/value_std": 0.01389312744140625,
107
+ "objective/train/weight_avg": 0.9974696040153503,
108
+ "objective/train/weighted_lm_loss": 4.853564262390137,
109
+ "objective/train/weights_max": 1.0277366638183594,
110
+ "objective/train/weights_min": 0.9056559801101685,
111
+ "theoretical_loss": 4.10401016644798,
112
+ "tokens_seen": 327680000
113
+ },
114
+ {
115
+ "epoch": 0.13,
116
+ "learning_rate": 0.0008777903897086645,
117
+ "loss": 2.4856,
118
+ "theoretical_loss": 4.051065245936996,
119
+ "tokens_seen": 367001600
120
+ },
121
+ {
122
+ "epoch": 0.15,
123
+ "learning_rate": 0.0008588724933787363,
124
+ "loss": 2.4156,
125
+ "theoretical_loss": 3.9908001978004064,
126
+ "tokens_seen": 419430400
127
+ },
128
+ {
129
+ "epoch": 0.17,
130
+ "learning_rate": 0.0008399545970488081,
131
+ "loss": 2.3438,
132
+ "theoretical_loss": 3.939481097700623,
133
+ "tokens_seen": 471859200
134
+ },
135
+ {
136
+ "epoch": 0.18,
137
+ "objective/train/advantage_avg": 0.01378590613603592,
138
+ "objective/train/docs_used": 287192,
139
+ "objective/train/instantaneous_batch_size": 32,
140
+ "objective/train/instantaneous_microbatch_size": 32768,
141
+ "objective/train/lm_loss": 4.726146221160889,
142
+ "objective/train/original_loss": 4.726146697998047,
143
+ "objective/train/theoretical_loss": 3.9220858822757396,
144
+ "objective/train/tokens_used": 511980000,
145
+ "objective/train/value_avg": -0.0184326171875,
146
+ "objective/train/value_loss": 0.0008915589423850179,
147
+ "objective/train/value_max": -0.0024433135986328125,
148
+ "objective/train/value_min": -0.169189453125,
149
+ "objective/train/value_reward_corr": 0.034202289497960975,
150
+ "objective/train/value_std": 0.01378631591796875,
151
+ "objective/train/weight_avg": 1.0013829469680786,
152
+ "objective/train/weighted_lm_loss": 4.732492446899414,
153
+ "objective/train/weights_max": 1.0164865255355835,
154
+ "objective/train/weights_min": 0.913129985332489,
155
+ "theoretical_loss": 3.9220858822757396,
156
+ "tokens_seen": 491520000
157
+ },
158
+ {
159
+ "epoch": 0.19,
160
+ "learning_rate": 0.0008210367007188801,
161
+ "loss": 2.2548,
162
+ "theoretical_loss": 3.8949869551339704,
163
+ "tokens_seen": 524288000
164
+ },
165
+ {
166
+ "epoch": 0.21,
167
+ "learning_rate": 0.000802118804388952,
168
+ "loss": 2.197,
169
+ "theoretical_loss": 3.855852403938689,
170
+ "tokens_seen": 576716800
171
+ },
172
+ {
173
+ "epoch": 0.22,
174
+ "learning_rate": 0.0007832009080590239,
175
+ "loss": 2.1036,
176
+ "theoretical_loss": 3.8210259233045254,
177
+ "tokens_seen": 629145600
178
+ },
179
+ {
180
+ "debugging/Self-BLEU-5": 0.5265375629586004,
181
+ "debugging/distinct-1-grams": 0.7435820408094715,
182
+ "debugging/distinct-2-grams": 0.9558103821233092,
183
+ "debugging/entropy-1-grams": 5.931434510687563,
184
+ "debugging/entropy-2-grams": 6.886416755326388,
185
+ "debugging/length": 521.9230769230769,
186
+ "debugging/num_segments": 13,
187
+ "debugging/raw_token_scores_avg": 0.022742915898561478,
188
+ "debugging/raw_token_scores_std": 0.07841178774833679,
189
+ "epoch": 0.23,
190
+ "objective/train/advantage_avg": 0.008544832468032837,
191
+ "objective/train/docs_used": 379091,
192
+ "objective/train/instantaneous_batch_size": 32,
193
+ "objective/train/instantaneous_microbatch_size": 32768,
194
+ "objective/train/lm_loss": 4.2809929847717285,
195
+ "objective/train/original_loss": 4.280993461608887,
196
+ "objective/train/theoretical_loss": 3.804976960695429,
197
+ "objective/train/tokens_used": 675820000,
198
+ "objective/train/value_avg": -0.031280517578125,
199
+ "objective/train/value_loss": 0.004552943632006645,
200
+ "objective/train/value_max": -0.0027370452880859375,
201
+ "objective/train/value_min": -0.306396484375,
202
+ "objective/train/value_reward_corr": 0.5422745268556184,
203
+ "objective/train/value_std": 0.03082275390625,
204
+ "objective/train/weight_avg": 1.000877022743225,
205
+ "objective/train/weighted_lm_loss": 4.284684181213379,
206
+ "objective/train/weights_max": 1.026648759841919,
207
+ "objective/train/weights_min": 0.9131191968917847,
208
+ "theoretical_loss": 3.804976960695429,
209
+ "tokens_seen": 655360000
210
+ },
211
+ {
212
+ "epoch": 0.24,
213
+ "learning_rate": 0.0007642830117290957,
214
+ "loss": 2.0198,
215
+ "theoretical_loss": 3.7897293654583164,
216
+ "tokens_seen": 681574400
217
+ },
218
+ {
219
+ "epoch": 0.26,
220
+ "learning_rate": 0.0007453651153991677,
221
+ "loss": 1.9114,
222
+ "theoretical_loss": 3.7613719997526367,
223
+ "tokens_seen": 734003200
224
+ },
225
+ {
226
+ "epoch": 0.28,
227
+ "learning_rate": 0.0007264472190692395,
228
+ "loss": 1.8379,
229
+ "theoretical_loss": 3.735495625147548,
230
+ "tokens_seen": 786432000
231
+ },
232
+ {
233
+ "epoch": 0.29,
234
+ "objective/train/advantage_avg": 0.005197666119784117,
235
+ "objective/train/docs_used": 471128,
236
+ "objective/train/instantaneous_batch_size": 32,
237
+ "objective/train/instantaneous_microbatch_size": 32768,
238
+ "objective/train/lm_loss": 3.469724655151367,
239
+ "objective/train/original_loss": 3.469724655151367,
240
+ "objective/train/theoretical_loss": 3.7204187214233073,
241
+ "objective/train/tokens_used": 839660000,
242
+ "objective/train/value_avg": -0.01427459716796875,
243
+ "objective/train/value_loss": 0.0032349335961043835,
244
+ "objective/train/value_max": -0.0009217262268066406,
245
+ "objective/train/value_min": -0.329833984375,
246
+ "objective/train/value_reward_corr": 0.23257723024354376,
247
+ "objective/train/value_std": 0.011383056640625,
248
+ "objective/train/weight_avg": 1.000535488128662,
249
+ "objective/train/weighted_lm_loss": 3.472010374069214,
250
+ "objective/train/weights_max": 1.0184398889541626,
251
+ "objective/train/weights_min": 0.9076024293899536,
252
+ "theoretical_loss": 3.7204187214233073,
253
+ "tokens_seen": 819200000
254
+ },
255
+ {
256
+ "epoch": 0.3,
257
+ "learning_rate": 0.0007075293227393113,
258
+ "loss": 1.7735,
259
+ "theoretical_loss": 3.7117382474521436,
260
+ "tokens_seen": 838860800
261
+ },
262
+ {
263
+ "epoch": 0.32,
264
+ "learning_rate": 0.0006886114264093834,
265
+ "loss": 1.7242,
266
+ "theoretical_loss": 3.689809300987042,
267
+ "tokens_seen": 891289600
268
+ },
269
+ {
270
+ "epoch": 0.34,
271
+ "learning_rate": 0.0006696935300794552,
272
+ "loss": 1.6889,
273
+ "theoretical_loss": 3.6694722975957066,
274
+ "tokens_seen": 943718400
275
+ },
276
+ {
277
+ "debugging/Self-BLEU-5": 0.4286046663919377,
278
+ "debugging/distinct-1-grams": 0.8147567798871364,
279
+ "debugging/distinct-2-grams": 0.9823269374342457,
280
+ "debugging/entropy-1-grams": 6.1671920556004824,
281
+ "debugging/entropy-2-grams": 6.947028138756313,
282
+ "debugging/length": 477.53333333333336,
283
+ "debugging/num_segments": 15,
284
+ "debugging/raw_token_scores_avg": 0.020611366257071495,
285
+ "debugging/raw_token_scores_std": 0.08496682345867157,
286
+ "epoch": 0.35,
287
+ "objective/train/advantage_avg": -0.002937063341960311,
288
+ "objective/train/docs_used": 560408,
289
+ "objective/train/instantaneous_batch_size": 32,
290
+ "objective/train/instantaneous_microbatch_size": 32768,
291
+ "objective/train/lm_loss": 3.587904214859009,
292
+ "objective/train/original_loss": 3.5879039764404297,
293
+ "objective/train/theoretical_loss": 3.6551457544283386,
294
+ "objective/train/tokens_used": 1003500000,
295
+ "objective/train/value_avg": -0.0176849365234375,
296
+ "objective/train/value_loss": 0.006396747659891844,
297
+ "objective/train/value_max": -0.0007319450378417969,
298
+ "objective/train/value_min": -0.62451171875,
299
+ "objective/train/value_reward_corr": 0.3547212443715962,
300
+ "objective/train/value_std": 0.0216827392578125,
301
+ "objective/train/weight_avg": 0.9997376203536987,
302
+ "objective/train/weighted_lm_loss": 3.5868186950683594,
303
+ "objective/train/weights_max": 1.0237751007080078,
304
+ "objective/train/weights_min": 0.9077049493789673,
305
+ "theoretical_loss": 3.6551457544283386,
306
+ "tokens_seen": 983040000
307
+ },
308
+ {
309
+ "epoch": 0.36,
310
+ "learning_rate": 0.0006507756337495271,
311
+ "loss": 1.7056,
312
+ "theoretical_loss": 3.6505323968108674,
313
+ "tokens_seen": 996147200
314
+ },
315
+ {
316
+ "epoch": 0.37,
317
+ "learning_rate": 0.000631857737419599,
318
+ "loss": 1.6782,
319
+ "theoretical_loss": 3.632827321456789,
320
+ "tokens_seen": 1048576000
321
+ },
322
+ {
323
+ "epoch": 0.39,
324
+ "learning_rate": 0.0006129398410896708,
325
+ "loss": 1.6393,
326
+ "theoretical_loss": 3.616220599546101,
327
+ "tokens_seen": 1101004800
328
+ },
329
+ {
330
+ "epoch": 0.41,
331
+ "objective/train/advantage_avg": -0.004092915914952755,
332
+ "objective/train/docs_used": 649861,
333
+ "objective/train/instantaneous_batch_size": 32,
334
+ "objective/train/instantaneous_microbatch_size": 32768,
335
+ "objective/train/lm_loss": 3.148904323577881,
336
+ "objective/train/original_loss": 3.148904323577881,
337
+ "objective/train/theoretical_loss": 3.6024992663141386,
338
+ "objective/train/tokens_used": 1167340000,
339
+ "objective/train/value_avg": -0.0187835693359375,
340
+ "objective/train/value_loss": 0.007057450246065855,
341
+ "objective/train/value_max": -0.0004076957702636719,
342
+ "objective/train/value_min": -0.76025390625,
343
+ "objective/train/value_reward_corr": 0.5218380949829392,
344
+ "objective/train/value_std": 0.02960205078125,
345
+ "objective/train/weight_avg": 0.9996253848075867,
346
+ "objective/train/weighted_lm_loss": 3.1456782817840576,
347
+ "objective/train/weights_max": 1.0473830699920654,
348
+ "objective/train/weights_min": 0.9076167345046997,
349
+ "theoretical_loss": 3.6024992663141386,
350
+ "tokens_seen": 1146880000
351
+ },
352
+ {
353
+ "epoch": 0.41,
354
+ "learning_rate": 0.0005940219447597427,
355
+ "loss": 1.6315,
356
+ "theoretical_loss": 3.6005964566275575,
357
+ "tokens_seen": 1153433600
358
+ },
359
+ {
360
+ "epoch": 0.43,
361
+ "learning_rate": 0.0005751040484298145,
362
+ "loss": 1.6112,
363
+ "theoretical_loss": 3.585855900316411,
364
+ "tokens_seen": 1205862400
365
+ },
366
+ {
367
+ "epoch": 0.45,
368
+ "learning_rate": 0.0005561861520998866,
369
+ "loss": 1.5989,
370
+ "theoretical_loss": 3.571913680014217,
371
+ "tokens_seen": 1258291200
372
+ },
373
+ {
374
+ "debugging/Self-BLEU-5": 0.49020908264157476,
375
+ "debugging/distinct-1-grams": 0.768901113497886,
376
+ "debugging/distinct-2-grams": 0.9428782333551957,
377
+ "debugging/entropy-1-grams": 6.085999550681761,
378
+ "debugging/entropy-2-grams": 7.0033060167714964,
379
+ "debugging/length": 490.2352941176471,
380
+ "debugging/num_segments": 17,
381
+ "debugging/raw_token_scores_avg": 0.02056093141436577,
382
+ "debugging/raw_token_scores_std": 0.10981010645627975,
383
+ "epoch": 0.47,
384
+ "objective/train/advantage_avg": -0.0010009908583015203,
385
+ "objective/train/docs_used": 741674,
386
+ "objective/train/instantaneous_batch_size": 32,
387
+ "objective/train/instantaneous_microbatch_size": 32768,
388
+ "objective/train/lm_loss": 3.083587408065796,
389
+ "objective/train/original_loss": 3.083587646484375,
390
+ "objective/train/theoretical_loss": 3.5586958985729016,
391
+ "objective/train/tokens_used": 1331180000,
392
+ "objective/train/value_avg": -0.0195770263671875,
393
+ "objective/train/value_loss": 0.007854425348341465,
394
+ "objective/train/value_max": -0.0004239082336425781,
395
+ "objective/train/value_min": -0.97705078125,
396
+ "objective/train/value_reward_corr": 0.5932239490084845,
397
+ "objective/train/value_std": 0.0595703125,
398
+ "objective/train/weight_avg": 0.9999384880065918,
399
+ "objective/train/weighted_lm_loss": 3.082087516784668,
400
+ "objective/train/weights_max": 1.0919088125228882,
401
+ "objective/train/weights_min": 0.9060803055763245,
402
+ "theoretical_loss": 3.5586958985729016,
403
+ "tokens_seen": 1310720000
404
+ },
405
+ {
406
+ "epoch": 0.47,
407
+ "learning_rate": 0.0005372682557699584,
408
+ "loss": 1.5642,
409
+ "theoretical_loss": 3.5586958985729016,
410
+ "tokens_seen": 1310720000
411
+ },
412
+ {
413
+ "epoch": 0.49,
414
+ "learning_rate": 0.0005183503594400303,
415
+ "loss": 1.5616,
416
+ "theoretical_loss": 3.5461381161006846,
417
+ "tokens_seen": 1363148800
418
+ }
419
+ ],
420
+ "max_steps": 2670,
421
+ "num_train_epochs": 9223372036854775807,
422
+ "total_flos": 7.138646384411935e+17,
423
+ "trial_name": null,
424
+ "trial_params": null
425
+ }
checkpoint-1334/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06649e6d12b7a4cd09a0a8cebe5d368d597d5443538b90e48182d55ad8737e4e
3
+ size 3451
checkpoint-1334/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMAndValueHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": true,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.23.0",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18526a5d91cde5bfc2d5b87457cf22c4434382b1dffb1425c83b153e956c3c94
3
+ size 510401409
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1024,
6
+ "name_or_path": "gpt2",
7
+ "special_tokens_map_file": null,
8
+ "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06649e6d12b7a4cd09a0a8cebe5d368d597d5443538b90e48182d55ad8737e4e
3
+ size 3451
vocab.json ADDED
The diff for this file is too large to render. See raw diff