galthran commited on
Commit
8c5617f
1 Parent(s): c40ac9e

model with metrics

Browse files
config.json ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": "5f009df2186c5841582ecd419409b56041180144",
3
+ "_name_or_path": "facebook/maskformer-swin-large-coco",
4
+ "architectures": [
5
+ "MaskFormerForInstanceSegmentation"
6
+ ],
7
+ "backbone_config": {
8
+ "_name_or_path": "",
9
+ "add_cross_attention": false,
10
+ "architectures": null,
11
+ "attention_probs_dropout_prob": 0.0,
12
+ "bad_words_ids": null,
13
+ "begin_suppress_tokens": null,
14
+ "bos_token_id": null,
15
+ "chunk_size_feed_forward": 0,
16
+ "cross_attention_hidden_size": null,
17
+ "decoder_start_token_id": null,
18
+ "depths": [
19
+ 2,
20
+ 2,
21
+ 18,
22
+ 2
23
+ ],
24
+ "diversity_penalty": 0.0,
25
+ "do_sample": false,
26
+ "drop_path_rate": 0.3,
27
+ "early_stopping": false,
28
+ "embed_dim": 192,
29
+ "encoder_no_repeat_ngram_size": 0,
30
+ "encoder_stride": 32,
31
+ "eos_token_id": null,
32
+ "exponential_decay_length_penalty": null,
33
+ "finetuning_task": null,
34
+ "forced_bos_token_id": null,
35
+ "forced_eos_token_id": null,
36
+ "hidden_act": "gelu",
37
+ "hidden_dropout_prob": 0.0,
38
+ "hidden_size": 1536,
39
+ "id2label": {
40
+ "0": "LABEL_0",
41
+ "1": "LABEL_1"
42
+ },
43
+ "image_size": 384,
44
+ "in_channels": 3,
45
+ "initializer_range": 0.02,
46
+ "is_decoder": false,
47
+ "is_encoder_decoder": false,
48
+ "label2id": {
49
+ "LABEL_0": 0,
50
+ "LABEL_1": 1
51
+ },
52
+ "layer_norm_eps": 1e-05,
53
+ "length_penalty": 1.0,
54
+ "max_length": 20,
55
+ "min_length": 0,
56
+ "mlp_ratio": 4.0,
57
+ "model_type": "swin",
58
+ "no_repeat_ngram_size": 0,
59
+ "num_beam_groups": 1,
60
+ "num_beams": 1,
61
+ "num_channels": 3,
62
+ "num_heads": [
63
+ 6,
64
+ 12,
65
+ 24,
66
+ 48
67
+ ],
68
+ "num_layers": 4,
69
+ "num_return_sequences": 1,
70
+ "out_features": null,
71
+ "output_attentions": false,
72
+ "output_hidden_states": false,
73
+ "output_scores": false,
74
+ "pad_token_id": null,
75
+ "patch_size": 4,
76
+ "path_norm": true,
77
+ "prefix": null,
78
+ "pretrain_img_size": 384,
79
+ "problem_type": null,
80
+ "pruned_heads": {},
81
+ "qkv_bias": true,
82
+ "remove_invalid_values": false,
83
+ "repetition_penalty": 1.0,
84
+ "return_dict": true,
85
+ "return_dict_in_generate": false,
86
+ "sep_token_id": null,
87
+ "stage_names": [
88
+ "stem",
89
+ "stage1",
90
+ "stage2",
91
+ "stage3",
92
+ "stage4"
93
+ ],
94
+ "suppress_tokens": null,
95
+ "task_specific_params": null,
96
+ "temperature": 1.0,
97
+ "tf_legacy_loss": false,
98
+ "tie_encoder_decoder": false,
99
+ "tie_word_embeddings": true,
100
+ "tokenizer_class": null,
101
+ "top_k": 50,
102
+ "top_p": 1.0,
103
+ "torch_dtype": null,
104
+ "torchscript": false,
105
+ "transformers_version": "4.27.2",
106
+ "typical_p": 1.0,
107
+ "use_absolute_embeddings": false,
108
+ "use_bfloat16": false,
109
+ "window_size": 12
110
+ },
111
+ "ce_weight": 1.0,
112
+ "cross_entropy_weight": 1.0,
113
+ "decoder_config": {
114
+ "_commit_hash": null,
115
+ "_name_or_path": "",
116
+ "activation_dropout": 0.0,
117
+ "activation_function": "relu",
118
+ "add_cross_attention": false,
119
+ "architectures": null,
120
+ "attention_dropout": 0.0,
121
+ "auxiliary_loss": false,
122
+ "backbone": "resnet50",
123
+ "backbone_config": null,
124
+ "bad_words_ids": null,
125
+ "bbox_cost": 5,
126
+ "bbox_loss_coefficient": 5,
127
+ "begin_suppress_tokens": null,
128
+ "bos_token_id": null,
129
+ "chunk_size_feed_forward": 0,
130
+ "class_cost": 1,
131
+ "cross_attention_hidden_size": null,
132
+ "d_model": 256,
133
+ "decoder_attention_heads": 8,
134
+ "decoder_ffn_dim": 2048,
135
+ "decoder_layerdrop": 0.0,
136
+ "decoder_layers": 6,
137
+ "decoder_start_token_id": null,
138
+ "dice_loss_coefficient": 1,
139
+ "dilation": false,
140
+ "diversity_penalty": 0.0,
141
+ "do_sample": false,
142
+ "dropout": 0.1,
143
+ "early_stopping": false,
144
+ "encoder_attention_heads": 8,
145
+ "encoder_ffn_dim": 2048,
146
+ "encoder_layerdrop": 0.0,
147
+ "encoder_layers": 6,
148
+ "encoder_no_repeat_ngram_size": 0,
149
+ "eos_coefficient": 0.1,
150
+ "eos_token_id": null,
151
+ "exponential_decay_length_penalty": null,
152
+ "finetuning_task": null,
153
+ "forced_bos_token_id": null,
154
+ "forced_eos_token_id": null,
155
+ "giou_cost": 2,
156
+ "giou_loss_coefficient": 2,
157
+ "id2label": {
158
+ "0": "LABEL_0",
159
+ "1": "LABEL_1"
160
+ },
161
+ "init_std": 0.02,
162
+ "init_xavier_std": 1.0,
163
+ "is_decoder": false,
164
+ "is_encoder_decoder": true,
165
+ "label2id": {
166
+ "LABEL_0": 0,
167
+ "LABEL_1": 1
168
+ },
169
+ "length_penalty": 1.0,
170
+ "mask_loss_coefficient": 1,
171
+ "max_length": 20,
172
+ "max_position_embeddings": 1024,
173
+ "min_length": 0,
174
+ "model_type": "detr",
175
+ "no_repeat_ngram_size": 0,
176
+ "num_beam_groups": 1,
177
+ "num_beams": 1,
178
+ "num_channels": 3,
179
+ "num_hidden_layers": 6,
180
+ "num_queries": 100,
181
+ "num_return_sequences": 1,
182
+ "output_attentions": false,
183
+ "output_hidden_states": false,
184
+ "output_scores": false,
185
+ "pad_token_id": null,
186
+ "position_embedding_type": "sine",
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "remove_invalid_values": false,
191
+ "repetition_penalty": 1.0,
192
+ "return_dict": true,
193
+ "return_dict_in_generate": false,
194
+ "scale_embedding": false,
195
+ "sep_token_id": null,
196
+ "suppress_tokens": null,
197
+ "task_specific_params": null,
198
+ "temperature": 1.0,
199
+ "tf_legacy_loss": false,
200
+ "tie_encoder_decoder": false,
201
+ "tie_word_embeddings": true,
202
+ "tokenizer_class": null,
203
+ "top_k": 50,
204
+ "top_p": 1.0,
205
+ "torch_dtype": null,
206
+ "torchscript": false,
207
+ "transformers_version": "4.17.0.dev0",
208
+ "typical_p": 1.0,
209
+ "use_bfloat16": false,
210
+ "use_pretrained_backbone": true,
211
+ "use_timm_backbone": true
212
+ },
213
+ "dice_weight": 1.0,
214
+ "fpn_feature_size": 256,
215
+ "id2label": {
216
+ "0": "background",
217
+ "1": "building",
218
+ "2": "door",
219
+ "3": "window"
220
+ },
221
+ "init_std": 0.02,
222
+ "init_xavier_std": 1.0,
223
+ "label2id": {
224
+ "airplane": 4,
225
+ "apple": 47,
226
+ "backpack": 24,
227
+ "banana": 46,
228
+ "banner": 80,
229
+ "baseball bat": 34,
230
+ "baseball glove": 35,
231
+ "bear": 21,
232
+ "bed": 59,
233
+ "bench": 13,
234
+ "bicycle": 1,
235
+ "bird": 14,
236
+ "blanket": 81,
237
+ "boat": 8,
238
+ "book": 73,
239
+ "bottle": 39,
240
+ "bowl": 45,
241
+ "bridge": 82,
242
+ "broccoli": 50,
243
+ "building-other-merged": 129,
244
+ "bus": 5,
245
+ "cabinet-merged": 120,
246
+ "cake": 55,
247
+ "car": 2,
248
+ "cardboard": 83,
249
+ "carrot": 51,
250
+ "cat": 15,
251
+ "ceiling-merged": 118,
252
+ "cell phone": 67,
253
+ "chair": 56,
254
+ "clock": 74,
255
+ "couch": 57,
256
+ "counter": 84,
257
+ "cow": 19,
258
+ "cup": 41,
259
+ "curtain": 85,
260
+ "dining table": 60,
261
+ "dirt-merged": 126,
262
+ "dog": 16,
263
+ "donut": 54,
264
+ "door-stuff": 86,
265
+ "elephant": 20,
266
+ "fence-merged": 117,
267
+ "fire hydrant": 10,
268
+ "floor-other-merged": 122,
269
+ "floor-wood": 87,
270
+ "flower": 88,
271
+ "food-other-merged": 128,
272
+ "fork": 42,
273
+ "frisbee": 29,
274
+ "fruit": 89,
275
+ "giraffe": 23,
276
+ "grass-merged": 125,
277
+ "gravel": 90,
278
+ "hair drier": 78,
279
+ "handbag": 26,
280
+ "horse": 17,
281
+ "hot dog": 52,
282
+ "house": 91,
283
+ "keyboard": 66,
284
+ "kite": 33,
285
+ "knife": 43,
286
+ "laptop": 63,
287
+ "light": 92,
288
+ "microwave": 68,
289
+ "mirror-stuff": 93,
290
+ "motorcycle": 3,
291
+ "mountain-merged": 124,
292
+ "mouse": 64,
293
+ "net": 94,
294
+ "orange": 49,
295
+ "oven": 69,
296
+ "paper-merged": 127,
297
+ "parking meter": 12,
298
+ "pavement-merged": 123,
299
+ "person": 0,
300
+ "pillow": 95,
301
+ "pizza": 53,
302
+ "platform": 96,
303
+ "playingfield": 97,
304
+ "potted plant": 58,
305
+ "railroad": 98,
306
+ "refrigerator": 72,
307
+ "remote": 65,
308
+ "river": 99,
309
+ "road": 100,
310
+ "rock-merged": 130,
311
+ "roof": 101,
312
+ "rug-merged": 132,
313
+ "sand": 102,
314
+ "sandwich": 48,
315
+ "scissors": 76,
316
+ "sea": 103,
317
+ "sheep": 18,
318
+ "shelf": 104,
319
+ "sink": 71,
320
+ "skateboard": 36,
321
+ "skis": 30,
322
+ "sky-other-merged": 119,
323
+ "snow": 105,
324
+ "snowboard": 31,
325
+ "spoon": 44,
326
+ "sports ball": 32,
327
+ "stairs": 106,
328
+ "stop sign": 11,
329
+ "suitcase": 28,
330
+ "surfboard": 37,
331
+ "table-merged": 121,
332
+ "teddy bear": 77,
333
+ "tennis racket": 38,
334
+ "tent": 107,
335
+ "tie": 27,
336
+ "toaster": 70,
337
+ "toilet": 61,
338
+ "toothbrush": 79,
339
+ "towel": 108,
340
+ "traffic light": 9,
341
+ "train": 6,
342
+ "tree-merged": 116,
343
+ "truck": 7,
344
+ "tv": 62,
345
+ "umbrella": 25,
346
+ "vase": 75,
347
+ "wall-brick": 109,
348
+ "wall-other-merged": 131,
349
+ "wall-stone": 110,
350
+ "wall-tile": 111,
351
+ "wall-wood": 112,
352
+ "water-other": 113,
353
+ "window-blind": 114,
354
+ "window-other": 115,
355
+ "wine glass": 40,
356
+ "zebra": 22
357
+ },
358
+ "mask_feature_size": 256,
359
+ "mask_weight": 20.0,
360
+ "model_type": "maskformer",
361
+ "no_object_weight": 0.1,
362
+ "num_attention_heads": 8,
363
+ "num_hidden_layers": 6,
364
+ "num_queries": 100,
365
+ "output_auxiliary_logits": null,
366
+ "torch_dtype": "float32",
367
+ "transformers_version": null,
368
+ "use_auxiliary_loss": false
369
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ee9972a13f0e672a5864dc6f42216529ca4fbb61a273d4014ae0c5cca88d3a1
3
+ size 1692669047
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59453a3e33287fd99b99fb746a23eaaacc7b92f2f676fba2d0a977189ee4dcfc
3
+ size 850389869
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a7adfe4697a47676df74267a3bab26c9f2c4ad4bbb113fff6e189c821c57eb1
3
+ size 14575
runs/Mar30_02-42-58_leo-pc/1680133386.6283898/events.out.tfevents.1680133386.leo-pc.3561.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfcab2bf044f21c8508678e394d1e4ed54357379aa1480ba4341a9436d1cbfc2
3
+ size 5810
runs/Mar30_02-42-58_leo-pc/events.out.tfevents.1680133386.leo-pc.3561.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9041c413d9d8e9cb9af9e8d1c48716613f648d4aa2adc89dc3f921c78b69482
3
+ size 11987
runs/Mar30_02-45-58_leo-pc/1680133560.1533623/events.out.tfevents.1680133560.leo-pc.3714.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e9cfffef788135d0e51a5067e8b789a74ca87b7c4f2d163cba2b2771c1e32b1
3
+ size 5810
runs/Mar30_02-45-58_leo-pc/events.out.tfevents.1680133560.leo-pc.3714.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71a392e713cf41578178d5e9dbcee7ff255e791922e9447a05837240fea40a2b
3
+ size 39599
runs/Mar30_11-29-54_leo-pc/1680165002.3051672/events.out.tfevents.1680165002.leo-pc.6504.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3504a77d3c8d5f8185f2547773487d989a0be9f2db27a0fd4c27691c9defdb7e
3
+ size 5810
runs/Mar30_11-29-54_leo-pc/events.out.tfevents.1680165002.leo-pc.6504.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9519343923cedc988c8c5ea4129195c8a052998028d6022f2dc7dea8d3e19df5
3
+ size 12148
runs/Mar31_02-25-47_leo-pc/1680218750.7461765/events.out.tfevents.1680218750.leo-pc.11713.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ce25e0f8bfd4b077904182104819d4371f1249fa7d8fc1e145cfd440fe8ec46
3
+ size 5810
runs/Mar31_02-25-47_leo-pc/events.out.tfevents.1680218750.leo-pc.11713.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37663bd2ea25ca96c8eb963bcb012e4792815ac390620ed8b22968a19645bfc0
3
+ size 12141
runs/Mar31_02-31-05_leo-pc/1680219069.4820511/events.out.tfevents.1680219069.leo-pc.12743.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90bc8d1d1345c59271c97d4c2fbf19d7a4a284a2380dfce0e6035785c681bcf4
3
+ size 5810
runs/Mar31_02-31-05_leo-pc/events.out.tfevents.1680219069.leo-pc.12743.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a229f78251c653da05f3ce90b58230f8b7d9a943c4ec4517e27db661c435f3c
3
+ size 11987
runs/Mar31_02-37-05_leo-pc/1680219429.14518/events.out.tfevents.1680219429.leo-pc.13564.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f100f9bac094f1450e919dc41750b0710c325b85646175279f0386d4390fbea
3
+ size 5810
runs/Mar31_02-37-05_leo-pc/events.out.tfevents.1680219429.leo-pc.13564.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21730403b5e51be71fc1fd710040c3f241f7f76ab6e0cc93c0f3dee8a732f19a
3
+ size 98037
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c8d47ab8f6a5a6f05dfe8bfa1057066f3e7216d49496acb6cca50b7a2fa5ecc
3
+ size 627
trainer_state.json ADDED
@@ -0,0 +1,2104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 37.51465416178195,
5
+ "global_step": 32000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.12,
12
+ "learning_rate": 5e-06,
13
+ "loss": 0.8323,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.23,
18
+ "learning_rate": 5e-06,
19
+ "loss": 0.793,
20
+ "step": 200
21
+ },
22
+ {
23
+ "epoch": 0.35,
24
+ "learning_rate": 5e-06,
25
+ "loss": 0.7508,
26
+ "step": 300
27
+ },
28
+ {
29
+ "epoch": 0.47,
30
+ "learning_rate": 5e-06,
31
+ "loss": 0.7777,
32
+ "step": 400
33
+ },
34
+ {
35
+ "epoch": 0.59,
36
+ "learning_rate": 5e-06,
37
+ "loss": 0.7741,
38
+ "step": 500
39
+ },
40
+ {
41
+ "epoch": 0.7,
42
+ "learning_rate": 5e-06,
43
+ "loss": 0.7435,
44
+ "step": 600
45
+ },
46
+ {
47
+ "epoch": 0.82,
48
+ "learning_rate": 5e-06,
49
+ "loss": 0.7681,
50
+ "step": 700
51
+ },
52
+ {
53
+ "epoch": 0.94,
54
+ "learning_rate": 5e-06,
55
+ "loss": 0.7505,
56
+ "step": 800
57
+ },
58
+ {
59
+ "epoch": 1.06,
60
+ "learning_rate": 5e-06,
61
+ "loss": 0.7285,
62
+ "step": 900
63
+ },
64
+ {
65
+ "epoch": 1.17,
66
+ "learning_rate": 5e-06,
67
+ "loss": 0.7243,
68
+ "step": 1000
69
+ },
70
+ {
71
+ "epoch": 1.29,
72
+ "learning_rate": 5e-06,
73
+ "loss": 0.7562,
74
+ "step": 1100
75
+ },
76
+ {
77
+ "epoch": 1.41,
78
+ "learning_rate": 5e-06,
79
+ "loss": 0.743,
80
+ "step": 1200
81
+ },
82
+ {
83
+ "epoch": 1.52,
84
+ "learning_rate": 5e-06,
85
+ "loss": 0.7086,
86
+ "step": 1300
87
+ },
88
+ {
89
+ "epoch": 1.64,
90
+ "learning_rate": 5e-06,
91
+ "loss": 0.7177,
92
+ "step": 1400
93
+ },
94
+ {
95
+ "epoch": 1.76,
96
+ "learning_rate": 5e-06,
97
+ "loss": 0.727,
98
+ "step": 1500
99
+ },
100
+ {
101
+ "epoch": 1.76,
102
+ "eval_loss": 0.6887246966362,
103
+ "eval_runtime": 70.0491,
104
+ "eval_samples_per_second": 3.041,
105
+ "eval_steps_per_second": 3.041,
106
+ "step": 1500
107
+ },
108
+ {
109
+ "epoch": 1.88,
110
+ "learning_rate": 5e-06,
111
+ "loss": 0.7781,
112
+ "step": 1600
113
+ },
114
+ {
115
+ "epoch": 1.99,
116
+ "learning_rate": 5e-06,
117
+ "loss": 0.7509,
118
+ "step": 1700
119
+ },
120
+ {
121
+ "epoch": 2.11,
122
+ "learning_rate": 5e-06,
123
+ "loss": 0.7057,
124
+ "step": 1800
125
+ },
126
+ {
127
+ "epoch": 2.23,
128
+ "learning_rate": 5e-06,
129
+ "loss": 0.7292,
130
+ "step": 1900
131
+ },
132
+ {
133
+ "epoch": 2.34,
134
+ "learning_rate": 5e-06,
135
+ "loss": 0.7037,
136
+ "step": 2000
137
+ },
138
+ {
139
+ "epoch": 2.46,
140
+ "learning_rate": 5e-06,
141
+ "loss": 0.7005,
142
+ "step": 2100
143
+ },
144
+ {
145
+ "epoch": 2.58,
146
+ "learning_rate": 5e-06,
147
+ "loss": 0.7501,
148
+ "step": 2200
149
+ },
150
+ {
151
+ "epoch": 2.7,
152
+ "learning_rate": 5e-06,
153
+ "loss": 0.7162,
154
+ "step": 2300
155
+ },
156
+ {
157
+ "epoch": 2.81,
158
+ "learning_rate": 5e-06,
159
+ "loss": 0.7428,
160
+ "step": 2400
161
+ },
162
+ {
163
+ "epoch": 2.93,
164
+ "learning_rate": 5e-06,
165
+ "loss": 0.7403,
166
+ "step": 2500
167
+ },
168
+ {
169
+ "epoch": 3.05,
170
+ "learning_rate": 5e-06,
171
+ "loss": 0.7286,
172
+ "step": 2600
173
+ },
174
+ {
175
+ "epoch": 3.17,
176
+ "learning_rate": 5e-06,
177
+ "loss": 0.72,
178
+ "step": 2700
179
+ },
180
+ {
181
+ "epoch": 3.28,
182
+ "learning_rate": 5e-06,
183
+ "loss": 0.6998,
184
+ "step": 2800
185
+ },
186
+ {
187
+ "epoch": 3.4,
188
+ "learning_rate": 5e-06,
189
+ "loss": 0.7515,
190
+ "step": 2900
191
+ },
192
+ {
193
+ "epoch": 3.52,
194
+ "learning_rate": 5e-06,
195
+ "loss": 0.7263,
196
+ "step": 3000
197
+ },
198
+ {
199
+ "epoch": 3.52,
200
+ "eval_loss": 0.6762834191322327,
201
+ "eval_runtime": 70.0592,
202
+ "eval_samples_per_second": 3.04,
203
+ "eval_steps_per_second": 3.04,
204
+ "step": 3000
205
+ },
206
+ {
207
+ "epoch": 3.63,
208
+ "learning_rate": 5e-06,
209
+ "loss": 0.6931,
210
+ "step": 3100
211
+ },
212
+ {
213
+ "epoch": 3.75,
214
+ "learning_rate": 5e-06,
215
+ "loss": 0.6907,
216
+ "step": 3200
217
+ },
218
+ {
219
+ "epoch": 3.87,
220
+ "learning_rate": 5e-06,
221
+ "loss": 0.6818,
222
+ "step": 3300
223
+ },
224
+ {
225
+ "epoch": 3.99,
226
+ "learning_rate": 5e-06,
227
+ "loss": 0.7417,
228
+ "step": 3400
229
+ },
230
+ {
231
+ "epoch": 4.1,
232
+ "learning_rate": 5e-06,
233
+ "loss": 0.6987,
234
+ "step": 3500
235
+ },
236
+ {
237
+ "epoch": 4.22,
238
+ "learning_rate": 5e-06,
239
+ "loss": 0.6854,
240
+ "step": 3600
241
+ },
242
+ {
243
+ "epoch": 4.34,
244
+ "learning_rate": 5e-06,
245
+ "loss": 0.7236,
246
+ "step": 3700
247
+ },
248
+ {
249
+ "epoch": 4.45,
250
+ "learning_rate": 5e-06,
251
+ "loss": 0.7045,
252
+ "step": 3800
253
+ },
254
+ {
255
+ "epoch": 4.57,
256
+ "learning_rate": 5e-06,
257
+ "loss": 0.699,
258
+ "step": 3900
259
+ },
260
+ {
261
+ "epoch": 4.69,
262
+ "learning_rate": 5e-06,
263
+ "loss": 0.7179,
264
+ "step": 4000
265
+ },
266
+ {
267
+ "epoch": 4.81,
268
+ "learning_rate": 5e-06,
269
+ "loss": 0.7066,
270
+ "step": 4100
271
+ },
272
+ {
273
+ "epoch": 4.92,
274
+ "learning_rate": 5e-06,
275
+ "loss": 0.713,
276
+ "step": 4200
277
+ },
278
+ {
279
+ "epoch": 5.04,
280
+ "learning_rate": 5e-06,
281
+ "loss": 0.6749,
282
+ "step": 4300
283
+ },
284
+ {
285
+ "epoch": 5.16,
286
+ "learning_rate": 5e-06,
287
+ "loss": 0.6882,
288
+ "step": 4400
289
+ },
290
+ {
291
+ "epoch": 5.28,
292
+ "learning_rate": 5e-06,
293
+ "loss": 0.6809,
294
+ "step": 4500
295
+ },
296
+ {
297
+ "epoch": 5.28,
298
+ "eval_loss": 0.6606600284576416,
299
+ "eval_runtime": 69.4571,
300
+ "eval_samples_per_second": 3.067,
301
+ "eval_steps_per_second": 3.067,
302
+ "step": 4500
303
+ },
304
+ {
305
+ "epoch": 5.39,
306
+ "learning_rate": 5e-06,
307
+ "loss": 0.7433,
308
+ "step": 4600
309
+ },
310
+ {
311
+ "epoch": 5.51,
312
+ "learning_rate": 5e-06,
313
+ "loss": 0.6881,
314
+ "step": 4700
315
+ },
316
+ {
317
+ "epoch": 5.63,
318
+ "learning_rate": 5e-06,
319
+ "loss": 0.7343,
320
+ "step": 4800
321
+ },
322
+ {
323
+ "epoch": 5.74,
324
+ "learning_rate": 5e-06,
325
+ "loss": 0.6675,
326
+ "step": 4900
327
+ },
328
+ {
329
+ "epoch": 5.86,
330
+ "learning_rate": 5e-06,
331
+ "loss": 0.7187,
332
+ "step": 5000
333
+ },
334
+ {
335
+ "epoch": 5.98,
336
+ "learning_rate": 5e-06,
337
+ "loss": 0.6837,
338
+ "step": 5100
339
+ },
340
+ {
341
+ "epoch": 6.1,
342
+ "learning_rate": 5e-06,
343
+ "loss": 0.6825,
344
+ "step": 5200
345
+ },
346
+ {
347
+ "epoch": 6.21,
348
+ "learning_rate": 5e-06,
349
+ "loss": 0.6976,
350
+ "step": 5300
351
+ },
352
+ {
353
+ "epoch": 6.33,
354
+ "learning_rate": 5e-06,
355
+ "loss": 0.7055,
356
+ "step": 5400
357
+ },
358
+ {
359
+ "epoch": 6.45,
360
+ "learning_rate": 5e-06,
361
+ "loss": 0.6584,
362
+ "step": 5500
363
+ },
364
+ {
365
+ "epoch": 6.57,
366
+ "learning_rate": 5e-06,
367
+ "loss": 0.6819,
368
+ "step": 5600
369
+ },
370
+ {
371
+ "epoch": 6.68,
372
+ "learning_rate": 5e-06,
373
+ "loss": 0.6652,
374
+ "step": 5700
375
+ },
376
+ {
377
+ "epoch": 6.8,
378
+ "learning_rate": 5e-06,
379
+ "loss": 0.6728,
380
+ "step": 5800
381
+ },
382
+ {
383
+ "epoch": 6.92,
384
+ "learning_rate": 5e-06,
385
+ "loss": 0.6916,
386
+ "step": 5900
387
+ },
388
+ {
389
+ "epoch": 7.03,
390
+ "learning_rate": 5e-06,
391
+ "loss": 0.657,
392
+ "step": 6000
393
+ },
394
+ {
395
+ "epoch": 7.03,
396
+ "eval_loss": 0.6467106938362122,
397
+ "eval_runtime": 69.8327,
398
+ "eval_samples_per_second": 3.05,
399
+ "eval_steps_per_second": 3.05,
400
+ "step": 6000
401
+ },
402
+ {
403
+ "epoch": 7.15,
404
+ "learning_rate": 5e-06,
405
+ "loss": 0.7331,
406
+ "step": 6100
407
+ },
408
+ {
409
+ "epoch": 7.27,
410
+ "learning_rate": 5e-06,
411
+ "loss": 0.6598,
412
+ "step": 6200
413
+ },
414
+ {
415
+ "epoch": 7.39,
416
+ "learning_rate": 5e-06,
417
+ "loss": 0.6602,
418
+ "step": 6300
419
+ },
420
+ {
421
+ "epoch": 7.5,
422
+ "learning_rate": 5e-06,
423
+ "loss": 0.6819,
424
+ "step": 6400
425
+ },
426
+ {
427
+ "epoch": 7.62,
428
+ "learning_rate": 5e-06,
429
+ "loss": 0.6764,
430
+ "step": 6500
431
+ },
432
+ {
433
+ "epoch": 7.74,
434
+ "learning_rate": 5e-06,
435
+ "loss": 0.6674,
436
+ "step": 6600
437
+ },
438
+ {
439
+ "epoch": 7.85,
440
+ "learning_rate": 5e-06,
441
+ "loss": 0.6848,
442
+ "step": 6700
443
+ },
444
+ {
445
+ "epoch": 7.97,
446
+ "learning_rate": 5e-06,
447
+ "loss": 0.6446,
448
+ "step": 6800
449
+ },
450
+ {
451
+ "epoch": 8.09,
452
+ "learning_rate": 5e-06,
453
+ "loss": 0.6601,
454
+ "step": 6900
455
+ },
456
+ {
457
+ "epoch": 8.21,
458
+ "learning_rate": 5e-06,
459
+ "loss": 0.6649,
460
+ "step": 7000
461
+ },
462
+ {
463
+ "epoch": 8.32,
464
+ "learning_rate": 5e-06,
465
+ "loss": 0.657,
466
+ "step": 7100
467
+ },
468
+ {
469
+ "epoch": 8.44,
470
+ "learning_rate": 5e-06,
471
+ "loss": 0.6326,
472
+ "step": 7200
473
+ },
474
+ {
475
+ "epoch": 8.56,
476
+ "learning_rate": 5e-06,
477
+ "loss": 0.6541,
478
+ "step": 7300
479
+ },
480
+ {
481
+ "epoch": 8.68,
482
+ "learning_rate": 5e-06,
483
+ "loss": 0.6579,
484
+ "step": 7400
485
+ },
486
+ {
487
+ "epoch": 8.79,
488
+ "learning_rate": 5e-06,
489
+ "loss": 0.6784,
490
+ "step": 7500
491
+ },
492
+ {
493
+ "epoch": 8.79,
494
+ "eval_loss": 0.6341073513031006,
495
+ "eval_runtime": 70.1746,
496
+ "eval_samples_per_second": 3.035,
497
+ "eval_steps_per_second": 3.035,
498
+ "step": 7500
499
+ },
500
+ {
501
+ "epoch": 8.91,
502
+ "learning_rate": 5e-06,
503
+ "loss": 0.6889,
504
+ "step": 7600
505
+ },
506
+ {
507
+ "epoch": 9.03,
508
+ "learning_rate": 5e-06,
509
+ "loss": 0.6752,
510
+ "step": 7700
511
+ },
512
+ {
513
+ "epoch": 9.14,
514
+ "learning_rate": 5e-06,
515
+ "loss": 0.6654,
516
+ "step": 7800
517
+ },
518
+ {
519
+ "epoch": 9.26,
520
+ "learning_rate": 5e-06,
521
+ "loss": 0.6516,
522
+ "step": 7900
523
+ },
524
+ {
525
+ "epoch": 9.38,
526
+ "learning_rate": 5e-06,
527
+ "loss": 0.6847,
528
+ "step": 8000
529
+ },
530
+ {
531
+ "epoch": 9.5,
532
+ "learning_rate": 5e-06,
533
+ "loss": 0.6396,
534
+ "step": 8100
535
+ },
536
+ {
537
+ "epoch": 9.61,
538
+ "learning_rate": 5e-06,
539
+ "loss": 0.6484,
540
+ "step": 8200
541
+ },
542
+ {
543
+ "epoch": 9.73,
544
+ "learning_rate": 5e-06,
545
+ "loss": 0.6396,
546
+ "step": 8300
547
+ },
548
+ {
549
+ "epoch": 9.85,
550
+ "learning_rate": 5e-06,
551
+ "loss": 0.6951,
552
+ "step": 8400
553
+ },
554
+ {
555
+ "epoch": 9.96,
556
+ "learning_rate": 5e-06,
557
+ "loss": 0.641,
558
+ "step": 8500
559
+ },
560
+ {
561
+ "epoch": 10.08,
562
+ "learning_rate": 5e-06,
563
+ "loss": 0.6379,
564
+ "step": 8600
565
+ },
566
+ {
567
+ "epoch": 10.2,
568
+ "learning_rate": 5e-06,
569
+ "loss": 0.6264,
570
+ "step": 8700
571
+ },
572
+ {
573
+ "epoch": 10.32,
574
+ "learning_rate": 5e-06,
575
+ "loss": 0.6364,
576
+ "step": 8800
577
+ },
578
+ {
579
+ "epoch": 10.43,
580
+ "learning_rate": 5e-06,
581
+ "loss": 0.676,
582
+ "step": 8900
583
+ },
584
+ {
585
+ "epoch": 10.55,
586
+ "learning_rate": 5e-06,
587
+ "loss": 0.6756,
588
+ "step": 9000
589
+ },
590
+ {
591
+ "epoch": 10.55,
592
+ "eval_loss": 0.6359875202178955,
593
+ "eval_runtime": 70.4519,
594
+ "eval_samples_per_second": 3.023,
595
+ "eval_steps_per_second": 3.023,
596
+ "step": 9000
597
+ },
598
+ {
599
+ "epoch": 10.67,
600
+ "learning_rate": 5e-06,
601
+ "loss": 0.6641,
602
+ "step": 9100
603
+ },
604
+ {
605
+ "epoch": 10.79,
606
+ "learning_rate": 5e-06,
607
+ "loss": 0.6126,
608
+ "step": 9200
609
+ },
610
+ {
611
+ "epoch": 10.9,
612
+ "learning_rate": 5e-06,
613
+ "loss": 0.6538,
614
+ "step": 9300
615
+ },
616
+ {
617
+ "epoch": 11.02,
618
+ "learning_rate": 5e-06,
619
+ "loss": 0.641,
620
+ "step": 9400
621
+ },
622
+ {
623
+ "epoch": 11.14,
624
+ "learning_rate": 5e-06,
625
+ "loss": 0.6501,
626
+ "step": 9500
627
+ },
628
+ {
629
+ "epoch": 11.25,
630
+ "learning_rate": 5e-06,
631
+ "loss": 0.647,
632
+ "step": 9600
633
+ },
634
+ {
635
+ "epoch": 11.37,
636
+ "learning_rate": 5e-06,
637
+ "loss": 0.6463,
638
+ "step": 9700
639
+ },
640
+ {
641
+ "epoch": 11.49,
642
+ "learning_rate": 5e-06,
643
+ "loss": 0.6507,
644
+ "step": 9800
645
+ },
646
+ {
647
+ "epoch": 11.61,
648
+ "learning_rate": 5e-06,
649
+ "loss": 0.6525,
650
+ "step": 9900
651
+ },
652
+ {
653
+ "epoch": 11.72,
654
+ "learning_rate": 5e-06,
655
+ "loss": 0.6194,
656
+ "step": 10000
657
+ },
658
+ {
659
+ "epoch": 11.84,
660
+ "learning_rate": 5e-06,
661
+ "loss": 0.6338,
662
+ "step": 10100
663
+ },
664
+ {
665
+ "epoch": 11.96,
666
+ "learning_rate": 5e-06,
667
+ "loss": 0.6492,
668
+ "step": 10200
669
+ },
670
+ {
671
+ "epoch": 12.08,
672
+ "learning_rate": 5e-06,
673
+ "loss": 0.6531,
674
+ "step": 10300
675
+ },
676
+ {
677
+ "epoch": 12.19,
678
+ "learning_rate": 5e-06,
679
+ "loss": 0.6073,
680
+ "step": 10400
681
+ },
682
+ {
683
+ "epoch": 12.31,
684
+ "learning_rate": 5e-06,
685
+ "loss": 0.6307,
686
+ "step": 10500
687
+ },
688
+ {
689
+ "epoch": 12.31,
690
+ "eval_loss": 0.6309817433357239,
691
+ "eval_runtime": 70.2077,
692
+ "eval_samples_per_second": 3.034,
693
+ "eval_steps_per_second": 3.034,
694
+ "step": 10500
695
+ },
696
+ {
697
+ "epoch": 12.43,
698
+ "learning_rate": 5e-06,
699
+ "loss": 0.6608,
700
+ "step": 10600
701
+ },
702
+ {
703
+ "epoch": 12.54,
704
+ "learning_rate": 5e-06,
705
+ "loss": 0.6252,
706
+ "step": 10700
707
+ },
708
+ {
709
+ "epoch": 12.66,
710
+ "learning_rate": 5e-06,
711
+ "loss": 0.6258,
712
+ "step": 10800
713
+ },
714
+ {
715
+ "epoch": 12.78,
716
+ "learning_rate": 5e-06,
717
+ "loss": 0.6504,
718
+ "step": 10900
719
+ },
720
+ {
721
+ "epoch": 12.9,
722
+ "learning_rate": 5e-06,
723
+ "loss": 0.6281,
724
+ "step": 11000
725
+ },
726
+ {
727
+ "epoch": 13.01,
728
+ "learning_rate": 5e-06,
729
+ "loss": 0.6398,
730
+ "step": 11100
731
+ },
732
+ {
733
+ "epoch": 13.13,
734
+ "learning_rate": 5e-06,
735
+ "loss": 0.6318,
736
+ "step": 11200
737
+ },
738
+ {
739
+ "epoch": 13.25,
740
+ "learning_rate": 5e-06,
741
+ "loss": 0.6162,
742
+ "step": 11300
743
+ },
744
+ {
745
+ "epoch": 13.36,
746
+ "learning_rate": 5e-06,
747
+ "loss": 0.6101,
748
+ "step": 11400
749
+ },
750
+ {
751
+ "epoch": 13.48,
752
+ "learning_rate": 5e-06,
753
+ "loss": 0.6124,
754
+ "step": 11500
755
+ },
756
+ {
757
+ "epoch": 13.6,
758
+ "learning_rate": 5e-06,
759
+ "loss": 0.5994,
760
+ "step": 11600
761
+ },
762
+ {
763
+ "epoch": 13.72,
764
+ "learning_rate": 5e-06,
765
+ "loss": 0.6599,
766
+ "step": 11700
767
+ },
768
+ {
769
+ "epoch": 13.83,
770
+ "learning_rate": 5e-06,
771
+ "loss": 0.6192,
772
+ "step": 11800
773
+ },
774
+ {
775
+ "epoch": 13.95,
776
+ "learning_rate": 5e-06,
777
+ "loss": 0.6341,
778
+ "step": 11900
779
+ },
780
+ {
781
+ "epoch": 14.07,
782
+ "learning_rate": 5e-06,
783
+ "loss": 0.6155,
784
+ "step": 12000
785
+ },
786
+ {
787
+ "epoch": 14.07,
788
+ "eval_loss": 0.63248211145401,
789
+ "eval_runtime": 70.8191,
790
+ "eval_samples_per_second": 3.008,
791
+ "eval_steps_per_second": 3.008,
792
+ "step": 12000
793
+ },
794
+ {
795
+ "epoch": 14.19,
796
+ "learning_rate": 5e-06,
797
+ "loss": 0.6562,
798
+ "step": 12100
799
+ },
800
+ {
801
+ "epoch": 14.3,
802
+ "learning_rate": 5e-06,
803
+ "loss": 0.633,
804
+ "step": 12200
805
+ },
806
+ {
807
+ "epoch": 14.42,
808
+ "learning_rate": 5e-06,
809
+ "loss": 0.6169,
810
+ "step": 12300
811
+ },
812
+ {
813
+ "epoch": 14.54,
814
+ "learning_rate": 5e-06,
815
+ "loss": 0.6312,
816
+ "step": 12400
817
+ },
818
+ {
819
+ "epoch": 14.65,
820
+ "learning_rate": 5e-06,
821
+ "loss": 0.6401,
822
+ "step": 12500
823
+ },
824
+ {
825
+ "epoch": 14.77,
826
+ "learning_rate": 5e-06,
827
+ "loss": 0.6365,
828
+ "step": 12600
829
+ },
830
+ {
831
+ "epoch": 14.89,
832
+ "learning_rate": 5e-06,
833
+ "loss": 0.6286,
834
+ "step": 12700
835
+ },
836
+ {
837
+ "epoch": 15.01,
838
+ "learning_rate": 5e-06,
839
+ "loss": 0.5877,
840
+ "step": 12800
841
+ },
842
+ {
843
+ "epoch": 15.12,
844
+ "learning_rate": 5e-06,
845
+ "loss": 0.6334,
846
+ "step": 12900
847
+ },
848
+ {
849
+ "epoch": 15.24,
850
+ "learning_rate": 5e-06,
851
+ "loss": 0.5785,
852
+ "step": 13000
853
+ },
854
+ {
855
+ "epoch": 15.36,
856
+ "learning_rate": 5e-06,
857
+ "loss": 0.6155,
858
+ "step": 13100
859
+ },
860
+ {
861
+ "epoch": 15.47,
862
+ "learning_rate": 5e-06,
863
+ "loss": 0.6404,
864
+ "step": 13200
865
+ },
866
+ {
867
+ "epoch": 15.59,
868
+ "learning_rate": 5e-06,
869
+ "loss": 0.6302,
870
+ "step": 13300
871
+ },
872
+ {
873
+ "epoch": 15.71,
874
+ "learning_rate": 5e-06,
875
+ "loss": 0.6154,
876
+ "step": 13400
877
+ },
878
+ {
879
+ "epoch": 15.83,
880
+ "learning_rate": 5e-06,
881
+ "loss": 0.6119,
882
+ "step": 13500
883
+ },
884
+ {
885
+ "epoch": 15.83,
886
+ "eval_loss": 0.6228322386741638,
887
+ "eval_runtime": 70.1345,
888
+ "eval_samples_per_second": 3.037,
889
+ "eval_steps_per_second": 3.037,
890
+ "step": 13500
891
+ },
892
+ {
893
+ "epoch": 15.94,
894
+ "learning_rate": 5e-06,
895
+ "loss": 0.6193,
896
+ "step": 13600
897
+ },
898
+ {
899
+ "epoch": 16.06,
900
+ "learning_rate": 5e-06,
901
+ "loss": 0.6161,
902
+ "step": 13700
903
+ },
904
+ {
905
+ "epoch": 16.18,
906
+ "learning_rate": 5e-06,
907
+ "loss": 0.5785,
908
+ "step": 13800
909
+ },
910
+ {
911
+ "epoch": 16.3,
912
+ "learning_rate": 5e-06,
913
+ "loss": 0.6043,
914
+ "step": 13900
915
+ },
916
+ {
917
+ "epoch": 16.41,
918
+ "learning_rate": 5e-06,
919
+ "loss": 0.6205,
920
+ "step": 14000
921
+ },
922
+ {
923
+ "epoch": 16.53,
924
+ "learning_rate": 5e-06,
925
+ "loss": 0.6321,
926
+ "step": 14100
927
+ },
928
+ {
929
+ "epoch": 16.65,
930
+ "learning_rate": 5e-06,
931
+ "loss": 0.5996,
932
+ "step": 14200
933
+ },
934
+ {
935
+ "epoch": 16.76,
936
+ "learning_rate": 5e-06,
937
+ "loss": 0.6232,
938
+ "step": 14300
939
+ },
940
+ {
941
+ "epoch": 16.88,
942
+ "learning_rate": 5e-06,
943
+ "loss": 0.6148,
944
+ "step": 14400
945
+ },
946
+ {
947
+ "epoch": 17.0,
948
+ "learning_rate": 5e-06,
949
+ "loss": 0.602,
950
+ "step": 14500
951
+ },
952
+ {
953
+ "epoch": 17.12,
954
+ "learning_rate": 5e-06,
955
+ "loss": 0.5716,
956
+ "step": 14600
957
+ },
958
+ {
959
+ "epoch": 17.23,
960
+ "learning_rate": 5e-06,
961
+ "loss": 0.629,
962
+ "step": 14700
963
+ },
964
+ {
965
+ "epoch": 17.35,
966
+ "learning_rate": 5e-06,
967
+ "loss": 0.6134,
968
+ "step": 14800
969
+ },
970
+ {
971
+ "epoch": 17.47,
972
+ "learning_rate": 5e-06,
973
+ "loss": 0.6023,
974
+ "step": 14900
975
+ },
976
+ {
977
+ "epoch": 17.58,
978
+ "learning_rate": 5e-06,
979
+ "loss": 0.5943,
980
+ "step": 15000
981
+ },
982
+ {
983
+ "epoch": 17.58,
984
+ "eval_loss": 0.623548686504364,
985
+ "eval_runtime": 70.4978,
986
+ "eval_samples_per_second": 3.021,
987
+ "eval_steps_per_second": 3.021,
988
+ "step": 15000
989
+ },
990
+ {
991
+ "epoch": 17.7,
992
+ "learning_rate": 5e-06,
993
+ "loss": 0.6092,
994
+ "step": 15100
995
+ },
996
+ {
997
+ "epoch": 17.82,
998
+ "learning_rate": 5e-06,
999
+ "loss": 0.6172,
1000
+ "step": 15200
1001
+ },
1002
+ {
1003
+ "epoch": 17.94,
1004
+ "learning_rate": 5e-06,
1005
+ "loss": 0.6247,
1006
+ "step": 15300
1007
+ },
1008
+ {
1009
+ "epoch": 18.05,
1010
+ "learning_rate": 5e-06,
1011
+ "loss": 0.6043,
1012
+ "step": 15400
1013
+ },
1014
+ {
1015
+ "epoch": 18.17,
1016
+ "learning_rate": 5e-06,
1017
+ "loss": 0.6186,
1018
+ "step": 15500
1019
+ },
1020
+ {
1021
+ "epoch": 18.29,
1022
+ "learning_rate": 5e-06,
1023
+ "loss": 0.5877,
1024
+ "step": 15600
1025
+ },
1026
+ {
1027
+ "epoch": 18.41,
1028
+ "learning_rate": 5e-06,
1029
+ "loss": 0.5993,
1030
+ "step": 15700
1031
+ },
1032
+ {
1033
+ "epoch": 18.52,
1034
+ "learning_rate": 5e-06,
1035
+ "loss": 0.5949,
1036
+ "step": 15800
1037
+ },
1038
+ {
1039
+ "epoch": 18.64,
1040
+ "learning_rate": 5e-06,
1041
+ "loss": 0.5775,
1042
+ "step": 15900
1043
+ },
1044
+ {
1045
+ "epoch": 18.76,
1046
+ "learning_rate": 5e-06,
1047
+ "loss": 0.6147,
1048
+ "step": 16000
1049
+ },
1050
+ {
1051
+ "epoch": 18.87,
1052
+ "learning_rate": 5e-06,
1053
+ "loss": 0.5973,
1054
+ "step": 16100
1055
+ },
1056
+ {
1057
+ "epoch": 18.99,
1058
+ "learning_rate": 5e-06,
1059
+ "loss": 0.6103,
1060
+ "step": 16200
1061
+ },
1062
+ {
1063
+ "epoch": 19.11,
1064
+ "learning_rate": 5e-06,
1065
+ "loss": 0.6024,
1066
+ "step": 16300
1067
+ },
1068
+ {
1069
+ "epoch": 19.23,
1070
+ "learning_rate": 5e-06,
1071
+ "loss": 0.5729,
1072
+ "step": 16400
1073
+ },
1074
+ {
1075
+ "epoch": 19.34,
1076
+ "learning_rate": 5e-06,
1077
+ "loss": 0.6012,
1078
+ "step": 16500
1079
+ },
1080
+ {
1081
+ "epoch": 19.34,
1082
+ "eval_loss": 0.6155942678451538,
1083
+ "eval_runtime": 70.5596,
1084
+ "eval_samples_per_second": 3.019,
1085
+ "eval_steps_per_second": 3.019,
1086
+ "step": 16500
1087
+ },
1088
+ {
1089
+ "epoch": 19.46,
1090
+ "learning_rate": 5e-06,
1091
+ "loss": 0.6123,
1092
+ "step": 16600
1093
+ },
1094
+ {
1095
+ "epoch": 19.58,
1096
+ "learning_rate": 5e-06,
1097
+ "loss": 0.5937,
1098
+ "step": 16700
1099
+ },
1100
+ {
1101
+ "epoch": 19.7,
1102
+ "learning_rate": 5e-06,
1103
+ "loss": 0.5824,
1104
+ "step": 16800
1105
+ },
1106
+ {
1107
+ "epoch": 19.81,
1108
+ "learning_rate": 5e-06,
1109
+ "loss": 0.6433,
1110
+ "step": 16900
1111
+ },
1112
+ {
1113
+ "epoch": 19.93,
1114
+ "learning_rate": 5e-06,
1115
+ "loss": 0.5799,
1116
+ "step": 17000
1117
+ },
1118
+ {
1119
+ "epoch": 20.05,
1120
+ "learning_rate": 5e-06,
1121
+ "loss": 0.593,
1122
+ "step": 17100
1123
+ },
1124
+ {
1125
+ "epoch": 20.16,
1126
+ "learning_rate": 5e-06,
1127
+ "loss": 0.5909,
1128
+ "step": 17200
1129
+ },
1130
+ {
1131
+ "epoch": 20.28,
1132
+ "learning_rate": 5e-06,
1133
+ "loss": 0.5918,
1134
+ "step": 17300
1135
+ },
1136
+ {
1137
+ "epoch": 20.4,
1138
+ "learning_rate": 5e-06,
1139
+ "loss": 0.5908,
1140
+ "step": 17400
1141
+ },
1142
+ {
1143
+ "epoch": 20.52,
1144
+ "learning_rate": 5e-06,
1145
+ "loss": 0.5932,
1146
+ "step": 17500
1147
+ },
1148
+ {
1149
+ "epoch": 20.63,
1150
+ "learning_rate": 5e-06,
1151
+ "loss": 0.6085,
1152
+ "step": 17600
1153
+ },
1154
+ {
1155
+ "epoch": 20.75,
1156
+ "learning_rate": 5e-06,
1157
+ "loss": 0.5737,
1158
+ "step": 17700
1159
+ },
1160
+ {
1161
+ "epoch": 20.87,
1162
+ "learning_rate": 5e-06,
1163
+ "loss": 0.5926,
1164
+ "step": 17800
1165
+ },
1166
+ {
1167
+ "epoch": 20.98,
1168
+ "learning_rate": 5e-06,
1169
+ "loss": 0.606,
1170
+ "step": 17900
1171
+ },
1172
+ {
1173
+ "epoch": 21.1,
1174
+ "learning_rate": 5e-06,
1175
+ "loss": 0.5834,
1176
+ "step": 18000
1177
+ },
1178
+ {
1179
+ "epoch": 21.1,
1180
+ "eval_loss": 0.6064698100090027,
1181
+ "eval_runtime": 70.551,
1182
+ "eval_samples_per_second": 3.019,
1183
+ "eval_steps_per_second": 3.019,
1184
+ "step": 18000
1185
+ },
1186
+ {
1187
+ "epoch": 21.22,
1188
+ "learning_rate": 5e-06,
1189
+ "loss": 0.57,
1190
+ "step": 18100
1191
+ },
1192
+ {
1193
+ "epoch": 21.34,
1194
+ "learning_rate": 5e-06,
1195
+ "loss": 0.5878,
1196
+ "step": 18200
1197
+ },
1198
+ {
1199
+ "epoch": 21.45,
1200
+ "learning_rate": 5e-06,
1201
+ "loss": 0.5623,
1202
+ "step": 18300
1203
+ },
1204
+ {
1205
+ "epoch": 21.57,
1206
+ "learning_rate": 5e-06,
1207
+ "loss": 0.5978,
1208
+ "step": 18400
1209
+ },
1210
+ {
1211
+ "epoch": 21.69,
1212
+ "learning_rate": 5e-06,
1213
+ "loss": 0.594,
1214
+ "step": 18500
1215
+ },
1216
+ {
1217
+ "epoch": 21.81,
1218
+ "learning_rate": 5e-06,
1219
+ "loss": 0.6013,
1220
+ "step": 18600
1221
+ },
1222
+ {
1223
+ "epoch": 21.92,
1224
+ "learning_rate": 5e-06,
1225
+ "loss": 0.5576,
1226
+ "step": 18700
1227
+ },
1228
+ {
1229
+ "epoch": 22.04,
1230
+ "learning_rate": 5e-06,
1231
+ "loss": 0.5794,
1232
+ "step": 18800
1233
+ },
1234
+ {
1235
+ "epoch": 22.16,
1236
+ "learning_rate": 5e-06,
1237
+ "loss": 0.5863,
1238
+ "step": 18900
1239
+ },
1240
+ {
1241
+ "epoch": 22.27,
1242
+ "learning_rate": 5e-06,
1243
+ "loss": 0.5956,
1244
+ "step": 19000
1245
+ },
1246
+ {
1247
+ "epoch": 22.39,
1248
+ "learning_rate": 5e-06,
1249
+ "loss": 0.5849,
1250
+ "step": 19100
1251
+ },
1252
+ {
1253
+ "epoch": 22.51,
1254
+ "learning_rate": 5e-06,
1255
+ "loss": 0.5705,
1256
+ "step": 19200
1257
+ },
1258
+ {
1259
+ "epoch": 22.63,
1260
+ "learning_rate": 5e-06,
1261
+ "loss": 0.5945,
1262
+ "step": 19300
1263
+ },
1264
+ {
1265
+ "epoch": 22.74,
1266
+ "learning_rate": 5e-06,
1267
+ "loss": 0.5673,
1268
+ "step": 19400
1269
+ },
1270
+ {
1271
+ "epoch": 22.86,
1272
+ "learning_rate": 5e-06,
1273
+ "loss": 0.5942,
1274
+ "step": 19500
1275
+ },
1276
+ {
1277
+ "epoch": 22.86,
1278
+ "eval_loss": 0.6019883155822754,
1279
+ "eval_runtime": 70.6262,
1280
+ "eval_samples_per_second": 3.016,
1281
+ "eval_steps_per_second": 3.016,
1282
+ "step": 19500
1283
+ },
1284
+ {
1285
+ "epoch": 22.98,
1286
+ "learning_rate": 5e-06,
1287
+ "loss": 0.556,
1288
+ "step": 19600
1289
+ },
1290
+ {
1291
+ "epoch": 23.09,
1292
+ "learning_rate": 5e-06,
1293
+ "loss": 0.5919,
1294
+ "step": 19700
1295
+ },
1296
+ {
1297
+ "epoch": 23.21,
1298
+ "learning_rate": 5e-06,
1299
+ "loss": 0.554,
1300
+ "step": 19800
1301
+ },
1302
+ {
1303
+ "epoch": 23.33,
1304
+ "learning_rate": 5e-06,
1305
+ "loss": 0.5708,
1306
+ "step": 19900
1307
+ },
1308
+ {
1309
+ "epoch": 23.45,
1310
+ "learning_rate": 5e-06,
1311
+ "loss": 0.5555,
1312
+ "step": 20000
1313
+ },
1314
+ {
1315
+ "epoch": 23.56,
1316
+ "learning_rate": 5e-06,
1317
+ "loss": 0.6004,
1318
+ "step": 20100
1319
+ },
1320
+ {
1321
+ "epoch": 23.68,
1322
+ "learning_rate": 5e-06,
1323
+ "loss": 0.5894,
1324
+ "step": 20200
1325
+ },
1326
+ {
1327
+ "epoch": 23.8,
1328
+ "learning_rate": 5e-06,
1329
+ "loss": 0.5718,
1330
+ "step": 20300
1331
+ },
1332
+ {
1333
+ "epoch": 23.92,
1334
+ "learning_rate": 5e-06,
1335
+ "loss": 0.5744,
1336
+ "step": 20400
1337
+ },
1338
+ {
1339
+ "epoch": 24.03,
1340
+ "learning_rate": 5e-06,
1341
+ "loss": 0.5602,
1342
+ "step": 20500
1343
+ },
1344
+ {
1345
+ "epoch": 24.15,
1346
+ "learning_rate": 5e-06,
1347
+ "loss": 0.5656,
1348
+ "step": 20600
1349
+ },
1350
+ {
1351
+ "epoch": 24.27,
1352
+ "learning_rate": 5e-06,
1353
+ "loss": 0.5657,
1354
+ "step": 20700
1355
+ },
1356
+ {
1357
+ "epoch": 24.38,
1358
+ "learning_rate": 5e-06,
1359
+ "loss": 0.5553,
1360
+ "step": 20800
1361
+ },
1362
+ {
1363
+ "epoch": 24.5,
1364
+ "learning_rate": 5e-06,
1365
+ "loss": 0.5962,
1366
+ "step": 20900
1367
+ },
1368
+ {
1369
+ "epoch": 24.62,
1370
+ "learning_rate": 5e-06,
1371
+ "loss": 0.5982,
1372
+ "step": 21000
1373
+ },
1374
+ {
1375
+ "epoch": 24.62,
1376
+ "eval_loss": 0.5987153053283691,
1377
+ "eval_runtime": 70.2465,
1378
+ "eval_samples_per_second": 3.032,
1379
+ "eval_steps_per_second": 3.032,
1380
+ "step": 21000
1381
+ },
1382
+ {
1383
+ "epoch": 24.74,
1384
+ "learning_rate": 5e-06,
1385
+ "loss": 0.5946,
1386
+ "step": 21100
1387
+ },
1388
+ {
1389
+ "epoch": 24.85,
1390
+ "learning_rate": 5e-06,
1391
+ "loss": 0.5473,
1392
+ "step": 21200
1393
+ },
1394
+ {
1395
+ "epoch": 24.97,
1396
+ "learning_rate": 5e-06,
1397
+ "loss": 0.5605,
1398
+ "step": 21300
1399
+ },
1400
+ {
1401
+ "epoch": 25.09,
1402
+ "learning_rate": 5e-06,
1403
+ "loss": 0.5953,
1404
+ "step": 21400
1405
+ },
1406
+ {
1407
+ "epoch": 25.21,
1408
+ "learning_rate": 5e-06,
1409
+ "loss": 0.5697,
1410
+ "step": 21500
1411
+ },
1412
+ {
1413
+ "epoch": 25.32,
1414
+ "learning_rate": 5e-06,
1415
+ "loss": 0.5627,
1416
+ "step": 21600
1417
+ },
1418
+ {
1419
+ "epoch": 25.44,
1420
+ "learning_rate": 5e-06,
1421
+ "loss": 0.567,
1422
+ "step": 21700
1423
+ },
1424
+ {
1425
+ "epoch": 25.56,
1426
+ "learning_rate": 5e-06,
1427
+ "loss": 0.5394,
1428
+ "step": 21800
1429
+ },
1430
+ {
1431
+ "epoch": 25.67,
1432
+ "learning_rate": 5e-06,
1433
+ "loss": 0.5461,
1434
+ "step": 21900
1435
+ },
1436
+ {
1437
+ "epoch": 25.79,
1438
+ "learning_rate": 5e-06,
1439
+ "loss": 0.5615,
1440
+ "step": 22000
1441
+ },
1442
+ {
1443
+ "epoch": 25.91,
1444
+ "learning_rate": 5e-06,
1445
+ "loss": 0.5547,
1446
+ "step": 22100
1447
+ },
1448
+ {
1449
+ "epoch": 26.03,
1450
+ "learning_rate": 5e-06,
1451
+ "loss": 0.5534,
1452
+ "step": 22200
1453
+ },
1454
+ {
1455
+ "epoch": 26.14,
1456
+ "learning_rate": 5e-06,
1457
+ "loss": 0.5494,
1458
+ "step": 22300
1459
+ },
1460
+ {
1461
+ "epoch": 26.26,
1462
+ "learning_rate": 5e-06,
1463
+ "loss": 0.569,
1464
+ "step": 22400
1465
+ },
1466
+ {
1467
+ "epoch": 26.38,
1468
+ "learning_rate": 5e-06,
1469
+ "loss": 0.5352,
1470
+ "step": 22500
1471
+ },
1472
+ {
1473
+ "epoch": 26.38,
1474
+ "eval_loss": 0.6042129397392273,
1475
+ "eval_runtime": 69.7858,
1476
+ "eval_samples_per_second": 3.052,
1477
+ "eval_steps_per_second": 3.052,
1478
+ "step": 22500
1479
+ },
1480
+ {
1481
+ "epoch": 26.49,
1482
+ "learning_rate": 5e-06,
1483
+ "loss": 0.5754,
1484
+ "step": 22600
1485
+ },
1486
+ {
1487
+ "epoch": 26.61,
1488
+ "learning_rate": 5e-06,
1489
+ "loss": 0.5443,
1490
+ "step": 22700
1491
+ },
1492
+ {
1493
+ "epoch": 26.73,
1494
+ "learning_rate": 5e-06,
1495
+ "loss": 0.5765,
1496
+ "step": 22800
1497
+ },
1498
+ {
1499
+ "epoch": 26.85,
1500
+ "learning_rate": 5e-06,
1501
+ "loss": 0.5494,
1502
+ "step": 22900
1503
+ },
1504
+ {
1505
+ "epoch": 26.96,
1506
+ "learning_rate": 5e-06,
1507
+ "loss": 0.5598,
1508
+ "step": 23000
1509
+ },
1510
+ {
1511
+ "epoch": 27.08,
1512
+ "learning_rate": 5e-06,
1513
+ "loss": 0.5634,
1514
+ "step": 23100
1515
+ },
1516
+ {
1517
+ "epoch": 27.2,
1518
+ "learning_rate": 5e-06,
1519
+ "loss": 0.534,
1520
+ "step": 23200
1521
+ },
1522
+ {
1523
+ "epoch": 27.32,
1524
+ "learning_rate": 5e-06,
1525
+ "loss": 0.5626,
1526
+ "step": 23300
1527
+ },
1528
+ {
1529
+ "epoch": 27.43,
1530
+ "learning_rate": 5e-06,
1531
+ "loss": 0.5681,
1532
+ "step": 23400
1533
+ },
1534
+ {
1535
+ "epoch": 27.55,
1536
+ "learning_rate": 5e-06,
1537
+ "loss": 0.5735,
1538
+ "step": 23500
1539
+ },
1540
+ {
1541
+ "epoch": 27.67,
1542
+ "learning_rate": 5e-06,
1543
+ "loss": 0.5464,
1544
+ "step": 23600
1545
+ },
1546
+ {
1547
+ "epoch": 27.78,
1548
+ "learning_rate": 5e-06,
1549
+ "loss": 0.529,
1550
+ "step": 23700
1551
+ },
1552
+ {
1553
+ "epoch": 27.9,
1554
+ "learning_rate": 5e-06,
1555
+ "loss": 0.548,
1556
+ "step": 23800
1557
+ },
1558
+ {
1559
+ "epoch": 28.02,
1560
+ "learning_rate": 5e-06,
1561
+ "loss": 0.5699,
1562
+ "step": 23900
1563
+ },
1564
+ {
1565
+ "epoch": 28.14,
1566
+ "learning_rate": 5e-06,
1567
+ "loss": 0.5746,
1568
+ "step": 24000
1569
+ },
1570
+ {
1571
+ "epoch": 28.14,
1572
+ "eval_loss": 0.5994372367858887,
1573
+ "eval_runtime": 70.1931,
1574
+ "eval_samples_per_second": 3.034,
1575
+ "eval_steps_per_second": 3.034,
1576
+ "step": 24000
1577
+ },
1578
+ {
1579
+ "epoch": 28.25,
1580
+ "learning_rate": 5e-06,
1581
+ "loss": 0.5537,
1582
+ "step": 24100
1583
+ },
1584
+ {
1585
+ "epoch": 28.37,
1586
+ "learning_rate": 5e-06,
1587
+ "loss": 0.5479,
1588
+ "step": 24200
1589
+ },
1590
+ {
1591
+ "epoch": 28.49,
1592
+ "learning_rate": 5e-06,
1593
+ "loss": 0.5643,
1594
+ "step": 24300
1595
+ },
1596
+ {
1597
+ "epoch": 28.6,
1598
+ "learning_rate": 5e-06,
1599
+ "loss": 0.5273,
1600
+ "step": 24400
1601
+ },
1602
+ {
1603
+ "epoch": 28.72,
1604
+ "learning_rate": 5e-06,
1605
+ "loss": 0.544,
1606
+ "step": 24500
1607
+ },
1608
+ {
1609
+ "epoch": 28.84,
1610
+ "learning_rate": 5e-06,
1611
+ "loss": 0.5172,
1612
+ "step": 24600
1613
+ },
1614
+ {
1615
+ "epoch": 28.96,
1616
+ "learning_rate": 5e-06,
1617
+ "loss": 0.5658,
1618
+ "step": 24700
1619
+ },
1620
+ {
1621
+ "epoch": 29.07,
1622
+ "learning_rate": 5e-06,
1623
+ "loss": 0.5343,
1624
+ "step": 24800
1625
+ },
1626
+ {
1627
+ "epoch": 29.19,
1628
+ "learning_rate": 5e-06,
1629
+ "loss": 0.5307,
1630
+ "step": 24900
1631
+ },
1632
+ {
1633
+ "epoch": 29.31,
1634
+ "learning_rate": 5e-06,
1635
+ "loss": 0.5386,
1636
+ "step": 25000
1637
+ },
1638
+ {
1639
+ "epoch": 29.43,
1640
+ "learning_rate": 5e-06,
1641
+ "loss": 0.5553,
1642
+ "step": 25100
1643
+ },
1644
+ {
1645
+ "epoch": 29.54,
1646
+ "learning_rate": 5e-06,
1647
+ "loss": 0.5309,
1648
+ "step": 25200
1649
+ },
1650
+ {
1651
+ "epoch": 29.66,
1652
+ "learning_rate": 5e-06,
1653
+ "loss": 0.5323,
1654
+ "step": 25300
1655
+ },
1656
+ {
1657
+ "epoch": 29.78,
1658
+ "learning_rate": 5e-06,
1659
+ "loss": 0.5477,
1660
+ "step": 25400
1661
+ },
1662
+ {
1663
+ "epoch": 29.89,
1664
+ "learning_rate": 5e-06,
1665
+ "loss": 0.5618,
1666
+ "step": 25500
1667
+ },
1668
+ {
1669
+ "epoch": 29.89,
1670
+ "eval_loss": 0.5992656350135803,
1671
+ "eval_runtime": 70.8526,
1672
+ "eval_samples_per_second": 3.006,
1673
+ "eval_steps_per_second": 3.006,
1674
+ "step": 25500
1675
+ },
1676
+ {
1677
+ "epoch": 30.01,
1678
+ "learning_rate": 5e-06,
1679
+ "loss": 0.5368,
1680
+ "step": 25600
1681
+ },
1682
+ {
1683
+ "epoch": 30.13,
1684
+ "learning_rate": 5e-06,
1685
+ "loss": 0.55,
1686
+ "step": 25700
1687
+ },
1688
+ {
1689
+ "epoch": 30.25,
1690
+ "learning_rate": 5e-06,
1691
+ "loss": 0.5138,
1692
+ "step": 25800
1693
+ },
1694
+ {
1695
+ "epoch": 30.36,
1696
+ "learning_rate": 5e-06,
1697
+ "loss": 0.5266,
1698
+ "step": 25900
1699
+ },
1700
+ {
1701
+ "epoch": 30.48,
1702
+ "learning_rate": 5e-06,
1703
+ "loss": 0.5539,
1704
+ "step": 26000
1705
+ },
1706
+ {
1707
+ "epoch": 30.6,
1708
+ "learning_rate": 5e-06,
1709
+ "loss": 0.536,
1710
+ "step": 26100
1711
+ },
1712
+ {
1713
+ "epoch": 30.72,
1714
+ "learning_rate": 5e-06,
1715
+ "loss": 0.5427,
1716
+ "step": 26200
1717
+ },
1718
+ {
1719
+ "epoch": 30.83,
1720
+ "learning_rate": 5e-06,
1721
+ "loss": 0.5496,
1722
+ "step": 26300
1723
+ },
1724
+ {
1725
+ "epoch": 30.95,
1726
+ "learning_rate": 5e-06,
1727
+ "loss": 0.5127,
1728
+ "step": 26400
1729
+ },
1730
+ {
1731
+ "epoch": 31.07,
1732
+ "learning_rate": 5e-06,
1733
+ "loss": 0.5569,
1734
+ "step": 26500
1735
+ },
1736
+ {
1737
+ "epoch": 31.18,
1738
+ "learning_rate": 5e-06,
1739
+ "loss": 0.5196,
1740
+ "step": 26600
1741
+ },
1742
+ {
1743
+ "epoch": 31.3,
1744
+ "learning_rate": 5e-06,
1745
+ "loss": 0.5268,
1746
+ "step": 26700
1747
+ },
1748
+ {
1749
+ "epoch": 31.42,
1750
+ "learning_rate": 5e-06,
1751
+ "loss": 0.5419,
1752
+ "step": 26800
1753
+ },
1754
+ {
1755
+ "epoch": 31.54,
1756
+ "learning_rate": 5e-06,
1757
+ "loss": 0.5087,
1758
+ "step": 26900
1759
+ },
1760
+ {
1761
+ "epoch": 31.65,
1762
+ "learning_rate": 5e-06,
1763
+ "loss": 0.5254,
1764
+ "step": 27000
1765
+ },
1766
+ {
1767
+ "epoch": 31.65,
1768
+ "eval_loss": 0.5909192562103271,
1769
+ "eval_runtime": 70.5089,
1770
+ "eval_samples_per_second": 3.021,
1771
+ "eval_steps_per_second": 3.021,
1772
+ "step": 27000
1773
+ },
1774
+ {
1775
+ "epoch": 31.77,
1776
+ "learning_rate": 5e-06,
1777
+ "loss": 0.5346,
1778
+ "step": 27100
1779
+ },
1780
+ {
1781
+ "epoch": 31.89,
1782
+ "learning_rate": 5e-06,
1783
+ "loss": 0.5279,
1784
+ "step": 27200
1785
+ },
1786
+ {
1787
+ "epoch": 32.0,
1788
+ "learning_rate": 5e-06,
1789
+ "loss": 0.5711,
1790
+ "step": 27300
1791
+ },
1792
+ {
1793
+ "epoch": 32.12,
1794
+ "learning_rate": 5e-06,
1795
+ "loss": 0.5079,
1796
+ "step": 27400
1797
+ },
1798
+ {
1799
+ "epoch": 32.24,
1800
+ "learning_rate": 5e-06,
1801
+ "loss": 0.5303,
1802
+ "step": 27500
1803
+ },
1804
+ {
1805
+ "epoch": 32.36,
1806
+ "learning_rate": 5e-06,
1807
+ "loss": 0.5347,
1808
+ "step": 27600
1809
+ },
1810
+ {
1811
+ "epoch": 32.47,
1812
+ "learning_rate": 5e-06,
1813
+ "loss": 0.4936,
1814
+ "step": 27700
1815
+ },
1816
+ {
1817
+ "epoch": 32.59,
1818
+ "learning_rate": 5e-06,
1819
+ "loss": 0.5303,
1820
+ "step": 27800
1821
+ },
1822
+ {
1823
+ "epoch": 32.71,
1824
+ "learning_rate": 5e-06,
1825
+ "loss": 0.5543,
1826
+ "step": 27900
1827
+ },
1828
+ {
1829
+ "epoch": 32.83,
1830
+ "learning_rate": 5e-06,
1831
+ "loss": 0.5266,
1832
+ "step": 28000
1833
+ },
1834
+ {
1835
+ "epoch": 32.94,
1836
+ "learning_rate": 5e-06,
1837
+ "loss": 0.5258,
1838
+ "step": 28100
1839
+ },
1840
+ {
1841
+ "epoch": 33.06,
1842
+ "learning_rate": 5e-06,
1843
+ "loss": 0.5559,
1844
+ "step": 28200
1845
+ },
1846
+ {
1847
+ "epoch": 33.18,
1848
+ "learning_rate": 5e-06,
1849
+ "loss": 0.5096,
1850
+ "step": 28300
1851
+ },
1852
+ {
1853
+ "epoch": 33.29,
1854
+ "learning_rate": 5e-06,
1855
+ "loss": 0.5427,
1856
+ "step": 28400
1857
+ },
1858
+ {
1859
+ "epoch": 33.41,
1860
+ "learning_rate": 5e-06,
1861
+ "loss": 0.5336,
1862
+ "step": 28500
1863
+ },
1864
+ {
1865
+ "epoch": 33.41,
1866
+ "eval_loss": 0.587517261505127,
1867
+ "eval_runtime": 70.6475,
1868
+ "eval_samples_per_second": 3.015,
1869
+ "eval_steps_per_second": 3.015,
1870
+ "step": 28500
1871
+ },
1872
+ {
1873
+ "epoch": 33.53,
1874
+ "learning_rate": 5e-06,
1875
+ "loss": 0.5419,
1876
+ "step": 28600
1877
+ },
1878
+ {
1879
+ "epoch": 33.65,
1880
+ "learning_rate": 5e-06,
1881
+ "loss": 0.5235,
1882
+ "step": 28700
1883
+ },
1884
+ {
1885
+ "epoch": 33.76,
1886
+ "learning_rate": 5e-06,
1887
+ "loss": 0.5266,
1888
+ "step": 28800
1889
+ },
1890
+ {
1891
+ "epoch": 33.88,
1892
+ "learning_rate": 5e-06,
1893
+ "loss": 0.5308,
1894
+ "step": 28900
1895
+ },
1896
+ {
1897
+ "epoch": 34.0,
1898
+ "learning_rate": 5e-06,
1899
+ "loss": 0.5203,
1900
+ "step": 29000
1901
+ },
1902
+ {
1903
+ "epoch": 34.11,
1904
+ "learning_rate": 5e-06,
1905
+ "loss": 0.4988,
1906
+ "step": 29100
1907
+ },
1908
+ {
1909
+ "epoch": 34.23,
1910
+ "learning_rate": 5e-06,
1911
+ "loss": 0.502,
1912
+ "step": 29200
1913
+ },
1914
+ {
1915
+ "epoch": 34.35,
1916
+ "learning_rate": 5e-06,
1917
+ "loss": 0.5111,
1918
+ "step": 29300
1919
+ },
1920
+ {
1921
+ "epoch": 34.47,
1922
+ "learning_rate": 5e-06,
1923
+ "loss": 0.539,
1924
+ "step": 29400
1925
+ },
1926
+ {
1927
+ "epoch": 34.58,
1928
+ "learning_rate": 5e-06,
1929
+ "loss": 0.5086,
1930
+ "step": 29500
1931
+ },
1932
+ {
1933
+ "epoch": 34.7,
1934
+ "learning_rate": 5e-06,
1935
+ "loss": 0.5285,
1936
+ "step": 29600
1937
+ },
1938
+ {
1939
+ "epoch": 34.82,
1940
+ "learning_rate": 5e-06,
1941
+ "loss": 0.5153,
1942
+ "step": 29700
1943
+ },
1944
+ {
1945
+ "epoch": 34.94,
1946
+ "learning_rate": 5e-06,
1947
+ "loss": 0.5366,
1948
+ "step": 29800
1949
+ },
1950
+ {
1951
+ "epoch": 35.05,
1952
+ "learning_rate": 5e-06,
1953
+ "loss": 0.5307,
1954
+ "step": 29900
1955
+ },
1956
+ {
1957
+ "epoch": 35.17,
1958
+ "learning_rate": 5e-06,
1959
+ "loss": 0.5677,
1960
+ "step": 30000
1961
+ },
1962
+ {
1963
+ "epoch": 35.17,
1964
+ "eval_loss": 0.591385543346405,
1965
+ "eval_runtime": 70.1376,
1966
+ "eval_samples_per_second": 3.037,
1967
+ "eval_steps_per_second": 3.037,
1968
+ "step": 30000
1969
+ },
1970
+ {
1971
+ "epoch": 35.29,
1972
+ "learning_rate": 5e-06,
1973
+ "loss": 0.5283,
1974
+ "step": 30100
1975
+ },
1976
+ {
1977
+ "epoch": 35.4,
1978
+ "learning_rate": 5e-06,
1979
+ "loss": 0.5281,
1980
+ "step": 30200
1981
+ },
1982
+ {
1983
+ "epoch": 35.52,
1984
+ "learning_rate": 5e-06,
1985
+ "loss": 0.5087,
1986
+ "step": 30300
1987
+ },
1988
+ {
1989
+ "epoch": 35.64,
1990
+ "learning_rate": 5e-06,
1991
+ "loss": 0.4965,
1992
+ "step": 30400
1993
+ },
1994
+ {
1995
+ "epoch": 35.76,
1996
+ "learning_rate": 5e-06,
1997
+ "loss": 0.5085,
1998
+ "step": 30500
1999
+ },
2000
+ {
2001
+ "epoch": 35.87,
2002
+ "learning_rate": 5e-06,
2003
+ "loss": 0.5159,
2004
+ "step": 30600
2005
+ },
2006
+ {
2007
+ "epoch": 35.99,
2008
+ "learning_rate": 5e-06,
2009
+ "loss": 0.5149,
2010
+ "step": 30700
2011
+ },
2012
+ {
2013
+ "epoch": 36.11,
2014
+ "learning_rate": 5e-06,
2015
+ "loss": 0.5281,
2016
+ "step": 30800
2017
+ },
2018
+ {
2019
+ "epoch": 36.23,
2020
+ "learning_rate": 5e-06,
2021
+ "loss": 0.511,
2022
+ "step": 30900
2023
+ },
2024
+ {
2025
+ "epoch": 36.34,
2026
+ "learning_rate": 5e-06,
2027
+ "loss": 0.5327,
2028
+ "step": 31000
2029
+ },
2030
+ {
2031
+ "epoch": 36.46,
2032
+ "learning_rate": 5e-06,
2033
+ "loss": 0.5267,
2034
+ "step": 31100
2035
+ },
2036
+ {
2037
+ "epoch": 36.58,
2038
+ "learning_rate": 5e-06,
2039
+ "loss": 0.5124,
2040
+ "step": 31200
2041
+ },
2042
+ {
2043
+ "epoch": 36.69,
2044
+ "learning_rate": 5e-06,
2045
+ "loss": 0.5069,
2046
+ "step": 31300
2047
+ },
2048
+ {
2049
+ "epoch": 36.81,
2050
+ "learning_rate": 5e-06,
2051
+ "loss": 0.4839,
2052
+ "step": 31400
2053
+ },
2054
+ {
2055
+ "epoch": 36.93,
2056
+ "learning_rate": 5e-06,
2057
+ "loss": 0.5009,
2058
+ "step": 31500
2059
+ },
2060
+ {
2061
+ "epoch": 36.93,
2062
+ "eval_loss": 0.6007654070854187,
2063
+ "eval_runtime": 70.1373,
2064
+ "eval_samples_per_second": 3.037,
2065
+ "eval_steps_per_second": 3.037,
2066
+ "step": 31500
2067
+ },
2068
+ {
2069
+ "epoch": 37.05,
2070
+ "learning_rate": 5e-06,
2071
+ "loss": 0.5169,
2072
+ "step": 31600
2073
+ },
2074
+ {
2075
+ "epoch": 37.16,
2076
+ "learning_rate": 5e-06,
2077
+ "loss": 0.5003,
2078
+ "step": 31700
2079
+ },
2080
+ {
2081
+ "epoch": 37.28,
2082
+ "learning_rate": 5e-06,
2083
+ "loss": 0.502,
2084
+ "step": 31800
2085
+ },
2086
+ {
2087
+ "epoch": 37.4,
2088
+ "learning_rate": 5e-06,
2089
+ "loss": 0.5149,
2090
+ "step": 31900
2091
+ },
2092
+ {
2093
+ "epoch": 37.51,
2094
+ "learning_rate": 5e-06,
2095
+ "loss": 0.4903,
2096
+ "step": 32000
2097
+ }
2098
+ ],
2099
+ "max_steps": 255900,
2100
+ "num_train_epochs": 300,
2101
+ "total_flos": 3.193778270424269e+19,
2102
+ "trial_name": null,
2103
+ "trial_params": null
2104
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d735c637c0acc0c7892e4e8dfe1b777e8c1ca3b71a525d3777a0d09bdedf3b02
3
+ size 3515