laverdes commited on
Commit
eae8886
1 Parent(s): 0d20dba

Upload checkpoints with huggingface_hub

Browse files
checkpoints/config.json ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": "3bb6803e5ea104ff51596e7066792ef0db0317f3",
3
+ "_name_or_path": "laverdes/donut-commoncrawl-mid",
4
+ "architectures": [
5
+ "VisionEncoderDecoderModel"
6
+ ],
7
+ "decoder": {
8
+ "_name_or_path": "",
9
+ "activation_dropout": 0.0,
10
+ "activation_function": "gelu",
11
+ "add_cross_attention": true,
12
+ "add_final_layer_norm": true,
13
+ "architectures": null,
14
+ "attention_dropout": 0.0,
15
+ "bad_words_ids": null,
16
+ "begin_suppress_tokens": null,
17
+ "bos_token_id": 0,
18
+ "chunk_size_feed_forward": 0,
19
+ "classifier_dropout": 0.0,
20
+ "cross_attention_hidden_size": null,
21
+ "d_model": 1024,
22
+ "decoder_attention_heads": 16,
23
+ "decoder_ffn_dim": 4096,
24
+ "decoder_layerdrop": 0.0,
25
+ "decoder_layers": 4,
26
+ "decoder_start_token_id": null,
27
+ "diversity_penalty": 0.0,
28
+ "do_sample": false,
29
+ "dropout": 0.1,
30
+ "early_stopping": false,
31
+ "encoder_attention_heads": 16,
32
+ "encoder_ffn_dim": 4096,
33
+ "encoder_layerdrop": 0.0,
34
+ "encoder_layers": 12,
35
+ "encoder_no_repeat_ngram_size": 0,
36
+ "eos_token_id": 2,
37
+ "exponential_decay_length_penalty": null,
38
+ "finetuning_task": null,
39
+ "forced_bos_token_id": null,
40
+ "forced_eos_token_id": 2,
41
+ "id2label": {
42
+ "0": "LABEL_0",
43
+ "1": "LABEL_1"
44
+ },
45
+ "init_std": 0.02,
46
+ "is_decoder": true,
47
+ "is_encoder_decoder": false,
48
+ "label2id": {
49
+ "LABEL_0": 0,
50
+ "LABEL_1": 1
51
+ },
52
+ "length_penalty": 1.0,
53
+ "max_length": 1024,
54
+ "max_position_embeddings": 1536,
55
+ "min_length": 0,
56
+ "model_type": "mbart",
57
+ "no_repeat_ngram_size": 0,
58
+ "num_beam_groups": 1,
59
+ "num_beams": 1,
60
+ "num_hidden_layers": 12,
61
+ "num_return_sequences": 1,
62
+ "output_attentions": false,
63
+ "output_hidden_states": false,
64
+ "output_scores": false,
65
+ "pad_token_id": 1,
66
+ "prefix": null,
67
+ "problem_type": null,
68
+ "pruned_heads": {},
69
+ "remove_invalid_values": false,
70
+ "repetition_penalty": 1.0,
71
+ "return_dict": true,
72
+ "return_dict_in_generate": false,
73
+ "scale_embedding": true,
74
+ "sep_token_id": null,
75
+ "suppress_tokens": null,
76
+ "task_specific_params": null,
77
+ "temperature": 1.0,
78
+ "tf_legacy_loss": false,
79
+ "tie_encoder_decoder": false,
80
+ "tie_word_embeddings": true,
81
+ "tokenizer_class": null,
82
+ "top_k": 50,
83
+ "top_p": 1.0,
84
+ "torch_dtype": null,
85
+ "torchscript": false,
86
+ "transformers_version": "4.28.1",
87
+ "typical_p": 1.0,
88
+ "use_bfloat16": false,
89
+ "use_cache": true,
90
+ "vocab_size": 57569
91
+ },
92
+ "decoder_start_token_id": 0,
93
+ "encoder": {
94
+ "_name_or_path": "",
95
+ "add_cross_attention": false,
96
+ "architectures": null,
97
+ "attention_probs_dropout_prob": 0.0,
98
+ "bad_words_ids": null,
99
+ "begin_suppress_tokens": null,
100
+ "bos_token_id": null,
101
+ "chunk_size_feed_forward": 0,
102
+ "cross_attention_hidden_size": null,
103
+ "decoder_start_token_id": null,
104
+ "depths": [
105
+ 2,
106
+ 2,
107
+ 14,
108
+ 2
109
+ ],
110
+ "diversity_penalty": 0.0,
111
+ "do_sample": false,
112
+ "drop_path_rate": 0.1,
113
+ "early_stopping": false,
114
+ "embed_dim": 128,
115
+ "encoder_no_repeat_ngram_size": 0,
116
+ "eos_token_id": null,
117
+ "exponential_decay_length_penalty": null,
118
+ "finetuning_task": null,
119
+ "forced_bos_token_id": null,
120
+ "forced_eos_token_id": null,
121
+ "hidden_act": "gelu",
122
+ "hidden_dropout_prob": 0.0,
123
+ "hidden_size": 1024,
124
+ "id2label": {
125
+ "0": "LABEL_0",
126
+ "1": "LABEL_1"
127
+ },
128
+ "image_size": [
129
+ 960,
130
+ 720
131
+ ],
132
+ "initializer_range": 0.02,
133
+ "is_decoder": false,
134
+ "is_encoder_decoder": false,
135
+ "label2id": {
136
+ "LABEL_0": 0,
137
+ "LABEL_1": 1
138
+ },
139
+ "layer_norm_eps": 1e-05,
140
+ "length_penalty": 1.0,
141
+ "max_length": 20,
142
+ "min_length": 0,
143
+ "mlp_ratio": 4.0,
144
+ "model_type": "donut-swin",
145
+ "no_repeat_ngram_size": 0,
146
+ "num_beam_groups": 1,
147
+ "num_beams": 1,
148
+ "num_channels": 3,
149
+ "num_heads": [
150
+ 4,
151
+ 8,
152
+ 16,
153
+ 32
154
+ ],
155
+ "num_layers": 4,
156
+ "num_return_sequences": 1,
157
+ "output_attentions": false,
158
+ "output_hidden_states": false,
159
+ "output_scores": false,
160
+ "pad_token_id": null,
161
+ "patch_size": 4,
162
+ "path_norm": true,
163
+ "prefix": null,
164
+ "problem_type": null,
165
+ "pruned_heads": {},
166
+ "qkv_bias": true,
167
+ "remove_invalid_values": false,
168
+ "repetition_penalty": 1.0,
169
+ "return_dict": true,
170
+ "return_dict_in_generate": false,
171
+ "sep_token_id": null,
172
+ "suppress_tokens": null,
173
+ "task_specific_params": null,
174
+ "temperature": 1.0,
175
+ "tf_legacy_loss": false,
176
+ "tie_encoder_decoder": false,
177
+ "tie_word_embeddings": true,
178
+ "tokenizer_class": null,
179
+ "top_k": 50,
180
+ "top_p": 1.0,
181
+ "torch_dtype": null,
182
+ "torchscript": false,
183
+ "transformers_version": "4.28.1",
184
+ "typical_p": 1.0,
185
+ "use_absolute_embeddings": false,
186
+ "use_bfloat16": false,
187
+ "window_size": 10
188
+ },
189
+ "is_encoder_decoder": true,
190
+ "model_type": "vision-encoder-decoder",
191
+ "pad_token_id": 1,
192
+ "tie_word_embeddings": false,
193
+ "torch_dtype": "float32",
194
+ "transformers_version": null
195
+ }
checkpoints/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 0,
5
+ "eos_token_id": 2,
6
+ "forced_eos_token_id": 2,
7
+ "max_length": 1024,
8
+ "pad_token_id": 1,
9
+ "transformers_version": "4.28.1"
10
+ }
checkpoints/trainer_state.json ADDED
@@ -0,0 +1,1822 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 11.0,
5
+ "global_step": 12914,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.09,
12
+ "learning_rate": 1.9914821124361162e-05,
13
+ "loss": 4.8491,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.09,
18
+ "eval_loss": 4.696099281311035,
19
+ "eval_runtime": 907.3534,
20
+ "eval_samples_per_second": 0.323,
21
+ "eval_steps_per_second": 0.041,
22
+ "step": 100
23
+ },
24
+ {
25
+ "epoch": 0.17,
26
+ "learning_rate": 1.982964224872232e-05,
27
+ "loss": 4.88,
28
+ "step": 200
29
+ },
30
+ {
31
+ "epoch": 0.17,
32
+ "eval_loss": 4.71148681640625,
33
+ "eval_runtime": 901.0601,
34
+ "eval_samples_per_second": 0.325,
35
+ "eval_steps_per_second": 0.041,
36
+ "step": 200
37
+ },
38
+ {
39
+ "epoch": 0.26,
40
+ "learning_rate": 1.9744463373083477e-05,
41
+ "loss": 4.6409,
42
+ "step": 300
43
+ },
44
+ {
45
+ "epoch": 0.26,
46
+ "eval_loss": 4.671314239501953,
47
+ "eval_runtime": 906.18,
48
+ "eval_samples_per_second": 0.323,
49
+ "eval_steps_per_second": 0.041,
50
+ "step": 300
51
+ },
52
+ {
53
+ "epoch": 0.34,
54
+ "learning_rate": 1.9659284497444635e-05,
55
+ "loss": 4.7222,
56
+ "step": 400
57
+ },
58
+ {
59
+ "epoch": 0.34,
60
+ "eval_loss": 4.637036323547363,
61
+ "eval_runtime": 903.4161,
62
+ "eval_samples_per_second": 0.324,
63
+ "eval_steps_per_second": 0.041,
64
+ "step": 400
65
+ },
66
+ {
67
+ "epoch": 0.43,
68
+ "learning_rate": 1.9574105621805795e-05,
69
+ "loss": 4.6357,
70
+ "step": 500
71
+ },
72
+ {
73
+ "epoch": 0.43,
74
+ "eval_loss": 4.6376471519470215,
75
+ "eval_runtime": 912.9746,
76
+ "eval_samples_per_second": 0.321,
77
+ "eval_steps_per_second": 0.041,
78
+ "step": 500
79
+ },
80
+ {
81
+ "epoch": 0.51,
82
+ "learning_rate": 1.9488926746166953e-05,
83
+ "loss": 4.6993,
84
+ "step": 600
85
+ },
86
+ {
87
+ "epoch": 0.51,
88
+ "eval_loss": 4.662974834442139,
89
+ "eval_runtime": 908.9893,
90
+ "eval_samples_per_second": 0.322,
91
+ "eval_steps_per_second": 0.041,
92
+ "step": 600
93
+ },
94
+ {
95
+ "epoch": 0.6,
96
+ "learning_rate": 1.940374787052811e-05,
97
+ "loss": 4.7104,
98
+ "step": 700
99
+ },
100
+ {
101
+ "epoch": 0.6,
102
+ "eval_loss": 4.587018966674805,
103
+ "eval_runtime": 913.6781,
104
+ "eval_samples_per_second": 0.321,
105
+ "eval_steps_per_second": 0.04,
106
+ "step": 700
107
+ },
108
+ {
109
+ "epoch": 0.68,
110
+ "learning_rate": 1.9318568994889268e-05,
111
+ "loss": 4.5864,
112
+ "step": 800
113
+ },
114
+ {
115
+ "epoch": 0.68,
116
+ "eval_loss": 4.565457344055176,
117
+ "eval_runtime": 900.1349,
118
+ "eval_samples_per_second": 0.326,
119
+ "eval_steps_per_second": 0.041,
120
+ "step": 800
121
+ },
122
+ {
123
+ "epoch": 0.77,
124
+ "learning_rate": 1.923339011925043e-05,
125
+ "loss": 4.6003,
126
+ "step": 900
127
+ },
128
+ {
129
+ "epoch": 0.77,
130
+ "eval_loss": 4.560632705688477,
131
+ "eval_runtime": 913.1928,
132
+ "eval_samples_per_second": 0.321,
133
+ "eval_steps_per_second": 0.041,
134
+ "step": 900
135
+ },
136
+ {
137
+ "epoch": 0.85,
138
+ "learning_rate": 1.9148211243611586e-05,
139
+ "loss": 4.6166,
140
+ "step": 1000
141
+ },
142
+ {
143
+ "epoch": 0.85,
144
+ "eval_loss": 4.5305914878845215,
145
+ "eval_runtime": 898.0505,
146
+ "eval_samples_per_second": 0.326,
147
+ "eval_steps_per_second": 0.041,
148
+ "step": 1000
149
+ },
150
+ {
151
+ "epoch": 0.94,
152
+ "learning_rate": 1.9063032367972743e-05,
153
+ "loss": 4.6019,
154
+ "step": 1100
155
+ },
156
+ {
157
+ "epoch": 0.94,
158
+ "eval_loss": 4.547730922698975,
159
+ "eval_runtime": 916.6743,
160
+ "eval_samples_per_second": 0.32,
161
+ "eval_steps_per_second": 0.04,
162
+ "step": 1100
163
+ },
164
+ {
165
+ "epoch": 1.02,
166
+ "learning_rate": 1.89778534923339e-05,
167
+ "loss": 4.4833,
168
+ "step": 1200
169
+ },
170
+ {
171
+ "epoch": 1.02,
172
+ "eval_loss": 4.527022838592529,
173
+ "eval_runtime": 905.1447,
174
+ "eval_samples_per_second": 0.324,
175
+ "eval_steps_per_second": 0.041,
176
+ "step": 1200
177
+ },
178
+ {
179
+ "epoch": 1.11,
180
+ "learning_rate": 1.889267461669506e-05,
181
+ "loss": 4.4664,
182
+ "step": 1300
183
+ },
184
+ {
185
+ "epoch": 1.11,
186
+ "eval_loss": 4.533600807189941,
187
+ "eval_runtime": 900.1154,
188
+ "eval_samples_per_second": 0.326,
189
+ "eval_steps_per_second": 0.041,
190
+ "step": 1300
191
+ },
192
+ {
193
+ "epoch": 1.19,
194
+ "learning_rate": 1.880749574105622e-05,
195
+ "loss": 4.4533,
196
+ "step": 1400
197
+ },
198
+ {
199
+ "epoch": 1.19,
200
+ "eval_loss": 4.512392044067383,
201
+ "eval_runtime": 897.9913,
202
+ "eval_samples_per_second": 0.326,
203
+ "eval_steps_per_second": 0.041,
204
+ "step": 1400
205
+ },
206
+ {
207
+ "epoch": 1.28,
208
+ "learning_rate": 1.872231686541738e-05,
209
+ "loss": 4.2969,
210
+ "step": 1500
211
+ },
212
+ {
213
+ "epoch": 1.28,
214
+ "eval_loss": 4.5288238525390625,
215
+ "eval_runtime": 902.8396,
216
+ "eval_samples_per_second": 0.325,
217
+ "eval_steps_per_second": 0.041,
218
+ "step": 1500
219
+ },
220
+ {
221
+ "epoch": 1.36,
222
+ "learning_rate": 1.8637137989778534e-05,
223
+ "loss": 4.4203,
224
+ "step": 1600
225
+ },
226
+ {
227
+ "epoch": 1.36,
228
+ "eval_loss": 4.510149002075195,
229
+ "eval_runtime": 896.5776,
230
+ "eval_samples_per_second": 0.327,
231
+ "eval_steps_per_second": 0.041,
232
+ "step": 1600
233
+ },
234
+ {
235
+ "epoch": 1.45,
236
+ "learning_rate": 1.8551959114139694e-05,
237
+ "loss": 4.4828,
238
+ "step": 1700
239
+ },
240
+ {
241
+ "epoch": 1.45,
242
+ "eval_loss": 4.4747633934021,
243
+ "eval_runtime": 900.0595,
244
+ "eval_samples_per_second": 0.326,
245
+ "eval_steps_per_second": 0.041,
246
+ "step": 1700
247
+ },
248
+ {
249
+ "epoch": 1.53,
250
+ "learning_rate": 1.8466780238500855e-05,
251
+ "loss": 4.5045,
252
+ "step": 1800
253
+ },
254
+ {
255
+ "epoch": 1.53,
256
+ "eval_loss": 4.477636337280273,
257
+ "eval_runtime": 930.3322,
258
+ "eval_samples_per_second": 0.315,
259
+ "eval_steps_per_second": 0.04,
260
+ "step": 1800
261
+ },
262
+ {
263
+ "epoch": 1.62,
264
+ "learning_rate": 1.8381601362862013e-05,
265
+ "loss": 4.2977,
266
+ "step": 1900
267
+ },
268
+ {
269
+ "epoch": 1.62,
270
+ "eval_loss": 4.475375175476074,
271
+ "eval_runtime": 897.0451,
272
+ "eval_samples_per_second": 0.327,
273
+ "eval_steps_per_second": 0.041,
274
+ "step": 1900
275
+ },
276
+ {
277
+ "epoch": 1.7,
278
+ "learning_rate": 1.829642248722317e-05,
279
+ "loss": 4.4022,
280
+ "step": 2000
281
+ },
282
+ {
283
+ "epoch": 1.7,
284
+ "eval_loss": 4.451622486114502,
285
+ "eval_runtime": 924.228,
286
+ "eval_samples_per_second": 0.317,
287
+ "eval_steps_per_second": 0.04,
288
+ "step": 2000
289
+ },
290
+ {
291
+ "epoch": 1.79,
292
+ "learning_rate": 1.8211243611584328e-05,
293
+ "loss": 4.3429,
294
+ "step": 2100
295
+ },
296
+ {
297
+ "epoch": 1.79,
298
+ "eval_loss": 4.43123722076416,
299
+ "eval_runtime": 903.4516,
300
+ "eval_samples_per_second": 0.324,
301
+ "eval_steps_per_second": 0.041,
302
+ "step": 2100
303
+ },
304
+ {
305
+ "epoch": 1.87,
306
+ "learning_rate": 1.812606473594549e-05,
307
+ "loss": 4.3748,
308
+ "step": 2200
309
+ },
310
+ {
311
+ "epoch": 1.87,
312
+ "eval_loss": 4.475191116333008,
313
+ "eval_runtime": 901.3458,
314
+ "eval_samples_per_second": 0.325,
315
+ "eval_steps_per_second": 0.041,
316
+ "step": 2200
317
+ },
318
+ {
319
+ "epoch": 1.96,
320
+ "learning_rate": 1.8040885860306646e-05,
321
+ "loss": 4.343,
322
+ "step": 2300
323
+ },
324
+ {
325
+ "epoch": 1.96,
326
+ "eval_loss": 4.455069541931152,
327
+ "eval_runtime": 919.5227,
328
+ "eval_samples_per_second": 0.319,
329
+ "eval_steps_per_second": 0.04,
330
+ "step": 2300
331
+ },
332
+ {
333
+ "epoch": 2.04,
334
+ "learning_rate": 1.7955706984667803e-05,
335
+ "loss": 4.1796,
336
+ "step": 2400
337
+ },
338
+ {
339
+ "epoch": 2.04,
340
+ "eval_loss": 4.424516201019287,
341
+ "eval_runtime": 912.0297,
342
+ "eval_samples_per_second": 0.321,
343
+ "eval_steps_per_second": 0.041,
344
+ "step": 2400
345
+ },
346
+ {
347
+ "epoch": 2.13,
348
+ "learning_rate": 1.787052810902896e-05,
349
+ "loss": 4.2325,
350
+ "step": 2500
351
+ },
352
+ {
353
+ "epoch": 2.13,
354
+ "eval_loss": 4.425054550170898,
355
+ "eval_runtime": 900.4656,
356
+ "eval_samples_per_second": 0.325,
357
+ "eval_steps_per_second": 0.041,
358
+ "step": 2500
359
+ },
360
+ {
361
+ "epoch": 2.21,
362
+ "learning_rate": 1.778534923339012e-05,
363
+ "loss": 4.2419,
364
+ "step": 2600
365
+ },
366
+ {
367
+ "epoch": 2.21,
368
+ "eval_loss": 4.426344871520996,
369
+ "eval_runtime": 907.7136,
370
+ "eval_samples_per_second": 0.323,
371
+ "eval_steps_per_second": 0.041,
372
+ "step": 2600
373
+ },
374
+ {
375
+ "epoch": 2.3,
376
+ "learning_rate": 1.770017035775128e-05,
377
+ "loss": 4.2556,
378
+ "step": 2700
379
+ },
380
+ {
381
+ "epoch": 2.3,
382
+ "eval_loss": 4.392487525939941,
383
+ "eval_runtime": 904.6208,
384
+ "eval_samples_per_second": 0.324,
385
+ "eval_steps_per_second": 0.041,
386
+ "step": 2700
387
+ },
388
+ {
389
+ "epoch": 2.39,
390
+ "learning_rate": 1.761499148211244e-05,
391
+ "loss": 4.2501,
392
+ "step": 2800
393
+ },
394
+ {
395
+ "epoch": 2.39,
396
+ "eval_loss": 4.414584636688232,
397
+ "eval_runtime": 904.1877,
398
+ "eval_samples_per_second": 0.324,
399
+ "eval_steps_per_second": 0.041,
400
+ "step": 2800
401
+ },
402
+ {
403
+ "epoch": 2.47,
404
+ "learning_rate": 1.7529812606473594e-05,
405
+ "loss": 4.2199,
406
+ "step": 2900
407
+ },
408
+ {
409
+ "epoch": 2.47,
410
+ "eval_loss": 4.391025543212891,
411
+ "eval_runtime": 904.8782,
412
+ "eval_samples_per_second": 0.324,
413
+ "eval_steps_per_second": 0.041,
414
+ "step": 2900
415
+ },
416
+ {
417
+ "epoch": 2.56,
418
+ "learning_rate": 1.7444633730834754e-05,
419
+ "loss": 4.1842,
420
+ "step": 3000
421
+ },
422
+ {
423
+ "epoch": 2.56,
424
+ "eval_loss": 4.3928704261779785,
425
+ "eval_runtime": 911.7639,
426
+ "eval_samples_per_second": 0.321,
427
+ "eval_steps_per_second": 0.041,
428
+ "step": 3000
429
+ },
430
+ {
431
+ "epoch": 2.64,
432
+ "learning_rate": 1.7359454855195912e-05,
433
+ "loss": 4.1789,
434
+ "step": 3100
435
+ },
436
+ {
437
+ "epoch": 2.64,
438
+ "eval_loss": 4.373032569885254,
439
+ "eval_runtime": 903.6693,
440
+ "eval_samples_per_second": 0.324,
441
+ "eval_steps_per_second": 0.041,
442
+ "step": 3100
443
+ },
444
+ {
445
+ "epoch": 2.73,
446
+ "learning_rate": 1.7274275979557073e-05,
447
+ "loss": 4.1993,
448
+ "step": 3200
449
+ },
450
+ {
451
+ "epoch": 2.73,
452
+ "eval_loss": 4.401707172393799,
453
+ "eval_runtime": 906.6931,
454
+ "eval_samples_per_second": 0.323,
455
+ "eval_steps_per_second": 0.041,
456
+ "step": 3200
457
+ },
458
+ {
459
+ "epoch": 2.81,
460
+ "learning_rate": 1.718909710391823e-05,
461
+ "loss": 4.1348,
462
+ "step": 3300
463
+ },
464
+ {
465
+ "epoch": 2.81,
466
+ "eval_loss": 4.366961479187012,
467
+ "eval_runtime": 909.5878,
468
+ "eval_samples_per_second": 0.322,
469
+ "eval_steps_per_second": 0.041,
470
+ "step": 3300
471
+ },
472
+ {
473
+ "epoch": 2.9,
474
+ "learning_rate": 1.7103918228279387e-05,
475
+ "loss": 4.196,
476
+ "step": 3400
477
+ },
478
+ {
479
+ "epoch": 2.9,
480
+ "eval_loss": 4.354998588562012,
481
+ "eval_runtime": 914.5578,
482
+ "eval_samples_per_second": 0.32,
483
+ "eval_steps_per_second": 0.04,
484
+ "step": 3400
485
+ },
486
+ {
487
+ "epoch": 2.98,
488
+ "learning_rate": 1.7018739352640548e-05,
489
+ "loss": 4.1906,
490
+ "step": 3500
491
+ },
492
+ {
493
+ "epoch": 2.98,
494
+ "eval_loss": 4.349732875823975,
495
+ "eval_runtime": 898.3438,
496
+ "eval_samples_per_second": 0.326,
497
+ "eval_steps_per_second": 0.041,
498
+ "step": 3500
499
+ },
500
+ {
501
+ "epoch": 3.07,
502
+ "learning_rate": 1.6933560477001706e-05,
503
+ "loss": 4.0583,
504
+ "step": 3600
505
+ },
506
+ {
507
+ "epoch": 3.07,
508
+ "eval_loss": 4.3627471923828125,
509
+ "eval_runtime": 897.909,
510
+ "eval_samples_per_second": 0.326,
511
+ "eval_steps_per_second": 0.041,
512
+ "step": 3600
513
+ },
514
+ {
515
+ "epoch": 3.15,
516
+ "learning_rate": 1.6848381601362863e-05,
517
+ "loss": 4.1298,
518
+ "step": 3700
519
+ },
520
+ {
521
+ "epoch": 3.15,
522
+ "eval_loss": 4.362400531768799,
523
+ "eval_runtime": 912.2819,
524
+ "eval_samples_per_second": 0.321,
525
+ "eval_steps_per_second": 0.041,
526
+ "step": 3700
527
+ },
528
+ {
529
+ "epoch": 3.24,
530
+ "learning_rate": 1.676320272572402e-05,
531
+ "loss": 4.0244,
532
+ "step": 3800
533
+ },
534
+ {
535
+ "epoch": 3.24,
536
+ "eval_loss": 4.338736534118652,
537
+ "eval_runtime": 896.0396,
538
+ "eval_samples_per_second": 0.327,
539
+ "eval_steps_per_second": 0.041,
540
+ "step": 3800
541
+ },
542
+ {
543
+ "epoch": 3.32,
544
+ "learning_rate": 1.667802385008518e-05,
545
+ "loss": 4.0549,
546
+ "step": 3900
547
+ },
548
+ {
549
+ "epoch": 3.32,
550
+ "eval_loss": 4.341971397399902,
551
+ "eval_runtime": 901.737,
552
+ "eval_samples_per_second": 0.325,
553
+ "eval_steps_per_second": 0.041,
554
+ "step": 3900
555
+ },
556
+ {
557
+ "epoch": 3.41,
558
+ "learning_rate": 1.659284497444634e-05,
559
+ "loss": 4.0006,
560
+ "step": 4000
561
+ },
562
+ {
563
+ "epoch": 3.41,
564
+ "eval_loss": 4.326377868652344,
565
+ "eval_runtime": 898.1346,
566
+ "eval_samples_per_second": 0.326,
567
+ "eval_steps_per_second": 0.041,
568
+ "step": 4000
569
+ },
570
+ {
571
+ "epoch": 3.49,
572
+ "learning_rate": 1.65076660988075e-05,
573
+ "loss": 4.0435,
574
+ "step": 4100
575
+ },
576
+ {
577
+ "epoch": 3.49,
578
+ "eval_loss": 4.331975936889648,
579
+ "eval_runtime": 901.1995,
580
+ "eval_samples_per_second": 0.325,
581
+ "eval_steps_per_second": 0.041,
582
+ "step": 4100
583
+ },
584
+ {
585
+ "epoch": 3.58,
586
+ "learning_rate": 1.6422487223168653e-05,
587
+ "loss": 4.0948,
588
+ "step": 4200
589
+ },
590
+ {
591
+ "epoch": 3.58,
592
+ "eval_loss": 4.32388973236084,
593
+ "eval_runtime": 904.6488,
594
+ "eval_samples_per_second": 0.324,
595
+ "eval_steps_per_second": 0.041,
596
+ "step": 4200
597
+ },
598
+ {
599
+ "epoch": 3.66,
600
+ "learning_rate": 1.6337308347529814e-05,
601
+ "loss": 3.933,
602
+ "step": 4300
603
+ },
604
+ {
605
+ "epoch": 3.66,
606
+ "eval_loss": 4.314174175262451,
607
+ "eval_runtime": 902.3829,
608
+ "eval_samples_per_second": 0.325,
609
+ "eval_steps_per_second": 0.041,
610
+ "step": 4300
611
+ },
612
+ {
613
+ "epoch": 3.75,
614
+ "learning_rate": 1.625212947189097e-05,
615
+ "loss": 3.9773,
616
+ "step": 4400
617
+ },
618
+ {
619
+ "epoch": 3.75,
620
+ "eval_loss": 4.353682041168213,
621
+ "eval_runtime": 901.0754,
622
+ "eval_samples_per_second": 0.325,
623
+ "eval_steps_per_second": 0.041,
624
+ "step": 4400
625
+ },
626
+ {
627
+ "epoch": 3.83,
628
+ "learning_rate": 1.6166950596252132e-05,
629
+ "loss": 4.1143,
630
+ "step": 4500
631
+ },
632
+ {
633
+ "epoch": 3.83,
634
+ "eval_loss": 4.3104963302612305,
635
+ "eval_runtime": 903.7164,
636
+ "eval_samples_per_second": 0.324,
637
+ "eval_steps_per_second": 0.041,
638
+ "step": 4500
639
+ },
640
+ {
641
+ "epoch": 3.92,
642
+ "learning_rate": 1.608177172061329e-05,
643
+ "loss": 3.961,
644
+ "step": 4600
645
+ },
646
+ {
647
+ "epoch": 3.92,
648
+ "eval_loss": 4.292612552642822,
649
+ "eval_runtime": 900.6329,
650
+ "eval_samples_per_second": 0.325,
651
+ "eval_steps_per_second": 0.041,
652
+ "step": 4600
653
+ },
654
+ {
655
+ "epoch": 4.0,
656
+ "learning_rate": 1.5996592844974447e-05,
657
+ "loss": 4.0047,
658
+ "step": 4700
659
+ },
660
+ {
661
+ "epoch": 4.0,
662
+ "eval_loss": 4.3063883781433105,
663
+ "eval_runtime": 900.2789,
664
+ "eval_samples_per_second": 0.325,
665
+ "eval_steps_per_second": 0.041,
666
+ "step": 4700
667
+ },
668
+ {
669
+ "epoch": 4.09,
670
+ "learning_rate": 1.5911413969335605e-05,
671
+ "loss": 3.9058,
672
+ "step": 4800
673
+ },
674
+ {
675
+ "epoch": 4.09,
676
+ "eval_loss": 4.303318023681641,
677
+ "eval_runtime": 906.0598,
678
+ "eval_samples_per_second": 0.323,
679
+ "eval_steps_per_second": 0.041,
680
+ "step": 4800
681
+ },
682
+ {
683
+ "epoch": 4.17,
684
+ "learning_rate": 1.5826235093696765e-05,
685
+ "loss": 3.7936,
686
+ "step": 4900
687
+ },
688
+ {
689
+ "epoch": 4.17,
690
+ "eval_loss": 4.301506996154785,
691
+ "eval_runtime": 906.1238,
692
+ "eval_samples_per_second": 0.323,
693
+ "eval_steps_per_second": 0.041,
694
+ "step": 4900
695
+ },
696
+ {
697
+ "epoch": 4.26,
698
+ "learning_rate": 1.5741056218057923e-05,
699
+ "loss": 3.8733,
700
+ "step": 5000
701
+ },
702
+ {
703
+ "epoch": 4.26,
704
+ "eval_loss": 4.344919681549072,
705
+ "eval_runtime": 916.0111,
706
+ "eval_samples_per_second": 0.32,
707
+ "eval_steps_per_second": 0.04,
708
+ "step": 5000
709
+ },
710
+ {
711
+ "epoch": 4.34,
712
+ "learning_rate": 1.565587734241908e-05,
713
+ "loss": 3.8625,
714
+ "step": 5100
715
+ },
716
+ {
717
+ "epoch": 4.34,
718
+ "eval_loss": 4.274160385131836,
719
+ "eval_runtime": 906.5215,
720
+ "eval_samples_per_second": 0.323,
721
+ "eval_steps_per_second": 0.041,
722
+ "step": 5100
723
+ },
724
+ {
725
+ "epoch": 4.43,
726
+ "learning_rate": 1.557069846678024e-05,
727
+ "loss": 4.0015,
728
+ "step": 5200
729
+ },
730
+ {
731
+ "epoch": 4.43,
732
+ "eval_loss": 4.2957258224487305,
733
+ "eval_runtime": 920.9763,
734
+ "eval_samples_per_second": 0.318,
735
+ "eval_steps_per_second": 0.04,
736
+ "step": 5200
737
+ },
738
+ {
739
+ "epoch": 4.51,
740
+ "learning_rate": 1.54855195911414e-05,
741
+ "loss": 4.0173,
742
+ "step": 5300
743
+ },
744
+ {
745
+ "epoch": 4.51,
746
+ "eval_loss": 4.30204439163208,
747
+ "eval_runtime": 900.2302,
748
+ "eval_samples_per_second": 0.325,
749
+ "eval_steps_per_second": 0.041,
750
+ "step": 5300
751
+ },
752
+ {
753
+ "epoch": 4.6,
754
+ "learning_rate": 1.5400340715502556e-05,
755
+ "loss": 3.8079,
756
+ "step": 5400
757
+ },
758
+ {
759
+ "epoch": 4.6,
760
+ "eval_loss": 4.274052619934082,
761
+ "eval_runtime": 902.0347,
762
+ "eval_samples_per_second": 0.325,
763
+ "eval_steps_per_second": 0.041,
764
+ "step": 5400
765
+ },
766
+ {
767
+ "epoch": 4.68,
768
+ "learning_rate": 1.5315161839863713e-05,
769
+ "loss": 4.0081,
770
+ "step": 5500
771
+ },
772
+ {
773
+ "epoch": 4.68,
774
+ "eval_loss": 4.2950968742370605,
775
+ "eval_runtime": 910.5366,
776
+ "eval_samples_per_second": 0.322,
777
+ "eval_steps_per_second": 0.041,
778
+ "step": 5500
779
+ },
780
+ {
781
+ "epoch": 4.77,
782
+ "learning_rate": 1.5229982964224874e-05,
783
+ "loss": 3.8823,
784
+ "step": 5600
785
+ },
786
+ {
787
+ "epoch": 4.77,
788
+ "eval_loss": 4.277224540710449,
789
+ "eval_runtime": 901.0907,
790
+ "eval_samples_per_second": 0.325,
791
+ "eval_steps_per_second": 0.041,
792
+ "step": 5600
793
+ },
794
+ {
795
+ "epoch": 4.86,
796
+ "learning_rate": 1.5144804088586031e-05,
797
+ "loss": 3.9056,
798
+ "step": 5700
799
+ },
800
+ {
801
+ "epoch": 4.86,
802
+ "eval_loss": 4.272303581237793,
803
+ "eval_runtime": 911.2022,
804
+ "eval_samples_per_second": 0.322,
805
+ "eval_steps_per_second": 0.041,
806
+ "step": 5700
807
+ },
808
+ {
809
+ "epoch": 4.94,
810
+ "learning_rate": 1.505962521294719e-05,
811
+ "loss": 3.813,
812
+ "step": 5800
813
+ },
814
+ {
815
+ "epoch": 4.94,
816
+ "eval_loss": 4.275490760803223,
817
+ "eval_runtime": 913.2487,
818
+ "eval_samples_per_second": 0.321,
819
+ "eval_steps_per_second": 0.041,
820
+ "step": 5800
821
+ },
822
+ {
823
+ "epoch": 5.03,
824
+ "learning_rate": 1.4974446337308348e-05,
825
+ "loss": 3.9263,
826
+ "step": 5900
827
+ },
828
+ {
829
+ "epoch": 5.03,
830
+ "eval_loss": 4.2567267417907715,
831
+ "eval_runtime": 922.3278,
832
+ "eval_samples_per_second": 0.318,
833
+ "eval_steps_per_second": 0.04,
834
+ "step": 5900
835
+ },
836
+ {
837
+ "epoch": 5.11,
838
+ "learning_rate": 1.4889267461669507e-05,
839
+ "loss": 3.7405,
840
+ "step": 6000
841
+ },
842
+ {
843
+ "epoch": 5.11,
844
+ "eval_loss": 4.279594421386719,
845
+ "eval_runtime": 902.112,
846
+ "eval_samples_per_second": 0.325,
847
+ "eval_steps_per_second": 0.041,
848
+ "step": 6000
849
+ },
850
+ {
851
+ "epoch": 5.2,
852
+ "learning_rate": 1.4804088586030664e-05,
853
+ "loss": 3.8432,
854
+ "step": 6100
855
+ },
856
+ {
857
+ "epoch": 5.2,
858
+ "eval_loss": 4.277362823486328,
859
+ "eval_runtime": 909.7644,
860
+ "eval_samples_per_second": 0.322,
861
+ "eval_steps_per_second": 0.041,
862
+ "step": 6100
863
+ },
864
+ {
865
+ "epoch": 5.28,
866
+ "learning_rate": 1.4718909710391824e-05,
867
+ "loss": 3.797,
868
+ "step": 6200
869
+ },
870
+ {
871
+ "epoch": 5.28,
872
+ "eval_loss": 4.2573161125183105,
873
+ "eval_runtime": 900.2164,
874
+ "eval_samples_per_second": 0.325,
875
+ "eval_steps_per_second": 0.041,
876
+ "step": 6200
877
+ },
878
+ {
879
+ "epoch": 5.37,
880
+ "learning_rate": 1.4633730834752981e-05,
881
+ "loss": 3.7348,
882
+ "step": 6300
883
+ },
884
+ {
885
+ "epoch": 5.37,
886
+ "eval_loss": 4.261054515838623,
887
+ "eval_runtime": 899.5894,
888
+ "eval_samples_per_second": 0.326,
889
+ "eval_steps_per_second": 0.041,
890
+ "step": 6300
891
+ },
892
+ {
893
+ "epoch": 5.45,
894
+ "learning_rate": 1.4548551959114142e-05,
895
+ "loss": 3.7363,
896
+ "step": 6400
897
+ },
898
+ {
899
+ "epoch": 5.45,
900
+ "eval_loss": 4.2909698486328125,
901
+ "eval_runtime": 914.0387,
902
+ "eval_samples_per_second": 0.321,
903
+ "eval_steps_per_second": 0.04,
904
+ "step": 6400
905
+ },
906
+ {
907
+ "epoch": 5.54,
908
+ "learning_rate": 1.44633730834753e-05,
909
+ "loss": 3.8448,
910
+ "step": 6500
911
+ },
912
+ {
913
+ "epoch": 5.54,
914
+ "eval_loss": 4.257021903991699,
915
+ "eval_runtime": 906.6331,
916
+ "eval_samples_per_second": 0.323,
917
+ "eval_steps_per_second": 0.041,
918
+ "step": 6500
919
+ },
920
+ {
921
+ "epoch": 5.62,
922
+ "learning_rate": 1.4378194207836458e-05,
923
+ "loss": 3.8023,
924
+ "step": 6600
925
+ },
926
+ {
927
+ "epoch": 5.62,
928
+ "eval_loss": 4.263571262359619,
929
+ "eval_runtime": 903.3527,
930
+ "eval_samples_per_second": 0.324,
931
+ "eval_steps_per_second": 0.041,
932
+ "step": 6600
933
+ },
934
+ {
935
+ "epoch": 5.71,
936
+ "learning_rate": 1.4293015332197616e-05,
937
+ "loss": 3.8202,
938
+ "step": 6700
939
+ },
940
+ {
941
+ "epoch": 5.71,
942
+ "eval_loss": 4.226749897003174,
943
+ "eval_runtime": 913.4737,
944
+ "eval_samples_per_second": 0.321,
945
+ "eval_steps_per_second": 0.041,
946
+ "step": 6700
947
+ },
948
+ {
949
+ "epoch": 5.79,
950
+ "learning_rate": 1.4207836456558775e-05,
951
+ "loss": 3.731,
952
+ "step": 6800
953
+ },
954
+ {
955
+ "epoch": 5.79,
956
+ "eval_loss": 4.232919216156006,
957
+ "eval_runtime": 905.4134,
958
+ "eval_samples_per_second": 0.324,
959
+ "eval_steps_per_second": 0.041,
960
+ "step": 6800
961
+ },
962
+ {
963
+ "epoch": 5.88,
964
+ "learning_rate": 1.4122657580919934e-05,
965
+ "loss": 3.7507,
966
+ "step": 6900
967
+ },
968
+ {
969
+ "epoch": 5.88,
970
+ "eval_loss": 4.2412004470825195,
971
+ "eval_runtime": 917.4159,
972
+ "eval_samples_per_second": 0.319,
973
+ "eval_steps_per_second": 0.04,
974
+ "step": 6900
975
+ },
976
+ {
977
+ "epoch": 5.96,
978
+ "learning_rate": 1.4037478705281091e-05,
979
+ "loss": 3.8812,
980
+ "step": 7000
981
+ },
982
+ {
983
+ "epoch": 5.96,
984
+ "eval_loss": 4.239901542663574,
985
+ "eval_runtime": 904.0296,
986
+ "eval_samples_per_second": 0.324,
987
+ "eval_steps_per_second": 0.041,
988
+ "step": 7000
989
+ },
990
+ {
991
+ "epoch": 6.05,
992
+ "learning_rate": 1.395229982964225e-05,
993
+ "loss": 3.6882,
994
+ "step": 7100
995
+ },
996
+ {
997
+ "epoch": 6.05,
998
+ "eval_loss": 4.252227306365967,
999
+ "eval_runtime": 909.685,
1000
+ "eval_samples_per_second": 0.322,
1001
+ "eval_steps_per_second": 0.041,
1002
+ "step": 7100
1003
+ },
1004
+ {
1005
+ "epoch": 6.13,
1006
+ "learning_rate": 1.3867120954003408e-05,
1007
+ "loss": 3.6485,
1008
+ "step": 7200
1009
+ },
1010
+ {
1011
+ "epoch": 6.13,
1012
+ "eval_loss": 4.240085601806641,
1013
+ "eval_runtime": 917.8845,
1014
+ "eval_samples_per_second": 0.319,
1015
+ "eval_steps_per_second": 0.04,
1016
+ "step": 7200
1017
+ },
1018
+ {
1019
+ "epoch": 6.22,
1020
+ "learning_rate": 1.3781942078364567e-05,
1021
+ "loss": 3.6561,
1022
+ "step": 7300
1023
+ },
1024
+ {
1025
+ "epoch": 6.22,
1026
+ "eval_loss": 4.261867046356201,
1027
+ "eval_runtime": 912.6991,
1028
+ "eval_samples_per_second": 0.321,
1029
+ "eval_steps_per_second": 0.041,
1030
+ "step": 7300
1031
+ },
1032
+ {
1033
+ "epoch": 6.3,
1034
+ "learning_rate": 1.3696763202725724e-05,
1035
+ "loss": 3.6851,
1036
+ "step": 7400
1037
+ },
1038
+ {
1039
+ "epoch": 6.3,
1040
+ "eval_loss": 4.274002552032471,
1041
+ "eval_runtime": 911.5799,
1042
+ "eval_samples_per_second": 0.321,
1043
+ "eval_steps_per_second": 0.041,
1044
+ "step": 7400
1045
+ },
1046
+ {
1047
+ "epoch": 6.39,
1048
+ "learning_rate": 1.3611584327086883e-05,
1049
+ "loss": 3.7842,
1050
+ "step": 7500
1051
+ },
1052
+ {
1053
+ "epoch": 6.39,
1054
+ "eval_loss": 4.243954658508301,
1055
+ "eval_runtime": 906.9325,
1056
+ "eval_samples_per_second": 0.323,
1057
+ "eval_steps_per_second": 0.041,
1058
+ "step": 7500
1059
+ },
1060
+ {
1061
+ "epoch": 6.47,
1062
+ "learning_rate": 1.352640545144804e-05,
1063
+ "loss": 3.6484,
1064
+ "step": 7600
1065
+ },
1066
+ {
1067
+ "epoch": 6.47,
1068
+ "eval_loss": 4.238761901855469,
1069
+ "eval_runtime": 910.5622,
1070
+ "eval_samples_per_second": 0.322,
1071
+ "eval_steps_per_second": 0.041,
1072
+ "step": 7600
1073
+ },
1074
+ {
1075
+ "epoch": 6.56,
1076
+ "learning_rate": 1.3441226575809202e-05,
1077
+ "loss": 3.7509,
1078
+ "step": 7700
1079
+ },
1080
+ {
1081
+ "epoch": 6.56,
1082
+ "eval_loss": 4.240095138549805,
1083
+ "eval_runtime": 935.7714,
1084
+ "eval_samples_per_second": 0.313,
1085
+ "eval_steps_per_second": 0.04,
1086
+ "step": 7700
1087
+ },
1088
+ {
1089
+ "epoch": 6.64,
1090
+ "learning_rate": 1.3356047700170357e-05,
1091
+ "loss": 3.7597,
1092
+ "step": 7800
1093
+ },
1094
+ {
1095
+ "epoch": 6.64,
1096
+ "eval_loss": 4.256958484649658,
1097
+ "eval_runtime": 915.2208,
1098
+ "eval_samples_per_second": 0.32,
1099
+ "eval_steps_per_second": 0.04,
1100
+ "step": 7800
1101
+ },
1102
+ {
1103
+ "epoch": 6.73,
1104
+ "learning_rate": 1.3270868824531518e-05,
1105
+ "loss": 3.7253,
1106
+ "step": 7900
1107
+ },
1108
+ {
1109
+ "epoch": 6.73,
1110
+ "eval_loss": 4.291894435882568,
1111
+ "eval_runtime": 915.8862,
1112
+ "eval_samples_per_second": 0.32,
1113
+ "eval_steps_per_second": 0.04,
1114
+ "step": 7900
1115
+ },
1116
+ {
1117
+ "epoch": 6.81,
1118
+ "learning_rate": 1.3185689948892676e-05,
1119
+ "loss": 3.7335,
1120
+ "step": 8000
1121
+ },
1122
+ {
1123
+ "epoch": 6.81,
1124
+ "eval_loss": 4.206986427307129,
1125
+ "eval_runtime": 916.8897,
1126
+ "eval_samples_per_second": 0.32,
1127
+ "eval_steps_per_second": 0.04,
1128
+ "step": 8000
1129
+ },
1130
+ {
1131
+ "epoch": 6.9,
1132
+ "learning_rate": 1.3100511073253835e-05,
1133
+ "loss": 3.6213,
1134
+ "step": 8100
1135
+ },
1136
+ {
1137
+ "epoch": 6.9,
1138
+ "eval_loss": 4.203628063201904,
1139
+ "eval_runtime": 910.5741,
1140
+ "eval_samples_per_second": 0.322,
1141
+ "eval_steps_per_second": 0.041,
1142
+ "step": 8100
1143
+ },
1144
+ {
1145
+ "epoch": 6.98,
1146
+ "learning_rate": 1.3015332197614992e-05,
1147
+ "loss": 3.588,
1148
+ "step": 8200
1149
+ },
1150
+ {
1151
+ "epoch": 6.98,
1152
+ "eval_loss": 4.219061851501465,
1153
+ "eval_runtime": 910.3103,
1154
+ "eval_samples_per_second": 0.322,
1155
+ "eval_steps_per_second": 0.041,
1156
+ "step": 8200
1157
+ },
1158
+ {
1159
+ "epoch": 7.07,
1160
+ "learning_rate": 1.2930153321976151e-05,
1161
+ "loss": 3.6381,
1162
+ "step": 8300
1163
+ },
1164
+ {
1165
+ "epoch": 7.07,
1166
+ "eval_loss": 4.232492923736572,
1167
+ "eval_runtime": 903.8691,
1168
+ "eval_samples_per_second": 0.324,
1169
+ "eval_steps_per_second": 0.041,
1170
+ "step": 8300
1171
+ },
1172
+ {
1173
+ "epoch": 7.16,
1174
+ "learning_rate": 1.284497444633731e-05,
1175
+ "loss": 3.6023,
1176
+ "step": 8400
1177
+ },
1178
+ {
1179
+ "epoch": 7.16,
1180
+ "eval_loss": 4.247186183929443,
1181
+ "eval_runtime": 910.7714,
1182
+ "eval_samples_per_second": 0.322,
1183
+ "eval_steps_per_second": 0.041,
1184
+ "step": 8400
1185
+ },
1186
+ {
1187
+ "epoch": 7.24,
1188
+ "learning_rate": 1.2759795570698468e-05,
1189
+ "loss": 3.5601,
1190
+ "step": 8500
1191
+ },
1192
+ {
1193
+ "epoch": 7.24,
1194
+ "eval_loss": 4.2428107261657715,
1195
+ "eval_runtime": 926.441,
1196
+ "eval_samples_per_second": 0.316,
1197
+ "eval_steps_per_second": 0.04,
1198
+ "step": 8500
1199
+ },
1200
+ {
1201
+ "epoch": 7.33,
1202
+ "learning_rate": 1.2674616695059627e-05,
1203
+ "loss": 3.6893,
1204
+ "step": 8600
1205
+ },
1206
+ {
1207
+ "epoch": 7.33,
1208
+ "eval_loss": 4.235583305358887,
1209
+ "eval_runtime": 905.7471,
1210
+ "eval_samples_per_second": 0.323,
1211
+ "eval_steps_per_second": 0.041,
1212
+ "step": 8600
1213
+ },
1214
+ {
1215
+ "epoch": 7.41,
1216
+ "learning_rate": 1.2589437819420784e-05,
1217
+ "loss": 3.4977,
1218
+ "step": 8700
1219
+ },
1220
+ {
1221
+ "epoch": 7.41,
1222
+ "eval_loss": 4.210697650909424,
1223
+ "eval_runtime": 909.4116,
1224
+ "eval_samples_per_second": 0.322,
1225
+ "eval_steps_per_second": 0.041,
1226
+ "step": 8700
1227
+ },
1228
+ {
1229
+ "epoch": 7.5,
1230
+ "learning_rate": 1.2504258943781943e-05,
1231
+ "loss": 3.5489,
1232
+ "step": 8800
1233
+ },
1234
+ {
1235
+ "epoch": 7.5,
1236
+ "eval_loss": 4.2333455085754395,
1237
+ "eval_runtime": 899.4646,
1238
+ "eval_samples_per_second": 0.326,
1239
+ "eval_steps_per_second": 0.041,
1240
+ "step": 8800
1241
+ },
1242
+ {
1243
+ "epoch": 7.58,
1244
+ "learning_rate": 1.24190800681431e-05,
1245
+ "loss": 3.5786,
1246
+ "step": 8900
1247
+ },
1248
+ {
1249
+ "epoch": 7.58,
1250
+ "eval_loss": 4.2256317138671875,
1251
+ "eval_runtime": 907.6718,
1252
+ "eval_samples_per_second": 0.323,
1253
+ "eval_steps_per_second": 0.041,
1254
+ "step": 8900
1255
+ },
1256
+ {
1257
+ "epoch": 7.67,
1258
+ "learning_rate": 1.2333901192504261e-05,
1259
+ "loss": 3.5991,
1260
+ "step": 9000
1261
+ },
1262
+ {
1263
+ "epoch": 7.67,
1264
+ "eval_loss": 4.206646919250488,
1265
+ "eval_runtime": 928.2897,
1266
+ "eval_samples_per_second": 0.316,
1267
+ "eval_steps_per_second": 0.04,
1268
+ "step": 9000
1269
+ },
1270
+ {
1271
+ "epoch": 7.75,
1272
+ "learning_rate": 1.2248722316865417e-05,
1273
+ "loss": 3.6571,
1274
+ "step": 9100
1275
+ },
1276
+ {
1277
+ "epoch": 7.75,
1278
+ "eval_loss": 4.213482856750488,
1279
+ "eval_runtime": 913.2222,
1280
+ "eval_samples_per_second": 0.321,
1281
+ "eval_steps_per_second": 0.041,
1282
+ "step": 9100
1283
+ },
1284
+ {
1285
+ "epoch": 7.84,
1286
+ "learning_rate": 1.2163543441226578e-05,
1287
+ "loss": 3.5465,
1288
+ "step": 9200
1289
+ },
1290
+ {
1291
+ "epoch": 7.84,
1292
+ "eval_loss": 4.197275638580322,
1293
+ "eval_runtime": 906.8727,
1294
+ "eval_samples_per_second": 0.323,
1295
+ "eval_steps_per_second": 0.041,
1296
+ "step": 9200
1297
+ },
1298
+ {
1299
+ "epoch": 7.92,
1300
+ "learning_rate": 1.2078364565587735e-05,
1301
+ "loss": 3.6476,
1302
+ "step": 9300
1303
+ },
1304
+ {
1305
+ "epoch": 7.92,
1306
+ "eval_loss": 4.191103935241699,
1307
+ "eval_runtime": 910.3991,
1308
+ "eval_samples_per_second": 0.322,
1309
+ "eval_steps_per_second": 0.041,
1310
+ "step": 9300
1311
+ },
1312
+ {
1313
+ "epoch": 8.01,
1314
+ "learning_rate": 1.1993185689948894e-05,
1315
+ "loss": 3.6669,
1316
+ "step": 9400
1317
+ },
1318
+ {
1319
+ "epoch": 8.01,
1320
+ "eval_loss": 4.2001953125,
1321
+ "eval_runtime": 908.1025,
1322
+ "eval_samples_per_second": 0.323,
1323
+ "eval_steps_per_second": 0.041,
1324
+ "step": 9400
1325
+ },
1326
+ {
1327
+ "epoch": 8.09,
1328
+ "learning_rate": 1.1908006814310052e-05,
1329
+ "loss": 3.4399,
1330
+ "step": 9500
1331
+ },
1332
+ {
1333
+ "epoch": 8.09,
1334
+ "eval_loss": 4.230240345001221,
1335
+ "eval_runtime": 914.9264,
1336
+ "eval_samples_per_second": 0.32,
1337
+ "eval_steps_per_second": 0.04,
1338
+ "step": 9500
1339
+ },
1340
+ {
1341
+ "epoch": 8.18,
1342
+ "learning_rate": 1.1822827938671211e-05,
1343
+ "loss": 3.5381,
1344
+ "step": 9600
1345
+ },
1346
+ {
1347
+ "epoch": 8.18,
1348
+ "eval_loss": 4.2084150314331055,
1349
+ "eval_runtime": 928.1244,
1350
+ "eval_samples_per_second": 0.316,
1351
+ "eval_steps_per_second": 0.04,
1352
+ "step": 9600
1353
+ },
1354
+ {
1355
+ "epoch": 8.26,
1356
+ "learning_rate": 1.1737649063032368e-05,
1357
+ "loss": 3.5008,
1358
+ "step": 9700
1359
+ },
1360
+ {
1361
+ "epoch": 8.26,
1362
+ "eval_loss": 4.216128826141357,
1363
+ "eval_runtime": 913.9443,
1364
+ "eval_samples_per_second": 0.321,
1365
+ "eval_steps_per_second": 0.04,
1366
+ "step": 9700
1367
+ },
1368
+ {
1369
+ "epoch": 8.35,
1370
+ "learning_rate": 1.1652470187393527e-05,
1371
+ "loss": 3.6199,
1372
+ "step": 9800
1373
+ },
1374
+ {
1375
+ "epoch": 8.35,
1376
+ "eval_loss": 4.218649387359619,
1377
+ "eval_runtime": 916.6151,
1378
+ "eval_samples_per_second": 0.32,
1379
+ "eval_steps_per_second": 0.04,
1380
+ "step": 9800
1381
+ },
1382
+ {
1383
+ "epoch": 8.43,
1384
+ "learning_rate": 1.1567291311754685e-05,
1385
+ "loss": 3.5997,
1386
+ "step": 9900
1387
+ },
1388
+ {
1389
+ "epoch": 8.43,
1390
+ "eval_loss": 4.269123554229736,
1391
+ "eval_runtime": 902.9645,
1392
+ "eval_samples_per_second": 0.324,
1393
+ "eval_steps_per_second": 0.041,
1394
+ "step": 9900
1395
+ },
1396
+ {
1397
+ "epoch": 8.52,
1398
+ "learning_rate": 1.1482112436115844e-05,
1399
+ "loss": 3.5075,
1400
+ "step": 10000
1401
+ },
1402
+ {
1403
+ "epoch": 8.52,
1404
+ "eval_loss": 4.2026848793029785,
1405
+ "eval_runtime": 903.7139,
1406
+ "eval_samples_per_second": 0.324,
1407
+ "eval_steps_per_second": 0.041,
1408
+ "step": 10000
1409
+ },
1410
+ {
1411
+ "epoch": 8.6,
1412
+ "learning_rate": 1.1396933560477003e-05,
1413
+ "loss": 3.5163,
1414
+ "step": 10100
1415
+ },
1416
+ {
1417
+ "epoch": 8.6,
1418
+ "eval_loss": 4.2565717697143555,
1419
+ "eval_runtime": 909.1873,
1420
+ "eval_samples_per_second": 0.322,
1421
+ "eval_steps_per_second": 0.041,
1422
+ "step": 10100
1423
+ },
1424
+ {
1425
+ "epoch": 8.69,
1426
+ "learning_rate": 1.131175468483816e-05,
1427
+ "loss": 3.4902,
1428
+ "step": 10200
1429
+ },
1430
+ {
1431
+ "epoch": 8.69,
1432
+ "eval_loss": 4.208179473876953,
1433
+ "eval_runtime": 901.8843,
1434
+ "eval_samples_per_second": 0.325,
1435
+ "eval_steps_per_second": 0.041,
1436
+ "step": 10200
1437
+ },
1438
+ {
1439
+ "epoch": 8.77,
1440
+ "learning_rate": 1.1226575809199321e-05,
1441
+ "loss": 3.4829,
1442
+ "step": 10300
1443
+ },
1444
+ {
1445
+ "epoch": 8.77,
1446
+ "eval_loss": 4.217052459716797,
1447
+ "eval_runtime": 913.5755,
1448
+ "eval_samples_per_second": 0.321,
1449
+ "eval_steps_per_second": 0.041,
1450
+ "step": 10300
1451
+ },
1452
+ {
1453
+ "epoch": 8.86,
1454
+ "learning_rate": 1.1141396933560477e-05,
1455
+ "loss": 3.599,
1456
+ "step": 10400
1457
+ },
1458
+ {
1459
+ "epoch": 8.86,
1460
+ "eval_loss": 4.216422080993652,
1461
+ "eval_runtime": 904.6206,
1462
+ "eval_samples_per_second": 0.324,
1463
+ "eval_steps_per_second": 0.041,
1464
+ "step": 10400
1465
+ },
1466
+ {
1467
+ "epoch": 8.94,
1468
+ "learning_rate": 1.1056218057921638e-05,
1469
+ "loss": 3.5058,
1470
+ "step": 10500
1471
+ },
1472
+ {
1473
+ "epoch": 8.94,
1474
+ "eval_loss": 4.200262069702148,
1475
+ "eval_runtime": 908.4763,
1476
+ "eval_samples_per_second": 0.323,
1477
+ "eval_steps_per_second": 0.041,
1478
+ "step": 10500
1479
+ },
1480
+ {
1481
+ "epoch": 9.03,
1482
+ "learning_rate": 1.0971039182282795e-05,
1483
+ "loss": 3.4622,
1484
+ "step": 10600
1485
+ },
1486
+ {
1487
+ "epoch": 9.03,
1488
+ "eval_loss": 4.201136112213135,
1489
+ "eval_runtime": 918.5088,
1490
+ "eval_samples_per_second": 0.319,
1491
+ "eval_steps_per_second": 0.04,
1492
+ "step": 10600
1493
+ },
1494
+ {
1495
+ "epoch": 9.11,
1496
+ "learning_rate": 1.0885860306643954e-05,
1497
+ "loss": 3.4836,
1498
+ "step": 10700
1499
+ },
1500
+ {
1501
+ "epoch": 9.11,
1502
+ "eval_loss": 4.196112632751465,
1503
+ "eval_runtime": 907.3086,
1504
+ "eval_samples_per_second": 0.323,
1505
+ "eval_steps_per_second": 0.041,
1506
+ "step": 10700
1507
+ },
1508
+ {
1509
+ "epoch": 9.2,
1510
+ "learning_rate": 1.0800681431005112e-05,
1511
+ "loss": 3.4177,
1512
+ "step": 10800
1513
+ },
1514
+ {
1515
+ "epoch": 9.2,
1516
+ "eval_loss": 4.2310028076171875,
1517
+ "eval_runtime": 904.8795,
1518
+ "eval_samples_per_second": 0.324,
1519
+ "eval_steps_per_second": 0.041,
1520
+ "step": 10800
1521
+ },
1522
+ {
1523
+ "epoch": 9.28,
1524
+ "learning_rate": 1.071550255536627e-05,
1525
+ "loss": 3.4407,
1526
+ "step": 10900
1527
+ },
1528
+ {
1529
+ "epoch": 9.28,
1530
+ "eval_loss": 4.2216362953186035,
1531
+ "eval_runtime": 910.613,
1532
+ "eval_samples_per_second": 0.322,
1533
+ "eval_steps_per_second": 0.041,
1534
+ "step": 10900
1535
+ },
1536
+ {
1537
+ "epoch": 9.37,
1538
+ "learning_rate": 1.0630323679727428e-05,
1539
+ "loss": 3.44,
1540
+ "step": 11000
1541
+ },
1542
+ {
1543
+ "epoch": 9.37,
1544
+ "eval_loss": 4.2364935874938965,
1545
+ "eval_runtime": 907.4387,
1546
+ "eval_samples_per_second": 0.323,
1547
+ "eval_steps_per_second": 0.041,
1548
+ "step": 11000
1549
+ },
1550
+ {
1551
+ "epoch": 9.45,
1552
+ "learning_rate": 1.0545144804088587e-05,
1553
+ "loss": 3.5116,
1554
+ "step": 11100
1555
+ },
1556
+ {
1557
+ "epoch": 9.45,
1558
+ "eval_loss": 4.201255798339844,
1559
+ "eval_runtime": 905.3716,
1560
+ "eval_samples_per_second": 0.324,
1561
+ "eval_steps_per_second": 0.041,
1562
+ "step": 11100
1563
+ },
1564
+ {
1565
+ "epoch": 9.54,
1566
+ "learning_rate": 1.0459965928449745e-05,
1567
+ "loss": 3.4793,
1568
+ "step": 11200
1569
+ },
1570
+ {
1571
+ "epoch": 9.54,
1572
+ "eval_loss": 4.203760147094727,
1573
+ "eval_runtime": 903.5367,
1574
+ "eval_samples_per_second": 0.324,
1575
+ "eval_steps_per_second": 0.041,
1576
+ "step": 11200
1577
+ },
1578
+ {
1579
+ "epoch": 9.63,
1580
+ "learning_rate": 1.0374787052810904e-05,
1581
+ "loss": 3.4414,
1582
+ "step": 11300
1583
+ },
1584
+ {
1585
+ "epoch": 9.63,
1586
+ "eval_loss": 4.177506923675537,
1587
+ "eval_runtime": 904.4055,
1588
+ "eval_samples_per_second": 0.324,
1589
+ "eval_steps_per_second": 0.041,
1590
+ "step": 11300
1591
+ },
1592
+ {
1593
+ "epoch": 9.71,
1594
+ "learning_rate": 1.0289608177172061e-05,
1595
+ "loss": 3.509,
1596
+ "step": 11400
1597
+ },
1598
+ {
1599
+ "epoch": 9.71,
1600
+ "eval_loss": 4.204129695892334,
1601
+ "eval_runtime": 904.4565,
1602
+ "eval_samples_per_second": 0.324,
1603
+ "eval_steps_per_second": 0.041,
1604
+ "step": 11400
1605
+ },
1606
+ {
1607
+ "epoch": 9.8,
1608
+ "learning_rate": 1.020442930153322e-05,
1609
+ "loss": 3.464,
1610
+ "step": 11500
1611
+ },
1612
+ {
1613
+ "epoch": 9.8,
1614
+ "eval_loss": 4.2182393074035645,
1615
+ "eval_runtime": 915.8218,
1616
+ "eval_samples_per_second": 0.32,
1617
+ "eval_steps_per_second": 0.04,
1618
+ "step": 11500
1619
+ },
1620
+ {
1621
+ "epoch": 9.88,
1622
+ "learning_rate": 1.0119250425894378e-05,
1623
+ "loss": 3.3574,
1624
+ "step": 11600
1625
+ },
1626
+ {
1627
+ "epoch": 9.88,
1628
+ "eval_loss": 4.222841262817383,
1629
+ "eval_runtime": 916.715,
1630
+ "eval_samples_per_second": 0.32,
1631
+ "eval_steps_per_second": 0.04,
1632
+ "step": 11600
1633
+ },
1634
+ {
1635
+ "epoch": 9.97,
1636
+ "learning_rate": 1.0034071550255537e-05,
1637
+ "loss": 3.4134,
1638
+ "step": 11700
1639
+ },
1640
+ {
1641
+ "epoch": 9.97,
1642
+ "eval_loss": 4.198949813842773,
1643
+ "eval_runtime": 911.1975,
1644
+ "eval_samples_per_second": 0.322,
1645
+ "eval_steps_per_second": 0.041,
1646
+ "step": 11700
1647
+ },
1648
+ {
1649
+ "epoch": 10.05,
1650
+ "learning_rate": 9.948892674616696e-06,
1651
+ "loss": 3.4876,
1652
+ "step": 11800
1653
+ },
1654
+ {
1655
+ "epoch": 10.05,
1656
+ "eval_loss": 4.1779093742370605,
1657
+ "eval_runtime": 905.3949,
1658
+ "eval_samples_per_second": 0.324,
1659
+ "eval_steps_per_second": 0.041,
1660
+ "step": 11800
1661
+ },
1662
+ {
1663
+ "epoch": 10.14,
1664
+ "learning_rate": 9.863713798977853e-06,
1665
+ "loss": 3.3188,
1666
+ "step": 11900
1667
+ },
1668
+ {
1669
+ "epoch": 10.14,
1670
+ "eval_loss": 4.199155330657959,
1671
+ "eval_runtime": 906.4403,
1672
+ "eval_samples_per_second": 0.323,
1673
+ "eval_steps_per_second": 0.041,
1674
+ "step": 11900
1675
+ },
1676
+ {
1677
+ "epoch": 10.22,
1678
+ "learning_rate": 9.778534923339012e-06,
1679
+ "loss": 3.4336,
1680
+ "step": 12000
1681
+ },
1682
+ {
1683
+ "epoch": 10.22,
1684
+ "eval_loss": 4.215114593505859,
1685
+ "eval_runtime": 918.8261,
1686
+ "eval_samples_per_second": 0.319,
1687
+ "eval_steps_per_second": 0.04,
1688
+ "step": 12000
1689
+ },
1690
+ {
1691
+ "epoch": 10.31,
1692
+ "learning_rate": 9.693356047700172e-06,
1693
+ "loss": 3.3774,
1694
+ "step": 12100
1695
+ },
1696
+ {
1697
+ "epoch": 10.31,
1698
+ "eval_loss": 4.194676876068115,
1699
+ "eval_runtime": 912.8562,
1700
+ "eval_samples_per_second": 0.321,
1701
+ "eval_steps_per_second": 0.041,
1702
+ "step": 12100
1703
+ },
1704
+ {
1705
+ "epoch": 10.39,
1706
+ "learning_rate": 9.608177172061329e-06,
1707
+ "loss": 3.2942,
1708
+ "step": 12200
1709
+ },
1710
+ {
1711
+ "epoch": 10.39,
1712
+ "eval_loss": 4.212912082672119,
1713
+ "eval_runtime": 905.2405,
1714
+ "eval_samples_per_second": 0.324,
1715
+ "eval_steps_per_second": 0.041,
1716
+ "step": 12200
1717
+ },
1718
+ {
1719
+ "epoch": 10.48,
1720
+ "learning_rate": 9.522998296422488e-06,
1721
+ "loss": 3.4431,
1722
+ "step": 12300
1723
+ },
1724
+ {
1725
+ "epoch": 10.48,
1726
+ "eval_loss": 4.1929144859313965,
1727
+ "eval_runtime": 908.9828,
1728
+ "eval_samples_per_second": 0.322,
1729
+ "eval_steps_per_second": 0.041,
1730
+ "step": 12300
1731
+ },
1732
+ {
1733
+ "epoch": 10.56,
1734
+ "learning_rate": 9.437819420783645e-06,
1735
+ "loss": 3.3895,
1736
+ "step": 12400
1737
+ },
1738
+ {
1739
+ "epoch": 10.56,
1740
+ "eval_loss": 4.221463203430176,
1741
+ "eval_runtime": 905.8419,
1742
+ "eval_samples_per_second": 0.323,
1743
+ "eval_steps_per_second": 0.041,
1744
+ "step": 12400
1745
+ },
1746
+ {
1747
+ "epoch": 10.65,
1748
+ "learning_rate": 9.352640545144805e-06,
1749
+ "loss": 3.4624,
1750
+ "step": 12500
1751
+ },
1752
+ {
1753
+ "epoch": 10.65,
1754
+ "eval_loss": 4.192135334014893,
1755
+ "eval_runtime": 915.7075,
1756
+ "eval_samples_per_second": 0.32,
1757
+ "eval_steps_per_second": 0.04,
1758
+ "step": 12500
1759
+ },
1760
+ {
1761
+ "epoch": 10.73,
1762
+ "learning_rate": 9.267461669505964e-06,
1763
+ "loss": 3.3823,
1764
+ "step": 12600
1765
+ },
1766
+ {
1767
+ "epoch": 10.73,
1768
+ "eval_loss": 4.197402477264404,
1769
+ "eval_runtime": 902.8153,
1770
+ "eval_samples_per_second": 0.325,
1771
+ "eval_steps_per_second": 0.041,
1772
+ "step": 12600
1773
+ },
1774
+ {
1775
+ "epoch": 10.82,
1776
+ "learning_rate": 9.182282793867123e-06,
1777
+ "loss": 3.3671,
1778
+ "step": 12700
1779
+ },
1780
+ {
1781
+ "epoch": 10.82,
1782
+ "eval_loss": 4.152112007141113,
1783
+ "eval_runtime": 903.9495,
1784
+ "eval_samples_per_second": 0.324,
1785
+ "eval_steps_per_second": 0.041,
1786
+ "step": 12700
1787
+ },
1788
+ {
1789
+ "epoch": 10.9,
1790
+ "learning_rate": 9.09710391822828e-06,
1791
+ "loss": 3.2883,
1792
+ "step": 12800
1793
+ },
1794
+ {
1795
+ "epoch": 10.9,
1796
+ "eval_loss": 4.178409576416016,
1797
+ "eval_runtime": 904.0493,
1798
+ "eval_samples_per_second": 0.324,
1799
+ "eval_steps_per_second": 0.041,
1800
+ "step": 12800
1801
+ },
1802
+ {
1803
+ "epoch": 10.99,
1804
+ "learning_rate": 9.01192504258944e-06,
1805
+ "loss": 3.4145,
1806
+ "step": 12900
1807
+ },
1808
+ {
1809
+ "epoch": 10.99,
1810
+ "eval_loss": 4.208373546600342,
1811
+ "eval_runtime": 903.9191,
1812
+ "eval_samples_per_second": 0.324,
1813
+ "eval_steps_per_second": 0.041,
1814
+ "step": 12900
1815
+ }
1816
+ ],
1817
+ "max_steps": 23480,
1818
+ "num_train_epochs": 20,
1819
+ "total_flos": 4.542855419394294e+19,
1820
+ "trial_name": null,
1821
+ "trial_params": null
1822
+ }