chansung commited on
Commit
9b94627
1 Parent(s): 28319a1

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,12 @@
2
  license: gemma
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
  base_model: google/gemma-7b
10
  datasets:
11
- - llama-duo/synth_summarize_dataset
12
  model-index:
13
  - name: gemma7b-summarize-gpt4o-30k
14
  results: []
@@ -17,12 +16,12 @@ model-index:
17
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
  should probably proofread and complete it, then remove this comment. -->
19
 
20
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/chansung18/huggingface/runs/gg6giqaz)
21
  # gemma7b-summarize-gpt4o-30k
22
 
23
- This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 2.3412
26
 
27
  ## Model description
28
 
@@ -53,13 +52,17 @@ The following hyperparameters were used during training:
53
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
54
  - lr_scheduler_type: cosine
55
  - lr_scheduler_warmup_ratio: 0.1
56
- - num_epochs: 1
57
 
58
  ### Training results
59
 
60
  | Training Loss | Epoch | Step | Validation Loss |
61
  |:-------------:|:-----:|:----:|:---------------:|
62
- | 0.961 | 1.0 | 137 | 2.3412 |
 
 
 
 
63
 
64
 
65
  ### Framework versions
 
2
  license: gemma
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  base_model: google/gemma-7b
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: gemma7b-summarize-gpt4o-30k
13
  results: []
 
16
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
  should probably proofread and complete it, then remove this comment. -->
18
 
19
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/chansung18/huggingface/runs/ddvw2m8z)
20
  # gemma7b-summarize-gpt4o-30k
21
 
22
+ This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
+ - Loss: 2.3811
25
 
26
  ## Model description
27
 
 
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
+ - num_epochs: 5
56
 
57
  ### Training results
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
+ | 0.9712 | 1.0 | 137 | 2.3077 |
62
+ | 0.8675 | 2.0 | 274 | 2.2479 |
63
+ | 0.7623 | 3.0 | 411 | 2.2756 |
64
+ | 0.709 | 4.0 | 548 | 2.3417 |
65
+ | 0.6601 | 5.0 | 685 | 2.3811 |
66
 
67
 
68
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5d08bade1b09675cae192cd2ae9dfc9b1209c9ac40ebb4ccf06ae69c64b21ea
3
  size 50056096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dead167941940b2975197a1f402c334b3606ea3d8e60a06c79db262f8105d00
3
  size 50056096
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 2.341205358505249,
4
- "eval_runtime": 1.0267,
5
- "eval_samples": 25,
6
- "eval_samples_per_second": 4.87,
7
- "eval_steps_per_second": 1.948,
8
- "total_flos": 2.0945562398778982e+17,
9
- "train_loss": 4.772963228887015,
10
- "train_runtime": 1080.9502,
11
  "train_samples": 29787,
12
- "train_samples_per_second": 2.028,
13
- "train_steps_per_second": 0.127
14
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 1.0472781231601746e+18,
4
+ "train_loss": 2.151051264783762,
5
+ "train_runtime": 5341.9856,
 
 
 
 
 
6
  "train_samples": 29787,
7
+ "train_samples_per_second": 2.052,
8
+ "train_steps_per_second": 0.128
9
  }
runs/May19_11-54-28_deep-diver-main-lucky-mouse-1-0-0/events.out.tfevents.1716134219.deep-diver-main-lucky-mouse-1-0-0.385.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b758f0942c1305f23836da55e9b5dd0158c5186be84c9830fcea14dacd4bc0c
3
- size 32101
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b9bd7ceaaf98ed46022543b9806d0f0f89b6e1bbda57604b1783852001cedbc
3
+ size 36313
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "total_flos": 2.0945562398778982e+17,
4
- "train_loss": 4.772963228887015,
5
- "train_runtime": 1080.9502,
6
  "train_samples": 29787,
7
- "train_samples_per_second": 2.028,
8
- "train_steps_per_second": 0.127
9
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "total_flos": 1.0472781231601746e+18,
4
+ "train_loss": 2.151051264783762,
5
+ "train_runtime": 5341.9856,
6
  "train_samples": 29787,
7
+ "train_samples_per_second": 2.052,
8
+ "train_steps_per_second": 0.128
9
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 137,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11,221 +11,1023 @@
11
  {
12
  "epoch": 0.0072992700729927005,
13
  "grad_norm": 708.0,
14
- "learning_rate": 1.4285714285714285e-05,
15
  "loss": 56.8346,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0364963503649635,
20
- "grad_norm": 368.0,
21
- "learning_rate": 7.142857142857143e-05,
22
- "loss": 45.8547,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.072992700729927,
27
- "grad_norm": 30.875,
28
- "learning_rate": 0.00014285714285714287,
29
- "loss": 19.792,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.10948905109489052,
34
- "grad_norm": 6.875,
35
- "learning_rate": 0.00019996738360808565,
36
- "loss": 15.0745,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.145985401459854,
41
- "grad_norm": 17.75,
42
- "learning_rate": 0.00019882804237803488,
43
- "loss": 13.3169,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.18248175182481752,
48
- "grad_norm": 117.0,
49
- "learning_rate": 0.00019607909582962477,
50
- "loss": 8.4463,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.21897810218978103,
55
- "grad_norm": 5.09375,
56
- "learning_rate": 0.0001917653158603628,
57
- "loss": 2.1599,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.25547445255474455,
62
- "grad_norm": 3.921875,
63
- "learning_rate": 0.00018595696069872013,
64
- "loss": 1.6643,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.291970802919708,
69
- "grad_norm": 2.21875,
70
- "learning_rate": 0.00017874863061334657,
71
- "loss": 1.4899,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.3284671532846715,
76
- "grad_norm": 2.1875,
77
- "learning_rate": 0.00017025772716520323,
78
- "loss": 1.3695,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.36496350364963503,
83
- "grad_norm": 2.0,
84
- "learning_rate": 0.0001606225410966638,
85
- "loss": 1.2794,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.40145985401459855,
90
- "grad_norm": 0.953125,
91
- "learning_rate": 0.00015000000000000001,
92
- "loss": 1.2388,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.43795620437956206,
97
- "grad_norm": 3.53125,
98
- "learning_rate": 0.0001385631124488136,
99
- "loss": 1.2269,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.4744525547445255,
104
- "grad_norm": 1.453125,
105
- "learning_rate": 0.0001264981502196662,
106
- "loss": 1.1434,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.5109489051094891,
111
- "grad_norm": 0.921875,
112
- "learning_rate": 0.00011400161449686293,
113
- "loss": 1.1033,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5474452554744526,
118
- "grad_norm": 1.5703125,
119
- "learning_rate": 0.00010127703547159739,
120
- "loss": 1.0812,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.583941605839416,
125
- "grad_norm": 2.828125,
126
- "learning_rate": 8.853165746015997e-05,
127
- "loss": 1.061,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.6204379562043796,
132
- "grad_norm": 7.4375,
133
- "learning_rate": 7.597306353045393e-05,
134
- "loss": 1.0362,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.656934306569343,
139
- "grad_norm": 0.734375,
140
- "learning_rate": 6.380579461128819e-05,
141
- "loss": 1.0233,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.6934306569343066,
146
- "grad_norm": 1.296875,
147
- "learning_rate": 5.222801814877369e-05,
148
- "loss": 1.0499,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7299270072992701,
153
- "grad_norm": 1.3125,
154
- "learning_rate": 4.142830056718052e-05,
155
- "loss": 0.995,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.7664233576642335,
160
- "grad_norm": 1.015625,
161
- "learning_rate": 3.158253610095697e-05,
162
- "loss": 0.9839,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.8029197080291971,
167
- "grad_norm": 0.765625,
168
- "learning_rate": 2.2851082017805703e-05,
169
- "loss": 0.9762,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.8394160583941606,
174
- "grad_norm": 0.63671875,
175
- "learning_rate": 1.5376146891235598e-05,
176
- "loss": 0.9773,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.8759124087591241,
181
- "grad_norm": 0.62890625,
182
- "learning_rate": 9.279474459608805e-06,
183
- "loss": 0.9756,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.9124087591240876,
188
- "grad_norm": 0.80859375,
189
- "learning_rate": 4.660360794506946e-06,
190
- "loss": 0.9625,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.948905109489051,
195
- "grad_norm": 0.57421875,
196
- "learning_rate": 1.5940370726542863e-06,
197
- "loss": 0.9625,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.9854014598540146,
202
- "grad_norm": 0.5625,
203
- "learning_rate": 1.3044429107700318e-07,
204
- "loss": 0.961,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 1.0,
209
- "eval_loss": 2.341205358505249,
210
- "eval_runtime": 1.0018,
211
- "eval_samples_per_second": 4.991,
212
- "eval_steps_per_second": 1.996,
213
  "step": 137
214
  },
215
  {
216
- "epoch": 1.0,
217
- "step": 137,
218
- "total_flos": 2.0945562398778982e+17,
219
- "train_loss": 4.772963228887015,
220
- "train_runtime": 1080.9502,
221
- "train_samples_per_second": 2.028,
222
- "train_steps_per_second": 0.127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  }
224
  ],
225
  "logging_steps": 5,
226
- "max_steps": 137,
227
  "num_input_tokens_seen": 0,
228
- "num_train_epochs": 1,
229
  "save_steps": 100,
230
  "stateful_callbacks": {
231
  "TrainerControl": {
@@ -239,7 +1041,7 @@
239
  "attributes": {}
240
  }
241
  },
242
- "total_flos": 2.0945562398778982e+17,
243
  "train_batch_size": 4,
244
  "trial_name": null,
245
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 685,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11
  {
12
  "epoch": 0.0072992700729927005,
13
  "grad_norm": 708.0,
14
+ "learning_rate": 2.898550724637681e-06,
15
  "loss": 56.8346,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0364963503649635,
20
+ "grad_norm": 604.0,
21
+ "learning_rate": 1.4492753623188407e-05,
22
+ "loss": 52.9742,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.072992700729927,
27
+ "grad_norm": 340.0,
28
+ "learning_rate": 2.8985507246376814e-05,
29
+ "loss": 39.0746,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.10948905109489052,
34
+ "grad_norm": 40.25,
35
+ "learning_rate": 4.347826086956522e-05,
36
+ "loss": 20.8099,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.145985401459854,
41
+ "grad_norm": 25.5,
42
+ "learning_rate": 5.797101449275363e-05,
43
+ "loss": 17.6144,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.18248175182481752,
48
+ "grad_norm": 7.78125,
49
+ "learning_rate": 7.246376811594203e-05,
50
+ "loss": 15.3803,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.21897810218978103,
55
+ "grad_norm": 6.40625,
56
+ "learning_rate": 8.695652173913044e-05,
57
+ "loss": 14.0798,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.25547445255474455,
62
+ "grad_norm": 13.4375,
63
+ "learning_rate": 0.00010144927536231885,
64
+ "loss": 13.4032,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.291970802919708,
69
+ "grad_norm": 41.0,
70
+ "learning_rate": 0.00011594202898550725,
71
+ "loss": 10.8827,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.3284671532846715,
76
+ "grad_norm": 13.1875,
77
+ "learning_rate": 0.00013043478260869567,
78
+ "loss": 4.5915,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.36496350364963503,
83
+ "grad_norm": 4.09375,
84
+ "learning_rate": 0.00014492753623188405,
85
+ "loss": 1.9,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.40145985401459855,
90
+ "grad_norm": 2.28125,
91
+ "learning_rate": 0.00015942028985507247,
92
+ "loss": 1.6474,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.43795620437956206,
97
+ "grad_norm": 3.5,
98
+ "learning_rate": 0.00017391304347826088,
99
+ "loss": 1.477,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.4744525547445255,
104
+ "grad_norm": 2.28125,
105
+ "learning_rate": 0.00018840579710144927,
106
+ "loss": 1.3309,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.5109489051094891,
111
+ "grad_norm": 1.6171875,
112
+ "learning_rate": 0.00019999869950890106,
113
+ "loss": 1.2538,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5474452554744526,
118
+ "grad_norm": 5.9375,
119
+ "learning_rate": 0.0001999531858720213,
120
+ "loss": 1.224,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.583941605839416,
125
+ "grad_norm": 2.25,
126
+ "learning_rate": 0.00019984268150178167,
127
+ "loss": 1.1823,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.6204379562043796,
132
+ "grad_norm": 2.078125,
133
+ "learning_rate": 0.00019966725824941932,
134
+ "loss": 1.1279,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.656934306569343,
139
+ "grad_norm": 3.0625,
140
+ "learning_rate": 0.00019942703017718975,
141
+ "loss": 1.127,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.6934306569343066,
146
+ "grad_norm": 1.75,
147
+ "learning_rate": 0.000199122153484202,
148
+ "loss": 1.1284,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7299270072992701,
153
+ "grad_norm": 1.5625,
154
+ "learning_rate": 0.00019875282640485645,
155
+ "loss": 1.0566,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.7664233576642335,
160
+ "grad_norm": 4.53125,
161
+ "learning_rate": 0.0001983192890799503,
162
+ "loss": 1.0361,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.8029197080291971,
167
+ "grad_norm": 2.5,
168
+ "learning_rate": 0.0001978218234005352,
169
+ "loss": 1.0371,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.8394160583941606,
174
+ "grad_norm": 1.890625,
175
+ "learning_rate": 0.00019726075282462845,
176
+ "loss": 1.0235,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.8759124087591241,
181
+ "grad_norm": 0.67578125,
182
+ "learning_rate": 0.00019663644216689683,
183
+ "loss": 0.996,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.9124087591240876,
188
+ "grad_norm": 1.2421875,
189
+ "learning_rate": 0.00019594929736144976,
190
+ "loss": 0.9734,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.948905109489051,
195
+ "grad_norm": 1.5625,
196
+ "learning_rate": 0.00019519976519789616,
197
+ "loss": 0.978,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.9854014598540146,
202
+ "grad_norm": 0.95703125,
203
+ "learning_rate": 0.00019438833303083678,
204
+ "loss": 0.9712,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 1.0,
209
+ "eval_loss": 2.307734489440918,
210
+ "eval_runtime": 0.9962,
211
+ "eval_samples_per_second": 5.019,
212
+ "eval_steps_per_second": 2.008,
213
  "step": 137
214
  },
215
  {
216
+ "epoch": 1.0218978102189782,
217
+ "grad_norm": 2.125,
218
+ "learning_rate": 0.00019351552846298025,
219
+ "loss": 0.9374,
220
+ "step": 140
221
+ },
222
+ {
223
+ "epoch": 1.0583941605839415,
224
+ "grad_norm": 2.265625,
225
+ "learning_rate": 0.0001925819190020898,
226
+ "loss": 0.9173,
227
+ "step": 145
228
+ },
229
+ {
230
+ "epoch": 1.094890510948905,
231
+ "grad_norm": 0.828125,
232
+ "learning_rate": 0.00019158811169198313,
233
+ "loss": 0.8916,
234
+ "step": 150
235
+ },
236
+ {
237
+ "epoch": 1.1313868613138687,
238
+ "grad_norm": 1.0703125,
239
+ "learning_rate": 0.0001905347527178252,
240
+ "loss": 0.9418,
241
+ "step": 155
242
+ },
243
+ {
244
+ "epoch": 1.167883211678832,
245
+ "grad_norm": 0.9140625,
246
+ "learning_rate": 0.00018942252698597113,
247
+ "loss": 0.9054,
248
+ "step": 160
249
+ },
250
+ {
251
+ "epoch": 1.2043795620437956,
252
+ "grad_norm": 2.0625,
253
+ "learning_rate": 0.00018825215767863214,
254
+ "loss": 0.9039,
255
+ "step": 165
256
+ },
257
+ {
258
+ "epoch": 1.2408759124087592,
259
+ "grad_norm": 1.5859375,
260
+ "learning_rate": 0.00018702440578365387,
261
+ "loss": 0.9146,
262
+ "step": 170
263
+ },
264
+ {
265
+ "epoch": 1.2773722627737225,
266
+ "grad_norm": 1.3515625,
267
+ "learning_rate": 0.00018574006959971333,
268
+ "loss": 0.8896,
269
+ "step": 175
270
+ },
271
+ {
272
+ "epoch": 1.313868613138686,
273
+ "grad_norm": 2.09375,
274
+ "learning_rate": 0.00018439998421725554,
275
+ "loss": 0.8947,
276
+ "step": 180
277
+ },
278
+ {
279
+ "epoch": 1.3503649635036497,
280
+ "grad_norm": 0.80078125,
281
+ "learning_rate": 0.00018300502097550806,
282
+ "loss": 0.881,
283
+ "step": 185
284
+ },
285
+ {
286
+ "epoch": 1.3868613138686132,
287
+ "grad_norm": 0.80078125,
288
+ "learning_rate": 0.00018155608689592604,
289
+ "loss": 0.8906,
290
+ "step": 190
291
+ },
292
+ {
293
+ "epoch": 1.4233576642335766,
294
+ "grad_norm": 0.80859375,
295
+ "learning_rate": 0.00018005412409243606,
296
+ "loss": 0.8939,
297
+ "step": 195
298
+ },
299
+ {
300
+ "epoch": 1.4598540145985401,
301
+ "grad_norm": 1.0234375,
302
+ "learning_rate": 0.0001785001091588628,
303
+ "loss": 0.9016,
304
+ "step": 200
305
+ },
306
+ {
307
+ "epoch": 1.4963503649635037,
308
+ "grad_norm": 0.70703125,
309
+ "learning_rate": 0.0001768950525339362,
310
+ "loss": 0.8943,
311
+ "step": 205
312
+ },
313
+ {
314
+ "epoch": 1.5328467153284673,
315
+ "grad_norm": 1.2109375,
316
+ "learning_rate": 0.00017523999784429238,
317
+ "loss": 0.8614,
318
+ "step": 210
319
+ },
320
+ {
321
+ "epoch": 1.5693430656934306,
322
+ "grad_norm": 0.7734375,
323
+ "learning_rate": 0.00017353602122589527,
324
+ "loss": 0.8788,
325
+ "step": 215
326
+ },
327
+ {
328
+ "epoch": 1.6058394160583942,
329
+ "grad_norm": 0.82421875,
330
+ "learning_rate": 0.0001717842306243205,
331
+ "loss": 0.8833,
332
+ "step": 220
333
+ },
334
+ {
335
+ "epoch": 1.6423357664233578,
336
+ "grad_norm": 0.84765625,
337
+ "learning_rate": 0.00016998576507435618,
338
+ "loss": 0.8713,
339
+ "step": 225
340
+ },
341
+ {
342
+ "epoch": 1.6788321167883211,
343
+ "grad_norm": 1.234375,
344
+ "learning_rate": 0.00016814179395938913,
345
+ "loss": 0.8661,
346
+ "step": 230
347
+ },
348
+ {
349
+ "epoch": 1.7153284671532847,
350
+ "grad_norm": 0.91015625,
351
+ "learning_rate": 0.00016625351625105796,
352
+ "loss": 0.8413,
353
+ "step": 235
354
+ },
355
+ {
356
+ "epoch": 1.7518248175182483,
357
+ "grad_norm": 0.63671875,
358
+ "learning_rate": 0.0001643221597296679,
359
+ "loss": 0.8741,
360
+ "step": 240
361
+ },
362
+ {
363
+ "epoch": 1.7883211678832116,
364
+ "grad_norm": 0.73046875,
365
+ "learning_rate": 0.00016234898018587337,
366
+ "loss": 0.8744,
367
+ "step": 245
368
+ },
369
+ {
370
+ "epoch": 1.8248175182481752,
371
+ "grad_norm": 0.671875,
372
+ "learning_rate": 0.00016033526060414842,
373
+ "loss": 0.8517,
374
+ "step": 250
375
+ },
376
+ {
377
+ "epoch": 1.8613138686131387,
378
+ "grad_norm": 1.0234375,
379
+ "learning_rate": 0.00015828231032857503,
380
+ "loss": 0.8899,
381
+ "step": 255
382
+ },
383
+ {
384
+ "epoch": 1.897810218978102,
385
+ "grad_norm": 0.66796875,
386
+ "learning_rate": 0.00015619146421149232,
387
+ "loss": 0.8537,
388
+ "step": 260
389
+ },
390
+ {
391
+ "epoch": 1.9343065693430657,
392
+ "grad_norm": 0.7109375,
393
+ "learning_rate": 0.00015406408174555976,
394
+ "loss": 0.8329,
395
+ "step": 265
396
+ },
397
+ {
398
+ "epoch": 1.9708029197080292,
399
+ "grad_norm": 0.71875,
400
+ "learning_rate": 0.00015190154617979938,
401
+ "loss": 0.8675,
402
+ "step": 270
403
+ },
404
+ {
405
+ "epoch": 2.0,
406
+ "eval_loss": 2.247941017150879,
407
+ "eval_runtime": 0.9979,
408
+ "eval_samples_per_second": 5.01,
409
+ "eval_steps_per_second": 2.004,
410
+ "step": 274
411
+ },
412
+ {
413
+ "epoch": 2.0072992700729926,
414
+ "grad_norm": 0.80859375,
415
+ "learning_rate": 0.00014970526362019079,
416
+ "loss": 0.8435,
417
+ "step": 275
418
+ },
419
+ {
420
+ "epoch": 2.0437956204379564,
421
+ "grad_norm": 1.515625,
422
+ "learning_rate": 0.00014747666211540459,
423
+ "loss": 0.7774,
424
+ "step": 280
425
+ },
426
+ {
427
+ "epoch": 2.0802919708029197,
428
+ "grad_norm": 1.0859375,
429
+ "learning_rate": 0.00014521719072826858,
430
+ "loss": 0.79,
431
+ "step": 285
432
+ },
433
+ {
434
+ "epoch": 2.116788321167883,
435
+ "grad_norm": 0.498046875,
436
+ "learning_rate": 0.00014292831859356997,
437
+ "loss": 0.7929,
438
+ "step": 290
439
+ },
440
+ {
441
+ "epoch": 2.153284671532847,
442
+ "grad_norm": 1.59375,
443
+ "learning_rate": 0.00014061153396280674,
444
+ "loss": 0.8032,
445
+ "step": 295
446
+ },
447
+ {
448
+ "epoch": 2.18978102189781,
449
+ "grad_norm": 0.83203125,
450
+ "learning_rate": 0.000138268343236509,
451
+ "loss": 0.7932,
452
+ "step": 300
453
+ },
454
+ {
455
+ "epoch": 2.2262773722627736,
456
+ "grad_norm": 0.734375,
457
+ "learning_rate": 0.00013590026998475986,
458
+ "loss": 0.7657,
459
+ "step": 305
460
+ },
461
+ {
462
+ "epoch": 2.2627737226277373,
463
+ "grad_norm": 0.609375,
464
+ "learning_rate": 0.0001335088539565523,
465
+ "loss": 0.783,
466
+ "step": 310
467
+ },
468
+ {
469
+ "epoch": 2.2992700729927007,
470
+ "grad_norm": 0.71484375,
471
+ "learning_rate": 0.00013109565007862596,
472
+ "loss": 0.7755,
473
+ "step": 315
474
+ },
475
+ {
476
+ "epoch": 2.335766423357664,
477
+ "grad_norm": 0.609375,
478
+ "learning_rate": 0.0001286622274444361,
479
+ "loss": 0.7723,
480
+ "step": 320
481
+ },
482
+ {
483
+ "epoch": 2.372262773722628,
484
+ "grad_norm": 1.3359375,
485
+ "learning_rate": 0.00012621016829391022,
486
+ "loss": 0.7739,
487
+ "step": 325
488
+ },
489
+ {
490
+ "epoch": 2.408759124087591,
491
+ "grad_norm": 1.1328125,
492
+ "learning_rate": 0.00012374106698465732,
493
+ "loss": 0.7821,
494
+ "step": 330
495
+ },
496
+ {
497
+ "epoch": 2.445255474452555,
498
+ "grad_norm": 0.91015625,
499
+ "learning_rate": 0.00012125652895529766,
500
+ "loss": 0.7852,
501
+ "step": 335
502
+ },
503
+ {
504
+ "epoch": 2.4817518248175183,
505
+ "grad_norm": 0.74609375,
506
+ "learning_rate": 0.00011875816968158815,
507
+ "loss": 0.7792,
508
+ "step": 340
509
+ },
510
+ {
511
+ "epoch": 2.5182481751824817,
512
+ "grad_norm": 0.625,
513
+ "learning_rate": 0.00011624761362602061,
514
+ "loss": 0.7799,
515
+ "step": 345
516
+ },
517
+ {
518
+ "epoch": 2.554744525547445,
519
+ "grad_norm": 0.81640625,
520
+ "learning_rate": 0.00011372649318157749,
521
+ "loss": 0.7914,
522
+ "step": 350
523
+ },
524
+ {
525
+ "epoch": 2.591240875912409,
526
+ "grad_norm": 0.80078125,
527
+ "learning_rate": 0.00011119644761033078,
528
+ "loss": 0.7847,
529
+ "step": 355
530
+ },
531
+ {
532
+ "epoch": 2.627737226277372,
533
+ "grad_norm": 0.984375,
534
+ "learning_rate": 0.0001086591219775746,
535
+ "loss": 0.8049,
536
+ "step": 360
537
+ },
538
+ {
539
+ "epoch": 2.664233576642336,
540
+ "grad_norm": 0.81640625,
541
+ "learning_rate": 0.00010611616608218429,
542
+ "loss": 0.7865,
543
+ "step": 365
544
+ },
545
+ {
546
+ "epoch": 2.7007299270072993,
547
+ "grad_norm": 0.51953125,
548
+ "learning_rate": 0.00010356923338389806,
549
+ "loss": 0.7908,
550
+ "step": 370
551
+ },
552
+ {
553
+ "epoch": 2.7372262773722627,
554
+ "grad_norm": 0.53125,
555
+ "learning_rate": 0.00010101997992821797,
556
+ "loss": 0.7925,
557
+ "step": 375
558
+ },
559
+ {
560
+ "epoch": 2.7737226277372264,
561
+ "grad_norm": 0.49609375,
562
+ "learning_rate": 9.847006326962974e-05,
563
+ "loss": 0.799,
564
+ "step": 380
565
+ },
566
+ {
567
+ "epoch": 2.81021897810219,
568
+ "grad_norm": 0.51171875,
569
+ "learning_rate": 9.592114139384145e-05,
570
+ "loss": 0.7832,
571
+ "step": 385
572
+ },
573
+ {
574
+ "epoch": 2.846715328467153,
575
+ "grad_norm": 0.7109375,
576
+ "learning_rate": 9.337487163974164e-05,
577
+ "loss": 0.7796,
578
+ "step": 390
579
+ },
580
+ {
581
+ "epoch": 2.883211678832117,
582
+ "grad_norm": 0.6328125,
583
+ "learning_rate": 9.083290962177828e-05,
584
+ "loss": 0.7839,
585
+ "step": 395
586
+ },
587
+ {
588
+ "epoch": 2.9197080291970803,
589
+ "grad_norm": 0.59765625,
590
+ "learning_rate": 8.829690815345886e-05,
591
+ "loss": 0.7781,
592
+ "step": 400
593
+ },
594
+ {
595
+ "epoch": 2.9562043795620436,
596
+ "grad_norm": 0.58203125,
597
+ "learning_rate": 8.57685161726715e-05,
598
+ "loss": 0.7457,
599
+ "step": 405
600
+ },
601
+ {
602
+ "epoch": 2.9927007299270074,
603
+ "grad_norm": 0.6171875,
604
+ "learning_rate": 8.324937766952638e-05,
605
+ "loss": 0.7623,
606
+ "step": 410
607
+ },
608
+ {
609
+ "epoch": 3.0,
610
+ "eval_loss": 2.275648355484009,
611
+ "eval_runtime": 0.9945,
612
+ "eval_samples_per_second": 5.028,
613
+ "eval_steps_per_second": 2.011,
614
+ "step": 411
615
+ },
616
+ {
617
+ "epoch": 3.0291970802919708,
618
+ "grad_norm": 0.8359375,
619
+ "learning_rate": 8.074113061741397e-05,
620
+ "loss": 0.7329,
621
+ "step": 415
622
+ },
623
+ {
624
+ "epoch": 3.065693430656934,
625
+ "grad_norm": 0.50390625,
626
+ "learning_rate": 7.824540590797568e-05,
627
+ "loss": 0.7052,
628
+ "step": 420
629
+ },
630
+ {
631
+ "epoch": 3.102189781021898,
632
+ "grad_norm": 0.5703125,
633
+ "learning_rate": 7.576382629067877e-05,
634
+ "loss": 0.7015,
635
+ "step": 425
636
+ },
637
+ {
638
+ "epoch": 3.1386861313868613,
639
+ "grad_norm": 0.6015625,
640
+ "learning_rate": 7.329800531768584e-05,
641
+ "loss": 0.696,
642
+ "step": 430
643
+ },
644
+ {
645
+ "epoch": 3.1751824817518246,
646
+ "grad_norm": 0.55078125,
647
+ "learning_rate": 7.084954629470417e-05,
648
+ "loss": 0.7154,
649
+ "step": 435
650
+ },
651
+ {
652
+ "epoch": 3.2116788321167884,
653
+ "grad_norm": 0.59765625,
654
+ "learning_rate": 6.842004123849752e-05,
655
+ "loss": 0.7113,
656
+ "step": 440
657
+ },
658
+ {
659
+ "epoch": 3.2481751824817517,
660
+ "grad_norm": 0.5625,
661
+ "learning_rate": 6.601106984173835e-05,
662
+ "loss": 0.7139,
663
+ "step": 445
664
+ },
665
+ {
666
+ "epoch": 3.2846715328467155,
667
+ "grad_norm": 0.59765625,
668
+ "learning_rate": 6.362419844587287e-05,
669
+ "loss": 0.6967,
670
+ "step": 450
671
+ },
672
+ {
673
+ "epoch": 3.321167883211679,
674
+ "grad_norm": 0.52734375,
675
+ "learning_rate": 6.126097902266772e-05,
676
+ "loss": 0.7073,
677
+ "step": 455
678
+ },
679
+ {
680
+ "epoch": 3.3576642335766422,
681
+ "grad_norm": 0.5625,
682
+ "learning_rate": 5.8922948165099524e-05,
683
+ "loss": 0.6857,
684
+ "step": 460
685
+ },
686
+ {
687
+ "epoch": 3.394160583941606,
688
+ "grad_norm": 0.55859375,
689
+ "learning_rate": 5.6611626088244194e-05,
690
+ "loss": 0.7199,
691
+ "step": 465
692
+ },
693
+ {
694
+ "epoch": 3.4306569343065694,
695
+ "grad_norm": 0.58203125,
696
+ "learning_rate": 5.432851564081534e-05,
697
+ "loss": 0.7075,
698
+ "step": 470
699
+ },
700
+ {
701
+ "epoch": 3.4671532846715327,
702
+ "grad_norm": 0.52734375,
703
+ "learning_rate": 5.207510132799436e-05,
704
+ "loss": 0.7006,
705
+ "step": 475
706
+ },
707
+ {
708
+ "epoch": 3.5036496350364965,
709
+ "grad_norm": 0.53515625,
710
+ "learning_rate": 4.9852848346187566e-05,
711
+ "loss": 0.7151,
712
+ "step": 480
713
+ },
714
+ {
715
+ "epoch": 3.54014598540146,
716
+ "grad_norm": 0.546875,
717
+ "learning_rate": 4.7663201630338816e-05,
718
+ "loss": 0.7129,
719
+ "step": 485
720
+ },
721
+ {
722
+ "epoch": 3.576642335766423,
723
+ "grad_norm": 0.5859375,
724
+ "learning_rate": 4.550758491441526e-05,
725
+ "loss": 0.7139,
726
+ "step": 490
727
+ },
728
+ {
729
+ "epoch": 3.613138686131387,
730
+ "grad_norm": 0.51953125,
731
+ "learning_rate": 4.3387399805679255e-05,
732
+ "loss": 0.7162,
733
+ "step": 495
734
+ },
735
+ {
736
+ "epoch": 3.6496350364963503,
737
+ "grad_norm": 0.55859375,
738
+ "learning_rate": 4.1304024873346705e-05,
739
+ "loss": 0.7132,
740
+ "step": 500
741
+ },
742
+ {
743
+ "epoch": 3.686131386861314,
744
+ "grad_norm": 0.57421875,
745
+ "learning_rate": 3.9258814752225284e-05,
746
+ "loss": 0.7007,
747
+ "step": 505
748
+ },
749
+ {
750
+ "epoch": 3.7226277372262775,
751
+ "grad_norm": 0.546875,
752
+ "learning_rate": 3.725309926191479e-05,
753
+ "loss": 0.7037,
754
+ "step": 510
755
+ },
756
+ {
757
+ "epoch": 3.759124087591241,
758
+ "grad_norm": 0.73828125,
759
+ "learning_rate": 3.528818254214329e-05,
760
+ "loss": 0.7255,
761
+ "step": 515
762
+ },
763
+ {
764
+ "epoch": 3.795620437956204,
765
+ "grad_norm": 0.52734375,
766
+ "learning_rate": 3.336534220479961e-05,
767
+ "loss": 0.6966,
768
+ "step": 520
769
+ },
770
+ {
771
+ "epoch": 3.832116788321168,
772
+ "grad_norm": 0.5078125,
773
+ "learning_rate": 3.1485828503215585e-05,
774
+ "loss": 0.7143,
775
+ "step": 525
776
+ },
777
+ {
778
+ "epoch": 3.8686131386861313,
779
+ "grad_norm": 0.6328125,
780
+ "learning_rate": 2.9650863519236418e-05,
781
+ "loss": 0.7005,
782
+ "step": 530
783
+ },
784
+ {
785
+ "epoch": 3.905109489051095,
786
+ "grad_norm": 0.5703125,
787
+ "learning_rate": 2.7861640368608844e-05,
788
+ "loss": 0.7005,
789
+ "step": 535
790
+ },
791
+ {
792
+ "epoch": 3.9416058394160585,
793
+ "grad_norm": 0.53125,
794
+ "learning_rate": 2.6119322425203197e-05,
795
+ "loss": 0.7139,
796
+ "step": 540
797
+ },
798
+ {
799
+ "epoch": 3.978102189781022,
800
+ "grad_norm": 0.51953125,
801
+ "learning_rate": 2.4425042564574184e-05,
802
+ "loss": 0.709,
803
+ "step": 545
804
+ },
805
+ {
806
+ "epoch": 4.0,
807
+ "eval_loss": 2.341665267944336,
808
+ "eval_runtime": 0.9977,
809
+ "eval_samples_per_second": 5.012,
810
+ "eval_steps_per_second": 2.005,
811
+ "step": 548
812
+ },
813
+ {
814
+ "epoch": 4.014598540145985,
815
+ "grad_norm": 0.53515625,
816
+ "learning_rate": 2.277990242735185e-05,
817
+ "loss": 0.6801,
818
+ "step": 550
819
+ },
820
+ {
821
+ "epoch": 4.0510948905109485,
822
+ "grad_norm": 0.52734375,
823
+ "learning_rate": 2.118497170294195e-05,
824
+ "loss": 0.6495,
825
+ "step": 555
826
+ },
827
+ {
828
+ "epoch": 4.087591240875913,
829
+ "grad_norm": 0.5625,
830
+ "learning_rate": 1.9641287434001355e-05,
831
+ "loss": 0.672,
832
+ "step": 560
833
+ },
834
+ {
835
+ "epoch": 4.124087591240876,
836
+ "grad_norm": 0.55078125,
837
+ "learning_rate": 1.8149853342140645e-05,
838
+ "loss": 0.6611,
839
+ "step": 565
840
+ },
841
+ {
842
+ "epoch": 4.160583941605839,
843
+ "grad_norm": 0.59375,
844
+ "learning_rate": 1.671163917529285e-05,
845
+ "loss": 0.662,
846
+ "step": 570
847
+ },
848
+ {
849
+ "epoch": 4.197080291970803,
850
+ "grad_norm": 0.51171875,
851
+ "learning_rate": 1.5327580077171587e-05,
852
+ "loss": 0.6635,
853
+ "step": 575
854
+ },
855
+ {
856
+ "epoch": 4.233576642335766,
857
+ "grad_norm": 0.54296875,
858
+ "learning_rate": 1.3998575979229944e-05,
859
+ "loss": 0.6624,
860
+ "step": 580
861
+ },
862
+ {
863
+ "epoch": 4.2700729927007295,
864
+ "grad_norm": 0.50390625,
865
+ "learning_rate": 1.272549101551438e-05,
866
+ "loss": 0.6523,
867
+ "step": 585
868
+ },
869
+ {
870
+ "epoch": 4.306569343065694,
871
+ "grad_norm": 0.51171875,
872
+ "learning_rate": 1.1509152960794666e-05,
873
+ "loss": 0.6607,
874
+ "step": 590
875
+ },
876
+ {
877
+ "epoch": 4.343065693430657,
878
+ "grad_norm": 0.546875,
879
+ "learning_rate": 1.035035269233493e-05,
880
+ "loss": 0.6626,
881
+ "step": 595
882
+ },
883
+ {
884
+ "epoch": 4.37956204379562,
885
+ "grad_norm": 0.54296875,
886
+ "learning_rate": 9.249843675656212e-06,
887
+ "loss": 0.678,
888
+ "step": 600
889
+ },
890
+ {
891
+ "epoch": 4.416058394160584,
892
+ "grad_norm": 0.5234375,
893
+ "learning_rate": 8.208341474624071e-06,
894
+ "loss": 0.6783,
895
+ "step": 605
896
+ },
897
+ {
898
+ "epoch": 4.452554744525547,
899
+ "grad_norm": 0.53515625,
900
+ "learning_rate": 7.226523286180776e-06,
901
+ "loss": 0.6699,
902
+ "step": 610
903
+ },
904
+ {
905
+ "epoch": 4.489051094890511,
906
+ "grad_norm": 0.5703125,
907
+ "learning_rate": 6.3050275000238414e-06,
908
+ "loss": 0.6607,
909
+ "step": 615
910
+ },
911
+ {
912
+ "epoch": 4.525547445255475,
913
+ "grad_norm": 0.5234375,
914
+ "learning_rate": 5.4444532835175144e-06,
915
+ "loss": 0.6702,
916
+ "step": 620
917
+ },
918
+ {
919
+ "epoch": 4.562043795620438,
920
+ "grad_norm": 0.5234375,
921
+ "learning_rate": 4.6453601921072395e-06,
922
+ "loss": 0.6793,
923
+ "step": 625
924
+ },
925
+ {
926
+ "epoch": 4.598540145985401,
927
+ "grad_norm": 0.5234375,
928
+ "learning_rate": 3.908267805490051e-06,
929
+ "loss": 0.6622,
930
+ "step": 630
931
+ },
932
+ {
933
+ "epoch": 4.635036496350365,
934
+ "grad_norm": 0.54296875,
935
+ "learning_rate": 3.233655389777801e-06,
936
+ "loss": 0.677,
937
+ "step": 635
938
+ },
939
+ {
940
+ "epoch": 4.671532846715328,
941
+ "grad_norm": 0.5234375,
942
+ "learning_rate": 2.62196158587269e-06,
943
+ "loss": 0.6588,
944
+ "step": 640
945
+ },
946
+ {
947
+ "epoch": 4.708029197080292,
948
+ "grad_norm": 0.5234375,
949
+ "learning_rate": 2.073584124257899e-06,
950
+ "loss": 0.6621,
951
+ "step": 645
952
+ },
953
+ {
954
+ "epoch": 4.744525547445256,
955
+ "grad_norm": 0.53515625,
956
+ "learning_rate": 1.5888795663883904e-06,
957
+ "loss": 0.6655,
958
+ "step": 650
959
+ },
960
+ {
961
+ "epoch": 4.781021897810219,
962
+ "grad_norm": 0.515625,
963
+ "learning_rate": 1.1681630728506699e-06,
964
+ "loss": 0.6653,
965
+ "step": 655
966
+ },
967
+ {
968
+ "epoch": 4.817518248175182,
969
+ "grad_norm": 0.52734375,
970
+ "learning_rate": 8.117081984415298e-07,
971
+ "loss": 0.6734,
972
+ "step": 660
973
+ },
974
+ {
975
+ "epoch": 4.854014598540146,
976
+ "grad_norm": 0.5390625,
977
+ "learning_rate": 5.19746714299596e-07,
978
+ "loss": 0.6541,
979
+ "step": 665
980
+ },
981
+ {
982
+ "epoch": 4.89051094890511,
983
+ "grad_norm": 0.5390625,
984
+ "learning_rate": 2.9246845720496407e-07,
985
+ "loss": 0.6722,
986
+ "step": 670
987
+ },
988
+ {
989
+ "epoch": 4.927007299270073,
990
+ "grad_norm": 0.55859375,
991
+ "learning_rate": 1.300212061451367e-07,
992
+ "loss": 0.6472,
993
+ "step": 675
994
+ },
995
+ {
996
+ "epoch": 4.963503649635037,
997
+ "grad_norm": 0.51953125,
998
+ "learning_rate": 3.251058622737446e-08,
999
+ "loss": 0.667,
1000
+ "step": 680
1001
+ },
1002
+ {
1003
+ "epoch": 5.0,
1004
+ "grad_norm": 0.52734375,
1005
+ "learning_rate": 0.0,
1006
+ "loss": 0.6601,
1007
+ "step": 685
1008
+ },
1009
+ {
1010
+ "epoch": 5.0,
1011
+ "eval_loss": 2.3811252117156982,
1012
+ "eval_runtime": 0.9953,
1013
+ "eval_samples_per_second": 5.024,
1014
+ "eval_steps_per_second": 2.01,
1015
+ "step": 685
1016
+ },
1017
+ {
1018
+ "epoch": 5.0,
1019
+ "step": 685,
1020
+ "total_flos": 1.0472781231601746e+18,
1021
+ "train_loss": 2.151051264783762,
1022
+ "train_runtime": 5341.9856,
1023
+ "train_samples_per_second": 2.052,
1024
+ "train_steps_per_second": 0.128
1025
  }
1026
  ],
1027
  "logging_steps": 5,
1028
+ "max_steps": 685,
1029
  "num_input_tokens_seen": 0,
1030
+ "num_train_epochs": 5,
1031
  "save_steps": 100,
1032
  "stateful_callbacks": {
1033
  "TrainerControl": {
 
1041
  "attributes": {}
1042
  }
1043
  },
1044
+ "total_flos": 1.0472781231601746e+18,
1045
  "train_batch_size": 4,
1046
  "trial_name": null,
1047
  "trial_params": null