chansung commited on
Commit
30a59c4
1 Parent(s): 134f598

Model save

Browse files
Files changed (4) hide show
  1. README.md +8 -9
  2. all_results.json +5 -10
  3. train_results.json +5 -5
  4. trainer_state.json +312 -123
README.md CHANGED
@@ -1,11 +1,10 @@
1
  ---
2
  base_model: mistralai/Mistral-7B-v0.3
3
  datasets:
4
- - llama-duo/synth_classification_dataset_dedup
5
  library_name: peft
6
  license: apache-2.0
7
  tags:
8
- - alignment-handbook
9
  - trl
10
  - sft
11
  - generated_from_trainer
@@ -19,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # mistral-7b-0.3-gpt4o_100k_classification-lora
21
 
22
- This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the llama-duo/synth_classification_dataset_dedup dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 1.4816
25
 
26
  ## Model description
27
 
@@ -41,14 +40,14 @@ More information needed
41
 
42
  The following hyperparameters were used during training:
43
  - learning_rate: 0.0002
44
- - train_batch_size: 4
45
- - eval_batch_size: 4
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
  - num_devices: 8
49
  - gradient_accumulation_steps: 2
50
- - total_train_batch_size: 64
51
- - total_eval_batch_size: 32
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
@@ -58,7 +57,7 @@ The following hyperparameters were used during training:
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
- | 0.9141 | 1.0 | 134 | 1.4816 |
62
 
63
 
64
  ### Framework versions
 
1
  ---
2
  base_model: mistralai/Mistral-7B-v0.3
3
  datasets:
4
+ - generator
5
  library_name: peft
6
  license: apache-2.0
7
  tags:
 
8
  - trl
9
  - sft
10
  - generated_from_trainer
 
18
 
19
  # mistral-7b-0.3-gpt4o_100k_classification-lora
20
 
21
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.4824
24
 
25
  ## Model description
26
 
 
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 0.0002
43
+ - train_batch_size: 2
44
+ - eval_batch_size: 2
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
  - num_devices: 8
48
  - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 32
50
+ - total_eval_batch_size: 16
51
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 0.8838 | 1.0 | 268 | 1.4824 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.4816477298736572,
4
- "eval_runtime": 1.519,
5
- "eval_samples": 16,
6
- "eval_samples_per_second": 1.317,
7
- "eval_steps_per_second": 0.658,
8
- "total_flos": 3.750131265144095e+17,
9
- "train_loss": 1.0418888739685515,
10
- "train_runtime": 1659.9836,
11
  "train_samples": 92634,
12
- "train_samples_per_second": 5.16,
13
- "train_steps_per_second": 0.081
14
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 3.7501312597753856e+17,
4
+ "train_loss": 0.975214945736216,
5
+ "train_runtime": 1874.6509,
 
 
 
 
 
6
  "train_samples": 92634,
7
+ "train_samples_per_second": 4.569,
8
+ "train_steps_per_second": 0.143
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "total_flos": 3.750131265144095e+17,
4
- "train_loss": 1.0418888739685515,
5
- "train_runtime": 1659.9836,
6
  "train_samples": 92634,
7
- "train_samples_per_second": 5.16,
8
- "train_steps_per_second": 0.081
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 3.7501312597753856e+17,
4
+ "train_loss": 0.975214945736216,
5
+ "train_runtime": 1874.6509,
6
  "train_samples": 92634,
7
+ "train_samples_per_second": 4.569,
8
+ "train_steps_per_second": 0.143
9
  }
trainer_state.json CHANGED
@@ -3,220 +3,409 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 134,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.007462686567164179,
13
- "grad_norm": 7.811283588409424,
14
- "learning_rate": 1.4285714285714285e-05,
15
- "loss": 2.0521,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.03731343283582089,
20
- "grad_norm": 6.445086479187012,
21
- "learning_rate": 7.142857142857143e-05,
22
- "loss": 2.0103,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.07462686567164178,
27
- "grad_norm": 4.3372368812561035,
28
- "learning_rate": 0.00014285714285714287,
29
- "loss": 1.6079,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.11194029850746269,
34
- "grad_norm": 1.0932683944702148,
35
- "learning_rate": 0.00019996573249755572,
36
- "loss": 1.3371,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.14925373134328357,
41
- "grad_norm": 0.9909504055976868,
42
- "learning_rate": 0.00019876883405951377,
43
- "loss": 1.1968,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.1865671641791045,
48
- "grad_norm": 0.8857837915420532,
49
- "learning_rate": 0.0001958819734868193,
50
- "loss": 1.107,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.22388059701492538,
55
- "grad_norm": 0.8879585266113281,
56
- "learning_rate": 0.0001913545457642601,
57
- "loss": 1.0563,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.26119402985074625,
62
- "grad_norm": 0.5094943642616272,
63
- "learning_rate": 0.00018526401643540922,
64
- "loss": 1.0143,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.29850746268656714,
69
- "grad_norm": 0.39295411109924316,
70
- "learning_rate": 0.0001777145961456971,
71
- "loss": 0.9871,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.3358208955223881,
76
- "grad_norm": 0.5487385988235474,
77
- "learning_rate": 0.0001688354575693754,
78
- "loss": 0.9945,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.373134328358209,
83
- "grad_norm": 0.48006948828697205,
84
- "learning_rate": 0.00015877852522924732,
85
- "loss": 0.9775,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.41044776119402987,
90
- "grad_norm": 0.32778072357177734,
91
- "learning_rate": 0.00014771587602596084,
92
- "loss": 0.9614,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.44776119402985076,
97
- "grad_norm": 0.6229733824729919,
98
- "learning_rate": 0.00013583679495453,
99
- "loss": 0.9528,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.48507462686567165,
104
- "grad_norm": 0.37184789776802063,
105
- "learning_rate": 0.00012334453638559057,
106
- "loss": 0.9527,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.5223880597014925,
111
- "grad_norm": 0.8213186264038086,
112
- "learning_rate": 0.00011045284632676536,
113
- "loss": 0.9422,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.5597014925373134,
118
- "grad_norm": 0.5482317209243774,
119
- "learning_rate": 9.73823051692127e-05,
120
- "loss": 0.9439,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.5970149253731343,
125
- "grad_norm": 0.535866379737854,
126
- "learning_rate": 8.435655349597689e-05,
127
- "loss": 0.9348,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.6343283582089553,
132
- "grad_norm": 0.4782055914402008,
133
- "learning_rate": 7.159846552960774e-05,
134
- "loss": 0.932,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.6716417910447762,
139
- "grad_norm": 0.44723376631736755,
140
- "learning_rate": 5.9326335692419995e-05,
141
- "loss": 0.9185,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.7089552238805971,
146
- "grad_norm": 0.3611028790473938,
147
- "learning_rate": 4.7750143528405126e-05,
148
- "loss": 0.9224,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.746268656716418,
153
- "grad_norm": 0.3897744119167328,
154
- "learning_rate": 3.7067960895016275e-05,
155
- "loss": 0.9175,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.7835820895522388,
160
- "grad_norm": 0.3638187646865845,
161
- "learning_rate": 2.746256289877126e-05,
162
- "loss": 0.9237,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.8208955223880597,
167
- "grad_norm": 0.36973854899406433,
168
- "learning_rate": 1.9098300562505266e-05,
169
- "loss": 0.9264,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.8582089552238806,
174
- "grad_norm": 0.3612724244594574,
175
- "learning_rate": 1.2118288733803473e-05,
176
- "loss": 0.9275,
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.8955223880597015,
181
- "grad_norm": 0.3824576139450073,
182
- "learning_rate": 6.6419573502798374e-06,
183
- "loss": 0.9104,
184
  "step": 120
185
  },
186
  {
187
- "epoch": 0.9328358208955224,
188
- "grad_norm": 0.3507688343524933,
189
- "learning_rate": 2.7630079602323442e-06,
190
- "loss": 0.929,
191
  "step": 125
192
  },
193
  {
194
- "epoch": 0.9701492537313433,
195
- "grad_norm": 0.3513280153274536,
196
- "learning_rate": 5.478104631726711e-07,
197
- "loss": 0.9141,
198
  "step": 130
199
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  {
201
  "epoch": 1.0,
202
- "eval_loss": 1.4816477298736572,
203
- "eval_runtime": 1.5129,
204
- "eval_samples_per_second": 1.322,
205
- "eval_steps_per_second": 0.661,
206
- "step": 134
207
  },
208
  {
209
  "epoch": 1.0,
210
- "step": 134,
211
- "total_flos": 3.750131265144095e+17,
212
- "train_loss": 1.0418888739685515,
213
- "train_runtime": 1659.9836,
214
- "train_samples_per_second": 5.16,
215
- "train_steps_per_second": 0.081
216
  }
217
  ],
218
  "logging_steps": 5,
219
- "max_steps": 134,
220
  "num_input_tokens_seen": 0,
221
  "num_train_epochs": 1,
222
  "save_steps": 100,
@@ -232,8 +421,8 @@
232
  "attributes": {}
233
  }
234
  },
235
- "total_flos": 3.750131265144095e+17,
236
- "train_batch_size": 4,
237
  "trial_name": null,
238
  "trial_params": null
239
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 268,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0037313432835820895,
13
+ "grad_norm": 8.229718208312988,
14
+ "learning_rate": 7.4074074074074075e-06,
15
+ "loss": 2.0524,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.018656716417910446,
20
+ "grad_norm": 8.02374267578125,
21
+ "learning_rate": 3.7037037037037037e-05,
22
+ "loss": 2.0234,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.03731343283582089,
27
+ "grad_norm": 3.9405264854431152,
28
+ "learning_rate": 7.407407407407407e-05,
29
+ "loss": 1.762,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.055970149253731345,
34
+ "grad_norm": 2.4330053329467773,
35
+ "learning_rate": 0.00011111111111111112,
36
+ "loss": 1.4793,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.07462686567164178,
41
+ "grad_norm": 1.2707175016403198,
42
+ "learning_rate": 0.00014814814814814815,
43
+ "loss": 1.3111,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.09328358208955224,
48
+ "grad_norm": 1.004278540611267,
49
+ "learning_rate": 0.0001851851851851852,
50
+ "loss": 1.2074,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.11194029850746269,
55
+ "grad_norm": 0.7126164436340332,
56
+ "learning_rate": 0.00019992354201925428,
57
+ "loss": 1.1125,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.13059701492537312,
62
+ "grad_norm": 0.5630809664726257,
63
+ "learning_rate": 0.0001994567221375987,
64
+ "loss": 1.0635,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.14925373134328357,
69
+ "grad_norm": 0.5791023373603821,
70
+ "learning_rate": 0.00019856753906964686,
71
+ "loss": 1.0219,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.16791044776119404,
76
+ "grad_norm": 1.550959587097168,
77
+ "learning_rate": 0.00019725976891203376,
78
+ "loss": 0.9986,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.1865671641791045,
83
+ "grad_norm": 0.5787221193313599,
84
+ "learning_rate": 0.00019553896537655318,
85
+ "loss": 0.9727,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.20522388059701493,
90
+ "grad_norm": 0.5211277604103088,
91
+ "learning_rate": 0.0001934124362051919,
92
+ "loss": 0.9605,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.22388059701492538,
97
+ "grad_norm": 0.46695733070373535,
98
+ "learning_rate": 0.000190889212136318,
99
+ "loss": 0.9695,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.24253731343283583,
104
+ "grad_norm": 0.48995310068130493,
105
+ "learning_rate": 0.0001879800085538147,
106
+ "loss": 0.9485,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.26119402985074625,
111
+ "grad_norm": 0.8782228231430054,
112
+ "learning_rate": 0.00018469717998202462,
113
+ "loss": 0.9413,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.2798507462686567,
118
+ "grad_norm": 0.7073250412940979,
119
+ "learning_rate": 0.00018105466761975109,
120
+ "loss": 0.9221,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.29850746268656714,
125
+ "grad_norm": 0.8044834136962891,
126
+ "learning_rate": 0.00017706794013612364,
127
+ "loss": 0.9363,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.31716417910447764,
132
+ "grad_norm": 0.6230499744415283,
133
+ "learning_rate": 0.00017275392797975032,
134
+ "loss": 0.9444,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.3358208955223881,
139
+ "grad_norm": 0.4618063271045685,
140
+ "learning_rate": 0.0001681309514801265,
141
+ "loss": 0.9384,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.35447761194029853,
146
+ "grad_norm": 0.4851289689540863,
147
+ "learning_rate": 0.00016321864304663173,
148
+ "loss": 0.9228,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.373134328358209,
153
+ "grad_norm": 0.5840283036231995,
154
+ "learning_rate": 0.0001580378637955128,
155
+ "loss": 0.9308,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.3917910447761194,
160
+ "grad_norm": 1.0067591667175293,
161
+ "learning_rate": 0.00015261061495891345,
162
+ "loss": 0.9239,
163
  "step": 105
164
  },
165
  {
166
+ "epoch": 0.41044776119402987,
167
+ "grad_norm": 0.5790657997131348,
168
+ "learning_rate": 0.00014695994445216985,
169
+ "loss": 0.909,
170
  "step": 110
171
  },
172
  {
173
+ "epoch": 0.4291044776119403,
174
+ "grad_norm": 0.7979658246040344,
175
+ "learning_rate": 0.00014110984899615367,
176
+ "loss": 0.9051,
177
  "step": 115
178
  },
179
  {
180
+ "epoch": 0.44776119402985076,
181
+ "grad_norm": 0.9141942262649536,
182
+ "learning_rate": 0.000135085172210319,
183
+ "loss": 0.9136,
184
  "step": 120
185
  },
186
  {
187
+ "epoch": 0.4664179104477612,
188
+ "grad_norm": 0.527861475944519,
189
+ "learning_rate": 0.00012891149910922267,
190
+ "loss": 0.9185,
191
  "step": 125
192
  },
193
  {
194
+ "epoch": 0.48507462686567165,
195
+ "grad_norm": 0.6267244815826416,
196
+ "learning_rate": 0.00012261504745055964,
197
+ "loss": 0.9016,
198
  "step": 130
199
  },
200
+ {
201
+ "epoch": 0.503731343283582,
202
+ "grad_norm": 0.5699637532234192,
203
+ "learning_rate": 0.00011622255639612554,
204
+ "loss": 0.8951,
205
+ "step": 135
206
+ },
207
+ {
208
+ "epoch": 0.5223880597014925,
209
+ "grad_norm": 1.0421922206878662,
210
+ "learning_rate": 0.00010976117295853154,
211
+ "loss": 0.9031,
212
+ "step": 140
213
+ },
214
+ {
215
+ "epoch": 0.5410447761194029,
216
+ "grad_norm": 0.5656252503395081,
217
+ "learning_rate": 0.00010325833671589687,
218
+ "loss": 0.9076,
219
+ "step": 145
220
+ },
221
+ {
222
+ "epoch": 0.5597014925373134,
223
+ "grad_norm": 0.5329589247703552,
224
+ "learning_rate": 9.674166328410318e-05,
225
+ "loss": 0.8981,
226
+ "step": 150
227
+ },
228
+ {
229
+ "epoch": 0.5783582089552238,
230
+ "grad_norm": 0.628094494342804,
231
+ "learning_rate": 9.023882704146848e-05,
232
+ "loss": 0.8999,
233
+ "step": 155
234
+ },
235
+ {
236
+ "epoch": 0.5970149253731343,
237
+ "grad_norm": 0.5785563588142395,
238
+ "learning_rate": 8.377744360387447e-05,
239
+ "loss": 0.8872,
240
+ "step": 160
241
+ },
242
+ {
243
+ "epoch": 0.6156716417910447,
244
+ "grad_norm": 0.5400257706642151,
245
+ "learning_rate": 7.738495254944042e-05,
246
+ "loss": 0.8889,
247
+ "step": 165
248
+ },
249
+ {
250
+ "epoch": 0.6343283582089553,
251
+ "grad_norm": 0.5926158428192139,
252
+ "learning_rate": 7.108850089077735e-05,
253
+ "loss": 0.8956,
254
+ "step": 170
255
+ },
256
+ {
257
+ "epoch": 0.6529850746268657,
258
+ "grad_norm": 0.8232181668281555,
259
+ "learning_rate": 6.491482778968104e-05,
260
+ "loss": 0.8931,
261
+ "step": 175
262
+ },
263
+ {
264
+ "epoch": 0.6716417910447762,
265
+ "grad_norm": 0.5512446761131287,
266
+ "learning_rate": 5.889015100384636e-05,
267
+ "loss": 0.8651,
268
+ "step": 180
269
+ },
270
+ {
271
+ "epoch": 0.6902985074626866,
272
+ "grad_norm": 0.5381608009338379,
273
+ "learning_rate": 5.304005554783015e-05,
274
+ "loss": 0.8817,
275
+ "step": 185
276
+ },
277
+ {
278
+ "epoch": 0.7089552238805971,
279
+ "grad_norm": 0.5687859058380127,
280
+ "learning_rate": 4.738938504108659e-05,
281
+ "loss": 0.8818,
282
+ "step": 190
283
+ },
284
+ {
285
+ "epoch": 0.7276119402985075,
286
+ "grad_norm": 0.5281744599342346,
287
+ "learning_rate": 4.196213620448723e-05,
288
+ "loss": 0.8795,
289
+ "step": 195
290
+ },
291
+ {
292
+ "epoch": 0.746268656716418,
293
+ "grad_norm": 0.5830202102661133,
294
+ "learning_rate": 3.6781356953368284e-05,
295
+ "loss": 0.8769,
296
+ "step": 200
297
+ },
298
+ {
299
+ "epoch": 0.7649253731343284,
300
+ "grad_norm": 0.6224981546401978,
301
+ "learning_rate": 3.186904851987351e-05,
302
+ "loss": 0.8821,
303
+ "step": 205
304
+ },
305
+ {
306
+ "epoch": 0.7835820895522388,
307
+ "grad_norm": 0.5855913758277893,
308
+ "learning_rate": 2.724607202024969e-05,
309
+ "loss": 0.883,
310
+ "step": 210
311
+ },
312
+ {
313
+ "epoch": 0.8022388059701493,
314
+ "grad_norm": 0.5719298720359802,
315
+ "learning_rate": 2.2932059863876365e-05,
316
+ "loss": 0.8909,
317
+ "step": 215
318
+ },
319
+ {
320
+ "epoch": 0.8208955223880597,
321
+ "grad_norm": 0.5183894038200378,
322
+ "learning_rate": 1.8945332380248913e-05,
323
+ "loss": 0.8782,
324
+ "step": 220
325
+ },
326
+ {
327
+ "epoch": 0.8395522388059702,
328
+ "grad_norm": 0.5395970344543457,
329
+ "learning_rate": 1.5302820017975394e-05,
330
+ "loss": 0.8847,
331
+ "step": 225
332
+ },
333
+ {
334
+ "epoch": 0.8582089552238806,
335
+ "grad_norm": 0.5199528336524963,
336
+ "learning_rate": 1.2019991446185309e-05,
337
+ "loss": 0.889,
338
+ "step": 230
339
+ },
340
+ {
341
+ "epoch": 0.8768656716417911,
342
+ "grad_norm": 0.5345931053161621,
343
+ "learning_rate": 9.110787863682002e-06,
344
+ "loss": 0.8732,
345
+ "step": 235
346
+ },
347
+ {
348
+ "epoch": 0.8955223880597015,
349
+ "grad_norm": 0.5364944338798523,
350
+ "learning_rate": 6.587563794808127e-06,
351
+ "loss": 0.8651,
352
+ "step": 240
353
+ },
354
+ {
355
+ "epoch": 0.914179104477612,
356
+ "grad_norm": 0.5282942056655884,
357
+ "learning_rate": 4.461034623446847e-06,
358
+ "loss": 0.895,
359
+ "step": 245
360
+ },
361
+ {
362
+ "epoch": 0.9328358208955224,
363
+ "grad_norm": 0.5484575033187866,
364
+ "learning_rate": 2.7402310879662497e-06,
365
+ "loss": 0.8827,
366
+ "step": 250
367
+ },
368
+ {
369
+ "epoch": 0.9514925373134329,
370
+ "grad_norm": 0.5638120770454407,
371
+ "learning_rate": 1.43246093035313e-06,
372
+ "loss": 0.8804,
373
+ "step": 255
374
+ },
375
+ {
376
+ "epoch": 0.9701492537313433,
377
+ "grad_norm": 0.5108746290206909,
378
+ "learning_rate": 5.432778624013257e-07,
379
+ "loss": 0.8699,
380
+ "step": 260
381
+ },
382
+ {
383
+ "epoch": 0.9888059701492538,
384
+ "grad_norm": 0.52601557970047,
385
+ "learning_rate": 7.645798074572552e-08,
386
+ "loss": 0.8838,
387
+ "step": 265
388
+ },
389
  {
390
  "epoch": 1.0,
391
+ "eval_loss": 1.4824345111846924,
392
+ "eval_runtime": 0.929,
393
+ "eval_samples_per_second": 2.153,
394
+ "eval_steps_per_second": 1.076,
395
+ "step": 268
396
  },
397
  {
398
  "epoch": 1.0,
399
+ "step": 268,
400
+ "total_flos": 3.7501312597753856e+17,
401
+ "train_loss": 0.975214945736216,
402
+ "train_runtime": 1874.6509,
403
+ "train_samples_per_second": 4.569,
404
+ "train_steps_per_second": 0.143
405
  }
406
  ],
407
  "logging_steps": 5,
408
+ "max_steps": 268,
409
  "num_input_tokens_seen": 0,
410
  "num_train_epochs": 1,
411
  "save_steps": 100,
 
421
  "attributes": {}
422
  }
423
  },
424
+ "total_flos": 3.7501312597753856e+17,
425
+ "train_batch_size": 2,
426
  "trial_name": null,
427
  "trial_params": null
428
  }