chansung commited on
Commit
3361fa2
1 Parent(s): 690c516

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. all_results.json +4 -9
  3. train_results.json +4 -4
  4. trainer_state.json +64 -64
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.6480
24
 
25
  ## Model description
26
 
@@ -57,7 +57,7 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
- | 1.1497 | 1.0 | 137 | 1.6480 |
61
 
62
 
63
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.6455
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 1.1473 | 1.0 | 137 | 1.6455 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.6479942798614502,
4
- "eval_runtime": 1.1327,
5
- "eval_samples": 16,
6
- "eval_samples_per_second": 7.945,
7
- "eval_steps_per_second": 0.883,
8
  "total_flos": 8.08957492854784e+17,
9
- "train_loss": 1.2461711733880705,
10
- "train_runtime": 2338.1563,
11
  "train_samples": 116368,
12
- "train_samples_per_second": 14.952,
13
- "train_steps_per_second": 0.059
14
  }
 
1
  {
2
  "epoch": 1.0,
 
 
 
 
 
3
  "total_flos": 8.08957492854784e+17,
4
+ "train_loss": 1.2450406925521629,
5
+ "train_runtime": 683.5001,
6
  "train_samples": 116368,
7
+ "train_samples_per_second": 51.148,
8
+ "train_steps_per_second": 0.2
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 8.08957492854784e+17,
4
- "train_loss": 1.2461711733880705,
5
- "train_runtime": 2338.1563,
6
  "train_samples": 116368,
7
- "train_samples_per_second": 14.952,
8
- "train_steps_per_second": 0.059
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 8.08957492854784e+17,
4
+ "train_loss": 1.2450406925521629,
5
+ "train_runtime": 683.5001,
6
  "train_samples": 116368,
7
+ "train_samples_per_second": 51.148,
8
+ "train_steps_per_second": 0.2
9
  }
trainer_state.json CHANGED
@@ -10,216 +10,216 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0072992700729927005,
13
- "grad_norm": 0.43487387895584106,
14
  "learning_rate": 1.4285714285714285e-05,
15
- "loss": 1.7554,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0364963503649635,
20
- "grad_norm": 0.4084155857563019,
21
  "learning_rate": 7.142857142857143e-05,
22
- "loss": 1.7248,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.072992700729927,
27
- "grad_norm": 0.4045219123363495,
28
  "learning_rate": 0.00014285714285714287,
29
- "loss": 1.6877,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.10948905109489052,
34
- "grad_norm": 0.5852366089820862,
35
  "learning_rate": 0.00019996738360808565,
36
- "loss": 1.5983,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.145985401459854,
41
- "grad_norm": 0.5415384769439697,
42
  "learning_rate": 0.00019882804237803488,
43
- "loss": 1.4422,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.18248175182481752,
48
- "grad_norm": 0.49430617690086365,
49
  "learning_rate": 0.00019607909582962477,
50
- "loss": 1.3509,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.21897810218978103,
55
- "grad_norm": 0.3028700649738312,
56
  "learning_rate": 0.0001917653158603628,
57
- "loss": 1.2905,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.25547445255474455,
62
- "grad_norm": 0.21788553893566132,
63
  "learning_rate": 0.00018595696069872013,
64
- "loss": 1.2481,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.291970802919708,
69
- "grad_norm": 0.21770533919334412,
70
  "learning_rate": 0.00017874863061334657,
71
- "loss": 1.2255,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.3284671532846715,
76
- "grad_norm": 0.19103731215000153,
77
  "learning_rate": 0.00017025772716520323,
78
- "loss": 1.2213,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.36496350364963503,
83
- "grad_norm": 0.18975041806697845,
84
  "learning_rate": 0.0001606225410966638,
85
- "loss": 1.2069,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.40145985401459855,
90
- "grad_norm": 0.21415986120700836,
91
  "learning_rate": 0.00015000000000000001,
92
- "loss": 1.1938,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.43795620437956206,
97
- "grad_norm": 0.20551565289497375,
98
  "learning_rate": 0.0001385631124488136,
99
- "loss": 1.1837,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.4744525547445255,
104
- "grad_norm": 0.20549848675727844,
105
  "learning_rate": 0.0001264981502196662,
106
- "loss": 1.1724,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.5109489051094891,
111
- "grad_norm": 0.19810882210731506,
112
  "learning_rate": 0.00011400161449686293,
113
- "loss": 1.1713,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5474452554744526,
118
- "grad_norm": 0.18478353321552277,
119
  "learning_rate": 0.00010127703547159739,
120
- "loss": 1.1571,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.583941605839416,
125
- "grad_norm": 0.19806450605392456,
126
  "learning_rate": 8.853165746015997e-05,
127
- "loss": 1.1539,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.6204379562043796,
132
- "grad_norm": 0.20397868752479553,
133
  "learning_rate": 7.597306353045393e-05,
134
- "loss": 1.1457,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.656934306569343,
139
- "grad_norm": 0.18514488637447357,
140
  "learning_rate": 6.380579461128819e-05,
141
- "loss": 1.1613,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.6934306569343066,
146
- "grad_norm": 0.18412043154239655,
147
  "learning_rate": 5.222801814877369e-05,
148
- "loss": 1.1449,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7299270072992701,
153
- "grad_norm": 0.19028052687644958,
154
  "learning_rate": 4.142830056718052e-05,
155
- "loss": 1.1511,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.7664233576642335,
160
- "grad_norm": 0.1979902684688568,
161
  "learning_rate": 3.158253610095697e-05,
162
- "loss": 1.1457,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.8029197080291971,
167
- "grad_norm": 0.19014957547187805,
168
  "learning_rate": 2.2851082017805703e-05,
169
- "loss": 1.1423,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.8394160583941606,
174
- "grad_norm": 0.186836376786232,
175
  "learning_rate": 1.5376146891235598e-05,
176
- "loss": 1.1477,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.8759124087591241,
181
- "grad_norm": 0.20176666975021362,
182
  "learning_rate": 9.279474459608805e-06,
183
- "loss": 1.153,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.9124087591240876,
188
- "grad_norm": 0.19549883902072906,
189
  "learning_rate": 4.660360794506946e-06,
190
- "loss": 1.1444,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.948905109489051,
195
- "grad_norm": 0.17397421598434448,
196
  "learning_rate": 1.5940370726542863e-06,
197
- "loss": 1.1508,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.9854014598540146,
202
- "grad_norm": 0.1897565871477127,
203
  "learning_rate": 1.3044429107700318e-07,
204
- "loss": 1.1497,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 1.0,
209
- "eval_loss": 1.6479942798614502,
210
- "eval_runtime": 1.1173,
211
- "eval_samples_per_second": 8.055,
212
- "eval_steps_per_second": 0.895,
213
  "step": 137
214
  },
215
  {
216
  "epoch": 1.0,
217
  "step": 137,
218
  "total_flos": 8.08957492854784e+17,
219
- "train_loss": 1.2461711733880705,
220
- "train_runtime": 2338.1563,
221
- "train_samples_per_second": 14.952,
222
- "train_steps_per_second": 0.059
223
  }
224
  ],
225
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0072992700729927005,
13
+ "grad_norm": 0.8994209170341492,
14
  "learning_rate": 1.4285714285714285e-05,
15
+ "loss": 1.7629,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0364963503649635,
20
+ "grad_norm": 0.7910200953483582,
21
  "learning_rate": 7.142857142857143e-05,
22
+ "loss": 1.7329,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.072992700729927,
27
+ "grad_norm": 0.47016409039497375,
28
  "learning_rate": 0.00014285714285714287,
29
+ "loss": 1.6931,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.10948905109489052,
34
+ "grad_norm": 0.6418158411979675,
35
  "learning_rate": 0.00019996738360808565,
36
+ "loss": 1.5991,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.145985401459854,
41
+ "grad_norm": 0.5754856467247009,
42
  "learning_rate": 0.00019882804237803488,
43
+ "loss": 1.4449,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.18248175182481752,
48
+ "grad_norm": 0.5607455968856812,
49
  "learning_rate": 0.00019607909582962477,
50
+ "loss": 1.3519,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.21897810218978103,
55
+ "grad_norm": 0.31966158747673035,
56
  "learning_rate": 0.0001917653158603628,
57
+ "loss": 1.2901,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.25547445255474455,
62
+ "grad_norm": 0.21750766038894653,
63
  "learning_rate": 0.00018595696069872013,
64
+ "loss": 1.2469,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.291970802919708,
69
+ "grad_norm": 0.21939656138420105,
70
  "learning_rate": 0.00017874863061334657,
71
+ "loss": 1.223,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.3284671532846715,
76
+ "grad_norm": 0.18782173097133636,
77
  "learning_rate": 0.00017025772716520323,
78
+ "loss": 1.2186,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.36496350364963503,
83
+ "grad_norm": 0.19173678755760193,
84
  "learning_rate": 0.0001606225410966638,
85
+ "loss": 1.2035,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.40145985401459855,
90
+ "grad_norm": 0.2181948572397232,
91
  "learning_rate": 0.00015000000000000001,
92
+ "loss": 1.191,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.43795620437956206,
97
+ "grad_norm": 0.20430037379264832,
98
  "learning_rate": 0.0001385631124488136,
99
+ "loss": 1.1811,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.4744525547445255,
104
+ "grad_norm": 0.19574333727359772,
105
  "learning_rate": 0.0001264981502196662,
106
+ "loss": 1.1705,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.5109489051094891,
111
+ "grad_norm": 0.1992381513118744,
112
  "learning_rate": 0.00011400161449686293,
113
+ "loss": 1.1695,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5474452554744526,
118
+ "grad_norm": 0.18511532247066498,
119
  "learning_rate": 0.00010127703547159739,
120
+ "loss": 1.1559,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.583941605839416,
125
+ "grad_norm": 0.19379611313343048,
126
  "learning_rate": 8.853165746015997e-05,
127
+ "loss": 1.1526,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.6204379562043796,
132
+ "grad_norm": 0.19651520252227783,
133
  "learning_rate": 7.597306353045393e-05,
134
+ "loss": 1.1439,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.656934306569343,
139
+ "grad_norm": 0.18215572834014893,
140
  "learning_rate": 6.380579461128819e-05,
141
+ "loss": 1.1592,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.6934306569343066,
146
+ "grad_norm": 0.19182553887367249,
147
  "learning_rate": 5.222801814877369e-05,
148
+ "loss": 1.1425,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7299270072992701,
153
+ "grad_norm": 0.19180874526500702,
154
  "learning_rate": 4.142830056718052e-05,
155
+ "loss": 1.1488,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.7664233576642335,
160
+ "grad_norm": 0.19942091405391693,
161
  "learning_rate": 3.158253610095697e-05,
162
+ "loss": 1.1437,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.8029197080291971,
167
+ "grad_norm": 0.19039174914360046,
168
  "learning_rate": 2.2851082017805703e-05,
169
+ "loss": 1.1397,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.8394160583941606,
174
+ "grad_norm": 0.18815937638282776,
175
  "learning_rate": 1.5376146891235598e-05,
176
+ "loss": 1.1451,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.8759124087591241,
181
+ "grad_norm": 0.19718731939792633,
182
  "learning_rate": 9.279474459608805e-06,
183
+ "loss": 1.1502,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.9124087591240876,
188
+ "grad_norm": 0.19630739092826843,
189
  "learning_rate": 4.660360794506946e-06,
190
+ "loss": 1.1419,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.948905109489051,
195
+ "grad_norm": 0.1804933249950409,
196
  "learning_rate": 1.5940370726542863e-06,
197
+ "loss": 1.1483,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.9854014598540146,
202
+ "grad_norm": 0.1939323991537094,
203
  "learning_rate": 1.3044429107700318e-07,
204
+ "loss": 1.1473,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 1.0,
209
+ "eval_loss": 1.645493507385254,
210
+ "eval_runtime": 0.792,
211
+ "eval_samples_per_second": 11.363,
212
+ "eval_steps_per_second": 1.263,
213
  "step": 137
214
  },
215
  {
216
  "epoch": 1.0,
217
  "step": 137,
218
  "total_flos": 8.08957492854784e+17,
219
+ "train_loss": 1.2450406925521629,
220
+ "train_runtime": 683.5001,
221
+ "train_samples_per_second": 51.148,
222
+ "train_steps_per_second": 0.2
223
  }
224
  ],
225
  "logging_steps": 5,