scottsuk0306 commited on
Commit
5d3b6f9
1 Parent(s): 3540389

Model save

Browse files
README.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: alignment-handbook/zephyr-7b-sft-full
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: zephyr-7b-math-case-4
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # zephyr-7b-math-case-4
20
+
21
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 0.0151
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 1e-05
43
+ - train_batch_size: 8
44
+ - eval_batch_size: 8
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 4
48
+ - total_train_batch_size: 32
49
+ - total_eval_batch_size: 32
50
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
51
+ - lr_scheduler_type: cosine
52
+ - lr_scheduler_warmup_ratio: 0.03
53
+ - num_epochs: 10
54
+
55
+ ### Training results
56
+
57
+ | Training Loss | Epoch | Step | Validation Loss |
58
+ |:-------------:|:-----:|:----:|:---------------:|
59
+ | 0.9831 | 1.0 | 9 | 0.6522 |
60
+ | 0.601 | 2.0 | 18 | 0.4056 |
61
+ | 0.3464 | 3.0 | 27 | 0.1887 |
62
+ | 0.1559 | 4.0 | 36 | 0.0765 |
63
+ | 0.0714 | 5.0 | 45 | 0.0478 |
64
+ | 0.0477 | 6.0 | 54 | 0.0351 |
65
+ | 0.0345 | 7.0 | 63 | 0.0256 |
66
+ | 0.0252 | 8.0 | 72 | 0.0192 |
67
+ | 0.0181 | 9.0 | 81 | 0.0158 |
68
+ | 0.0153 | 10.0 | 90 | 0.0151 |
69
+
70
+
71
+ ### Framework versions
72
+
73
+ - Transformers 4.44.2
74
+ - Pytorch 2.4.1+cu121
75
+ - Datasets 3.0.0
76
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 18844169011200.0,
4
+ "train_loss": 0.21927920257051786,
5
+ "train_runtime": 1273.3046,
6
+ "train_samples": 4000,
7
+ "train_samples_per_second": 2.034,
8
+ "train_steps_per_second": 0.071
9
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.44.2"
6
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 18844169011200.0,
4
+ "train_loss": 0.21927920257051786,
5
+ "train_runtime": 1273.3046,
6
+ "train_samples": 4000,
7
+ "train_samples_per_second": 2.034,
8
+ "train_steps_per_second": 0.071
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 90,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1111111111111111,
13
+ "grad_norm": 25.66914681799565,
14
+ "learning_rate": 3.3333333333333333e-06,
15
+ "loss": 1.0048,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.5555555555555556,
20
+ "grad_norm": 12.156020748334926,
21
+ "learning_rate": 9.986966157589751e-06,
22
+ "loss": 0.9831,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 1.0,
27
+ "eval_loss": 0.652240514755249,
28
+ "eval_runtime": 24.1162,
29
+ "eval_samples_per_second": 10.74,
30
+ "eval_steps_per_second": 0.373,
31
+ "step": 9
32
+ },
33
+ {
34
+ "epoch": 1.1111111111111112,
35
+ "grad_norm": 3.8285863774272446,
36
+ "learning_rate": 9.841114703012817e-06,
37
+ "loss": 0.7569,
38
+ "step": 10
39
+ },
40
+ {
41
+ "epoch": 1.6666666666666665,
42
+ "grad_norm": 2.607435400324957,
43
+ "learning_rate": 9.537877098354787e-06,
44
+ "loss": 0.601,
45
+ "step": 15
46
+ },
47
+ {
48
+ "epoch": 2.0,
49
+ "eval_loss": 0.4056125581264496,
50
+ "eval_runtime": 24.0279,
51
+ "eval_samples_per_second": 10.779,
52
+ "eval_steps_per_second": 0.375,
53
+ "step": 18
54
+ },
55
+ {
56
+ "epoch": 2.2222222222222223,
57
+ "grad_norm": 3.788253489544647,
58
+ "learning_rate": 9.08711169279446e-06,
59
+ "loss": 0.4813,
60
+ "step": 20
61
+ },
62
+ {
63
+ "epoch": 2.7777777777777777,
64
+ "grad_norm": 2.392342802419934,
65
+ "learning_rate": 8.503473010366713e-06,
66
+ "loss": 0.3464,
67
+ "step": 25
68
+ },
69
+ {
70
+ "epoch": 3.0,
71
+ "eval_loss": 0.18872837722301483,
72
+ "eval_runtime": 24.0055,
73
+ "eval_samples_per_second": 10.789,
74
+ "eval_steps_per_second": 0.375,
75
+ "step": 27
76
+ },
77
+ {
78
+ "epoch": 3.3333333333333335,
79
+ "grad_norm": 5.610647726544286,
80
+ "learning_rate": 7.805935326811913e-06,
81
+ "loss": 0.2162,
82
+ "step": 30
83
+ },
84
+ {
85
+ "epoch": 3.888888888888889,
86
+ "grad_norm": 2.516026593621024,
87
+ "learning_rate": 7.017175809949044e-06,
88
+ "loss": 0.1559,
89
+ "step": 35
90
+ },
91
+ {
92
+ "epoch": 4.0,
93
+ "eval_loss": 0.07645933330059052,
94
+ "eval_runtime": 24.0333,
95
+ "eval_samples_per_second": 10.777,
96
+ "eval_steps_per_second": 0.374,
97
+ "step": 36
98
+ },
99
+ {
100
+ "epoch": 4.444444444444445,
101
+ "grad_norm": 3.496241200147731,
102
+ "learning_rate": 6.162837277871553e-06,
103
+ "loss": 0.0797,
104
+ "step": 40
105
+ },
106
+ {
107
+ "epoch": 5.0,
108
+ "grad_norm": 1.562094511651351,
109
+ "learning_rate": 5.270694542927089e-06,
110
+ "loss": 0.0714,
111
+ "step": 45
112
+ },
113
+ {
114
+ "epoch": 5.0,
115
+ "eval_loss": 0.047790270298719406,
116
+ "eval_runtime": 24.1049,
117
+ "eval_samples_per_second": 10.745,
118
+ "eval_steps_per_second": 0.373,
119
+ "step": 45
120
+ },
121
+ {
122
+ "epoch": 5.555555555555555,
123
+ "grad_norm": 2.038884810074137,
124
+ "learning_rate": 4.369751443898554e-06,
125
+ "loss": 0.0477,
126
+ "step": 50
127
+ },
128
+ {
129
+ "epoch": 6.0,
130
+ "eval_loss": 0.035123735666275024,
131
+ "eval_runtime": 24.0485,
132
+ "eval_samples_per_second": 10.77,
133
+ "eval_steps_per_second": 0.374,
134
+ "step": 54
135
+ },
136
+ {
137
+ "epoch": 6.111111111111111,
138
+ "grad_norm": 1.2084030161409793,
139
+ "learning_rate": 3.489297922152136e-06,
140
+ "loss": 0.0452,
141
+ "step": 55
142
+ },
143
+ {
144
+ "epoch": 6.666666666666667,
145
+ "grad_norm": 1.5950605501326323,
146
+ "learning_rate": 2.65795779650105e-06,
147
+ "loss": 0.0345,
148
+ "step": 60
149
+ },
150
+ {
151
+ "epoch": 7.0,
152
+ "eval_loss": 0.02559623494744301,
153
+ "eval_runtime": 24.0003,
154
+ "eval_samples_per_second": 10.792,
155
+ "eval_steps_per_second": 0.375,
156
+ "step": 63
157
+ },
158
+ {
159
+ "epoch": 7.222222222222222,
160
+ "grad_norm": 0.7529091490914854,
161
+ "learning_rate": 1.9027581939213852e-06,
162
+ "loss": 0.0295,
163
+ "step": 65
164
+ },
165
+ {
166
+ "epoch": 7.777777777777778,
167
+ "grad_norm": 1.569990631190196,
168
+ "learning_rate": 1.2482508892179884e-06,
169
+ "loss": 0.0252,
170
+ "step": 70
171
+ },
172
+ {
173
+ "epoch": 8.0,
174
+ "eval_loss": 0.019161375239491463,
175
+ "eval_runtime": 24.061,
176
+ "eval_samples_per_second": 10.764,
177
+ "eval_steps_per_second": 0.374,
178
+ "step": 72
179
+ },
180
+ {
181
+ "epoch": 8.333333333333334,
182
+ "grad_norm": 0.674086772310731,
183
+ "learning_rate": 7.157141191620548e-07,
184
+ "loss": 0.0198,
185
+ "step": 75
186
+ },
187
+ {
188
+ "epoch": 8.88888888888889,
189
+ "grad_norm": 0.5564084869849035,
190
+ "learning_rate": 3.224608203719953e-07,
191
+ "loss": 0.0181,
192
+ "step": 80
193
+ },
194
+ {
195
+ "epoch": 9.0,
196
+ "eval_loss": 0.015782972797751427,
197
+ "eval_runtime": 23.9574,
198
+ "eval_samples_per_second": 10.811,
199
+ "eval_steps_per_second": 0.376,
200
+ "step": 81
201
+ },
202
+ {
203
+ "epoch": 9.444444444444445,
204
+ "grad_norm": 0.402959494753,
205
+ "learning_rate": 8.127578033998663e-08,
206
+ "loss": 0.0156,
207
+ "step": 85
208
+ },
209
+ {
210
+ "epoch": 10.0,
211
+ "grad_norm": 0.3564826816641896,
212
+ "learning_rate": 0.0,
213
+ "loss": 0.0153,
214
+ "step": 90
215
+ },
216
+ {
217
+ "epoch": 10.0,
218
+ "eval_loss": 0.015113583765923977,
219
+ "eval_runtime": 23.9569,
220
+ "eval_samples_per_second": 10.811,
221
+ "eval_steps_per_second": 0.376,
222
+ "step": 90
223
+ },
224
+ {
225
+ "epoch": 10.0,
226
+ "step": 90,
227
+ "total_flos": 18844169011200.0,
228
+ "train_loss": 0.21927920257051786,
229
+ "train_runtime": 1273.3046,
230
+ "train_samples_per_second": 2.034,
231
+ "train_steps_per_second": 0.071
232
+ }
233
+ ],
234
+ "logging_steps": 5,
235
+ "max_steps": 90,
236
+ "num_input_tokens_seen": 0,
237
+ "num_train_epochs": 10,
238
+ "save_steps": 100,
239
+ "stateful_callbacks": {
240
+ "TrainerControl": {
241
+ "args": {
242
+ "should_epoch_stop": false,
243
+ "should_evaluate": false,
244
+ "should_log": false,
245
+ "should_save": true,
246
+ "should_training_stop": true
247
+ },
248
+ "attributes": {}
249
+ }
250
+ },
251
+ "total_flos": 18844169011200.0,
252
+ "train_batch_size": 8,
253
+ "trial_name": null,
254
+ "trial_params": null
255
+ }