hllj commited on
Commit
c5f3ba1
1 Parent(s): 7e29eeb

Model save

Browse files
README.md CHANGED
@@ -14,7 +14,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  This model is a fine-tuned version of [hllj/mistral-vi-math](https://huggingface.co/hllj/mistral-vi-math) on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
- - Loss: 0.5087
18
 
19
  ## Model description
20
 
@@ -33,7 +33,7 @@ More information needed
33
  ### Training hyperparameters
34
 
35
  The following hyperparameters were used during training:
36
- - learning_rate: 5e-05
37
  - train_batch_size: 4
38
  - eval_batch_size: 4
39
  - seed: 42
@@ -48,7 +48,7 @@ The following hyperparameters were used during training:
48
 
49
  | Training Loss | Epoch | Step | Validation Loss |
50
  |:-------------:|:-----:|:----:|:---------------:|
51
- | 0.3103 | 1.27 | 200 | 0.5224 |
52
 
53
 
54
  ### Framework versions
 
14
 
15
  This model is a fine-tuned version of [hllj/mistral-vi-math](https://huggingface.co/hllj/mistral-vi-math) on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
+ - Loss: 0.5062
18
 
19
  ## Model description
20
 
 
33
  ### Training hyperparameters
34
 
35
  The following hyperparameters were used during training:
36
+ - learning_rate: 3e-05
37
  - train_batch_size: 4
38
  - eval_batch_size: 4
39
  - seed: 42
 
48
 
49
  | Training Loss | Epoch | Step | Validation Loss |
50
  |:-------------:|:-----:|:----:|:---------------:|
51
+ | 0.3374 | 1.27 | 200 | 0.5180 |
52
 
53
 
54
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c25760b21a9d3724c95a2655f4e712bc426ea7d6cf3771f3a4c590d72513ee16
3
  size 872450448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e53a9e6b7e21140982d001c013b6966e19608a7186e08561bc84d41e56f14f9e
3
  size 872450448
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 1.39,
3
- "eval_loss": 0.508748471736908,
4
- "eval_runtime": 6.855,
5
  "eval_samples": 140,
6
- "eval_samples_per_second": 20.423,
7
- "eval_steps_per_second": 5.106,
8
- "train_loss": 0.40742398091291976,
9
- "train_runtime": 500.0224,
10
  "train_samples": 1196,
11
- "train_samples_per_second": 4.784,
12
- "train_steps_per_second": 1.196
13
  }
 
1
  {
2
  "epoch": 1.39,
3
+ "eval_loss": 0.5062369704246521,
4
+ "eval_runtime": 6.8908,
5
  "eval_samples": 140,
6
+ "eval_samples_per_second": 20.317,
7
+ "eval_steps_per_second": 5.079,
8
+ "train_loss": 0.4328786596908408,
9
+ "train_runtime": 504.1662,
10
  "train_samples": 1196,
11
+ "train_samples_per_second": 4.744,
12
+ "train_steps_per_second": 1.186
13
  }
config_argument.yaml CHANGED
@@ -13,7 +13,7 @@ gradient_checkpointing_kwargs:
13
  use_reentrant: false
14
  hub_model_id: hllj/sft-mistral-v1-original-data
15
  hub_strategy: every_save
16
- learning_rate: 5.0e-05
17
  log_level: info
18
  logging_first_step: true
19
  logging_steps: 10
 
13
  use_reentrant: false
14
  hub_model_id: hllj/sft-mistral-v1-original-data
15
  hub_strategy: every_save
16
+ learning_rate: 3.0e-05
17
  log_level: info
18
  logging_first_step: true
19
  logging_steps: 10
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.39,
3
- "eval_loss": 0.508748471736908,
4
- "eval_runtime": 6.855,
5
  "eval_samples": 140,
6
- "eval_samples_per_second": 20.423,
7
- "eval_steps_per_second": 5.106
8
  }
 
1
  {
2
  "epoch": 1.39,
3
+ "eval_loss": 0.5062369704246521,
4
+ "eval_runtime": 6.8908,
5
  "eval_samples": 140,
6
+ "eval_samples_per_second": 20.317,
7
+ "eval_steps_per_second": 5.079
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.39,
3
- "train_loss": 0.40742398091291976,
4
- "train_runtime": 500.0224,
5
  "train_samples": 1196,
6
- "train_samples_per_second": 4.784,
7
- "train_steps_per_second": 1.196
8
  }
 
1
  {
2
  "epoch": 1.39,
3
+ "train_loss": 0.4328786596908408,
4
+ "train_runtime": 504.1662,
5
  "train_samples": 1196,
6
+ "train_samples_per_second": 4.744,
7
+ "train_steps_per_second": 1.186
8
  }
trainer_state.json CHANGED
@@ -10,164 +10,164 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 1.6666666666666667e-06,
14
  "loss": 0.8728,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.03,
19
- "learning_rate": 1.6666666666666667e-05,
20
- "loss": 0.8168,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.07,
25
- "learning_rate": 3.3333333333333335e-05,
26
- "loss": 0.6667,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.1,
31
- "learning_rate": 5e-05,
32
- "loss": 0.5432,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.13,
37
- "learning_rate": 4.996177016978633e-05,
38
- "loss": 0.4616,
39
  "step": 40
40
  },
41
  {
42
  "epoch": 0.17,
43
- "learning_rate": 4.984719760073877e-05,
44
- "loss": 0.4572,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.2,
49
- "learning_rate": 4.9656632700046265e-05,
50
- "loss": 0.4327,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.23,
55
- "learning_rate": 4.9390658288812675e-05,
56
- "loss": 0.401,
57
  "step": 70
58
  },
59
  {
60
  "epoch": 0.27,
61
- "learning_rate": 4.90500878195646e-05,
62
- "loss": 0.4179,
63
  "step": 80
64
  },
65
  {
66
  "epoch": 0.3,
67
- "learning_rate": 4.8635962888399254e-05,
68
- "loss": 0.4091,
69
  "step": 90
70
  },
71
  {
72
  "epoch": 0.33,
73
- "learning_rate": 4.820140360457198e-05,
74
- "loss": 0.4178,
75
  "step": 100
76
  },
77
  {
78
  "epoch": 0.37,
79
- "learning_rate": 4.7651197369406566e-05,
80
- "loss": 0.4046,
81
  "step": 110
82
  },
83
  {
84
  "epoch": 1.01,
85
- "learning_rate": 4.703171501987564e-05,
86
- "loss": 0.396,
87
  "step": 120
88
  },
89
  {
90
  "epoch": 1.04,
91
- "learning_rate": 4.6344851172382647e-05,
92
- "loss": 0.3232,
93
  "step": 130
94
  },
95
  {
96
  "epoch": 1.07,
97
- "learning_rate": 4.5592706521989154e-05,
98
- "loss": 0.3301,
99
  "step": 140
100
  },
101
  {
102
  "epoch": 1.11,
103
- "learning_rate": 4.477758141767761e-05,
104
- "loss": 0.333,
105
  "step": 150
106
  },
107
  {
108
  "epoch": 1.14,
109
- "learning_rate": 4.390196882699528e-05,
110
- "loss": 0.3361,
111
  "step": 160
112
  },
113
  {
114
  "epoch": 1.17,
115
- "learning_rate": 4.296854671159614e-05,
116
- "loss": 0.3169,
117
  "step": 170
118
  },
119
  {
120
  "epoch": 1.21,
121
- "learning_rate": 4.198016983699933e-05,
122
- "loss": 0.3168,
123
  "step": 180
124
  },
125
  {
126
  "epoch": 1.24,
127
- "learning_rate": 4.0939861041613107e-05,
128
- "loss": 0.3351,
129
  "step": 190
130
  },
131
  {
132
  "epoch": 1.27,
133
- "learning_rate": 3.9850801991726846e-05,
134
- "loss": 0.3103,
135
  "step": 200
136
  },
137
  {
138
  "epoch": 1.27,
139
- "eval_loss": 0.5224232077598572,
140
- "eval_runtime": 6.8718,
141
- "eval_samples_per_second": 20.373,
142
- "eval_steps_per_second": 5.093,
143
  "step": 200
144
  },
145
  {
146
  "epoch": 1.31,
147
- "learning_rate": 3.871632345074615e-05,
148
- "loss": 0.3372,
149
  "step": 210
150
  },
151
  {
152
  "epoch": 1.34,
153
- "learning_rate": 3.753989509243122e-05,
154
- "loss": 0.3065,
155
  "step": 220
156
  },
157
  {
158
  "epoch": 1.37,
159
- "learning_rate": 3.632511488929382e-05,
160
- "loss": 0.3254,
161
  "step": 230
162
  },
163
  {
164
  "epoch": 1.39,
165
  "step": 236,
166
  "total_flos": 4.241630717752115e+16,
167
- "train_loss": 0.40742398091291976,
168
- "train_runtime": 500.0224,
169
- "train_samples_per_second": 4.784,
170
- "train_steps_per_second": 1.196
171
  }
172
  ],
173
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 1e-06,
14
  "loss": 0.8728,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.03,
19
+ "learning_rate": 9.999999999999999e-06,
20
+ "loss": 0.8297,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.07,
25
+ "learning_rate": 1.9999999999999998e-05,
26
+ "loss": 0.7256,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.1,
31
+ "learning_rate": 3e-05,
32
+ "loss": 0.6035,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.13,
37
+ "learning_rate": 2.99770621018718e-05,
38
+ "loss": 0.4872,
39
  "step": 40
40
  },
41
  {
42
  "epoch": 0.17,
43
+ "learning_rate": 2.990831856044326e-05,
44
+ "loss": 0.476,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.2,
49
+ "learning_rate": 2.979397962002776e-05,
50
+ "loss": 0.4522,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.23,
55
+ "learning_rate": 2.9634394973287605e-05,
56
+ "loss": 0.4198,
57
  "step": 70
58
  },
59
  {
60
  "epoch": 0.27,
61
+ "learning_rate": 2.943005269173876e-05,
62
+ "loss": 0.434,
63
  "step": 80
64
  },
65
  {
66
  "epoch": 0.3,
67
+ "learning_rate": 2.9181577733039554e-05,
68
+ "loss": 0.4245,
69
  "step": 90
70
  },
71
  {
72
  "epoch": 0.33,
73
+ "learning_rate": 2.8889730029628665e-05,
74
+ "loss": 0.4332,
75
  "step": 100
76
  },
77
  {
78
  "epoch": 0.37,
79
+ "learning_rate": 2.8555402164558058e-05,
80
+ "loss": 0.418,
81
  "step": 110
82
  },
83
  {
84
  "epoch": 1.01,
85
+ "learning_rate": 2.8179616641629125e-05,
86
+ "loss": 0.4121,
87
  "step": 120
88
  },
89
  {
90
  "epoch": 1.04,
91
+ "learning_rate": 2.776352275818093e-05,
92
+ "loss": 0.3505,
93
  "step": 130
94
  },
95
  {
96
  "epoch": 1.07,
97
+ "learning_rate": 2.730839309009485e-05,
98
+ "loss": 0.3558,
99
  "step": 140
100
  },
101
  {
102
  "epoch": 1.11,
103
+ "learning_rate": 2.6815619599765775e-05,
104
+ "loss": 0.3613,
105
  "step": 150
106
  },
107
  {
108
  "epoch": 1.14,
109
+ "learning_rate": 2.628670937894323e-05,
110
+ "loss": 0.3646,
111
  "step": 160
112
  },
113
  {
114
  "epoch": 1.17,
115
+ "learning_rate": 2.572328003946244e-05,
116
+ "loss": 0.3456,
117
  "step": 170
118
  },
119
  {
120
  "epoch": 1.21,
121
+ "learning_rate": 2.512705476596226e-05,
122
+ "loss": 0.3435,
123
  "step": 180
124
  },
125
  {
126
  "epoch": 1.24,
127
+ "learning_rate": 2.4499857045720705e-05,
128
+ "loss": 0.3614,
129
  "step": 190
130
  },
131
  {
132
  "epoch": 1.27,
133
+ "learning_rate": 2.3843605091726184e-05,
134
+ "loss": 0.3374,
135
  "step": 200
136
  },
137
  {
138
  "epoch": 1.27,
139
+ "eval_loss": 0.5180116295814514,
140
+ "eval_runtime": 6.8175,
141
+ "eval_samples_per_second": 20.535,
142
+ "eval_steps_per_second": 5.134,
143
  "step": 200
144
  },
145
  {
146
  "epoch": 1.31,
147
+ "learning_rate": 2.3160305976040984e-05,
148
+ "loss": 0.3647,
149
  "step": 210
150
  },
151
  {
152
  "epoch": 1.34,
153
+ "learning_rate": 2.2452049491399336e-05,
154
+ "loss": 0.3271,
155
  "step": 220
156
  },
157
  {
158
  "epoch": 1.37,
159
+ "learning_rate": 2.1721001759813677e-05,
160
+ "loss": 0.3519,
161
  "step": 230
162
  },
163
  {
164
  "epoch": 1.39,
165
  "step": 236,
166
  "total_flos": 4.241630717752115e+16,
167
+ "train_loss": 0.4328786596908408,
168
+ "train_runtime": 504.1662,
169
+ "train_samples_per_second": 4.744,
170
+ "train_steps_per_second": 1.186
171
  }
172
  ],
173
  "logging_steps": 10,