hllj commited on
Commit
c523564
1 Parent(s): a8f7c0a

Model save

Browse files
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: hllj/mistral-vi-math
3
+ tags:
4
+ - generated_from_trainer
5
+ model-index:
6
+ - name: sft-mistral-v1-original-data
7
+ results: []
8
+ ---
9
+
10
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
11
+ should probably proofread and complete it, then remove this comment. -->
12
+
13
+ # sft-mistral-v1-original-data
14
+
15
+ This model is a fine-tuned version of [hllj/mistral-vi-math](https://huggingface.co/hllj/mistral-vi-math) on an unknown dataset.
16
+ It achieves the following results on the evaluation set:
17
+ - Loss: 0.5087
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 5e-05
37
+ - train_batch_size: 4
38
+ - eval_batch_size: 4
39
+ - seed: 42
40
+ - distributed_type: multi-GPU
41
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
+ - lr_scheduler_type: cosine
43
+ - lr_scheduler_warmup_ratio: 0.05
44
+ - num_epochs: 2
45
+ - mixed_precision_training: Native AMP
46
+
47
+ ### Training results
48
+
49
+ | Training Loss | Epoch | Step | Validation Loss |
50
+ |:-------------:|:-----:|:----:|:---------------:|
51
+ | 0.3103 | 1.27 | 200 | 0.5224 |
52
+
53
+
54
+ ### Framework versions
55
+
56
+ - Transformers 4.35.2
57
+ - Pytorch 2.1.0
58
+ - Datasets 2.15.0
59
+ - Tokenizers 0.15.0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9f098cde9492e4e08a9e4630d9b6655df2c1a4d8159595ebc4007e34f209afb
3
  size 872450448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28e27032630142d5d3a5633ce86247205605fe887fce465f29e6f88d604023e2
3
  size 872450448
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.39,
3
+ "eval_loss": 0.508748471736908,
4
+ "eval_runtime": 6.855,
5
+ "eval_samples": 140,
6
+ "eval_samples_per_second": 20.423,
7
+ "eval_steps_per_second": 5.106,
8
+ "train_loss": 0.40742398091291976,
9
+ "train_runtime": 500.0224,
10
+ "train_samples": 1196,
11
+ "train_samples_per_second": 4.784,
12
+ "train_steps_per_second": 1.196
13
+ }
config_argument.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cache_dir: ./cache
2
+ ddp_find_unused_parameters: false
3
+ ddp_timeout: 30000
4
+ device_map: auto
5
+ do_eval: true
6
+ do_train: true
7
+ eval_steps: 200
8
+ evaluation_strategy: steps
9
+ fp16: true
10
+ gradient_accumulation_steps: 1
11
+ gradient_checkpointing: true
12
+ gradient_checkpointing_kwargs:
13
+ use_reentrant: false
14
+ hub_model_id: hllj/sft-mistral-v1-original-data
15
+ hub_strategy: every_save
16
+ learning_rate: 5.0e-05
17
+ log_level: info
18
+ logging_first_step: true
19
+ logging_steps: 10
20
+ logging_strategy: steps
21
+ lora_alpha: 128
22
+ lora_dropout: 0.05
23
+ lora_r: 256
24
+ lora_target_modules:
25
+ - q_proj
26
+ - k_proj
27
+ - v_proj
28
+ - o_proj
29
+ lr_scheduler_type: cosine
30
+ max_seq_length: 1024
31
+ model_name_or_path: hllj/mistral-vi-math
32
+ model_type: auto
33
+ num_train_epochs: 2
34
+ output_dir: outputs-sft-mistral-v1-original-data
35
+ overwrite_output_dir: true
36
+ per_device_eval_batch_size: 4
37
+ per_device_train_batch_size: 4
38
+ preprocessing_num_workers: 4
39
+ push_to_hub: true
40
+ report_to: wandb
41
+ run_name: sft-mistral-v1-original-data
42
+ save_steps: 200
43
+ save_strategy: steps
44
+ save_total_limit: 13
45
+ seed: 42
46
+ token: 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
47
+ torch_dtype: float16
48
+ train_file_dir: datasets/finetune_original
49
+ use_peft: true
50
+ validation_file_dir: datasets/validation
51
+ warmup_ratio: 0.05
52
+ weight_decay: 0.05
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.39,
3
+ "eval_loss": 0.508748471736908,
4
+ "eval_runtime": 6.855,
5
+ "eval_samples": 140,
6
+ "eval_samples_per_second": 20.423,
7
+ "eval_steps_per_second": 5.106
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.39,
3
+ "train_loss": 0.40742398091291976,
4
+ "train_runtime": 500.0224,
5
+ "train_samples": 1196,
6
+ "train_samples_per_second": 4.784,
7
+ "train_steps_per_second": 1.196
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.394648829431438,
5
+ "eval_steps": 200,
6
+ "global_step": 236,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 1.6666666666666667e-06,
14
+ "loss": 0.8728,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.03,
19
+ "learning_rate": 1.6666666666666667e-05,
20
+ "loss": 0.8168,
21
+ "step": 10
22
+ },
23
+ {
24
+ "epoch": 0.07,
25
+ "learning_rate": 3.3333333333333335e-05,
26
+ "loss": 0.6667,
27
+ "step": 20
28
+ },
29
+ {
30
+ "epoch": 0.1,
31
+ "learning_rate": 5e-05,
32
+ "loss": 0.5432,
33
+ "step": 30
34
+ },
35
+ {
36
+ "epoch": 0.13,
37
+ "learning_rate": 4.996177016978633e-05,
38
+ "loss": 0.4616,
39
+ "step": 40
40
+ },
41
+ {
42
+ "epoch": 0.17,
43
+ "learning_rate": 4.984719760073877e-05,
44
+ "loss": 0.4572,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.2,
49
+ "learning_rate": 4.9656632700046265e-05,
50
+ "loss": 0.4327,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.23,
55
+ "learning_rate": 4.9390658288812675e-05,
56
+ "loss": 0.401,
57
+ "step": 70
58
+ },
59
+ {
60
+ "epoch": 0.27,
61
+ "learning_rate": 4.90500878195646e-05,
62
+ "loss": 0.4179,
63
+ "step": 80
64
+ },
65
+ {
66
+ "epoch": 0.3,
67
+ "learning_rate": 4.8635962888399254e-05,
68
+ "loss": 0.4091,
69
+ "step": 90
70
+ },
71
+ {
72
+ "epoch": 0.33,
73
+ "learning_rate": 4.820140360457198e-05,
74
+ "loss": 0.4178,
75
+ "step": 100
76
+ },
77
+ {
78
+ "epoch": 0.37,
79
+ "learning_rate": 4.7651197369406566e-05,
80
+ "loss": 0.4046,
81
+ "step": 110
82
+ },
83
+ {
84
+ "epoch": 1.01,
85
+ "learning_rate": 4.703171501987564e-05,
86
+ "loss": 0.396,
87
+ "step": 120
88
+ },
89
+ {
90
+ "epoch": 1.04,
91
+ "learning_rate": 4.6344851172382647e-05,
92
+ "loss": 0.3232,
93
+ "step": 130
94
+ },
95
+ {
96
+ "epoch": 1.07,
97
+ "learning_rate": 4.5592706521989154e-05,
98
+ "loss": 0.3301,
99
+ "step": 140
100
+ },
101
+ {
102
+ "epoch": 1.11,
103
+ "learning_rate": 4.477758141767761e-05,
104
+ "loss": 0.333,
105
+ "step": 150
106
+ },
107
+ {
108
+ "epoch": 1.14,
109
+ "learning_rate": 4.390196882699528e-05,
110
+ "loss": 0.3361,
111
+ "step": 160
112
+ },
113
+ {
114
+ "epoch": 1.17,
115
+ "learning_rate": 4.296854671159614e-05,
116
+ "loss": 0.3169,
117
+ "step": 170
118
+ },
119
+ {
120
+ "epoch": 1.21,
121
+ "learning_rate": 4.198016983699933e-05,
122
+ "loss": 0.3168,
123
+ "step": 180
124
+ },
125
+ {
126
+ "epoch": 1.24,
127
+ "learning_rate": 4.0939861041613107e-05,
128
+ "loss": 0.3351,
129
+ "step": 190
130
+ },
131
+ {
132
+ "epoch": 1.27,
133
+ "learning_rate": 3.9850801991726846e-05,
134
+ "loss": 0.3103,
135
+ "step": 200
136
+ },
137
+ {
138
+ "epoch": 1.27,
139
+ "eval_loss": 0.5224232077598572,
140
+ "eval_runtime": 6.8718,
141
+ "eval_samples_per_second": 20.373,
142
+ "eval_steps_per_second": 5.093,
143
+ "step": 200
144
+ },
145
+ {
146
+ "epoch": 1.31,
147
+ "learning_rate": 3.871632345074615e-05,
148
+ "loss": 0.3372,
149
+ "step": 210
150
+ },
151
+ {
152
+ "epoch": 1.34,
153
+ "learning_rate": 3.753989509243122e-05,
154
+ "loss": 0.3065,
155
+ "step": 220
156
+ },
157
+ {
158
+ "epoch": 1.37,
159
+ "learning_rate": 3.632511488929382e-05,
160
+ "loss": 0.3254,
161
+ "step": 230
162
+ },
163
+ {
164
+ "epoch": 1.39,
165
+ "step": 236,
166
+ "total_flos": 4.241630717752115e+16,
167
+ "train_loss": 0.40742398091291976,
168
+ "train_runtime": 500.0224,
169
+ "train_samples_per_second": 4.784,
170
+ "train_steps_per_second": 1.196
171
+ }
172
+ ],
173
+ "logging_steps": 10,
174
+ "max_steps": 598,
175
+ "num_train_epochs": 2,
176
+ "save_steps": 200,
177
+ "total_flos": 4.241630717752115e+16,
178
+ "trial_name": null,
179
+ "trial_params": null
180
+ }