ChenWu98 commited on
Commit
40945e6
1 Parent(s): b08c468

Model save

Browse files
Files changed (5) hide show
  1. README.md +68 -0
  2. all_results.json +13 -0
  3. eval_results.json +8 -0
  4. train_results.json +8 -0
  5. trainer_state.json +152 -0
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ base_model: HuggingFaceH4/zephyr-7b-beta
9
+ model-index:
10
+ - name: skills_red_herring_chat-lora
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # skills_red_herring_chat-lora
18
+
19
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.2122
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 0.0002
41
+ - train_batch_size: 4
42
+ - eval_batch_size: 8
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - gradient_accumulation_steps: 4
46
+ - total_train_batch_size: 16
47
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
+ - lr_scheduler_type: cosine
49
+ - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 4.0
51
+
52
+ ### Training results
53
+
54
+ | Training Loss | Epoch | Step | Validation Loss |
55
+ |:-------------:|:-----:|:----:|:---------------:|
56
+ | 0.3285 | 0.96 | 18 | 0.2505 |
57
+ | 0.1944 | 1.97 | 37 | 0.2189 |
58
+ | 0.1767 | 2.99 | 56 | 0.2127 |
59
+ | 0.1591 | 3.84 | 72 | 0.2122 |
60
+
61
+
62
+ ### Framework versions
63
+
64
+ - PEFT 0.7.1
65
+ - Transformers 4.37.2
66
+ - Pytorch 2.1.2+cu121
67
+ - Datasets 2.14.6
68
+ - Tokenizers 0.15.1
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.84,
3
+ "eval_loss": 0.21221186220645905,
4
+ "eval_runtime": 4.0792,
5
+ "eval_samples": 100,
6
+ "eval_samples_per_second": 24.515,
7
+ "eval_steps_per_second": 3.187,
8
+ "train_loss": 0.4049788423710399,
9
+ "train_runtime": 399.0811,
10
+ "train_samples": 300,
11
+ "train_samples_per_second": 3.007,
12
+ "train_steps_per_second": 0.18
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.84,
3
+ "eval_loss": 0.21221186220645905,
4
+ "eval_runtime": 4.0792,
5
+ "eval_samples": 100,
6
+ "eval_samples_per_second": 24.515,
7
+ "eval_steps_per_second": 3.187
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.84,
3
+ "train_loss": 0.4049788423710399,
4
+ "train_runtime": 399.0811,
5
+ "train_samples": 300,
6
+ "train_samples_per_second": 3.007,
7
+ "train_steps_per_second": 0.18
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.84,
5
+ "eval_steps": 500,
6
+ "global_step": 72,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05,
13
+ "learning_rate": 2.5e-05,
14
+ "loss": 2.4616,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.27,
19
+ "learning_rate": 0.000125,
20
+ "loss": 2.2926,
21
+ "step": 5
22
+ },
23
+ {
24
+ "epoch": 0.53,
25
+ "learning_rate": 0.0001995184726672197,
26
+ "loss": 1.0788,
27
+ "step": 10
28
+ },
29
+ {
30
+ "epoch": 0.8,
31
+ "learning_rate": 0.00019415440651830208,
32
+ "loss": 0.3285,
33
+ "step": 15
34
+ },
35
+ {
36
+ "epoch": 0.96,
37
+ "eval_loss": 0.25046542286872864,
38
+ "eval_runtime": 4.9708,
39
+ "eval_samples_per_second": 20.118,
40
+ "eval_steps_per_second": 2.615,
41
+ "step": 18
42
+ },
43
+ {
44
+ "epoch": 1.07,
45
+ "learning_rate": 0.00018314696123025454,
46
+ "loss": 0.2382,
47
+ "step": 20
48
+ },
49
+ {
50
+ "epoch": 1.33,
51
+ "learning_rate": 0.00016715589548470185,
52
+ "loss": 0.2235,
53
+ "step": 25
54
+ },
55
+ {
56
+ "epoch": 1.6,
57
+ "learning_rate": 0.0001471396736825998,
58
+ "loss": 0.2044,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 1.87,
63
+ "learning_rate": 0.0001242980179903264,
64
+ "loss": 0.1944,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 1.97,
69
+ "eval_loss": 0.21891021728515625,
70
+ "eval_runtime": 4.0841,
71
+ "eval_samples_per_second": 24.485,
72
+ "eval_steps_per_second": 3.183,
73
+ "step": 37
74
+ },
75
+ {
76
+ "epoch": 2.13,
77
+ "learning_rate": 0.0001,
78
+ "loss": 0.1881,
79
+ "step": 40
80
+ },
81
+ {
82
+ "epoch": 2.4,
83
+ "learning_rate": 7.570198200967362e-05,
84
+ "loss": 0.1768,
85
+ "step": 45
86
+ },
87
+ {
88
+ "epoch": 2.67,
89
+ "learning_rate": 5.286032631740023e-05,
90
+ "loss": 0.1628,
91
+ "step": 50
92
+ },
93
+ {
94
+ "epoch": 2.93,
95
+ "learning_rate": 3.2844104515298155e-05,
96
+ "loss": 0.1767,
97
+ "step": 55
98
+ },
99
+ {
100
+ "epoch": 2.99,
101
+ "eval_loss": 0.2127072662115097,
102
+ "eval_runtime": 4.0548,
103
+ "eval_samples_per_second": 24.662,
104
+ "eval_steps_per_second": 3.206,
105
+ "step": 56
106
+ },
107
+ {
108
+ "epoch": 3.2,
109
+ "learning_rate": 1.6853038769745467e-05,
110
+ "loss": 0.1663,
111
+ "step": 60
112
+ },
113
+ {
114
+ "epoch": 3.47,
115
+ "learning_rate": 5.8455934816979305e-06,
116
+ "loss": 0.1468,
117
+ "step": 65
118
+ },
119
+ {
120
+ "epoch": 3.73,
121
+ "learning_rate": 4.815273327803182e-07,
122
+ "loss": 0.1591,
123
+ "step": 70
124
+ },
125
+ {
126
+ "epoch": 3.84,
127
+ "eval_loss": 0.21221186220645905,
128
+ "eval_runtime": 4.0724,
129
+ "eval_samples_per_second": 24.556,
130
+ "eval_steps_per_second": 3.192,
131
+ "step": 72
132
+ },
133
+ {
134
+ "epoch": 3.84,
135
+ "step": 72,
136
+ "total_flos": 40669301473280.0,
137
+ "train_loss": 0.4049788423710399,
138
+ "train_runtime": 399.0811,
139
+ "train_samples_per_second": 3.007,
140
+ "train_steps_per_second": 0.18
141
+ }
142
+ ],
143
+ "logging_steps": 5,
144
+ "max_steps": 72,
145
+ "num_input_tokens_seen": 0,
146
+ "num_train_epochs": 4,
147
+ "save_steps": 500,
148
+ "total_flos": 40669301473280.0,
149
+ "train_batch_size": 4,
150
+ "trial_name": null,
151
+ "trial_params": null
152
+ }