nlpguy commited on
Commit
279c2b8
1 Parent(s): 6274349

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - unsloth
8
+ - generated_from_trainer
9
+ base_model: chujiezheng/Starling-LM-7B-alpha-ExPO
10
+ model-index:
11
+ - name: train_2024-05-08-19-49-29
12
+ results: []
13
+ ---
14
+
15
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
+ should probably proofread and complete it, then remove this comment. -->
17
+
18
+ # train_2024-05-08-19-49-29
19
+
20
+ This model is a fine-tuned version of [chujiezheng/Starling-LM-7B-alpha-ExPO](https://huggingface.co/chujiezheng/Starling-LM-7B-alpha-ExPO) on the no_robots dataset.
21
+
22
+ ## Model description
23
+
24
+ More information needed
25
+
26
+ ## Intended uses & limitations
27
+
28
+ More information needed
29
+
30
+ ## Training and evaluation data
31
+
32
+ More information needed
33
+
34
+ ## Training procedure
35
+
36
+ ### Training hyperparameters
37
+
38
+ The following hyperparameters were used during training:
39
+ - learning_rate: 5e-05
40
+ - train_batch_size: 2
41
+ - eval_batch_size: 8
42
+ - seed: 42
43
+ - gradient_accumulation_steps: 8
44
+ - total_train_batch_size: 16
45
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
+ - lr_scheduler_type: cosine
47
+ - num_epochs: 0.1
48
+ - mixed_precision_training: Native AMP
49
+
50
+ ### Training results
51
+
52
+
53
+
54
+ ### Framework versions
55
+
56
+ - PEFT 0.10.0
57
+ - Transformers 4.40.1
58
+ - Pytorch 2.2.1+cu121
59
+ - Datasets 2.19.1
60
+ - Tokenizers 0.19.1
adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "chujiezheng/Starling-LM-7B-alpha-ExPO",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": "unsloth",
22
+ "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e868c6f310cde5819d461685833d5c4ba914a779ab400a10df62f05a7ba90bc
3
+ size 13648432
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<sep>": 32002,
3
+ "<|end_of_turn|>": 32000,
4
+ "<|pad_0|>": 32001
5
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.10105263157894737,
3
+ "total_flos": 1.783322464223232e+16,
4
+ "train_loss": 1.624589498837789,
5
+ "train_runtime": 968.353,
6
+ "train_samples_per_second": 0.981,
7
+ "train_steps_per_second": 0.062
8
+ }
running_log.txt ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 05/08/2024 19:50:37 - INFO - transformers.tokenization_utils_base - loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/tokenizer.model
2
+
3
+ 05/08/2024 19:50:37 - INFO - transformers.tokenization_utils_base - loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/tokenizer.json
4
+
5
+ 05/08/2024 19:50:37 - INFO - transformers.tokenization_utils_base - loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/added_tokens.json
6
+
7
+ 05/08/2024 19:50:37 - INFO - transformers.tokenization_utils_base - loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/special_tokens_map.json
8
+
9
+ 05/08/2024 19:50:37 - INFO - transformers.tokenization_utils_base - loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/tokenizer_config.json
10
+
11
+ 05/08/2024 19:50:38 - WARNING - transformers.tokenization_utils_base - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
12
+
13
+ 05/08/2024 19:50:38 - INFO - llmtuner.data.loader - Loading dataset realign_no_robots.jsonl...
14
+
15
+ 05/08/2024 19:50:39 - INFO - transformers.configuration_utils - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/config.json
16
+
17
+ 05/08/2024 19:50:39 - INFO - transformers.configuration_utils - Model config MistralConfig {
18
+ "_name_or_path": "chujiezheng/Starling-LM-7B-alpha-ExPO",
19
+ "architectures": [
20
+ "MistralForCausalLM"
21
+ ],
22
+ "attention_dropout": 0.0,
23
+ "bos_token_id": 1,
24
+ "eos_token_id": 32000,
25
+ "hidden_act": "silu",
26
+ "hidden_size": 4096,
27
+ "initializer_range": 0.02,
28
+ "intermediate_size": 14336,
29
+ "max_position_embeddings": 8192,
30
+ "model_type": "mistral",
31
+ "num_attention_heads": 32,
32
+ "num_hidden_layers": 32,
33
+ "num_key_value_heads": 8,
34
+ "rms_norm_eps": 1e-05,
35
+ "rope_theta": 10000.0,
36
+ "sliding_window": 4096,
37
+ "tie_word_embeddings": false,
38
+ "torch_dtype": "bfloat16",
39
+ "transformers_version": "4.40.1",
40
+ "use_cache": true,
41
+ "vocab_size": 32002
42
+ }
43
+
44
+
45
+ 05/08/2024 19:50:39 - INFO - transformers.configuration_utils - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/config.json
46
+
47
+ 05/08/2024 19:50:39 - INFO - transformers.configuration_utils - Model config MistralConfig {
48
+ "_name_or_path": "chujiezheng/Starling-LM-7B-alpha-ExPO",
49
+ "architectures": [
50
+ "MistralForCausalLM"
51
+ ],
52
+ "attention_dropout": 0.0,
53
+ "bos_token_id": 1,
54
+ "eos_token_id": 32000,
55
+ "hidden_act": "silu",
56
+ "hidden_size": 4096,
57
+ "initializer_range": 0.02,
58
+ "intermediate_size": 14336,
59
+ "max_position_embeddings": 8192,
60
+ "model_type": "mistral",
61
+ "num_attention_heads": 32,
62
+ "num_hidden_layers": 32,
63
+ "num_key_value_heads": 8,
64
+ "rms_norm_eps": 1e-05,
65
+ "rope_theta": 10000.0,
66
+ "sliding_window": 4096,
67
+ "tie_word_embeddings": false,
68
+ "torch_dtype": "bfloat16",
69
+ "transformers_version": "4.40.1",
70
+ "use_cache": true,
71
+ "vocab_size": 32002
72
+ }
73
+
74
+
75
+ 05/08/2024 19:50:39 - INFO - transformers.configuration_utils - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/config.json
76
+
77
+ 05/08/2024 19:50:39 - INFO - transformers.configuration_utils - Model config MistralConfig {
78
+ "_name_or_path": "chujiezheng/Starling-LM-7B-alpha-ExPO",
79
+ "architectures": [
80
+ "MistralForCausalLM"
81
+ ],
82
+ "attention_dropout": 0.0,
83
+ "bos_token_id": 1,
84
+ "eos_token_id": 32000,
85
+ "hidden_act": "silu",
86
+ "hidden_size": 4096,
87
+ "initializer_range": 0.02,
88
+ "intermediate_size": 14336,
89
+ "max_position_embeddings": 8192,
90
+ "model_type": "mistral",
91
+ "num_attention_heads": 32,
92
+ "num_hidden_layers": 32,
93
+ "num_key_value_heads": 8,
94
+ "rms_norm_eps": 1e-05,
95
+ "rope_theta": 10000.0,
96
+ "sliding_window": 4096,
97
+ "tie_word_embeddings": false,
98
+ "torch_dtype": "bfloat16",
99
+ "transformers_version": "4.40.1",
100
+ "use_cache": true,
101
+ "vocab_size": 32002
102
+ }
103
+
104
+
105
+ 05/08/2024 19:50:40 - INFO - transformers.configuration_utils - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/config.json
106
+
107
+ 05/08/2024 19:50:40 - INFO - transformers.configuration_utils - Model config MistralConfig {
108
+ "_name_or_path": "chujiezheng/Starling-LM-7B-alpha-ExPO",
109
+ "architectures": [
110
+ "MistralForCausalLM"
111
+ ],
112
+ "attention_dropout": 0.0,
113
+ "bos_token_id": 1,
114
+ "eos_token_id": 32000,
115
+ "hidden_act": "silu",
116
+ "hidden_size": 4096,
117
+ "initializer_range": 0.02,
118
+ "intermediate_size": 14336,
119
+ "max_position_embeddings": 8192,
120
+ "model_type": "mistral",
121
+ "num_attention_heads": 32,
122
+ "num_hidden_layers": 32,
123
+ "num_key_value_heads": 8,
124
+ "rms_norm_eps": 1e-05,
125
+ "rope_theta": 10000.0,
126
+ "sliding_window": 4096,
127
+ "tie_word_embeddings": false,
128
+ "torch_dtype": "float16",
129
+ "transformers_version": "4.40.1",
130
+ "use_cache": true,
131
+ "vocab_size": 32002
132
+ }
133
+
134
+
135
+ 05/08/2024 19:50:40 - INFO - transformers.modeling_utils - loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/model.safetensors.index.json
136
+
137
+ 05/08/2024 19:50:40 - INFO - transformers.modeling_utils - Instantiating MistralForCausalLM model under default dtype torch.float16.
138
+
139
+ 05/08/2024 19:50:40 - INFO - transformers.generation.configuration_utils - Generate config GenerationConfig {
140
+ "bos_token_id": 1,
141
+ "eos_token_id": 32000
142
+ }
143
+
144
+
145
+ 05/08/2024 19:51:52 - INFO - transformers.modeling_utils - All model checkpoint weights were used when initializing MistralForCausalLM.
146
+
147
+
148
+ 05/08/2024 19:51:52 - INFO - transformers.modeling_utils - All the weights of MistralForCausalLM were initialized from the model checkpoint at chujiezheng/Starling-LM-7B-alpha-ExPO.
149
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
150
+
151
+ 05/08/2024 19:51:52 - INFO - transformers.generation.configuration_utils - loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/generation_config.json
152
+
153
+ 05/08/2024 19:51:52 - INFO - transformers.generation.configuration_utils - Generate config GenerationConfig {
154
+ "bos_token_id": 1,
155
+ "do_sample": true,
156
+ "eos_token_id": 32000,
157
+ "max_length": 8192,
158
+ "pad_token_id": 0,
159
+ "temperature": 0.5
160
+ }
161
+
162
+
163
+ 05/08/2024 19:51:52 - INFO - transformers.tokenization_utils_base - loading file tokenizer.model from cache at huggingface_tokenizers_cache/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/tokenizer.model
164
+
165
+ 05/08/2024 19:51:52 - INFO - transformers.tokenization_utils_base - loading file added_tokens.json from cache at huggingface_tokenizers_cache/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/added_tokens.json
166
+
167
+ 05/08/2024 19:51:52 - INFO - transformers.tokenization_utils_base - loading file special_tokens_map.json from cache at huggingface_tokenizers_cache/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/special_tokens_map.json
168
+
169
+ 05/08/2024 19:51:52 - INFO - transformers.tokenization_utils_base - loading file tokenizer_config.json from cache at huggingface_tokenizers_cache/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/tokenizer_config.json
170
+
171
+ 05/08/2024 19:51:52 - INFO - transformers.tokenization_utils_base - loading file tokenizer.json from cache at huggingface_tokenizers_cache/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/tokenizer.json
172
+
173
+ 05/08/2024 19:51:52 - WARNING - transformers.tokenization_utils_base - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
174
+
175
+ 05/08/2024 19:51:53 - INFO - transformers.tokenization_utils_base - loading file tokenizer.model from cache at huggingface_tokenizers_cache/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/tokenizer.model
176
+
177
+ 05/08/2024 19:51:53 - INFO - transformers.tokenization_utils_base - loading file tokenizer.json from cache at huggingface_tokenizers_cache/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/tokenizer.json
178
+
179
+ 05/08/2024 19:51:53 - INFO - transformers.tokenization_utils_base - loading file added_tokens.json from cache at huggingface_tokenizers_cache/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/added_tokens.json
180
+
181
+ 05/08/2024 19:51:53 - INFO - transformers.tokenization_utils_base - loading file special_tokens_map.json from cache at huggingface_tokenizers_cache/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/special_tokens_map.json
182
+
183
+ 05/08/2024 19:51:53 - INFO - transformers.tokenization_utils_base - loading file tokenizer_config.json from cache at huggingface_tokenizers_cache/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/tokenizer_config.json
184
+
185
+ 05/08/2024 19:51:53 - WARNING - transformers.tokenization_utils_base - Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
186
+
187
+ 05/08/2024 19:51:53 - WARNING - transformers.models.llama.modeling_llama - chujiezheng/Starling-LM-7B-alpha-ExPO does not have a padding token! Will use pad_token = <unk>.
188
+
189
+ 05/08/2024 19:51:54 - INFO - llmtuner.model.utils.checkpointing - Gradient checkpointing enabled.
190
+
191
+ 05/08/2024 19:51:54 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA
192
+
193
+ 05/08/2024 19:51:54 - WARNING - transformers.models.llama.modeling_llama - Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
194
+ are not enabled or a bias term (like in Qwen) is used.
195
+
196
+ 05/08/2024 19:51:54 - WARNING - transformers.models.llama.modeling_llama - Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters
197
+ are not enabled or a bias term (like in Qwen) is used.
198
+
199
+ 05/08/2024 19:51:54 - WARNING - transformers.models.llama.modeling_llama - Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters
200
+ are not enabled or a bias term (like in Qwen) is used.
201
+
202
+ 05/08/2024 19:51:54 - WARNING - transformers.models.llama.modeling_llama - Unsloth 2024.4 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.
203
+
204
+ 05/08/2024 19:51:54 - INFO - llmtuner.model.loader - trainable params: 3407872 || all params: 7245156352 || trainable%: 0.0470
205
+
206
+ 05/08/2024 19:51:54 - INFO - transformers.trainer - Using auto half precision backend
207
+
208
+ 05/08/2024 19:51:54 - WARNING - transformers.trainer - ==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1
209
+ \\ /| Num examples = 9,500 | Num Epochs = 1
210
+ O^O/ \_/ \ Batch size per device = 2 | Gradient Accumulation steps = 8
211
+ \ / Total batch size = 16 | Total steps = 60
212
+ "-____-" Number of trainable parameters = 3,407,872
213
+
214
+ 05/08/2024 19:53:17 - INFO - llmtuner.extras.callbacks - {'loss': 1.8162, 'learning_rate': 4.9148e-05, 'epoch': 0.01}
215
+
216
+ 05/08/2024 19:54:37 - INFO - llmtuner.extras.callbacks - {'loss': 1.7065, 'learning_rate': 4.6651e-05, 'epoch': 0.02}
217
+
218
+ 05/08/2024 19:55:55 - INFO - llmtuner.extras.callbacks - {'loss': 1.6809, 'learning_rate': 4.2678e-05, 'epoch': 0.03}
219
+
220
+ 05/08/2024 19:57:22 - INFO - llmtuner.extras.callbacks - {'loss': 1.7278, 'learning_rate': 3.7500e-05, 'epoch': 0.03}
221
+
222
+ 05/08/2024 19:58:49 - INFO - llmtuner.extras.callbacks - {'loss': 1.5573, 'learning_rate': 3.1470e-05, 'epoch': 0.04}
223
+
224
+ 05/08/2024 20:00:11 - INFO - llmtuner.extras.callbacks - {'loss': 1.5858, 'learning_rate': 2.5000e-05, 'epoch': 0.05}
225
+
226
+ 05/08/2024 20:01:39 - INFO - llmtuner.extras.callbacks - {'loss': 1.4219, 'learning_rate': 1.8530e-05, 'epoch': 0.06}
227
+
228
+ 05/08/2024 20:02:51 - INFO - llmtuner.extras.callbacks - {'loss': 1.6545, 'learning_rate': 1.2500e-05, 'epoch': 0.07}
229
+
230
+ 05/08/2024 20:04:11 - INFO - llmtuner.extras.callbacks - {'loss': 1.5896, 'learning_rate': 7.3223e-06, 'epoch': 0.08}
231
+
232
+ 05/08/2024 20:05:24 - INFO - llmtuner.extras.callbacks - {'loss': 1.4727, 'learning_rate': 3.3494e-06, 'epoch': 0.08}
233
+
234
+ 05/08/2024 20:06:42 - INFO - llmtuner.extras.callbacks - {'loss': 1.6785, 'learning_rate': 8.5185e-07, 'epoch': 0.09}
235
+
236
+ 05/08/2024 20:08:03 - INFO - llmtuner.extras.callbacks - {'loss': 1.6034, 'learning_rate': 0.0000e+00, 'epoch': 0.10}
237
+
238
+ 05/08/2024 20:08:03 - INFO - transformers.trainer -
239
+
240
+ Training completed. Do not forget to share your model on huggingface.co/models =)
241
+
242
+
243
+
244
+ 05/08/2024 20:08:03 - INFO - transformers.trainer - Saving model checkpoint to saves/OpenChat3.5-7B-Chat/lora/train_2024-05-08-19-49-29
245
+
246
+ 05/08/2024 20:08:03 - INFO - transformers.configuration_utils - loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--chujiezheng--Starling-LM-7B-alpha-ExPO/snapshots/fc53fe6883ff3ea00a89be453a6cc586b3c369c3/config.json
247
+
248
+ 05/08/2024 20:08:03 - INFO - transformers.configuration_utils - Model config MistralConfig {
249
+ "_name_or_path": "openchat/openchat_3.5",
250
+ "architectures": [
251
+ "MistralForCausalLM"
252
+ ],
253
+ "attention_dropout": 0.0,
254
+ "bos_token_id": 1,
255
+ "eos_token_id": 32000,
256
+ "hidden_act": "silu",
257
+ "hidden_size": 4096,
258
+ "initializer_range": 0.02,
259
+ "intermediate_size": 14336,
260
+ "max_position_embeddings": 8192,
261
+ "model_type": "mistral",
262
+ "num_attention_heads": 32,
263
+ "num_hidden_layers": 32,
264
+ "num_key_value_heads": 8,
265
+ "rms_norm_eps": 1e-05,
266
+ "rope_theta": 10000.0,
267
+ "sliding_window": 4096,
268
+ "tie_word_embeddings": false,
269
+ "torch_dtype": "bfloat16",
270
+ "transformers_version": "4.40.1",
271
+ "use_cache": true,
272
+ "vocab_size": 32002
273
+ }
274
+
275
+
276
+ 05/08/2024 20:08:03 - INFO - transformers.tokenization_utils_base - tokenizer config file saved in saves/OpenChat3.5-7B-Chat/lora/train_2024-05-08-19-49-29/tokenizer_config.json
277
+
278
+ 05/08/2024 20:08:03 - INFO - transformers.tokenization_utils_base - Special tokens file saved in saves/OpenChat3.5-7B-Chat/lora/train_2024-05-08-19-49-29/special_tokens_map.json
279
+
280
+ 05/08/2024 20:08:03 - INFO - transformers.modelcard - Dropping the following result as it does not have all the necessary fields:
281
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
282
+
special_tokens_map.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|end_of_turn|>",
4
+ "<|pad_0|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<s>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|end_of_turn|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<|end_of_turn|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "sep_token": {
28
+ "content": "<sep>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "unk_token": {
35
+ "content": "<unk>",
36
+ "lstrip": false,
37
+ "normalized": false,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ }
41
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "32000": {
30
+ "content": "<|end_of_turn|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "32001": {
38
+ "content": "<|pad_0|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "32002": {
46
+ "content": "<sep>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<|end_of_turn|>",
56
+ "<|pad_0|>"
57
+ ],
58
+ "bos_token": "<s>",
59
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '<s>' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'GPT4 Correct User: ' + content + '<|end_of_turn|>' + 'GPT4 Correct Assistant:' }}{% elif message['role'] == 'assistant' %}{{ content + '<|end_of_turn|>' }}{% endif %}{% endfor %}",
60
+ "clean_up_tokenization_spaces": false,
61
+ "eos_token": "<|end_of_turn|>",
62
+ "legacy": true,
63
+ "model_max_length": 1000000000000000019884624838656,
64
+ "pad_token": "<|end_of_turn|>",
65
+ "padding_side": "right",
66
+ "sep_token": "<sep>",
67
+ "sp_model_kwargs": {},
68
+ "spaces_between_special_tokens": false,
69
+ "split_special_tokens": false,
70
+ "tokenizer_class": "LlamaTokenizer",
71
+ "unk_token": "<unk>",
72
+ "use_default_system_prompt": true
73
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.10105263157894737,
3
+ "total_flos": 1.783322464223232e+16,
4
+ "train_loss": 1.624589498837789,
5
+ "train_runtime": 968.353,
6
+ "train_samples_per_second": 0.981,
7
+ "train_steps_per_second": 0.062
8
+ }
trainer_config.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cutoff_len: 1024
2
+ dataset: no_robots
3
+ dataset_dir: data
4
+ do_train: true
5
+ finetuning_type: lora
6
+ flash_attn: auto
7
+ fp16: true
8
+ gradient_accumulation_steps: 8
9
+ learning_rate: 5.0e-05
10
+ logging_steps: 5
11
+ lora_alpha: 16
12
+ lora_dropout: 0
13
+ lora_rank: 8
14
+ lora_target: q_proj,v_proj
15
+ lr_scheduler_type: cosine
16
+ max_grad_norm: 1.0
17
+ max_samples: 100000
18
+ model_name_or_path: chujiezheng/Starling-LM-7B-alpha-ExPO
19
+ num_train_epochs: 0.1
20
+ optim: adamw_torch
21
+ output_dir: saves/OpenChat3.5-7B-Chat/lora/train_2024-05-08-19-49-29
22
+ packing: false
23
+ per_device_train_batch_size: 2
24
+ report_to: none
25
+ save_steps: 100
26
+ stage: sft
27
+ template: openchat
28
+ use_unsloth: true
29
+ warmup_steps: 0
trainer_log.jsonl ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 60, "loss": 1.8162, "learning_rate": 4.914814565722671e-05, "epoch": 0.008421052631578947, "percentage": 8.33, "elapsed_time": "0:01:22", "remaining_time": "0:15:10"}
2
+ {"current_steps": 10, "total_steps": 60, "loss": 1.7065, "learning_rate": 4.665063509461097e-05, "epoch": 0.016842105263157894, "percentage": 16.67, "elapsed_time": "0:02:42", "remaining_time": "0:13:32"}
3
+ {"current_steps": 15, "total_steps": 60, "loss": 1.6809, "learning_rate": 4.267766952966369e-05, "epoch": 0.02526315789473684, "percentage": 25.0, "elapsed_time": "0:04:00", "remaining_time": "0:12:01"}
4
+ {"current_steps": 20, "total_steps": 60, "loss": 1.7278, "learning_rate": 3.7500000000000003e-05, "epoch": 0.03368421052631579, "percentage": 33.33, "elapsed_time": "0:05:27", "remaining_time": "0:10:54"}
5
+ {"current_steps": 25, "total_steps": 60, "loss": 1.5573, "learning_rate": 3.147047612756302e-05, "epoch": 0.042105263157894736, "percentage": 41.67, "elapsed_time": "0:06:54", "remaining_time": "0:09:40"}
6
+ {"current_steps": 30, "total_steps": 60, "loss": 1.5858, "learning_rate": 2.5e-05, "epoch": 0.05052631578947368, "percentage": 50.0, "elapsed_time": "0:08:16", "remaining_time": "0:08:16"}
7
+ {"current_steps": 35, "total_steps": 60, "loss": 1.4219, "learning_rate": 1.852952387243698e-05, "epoch": 0.05894736842105263, "percentage": 58.33, "elapsed_time": "0:09:44", "remaining_time": "0:06:57"}
8
+ {"current_steps": 40, "total_steps": 60, "loss": 1.6545, "learning_rate": 1.2500000000000006e-05, "epoch": 0.06736842105263158, "percentage": 66.67, "elapsed_time": "0:10:57", "remaining_time": "0:05:28"}
9
+ {"current_steps": 45, "total_steps": 60, "loss": 1.5896, "learning_rate": 7.3223304703363135e-06, "epoch": 0.07578947368421053, "percentage": 75.0, "elapsed_time": "0:12:16", "remaining_time": "0:04:05"}
10
+ {"current_steps": 50, "total_steps": 60, "loss": 1.4727, "learning_rate": 3.3493649053890326e-06, "epoch": 0.08421052631578947, "percentage": 83.33, "elapsed_time": "0:13:29", "remaining_time": "0:02:41"}
11
+ {"current_steps": 55, "total_steps": 60, "loss": 1.6785, "learning_rate": 8.51854342773295e-07, "epoch": 0.09263157894736843, "percentage": 91.67, "elapsed_time": "0:14:47", "remaining_time": "0:01:20"}
12
+ {"current_steps": 60, "total_steps": 60, "loss": 1.6034, "learning_rate": 0.0, "epoch": 0.10105263157894737, "percentage": 100.0, "elapsed_time": "0:16:08", "remaining_time": "0:00:00"}
13
+ {"current_steps": 60, "total_steps": 60, "epoch": 0.10105263157894737, "percentage": 100.0, "elapsed_time": "0:16:08", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.10105263157894737,
5
+ "eval_steps": 500,
6
+ "global_step": 60,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.008421052631578947,
13
+ "grad_norm": 1.9215331077575684,
14
+ "learning_rate": 4.914814565722671e-05,
15
+ "loss": 1.8162,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.016842105263157894,
20
+ "grad_norm": 1.746415376663208,
21
+ "learning_rate": 4.665063509461097e-05,
22
+ "loss": 1.7065,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.02526315789473684,
27
+ "grad_norm": 1.2568161487579346,
28
+ "learning_rate": 4.267766952966369e-05,
29
+ "loss": 1.6809,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.03368421052631579,
34
+ "grad_norm": 1.087218165397644,
35
+ "learning_rate": 3.7500000000000003e-05,
36
+ "loss": 1.7278,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.042105263157894736,
41
+ "grad_norm": 0.935480535030365,
42
+ "learning_rate": 3.147047612756302e-05,
43
+ "loss": 1.5573,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.05052631578947368,
48
+ "grad_norm": 1.06438148021698,
49
+ "learning_rate": 2.5e-05,
50
+ "loss": 1.5858,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.05894736842105263,
55
+ "grad_norm": 1.0331830978393555,
56
+ "learning_rate": 1.852952387243698e-05,
57
+ "loss": 1.4219,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.06736842105263158,
62
+ "grad_norm": 0.7302573919296265,
63
+ "learning_rate": 1.2500000000000006e-05,
64
+ "loss": 1.6545,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.07578947368421053,
69
+ "grad_norm": 0.8324851989746094,
70
+ "learning_rate": 7.3223304703363135e-06,
71
+ "loss": 1.5896,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.08421052631578947,
76
+ "grad_norm": 1.0173665285110474,
77
+ "learning_rate": 3.3493649053890326e-06,
78
+ "loss": 1.4727,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.09263157894736843,
83
+ "grad_norm": 0.763109564781189,
84
+ "learning_rate": 8.51854342773295e-07,
85
+ "loss": 1.6785,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.10105263157894737,
90
+ "grad_norm": 0.8254455327987671,
91
+ "learning_rate": 0.0,
92
+ "loss": 1.6034,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.10105263157894737,
97
+ "step": 60,
98
+ "total_flos": 1.783322464223232e+16,
99
+ "train_loss": 1.624589498837789,
100
+ "train_runtime": 968.353,
101
+ "train_samples_per_second": 0.981,
102
+ "train_steps_per_second": 0.062
103
+ }
104
+ ],
105
+ "logging_steps": 5,
106
+ "max_steps": 60,
107
+ "num_input_tokens_seen": 0,
108
+ "num_train_epochs": 1,
109
+ "save_steps": 100,
110
+ "total_flos": 1.783322464223232e+16,
111
+ "train_batch_size": 2,
112
+ "trial_name": null,
113
+ "trial_params": null
114
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d7813b595d6dc62dc0132620651454ef332f506a8191f64fde55da56335e1af
3
+ size 5176