ping98k commited on
Commit
f549f59
1 Parent(s): 080cdca

Upload 7 files

Browse files
README.md ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ tags:
4
+ - generated_from_trainer
5
+ base_model: scb10x/typhoon-7b
6
+ model-index:
7
+ - name: work/out
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
15
+ <details><summary>See axolotl config</summary>
16
+
17
+ axolotl version: `0.4.0`
18
+ ```yaml
19
+ base_model: ./models/scb10x_typhoon-7b
20
+ model_type: MistralForCausalLM
21
+ tokenizer_type: LlamaTokenizer
22
+ is_mistral_derived_model: true
23
+
24
+ load_in_8bit: false
25
+ load_in_4bit: true
26
+ strict: false
27
+
28
+ # datasets:
29
+ # - path: ./work/finetune-data-kbank.jsonl
30
+ # - path: ./work/finetune-data.jsonl
31
+ # - path: mhenrichsen/alpaca_2k_test
32
+ # type: alpaca
33
+
34
+ datasets:
35
+ - path: ./work/scb-mt-en-th-2020/apdf.csv
36
+ type:
37
+ system_prompt: ""
38
+ field_system: system
39
+ field_instruction: en_text
40
+ field_output: th_text
41
+ format: "{instruction}<translate>"
42
+ - path: ./work/scb-mt-en-th-2020/assorted_government.csv
43
+ type:
44
+ system_prompt: ""
45
+ field_system: system
46
+ field_instruction: en_text
47
+ field_output: th_text
48
+ format: "{instruction}<translate>"
49
+ - path: ./work/scb-mt-en-th-2020/generated_reviews_crowd.csv
50
+ type:
51
+ system_prompt: ""
52
+ field_system: system
53
+ field_instruction: en_text
54
+ field_output: th_text
55
+ format: "{instruction}<translate>"
56
+ - path: ./work/scb-mt-en-th-2020/generated_reviews_translator.csv
57
+ type:
58
+ system_prompt: ""
59
+ field_system: system
60
+ field_instruction: en_text
61
+ field_output: th_text
62
+ format: "{instruction}<translate>"
63
+ - path: ./work/scb-mt-en-th-2020/generated_reviews_yn.csv
64
+ type:
65
+ system_prompt: ""
66
+ field_system: system
67
+ field_instruction: en_text
68
+ field_output: th_text
69
+ format: "{instruction}<translate>"
70
+ - path: ./work/scb-mt-en-th-2020/mozilla_common_voice.csv
71
+ type:
72
+ system_prompt: ""
73
+ field_system: system
74
+ field_instruction: en_text
75
+ field_output: th_text
76
+ format: "{instruction}<translate>"
77
+ - path: ./work/scb-mt-en-th-2020/msr_paraphrase.csv
78
+ type:
79
+ system_prompt: ""
80
+ field_system: system
81
+ field_instruction: en_text
82
+ field_output: th_text
83
+ format: "{instruction}<translate>"
84
+ - path: ./work/scb-mt-en-th-2020/nus_sms.csv
85
+ type:
86
+ system_prompt: ""
87
+ field_system: system
88
+ field_instruction: en_text
89
+ field_output: th_text
90
+ format: "{instruction}<translate>"
91
+ - path: ./work/scb-mt-en-th-2020/paracrawl.csv
92
+ type:
93
+ system_prompt: ""
94
+ field_system: system
95
+ field_instruction: en_text
96
+ field_output: th_text
97
+ format: "{instruction}<translate>"
98
+ - path: ./work/scb-mt-en-th-2020/task_master_1.csv
99
+ type:
100
+ system_prompt: ""
101
+ field_system: system
102
+ field_instruction: en_text
103
+ field_output: th_text
104
+ format: "{instruction}<translate>"
105
+ - path: ./work/scb-mt-en-th-2020/thai_websites.csv
106
+ type:
107
+ system_prompt: ""
108
+ field_system: system
109
+ field_instruction: en_text
110
+ field_output: th_text
111
+ format: "{instruction}<translate>"
112
+ - path: ./work/scb-mt-en-th-2020/wikipedia.csv
113
+ type:
114
+ system_prompt: ""
115
+ field_system: system
116
+ field_instruction: en_text
117
+ field_output: th_text
118
+ format: "{instruction}<translate>"
119
+
120
+ dataset_prepared_path: ./work/last_run_prepared
121
+ val_set_size: 0.02
122
+ output_dir: ./work/out
123
+
124
+ # lora_modules_to_save:
125
+ # - embed_tokens
126
+ # - lm_head
127
+
128
+
129
+ adapter: qlora
130
+ lora_model_dir:
131
+
132
+ sequence_len: 4096
133
+ sample_packing: true
134
+ pad_to_sequence_len: true
135
+
136
+ gpu_memory_limit: 20
137
+
138
+ lora_r: 32
139
+ lora_alpha: 16
140
+ lora_dropout: 0.05
141
+ lora_target_linear: true
142
+ lora_fan_in_fan_out:
143
+
144
+
145
+ wandb_project: typhoon-7b
146
+ wandb_entity:
147
+ wandb_watch:
148
+ wandb_name:
149
+ wandb_log_model:
150
+
151
+ gradient_accumulation_steps: 8
152
+ micro_batch_size: 2
153
+ num_epochs: 1
154
+ optimizer: paged_adamw_8bit
155
+ lr_scheduler: cosine
156
+ learning_rate: 0.0004
157
+
158
+ train_on_inputs: false
159
+ group_by_length: false
160
+ bf16: true
161
+ fp16: false
162
+ tf32: false
163
+
164
+ gradient_checkpointing: true
165
+ # early_stopping_patience: 3
166
+ resume_from_checkpoint: true
167
+ local_rank:
168
+ logging_steps: 1
169
+ xformers_attention:
170
+ flash_attention: true
171
+
172
+ # loss_watchdog_threshold: 5.0
173
+ # loss_watchdog_patience: 3
174
+
175
+ warmup_ratio: 0.01
176
+ # evals_per_epoch: 5
177
+ eval_steps: 10
178
+ eval_table_size:
179
+ eval_table_max_new_tokens: 128
180
+ # saves_per_epoch: 5
181
+ save_steps: 10
182
+ save_total_limit: 10
183
+ debug:
184
+ deepspeed:
185
+ weight_decay: 0.0
186
+ fsdp:
187
+ fsdp_config:
188
+ # special_tokens:
189
+ # tokens: # these are delimiters
190
+ # - "=======START OF DOCUMENT======="
191
+ # - "=======NEXT DOCUMENT======="
192
+ # - "=======END OF DOCUMENT======="
193
+ # - "=======NO ANSWER======="
194
+ ```
195
+
196
+ </details><br>
197
+
198
+ # work/out
199
+
200
+ This model was trained from scratch on the None dataset.
201
+ It achieves the following results on the evaluation set:
202
+ - Loss: 0.8657
203
+
204
+ ## Model description
205
+
206
+ More information needed
207
+
208
+ ## Intended uses & limitations
209
+
210
+ More information needed
211
+
212
+ ## Training and evaluation data
213
+
214
+ More information needed
215
+
216
+ ## Training procedure
217
+
218
+ ### Training hyperparameters
219
+
220
+ The following hyperparameters were used during training:
221
+ - learning_rate: 0.0004
222
+ - train_batch_size: 2
223
+ - eval_batch_size: 2
224
+ - seed: 42
225
+ - gradient_accumulation_steps: 8
226
+ - total_train_batch_size: 16
227
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
228
+ - lr_scheduler_type: cosine
229
+ - lr_scheduler_warmup_steps: 90
230
+ - num_epochs: 1
231
+
232
+ ### Training results
233
+
234
+ | Training Loss | Epoch | Step | Validation Loss |
235
+ |:-------------:|:-----:|:----:|:---------------:|
236
+ | 2.8002 | 0.01 | 10 | 2.7164 |
237
+ | 2.1186 | 0.02 | 20 | 2.0709 |
238
+ | 1.717 | 0.03 | 30 | 1.6999 |
239
+ | 1.5327 | 0.04 | 40 | 1.5332 |
240
+ | 1.3684 | 0.04 | 50 | 1.4293 |
241
+ | 1.3992 | 0.05 | 60 | 1.3651 |
242
+ | 1.3031 | 0.06 | 70 | 1.3198 |
243
+ | 1.3067 | 0.07 | 80 | 1.2831 |
244
+ | 1.2685 | 0.08 | 90 | 1.2542 |
245
+ | 1.2469 | 0.09 | 100 | 1.2293 |
246
+ | 1.2067 | 0.1 | 110 | 1.2096 |
247
+ | 1.1458 | 0.11 | 120 | 1.1942 |
248
+ | 1.1679 | 0.11 | 130 | 1.1732 |
249
+ | 1.1914 | 0.12 | 140 | 1.1609 |
250
+ | 1.2329 | 0.13 | 150 | 1.1491 |
251
+ | 1.1151 | 0.14 | 160 | 1.1365 |
252
+ | 1.1138 | 0.15 | 170 | 1.1252 |
253
+ | 1.1607 | 0.16 | 180 | 1.1188 |
254
+ | 1.083 | 0.17 | 190 | 1.1095 |
255
+ | 1.1068 | 0.18 | 200 | 1.1016 |
256
+ | 1.1214 | 0.18 | 210 | 1.0921 |
257
+ | 1.061 | 0.19 | 220 | 1.0862 |
258
+ | 1.1072 | 0.2 | 230 | 1.0792 |
259
+ | 1.0275 | 0.21 | 240 | 1.0739 |
260
+ | 1.0735 | 0.22 | 250 | 1.0666 |
261
+ | 1.0549 | 0.23 | 260 | 1.0634 |
262
+ | 1.0336 | 0.24 | 270 | 1.0561 |
263
+ | 1.0784 | 0.25 | 280 | 1.0519 |
264
+ | 1.0313 | 0.26 | 290 | 1.0459 |
265
+ | 1.0459 | 0.26 | 300 | 1.0415 |
266
+ | 1.0824 | 0.27 | 310 | 1.0390 |
267
+ | 1.0543 | 0.28 | 320 | 1.0327 |
268
+ | 1.0732 | 0.29 | 330 | 1.0287 |
269
+ | 1.0071 | 0.3 | 340 | 1.0237 |
270
+ | 1.0336 | 0.31 | 350 | 1.0200 |
271
+ | 1.0694 | 0.32 | 360 | 1.0155 |
272
+ | 0.9799 | 0.33 | 370 | 1.0111 |
273
+ | 1.0025 | 0.33 | 380 | 1.0073 |
274
+ | 0.9805 | 0.34 | 390 | 1.0044 |
275
+ | 0.9398 | 0.35 | 400 | 1.0011 |
276
+ | 1.0133 | 0.36 | 410 | 0.9957 |
277
+ | 1.0465 | 0.37 | 420 | 0.9916 |
278
+ | 0.9711 | 0.38 | 430 | 0.9887 |
279
+ | 0.9786 | 0.39 | 440 | 0.9858 |
280
+ | 0.9687 | 0.4 | 450 | 0.9835 |
281
+ | 0.988 | 0.4 | 460 | 0.9810 |
282
+ | 1.021 | 0.41 | 470 | 0.9770 |
283
+ | 0.9754 | 0.42 | 480 | 0.9734 |
284
+ | 0.9677 | 0.43 | 490 | 0.9705 |
285
+ | 1.0114 | 0.44 | 500 | 0.9667 |
286
+ | 0.978 | 0.45 | 510 | 0.9643 |
287
+ | 0.9762 | 0.46 | 520 | 0.9611 |
288
+ | 0.9795 | 0.47 | 530 | 0.9597 |
289
+ | 0.9419 | 0.48 | 540 | 0.9558 |
290
+ | 0.9403 | 0.48 | 550 | 0.9519 |
291
+ | 0.9408 | 0.49 | 560 | 0.9495 |
292
+ | 0.9704 | 0.5 | 570 | 0.9460 |
293
+ | 0.9426 | 0.51 | 580 | 0.9447 |
294
+ | 0.9288 | 0.52 | 590 | 0.9406 |
295
+ | 0.9986 | 0.53 | 600 | 0.9394 |
296
+ | 0.9129 | 0.54 | 610 | 0.9374 |
297
+ | 0.9797 | 0.55 | 620 | 0.9349 |
298
+ | 0.9269 | 0.55 | 630 | 0.9317 |
299
+ | 0.9258 | 0.56 | 640 | 0.9296 |
300
+ | 0.9041 | 0.57 | 650 | 0.9268 |
301
+ | 0.9383 | 0.58 | 660 | 0.9240 |
302
+ | 0.9289 | 0.59 | 670 | 0.9220 |
303
+ | 0.8906 | 0.6 | 680 | 0.9201 |
304
+ | 0.9275 | 0.61 | 690 | 0.9171 |
305
+ | 0.99 | 0.62 | 700 | 0.9150 |
306
+ | 0.9063 | 0.62 | 710 | 0.9124 |
307
+ | 0.8757 | 0.63 | 720 | 0.9107 |
308
+ | 0.9276 | 0.64 | 730 | 0.9087 |
309
+ | 0.9315 | 0.65 | 740 | 0.9064 |
310
+ | 0.9442 | 0.66 | 750 | 0.9037 |
311
+ | 0.8848 | 0.67 | 760 | 0.9015 |
312
+ | 0.8901 | 0.68 | 770 | 0.8993 |
313
+ | 0.8714 | 0.69 | 780 | 0.8973 |
314
+ | 0.8641 | 0.7 | 790 | 0.8956 |
315
+ | 0.8915 | 0.7 | 800 | 0.8938 |
316
+ | 0.9069 | 0.71 | 810 | 0.8921 |
317
+ | 0.8798 | 0.72 | 820 | 0.8901 |
318
+ | 0.9195 | 0.73 | 830 | 0.8884 |
319
+ | 0.8936 | 0.74 | 840 | 0.8868 |
320
+ | 0.8284 | 0.75 | 850 | 0.8851 |
321
+ | 0.9469 | 0.76 | 860 | 0.8833 |
322
+ | 0.8854 | 0.77 | 870 | 0.8820 |
323
+ | 0.8865 | 0.77 | 880 | 0.8809 |
324
+ | 0.8982 | 0.78 | 890 | 0.8799 |
325
+ | 0.8683 | 0.79 | 900 | 0.8786 |
326
+ | 0.9326 | 0.8 | 910 | 0.8773 |
327
+ | 0.8937 | 0.81 | 920 | 0.8758 |
328
+ | 0.8995 | 0.82 | 930 | 0.8746 |
329
+ | 0.9263 | 0.83 | 940 | 0.8735 |
330
+ | 0.907 | 0.84 | 950 | 0.8725 |
331
+ | 0.8467 | 0.84 | 960 | 0.8715 |
332
+ | 0.9037 | 0.85 | 970 | 0.8708 |
333
+ | 0.833 | 0.86 | 980 | 0.8699 |
334
+ | 0.878 | 0.87 | 990 | 0.8693 |
335
+ | 0.8897 | 0.88 | 1000 | 0.8686 |
336
+ | 0.8931 | 0.89 | 1010 | 0.8681 |
337
+ | 0.8766 | 0.9 | 1020 | 0.8676 |
338
+ | 0.839 | 0.91 | 1030 | 0.8672 |
339
+ | 0.8973 | 0.92 | 1040 | 0.8669 |
340
+ | 0.8806 | 0.92 | 1050 | 0.8666 |
341
+ | 0.8683 | 0.93 | 1060 | 0.8664 |
342
+ | 0.8736 | 0.94 | 1070 | 0.8662 |
343
+ | 0.8495 | 0.95 | 1080 | 0.8660 |
344
+ | 0.8364 | 0.96 | 1090 | 0.8659 |
345
+ | 0.8934 | 0.97 | 1100 | 0.8658 |
346
+ | 0.8954 | 0.98 | 1110 | 0.8658 |
347
+ | 0.8783 | 0.99 | 1120 | 0.8657 |
348
+ | 0.8678 | 0.99 | 1130 | 0.8657 |
349
+
350
+
351
+ ### Framework versions
352
+
353
+ - PEFT 0.7.1
354
+ - Transformers 4.37.0
355
+ - Pytorch 2.0.1+cu118
356
+ - Datasets 2.16.1
357
+ - Tokenizers 0.15.0
adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "scb10x/typhoon-7b",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 32,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "q_proj",
23
+ "gate_proj",
24
+ "up_proj",
25
+ "o_proj",
26
+ "v_proj",
27
+ "down_proj",
28
+ "k_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM"
31
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4beca2ab0d5f121a88a43790726dcf7c0cfd22c212b969e1c7d71effedfdea1b
3
+ size 335705741
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "scb10x/typhoon-7b",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 4096,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 14336,
13
+ "max_position_embeddings": 32768,
14
+ "model_type": "mistral",
15
+ "num_attention_heads": 32,
16
+ "num_hidden_layers": 32,
17
+ "num_key_value_heads": 8,
18
+ "quantization_config": {
19
+ "bnb_4bit_compute_dtype": "bfloat16",
20
+ "bnb_4bit_quant_type": "nf4",
21
+ "bnb_4bit_use_double_quant": true,
22
+ "llm_int8_enable_fp32_cpu_offload": false,
23
+ "llm_int8_has_fp16_weight": false,
24
+ "llm_int8_skip_modules": null,
25
+ "llm_int8_threshold": 6.0,
26
+ "load_in_4bit": true,
27
+ "load_in_8bit": false,
28
+ "quant_method": "bitsandbytes"
29
+ },
30
+ "rms_norm_eps": 1e-05,
31
+ "rope_theta": 10000.0,
32
+ "sliding_window": 4096,
33
+ "tie_word_embeddings": false,
34
+ "torch_dtype": "bfloat16",
35
+ "transformers_version": "4.37.0",
36
+ "use_cache": false,
37
+ "vocab_size": 35219
38
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba0260fe22b9efe79df479f2619890767ab9c44912142f21648f1980c32297ed
3
+ size 562945
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": true,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "</s>",
37
+ "sp_model_kwargs": {},
38
+ "spaces_between_special_tokens": false,
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "trust_remote_code": false,
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": true,
43
+ "use_fast": true
44
+ }