diabolic6045 commited on
Commit
e1c5269
1 Parent(s): 1fb2577

End of training

Browse files
Files changed (2) hide show
  1. README.md +157 -3
  2. adapter_model.bin +3 -0
README.md CHANGED
@@ -1,3 +1,157 @@
1
- ---
2
- license: llama3
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: llama3
3
+ library_name: peft
4
+ tags:
5
+ - axolotl
6
+ - generated_from_trainer
7
+ base_model: meta-llama/Meta-Llama-3-8B
8
+ model-index:
9
+ - name: Sanskrit-llama
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
17
+ <details><summary>See axolotl config</summary>
18
+
19
+ axolotl version: `0.4.1`
20
+ ```yaml
21
+
22
+ base_model: meta-llama/Meta-Llama-3-8B
23
+ model_type: AutoModelForCausalLM
24
+ tokenizer_type: AutoTokenizer
25
+ max_steps: 2
26
+ bnb_config_kwargs:
27
+ llm_int8_has_fp16_weight: false
28
+ bnb_4bit_quant_type: nf4
29
+ bnb_4bit_use_double_quant: true
30
+
31
+ load_in_8bit: false
32
+ load_in_4bit: true
33
+ strict: false
34
+
35
+ datasets:
36
+ - path: diabolic6045/Sanskrit-llama
37
+ type: alpaca
38
+ dataset_prepared_path:
39
+ val_set_size: 0
40
+ output_dir: ./outputs/qlora-out
41
+ chat_template: chatml
42
+ hub_model_id: diabolic6045/Sanskrit-llama
43
+ hf_use_auth_token: true
44
+ adapter: qlora
45
+ lora_model_dir:
46
+
47
+ sequence_len: 1024
48
+ sample_packing: true
49
+ eval_sample_packing: false
50
+ pad_to_sequence_len: true
51
+
52
+ lora_r: 32
53
+ lora_alpha: 16
54
+ lora_dropout: 0.05
55
+ lora_target_modules:
56
+ lora_target_linear: true
57
+ lora_fan_in_fan_out:
58
+
59
+ wandb_project: संस्कृतम्-llama
60
+ wandb_entity:
61
+ wandb_watch: all
62
+ wandb_name: संस्कृतम्-llama
63
+ wandb_log_model:
64
+
65
+ gradient_accumulation_steps: 4
66
+ micro_batch_size: 1
67
+ num_epochs: 1
68
+ optimizer: paged_adamw_8bit
69
+ lr_scheduler: cosine
70
+ cosine_min_lr_ratio: 0.2
71
+ learning_rate: 2e-5
72
+
73
+ train_on_inputs: false
74
+ group_by_length: false
75
+ bf16: false
76
+ fp16:
77
+ tf32: false
78
+
79
+ gradient_checkpointing: true
80
+ early_stopping_patience:
81
+ resume_from_checkpoint:
82
+ local_rank:
83
+ logging_steps: 1
84
+ xformers_attention:
85
+ flash_attention: false
86
+
87
+ warmup_steps: 10
88
+ evals_per_epoch: 4
89
+ saves_per_epoch: 1
90
+ debug:
91
+ deepspeed:
92
+ weight_decay: 0.0
93
+ #fsdp:
94
+ # - full_shard
95
+ # - auto_wrap
96
+ #fsdp_config:
97
+ # fsdp_limit_all_gathers: true
98
+ # fsdp_sync_module_states: true
99
+ # fsdp_offload_params: true
100
+ # fsdp_use_orig_params: false
101
+ # fsdp_cpu_ram_efficient_loading: true
102
+ # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
103
+ # fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
104
+ # fsdp_state_dict_type: FULL_STATE_DICT
105
+ special_tokens:
106
+ pad_token: "<|end_of_text|>"
107
+
108
+ ```
109
+
110
+ </details><br>
111
+
112
+ # Sanskrit-llama
113
+
114
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the None dataset.
115
+
116
+ ## Model description
117
+
118
+ More information needed
119
+
120
+ ## Intended uses & limitations
121
+
122
+ More information needed
123
+
124
+ ## Training and evaluation data
125
+
126
+ More information needed
127
+
128
+ ## Training procedure
129
+
130
+ ### Training hyperparameters
131
+
132
+ The following hyperparameters were used during training:
133
+ - learning_rate: 2e-05
134
+ - train_batch_size: 1
135
+ - eval_batch_size: 1
136
+ - seed: 42
137
+ - distributed_type: multi-GPU
138
+ - num_devices: 2
139
+ - gradient_accumulation_steps: 4
140
+ - total_train_batch_size: 8
141
+ - total_eval_batch_size: 2
142
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
143
+ - lr_scheduler_type: cosine
144
+ - lr_scheduler_warmup_steps: 10
145
+ - training_steps: 2
146
+
147
+ ### Training results
148
+
149
+
150
+
151
+ ### Framework versions
152
+
153
+ - PEFT 0.11.1
154
+ - Transformers 4.41.1
155
+ - Pytorch 2.1.2
156
+ - Datasets 2.19.1
157
+ - Tokenizers 0.19.1
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89d88f847c0e0b16a88c3c027e4ca023c4aa4cb1db074e327121b9c858e90079
3
+ size 167843194