rombodawg commited on
Commit
8fc73c0
1 Parent(s): a4be7cc

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +141 -14
README.md CHANGED
@@ -2,22 +2,149 @@
2
  language:
3
  - en
4
  license: apache-2.0
5
- tags:
6
- - text-generation-inference
7
- - transformers
8
- - unsloth
9
- - llama
10
- - trl
11
- - sft
12
- base_model: unsloth/llama-3-8b-Instruct-bnb-4bit
13
  ---
 
14
 
15
- # Uploaded model
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- - **Developed by:** rombodawg
18
- - **License:** apache-2.0
19
- - **Finetuned from model :** unsloth/llama-3-8b-Instruct-bnb-4bit
20
 
21
- This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
 
2
  language:
3
  - en
4
  license: apache-2.0
 
 
 
 
 
 
 
 
5
  ---
6
+ This is unsloth/llama-3-8b-Instruct trained on the Replete-AI/code-test-dataset using the code bellow with unsloth and google colab with under 15gb of vram. This training was complete in about 40 minutes total.
7
 
8
+ ```Python
9
+ %%capture
10
+ import torch
11
+ major_version, minor_version = torch.cuda.get_device_capability()
12
+ # Must install separately since Colab has torch 2.2.1, which breaks packages
13
+ !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
14
+ if major_version >= 8:
15
+ # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
16
+ !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
17
+ else:
18
+ # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
19
+ !pip install --no-deps xformers trl peft accelerate bitsandbytes
20
+ pass
21
+ ```
22
 
23
+ ```Python
24
+ !pip install galore_torch
25
+ ```
26
 
27
+ ```Python
28
+ from unsloth import FastLanguageModel
29
+ import torch
30
+ max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
31
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
32
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
33
+
34
+ # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
35
+ fourbit_models = [
36
+ "unsloth/mistral-7b-bnb-4bit",
37
+ "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
38
+ "unsloth/llama-2-7b-bnb-4bit",
39
+ "unsloth/gemma-7b-bnb-4bit",
40
+ "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
41
+ "unsloth/gemma-2b-bnb-4bit",
42
+ "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
43
+ "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
44
+ ] # More models at https://huggingface.co/unsloth
45
+
46
+ model, tokenizer = FastLanguageModel.from_pretrained(
47
+ model_name = "unsloth/llama-3-8b-Instruct",
48
+ max_seq_length = max_seq_length,
49
+ dtype = dtype,
50
+ load_in_4bit = load_in_4bit,
51
+ # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
52
+ )
53
+ ```
54
+
55
+ ```Python
56
+ model = FastLanguageModel.get_peft_model(
57
+ model,
58
+ r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
59
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
60
+ "gate_proj", "up_proj", "down_proj",],
61
+ lora_alpha = 16,
62
+ lora_dropout = 0, # Supports any, but = 0 is optimized
63
+ bias = "none", # Supports any, but = "none" is optimized
64
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
65
+ use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
66
+ random_state = 3407,
67
+ use_rslora = False, # We support rank stabilized LoRA
68
+ loftq_config = None, # And LoftQ
69
+ )
70
+ ```
71
+
72
+ ```Python
73
+ alpaca_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
74
+
75
+ Below is an instruction that describes a task, Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>
76
+
77
+ {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{}"""
78
+
79
+ EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
80
+ def formatting_prompts_func(examples):
81
+ inputs = examples["human"]
82
+ outputs = examples["assistant"]
83
+ texts = []
84
+ for input, output in zip(inputs, outputs):
85
+ # Must add EOS_TOKEN, otherwise your generation will go on forever!
86
+ text = alpaca_prompt.format(input, output) + EOS_TOKEN
87
+ texts.append(text)
88
+ return { "text" : texts, }
89
+ pass
90
+
91
+ from datasets import load_dataset
92
+ dataset = load_dataset("Replete-AI/code-test-dataset", split = "train")
93
+ dataset = dataset.map(formatting_prompts_func, batched = True,)
94
+ ```
95
+
96
+ ```Python
97
+ from trl import SFTTrainer
98
+ from transformers import TrainingArguments
99
+ from galore_torch import GaLoreAdamW8bit
100
+ import torch.nn as nn
101
+ galore_params = []
102
+ target_modules_list = ["attn", "mlp"]
103
+ for module_name, module in model.named_modules():
104
+ if not isinstance(module, nn.Linear):
105
+ continue
106
+
107
+ if not any(target_key in module_name for target_key in target_modules_list):
108
+ continue
109
+
110
+ print('mod ', module_name)
111
+ galore_params.append(module.weight)
112
+ id_galore_params = [id(p) for p in galore_params]
113
+ regular_params = [p for p in model.parameters() if id(p) not in id_galore_params]
114
+
115
+
116
+ param_groups = [{'params': regular_params},
117
+ {'params': galore_params, 'rank': 64, 'update_proj_gap': 200, 'scale': 0.25, 'proj_type': 'std'}]
118
+ optimizer = GaLoreAdamW8bit(param_groups, lr=2e-5)
119
+
120
+ trainer = SFTTrainer(
121
+ model = model,
122
+ tokenizer = tokenizer,
123
+ train_dataset = dataset,
124
+ optimizers=(optimizer, None),
125
+ dataset_text_field = "text",
126
+ max_seq_length = max_seq_length,
127
+ dataset_num_proc = 2,
128
+ packing = True, # Can make training 5x faster for short sequences.
129
+ args = TrainingArguments(
130
+ per_device_train_batch_size = 1,
131
+ gradient_accumulation_steps = 4,
132
+ warmup_steps = 5,
133
+ learning_rate = 2e-4,
134
+ fp16 = not torch.cuda.is_bf16_supported(),
135
+ bf16 = torch.cuda.is_bf16_supported(),
136
+ logging_steps = 1,
137
+ weight_decay = 0.01,
138
+ lr_scheduler_type = "linear",
139
+ seed = 3407,
140
+ output_dir = "outputs",
141
+ ),
142
+ )
143
+ ```
144
+
145
+ ```Python
146
+ trainer_stats = trainer.train()
147
+ model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
148
+ model.push_to_hub_merged("rombodawg/test_dataset_Codellama-3-8B", tokenizer, save_method = "merged_16bit", token = "")
149
+ ```
150