robkaandorp commited on
Commit
aeddf48
1 Parent(s): ccebcb5

Add phi-2-super training script

Browse files
.gitattributes CHANGED
@@ -25,6 +25,7 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  results/**/* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tar.* filter=lfs diff=lfs merge=lfs -text
29
  *.tar filter=lfs diff=lfs merge=lfs -text
30
  *.tflite filter=lfs diff=lfs merge=lfs -text
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  results/**/* filter=lfs diff=lfs merge=lfs -text
28
+ results_phi-2-super/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.tar.* filter=lfs diff=lfs merge=lfs -text
30
  *.tar filter=lfs diff=lfs merge=lfs -text
31
  *.tflite filter=lfs diff=lfs merge=lfs -text
train_csv_dataset_phi-2-super.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
4
+ from trl import SFTTrainer
5
+ from peft import LoraConfig, prepare_model_for_kbit_training
6
+
7
+ dataset = load_dataset()
8
+
9
+ if torch.cuda.is_available():
10
+ print("Cuda is available")
11
+
12
+ base_model_id = "abacaj/phi-2-super"
13
+ output_dir = "./results_phi-2-super"
14
+
15
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id)
16
+ if tokenizer.pad_token is None:
17
+ tokenizer.pad_token = tokenizer.eos_token
18
+ print("pad_token was missing and has been set to eos_token")
19
+
20
+ # Configuration to load model in 4-bit quantized
21
+ bnb_config = BitsAndBytesConfig(load_in_4bit=True,
22
+ bnb_4bit_quant_type='nf4',
23
+ #bnb_4bit_compute_dtype='float16',
24
+ bnb_4bit_compute_dtype=torch.bfloat16,
25
+ bnb_4bit_use_double_quant=False)
26
+
27
+ model = AutoModelForCausalLM.from_pretrained(base_model_id, attn_implementation="flash_attention_2", quantization_config=bnb_config, torch_dtype="auto")
28
+ print(model)
29
+
30
+ # Gradient checkpointing to save memory
31
+ model.gradient_checkpointing_enable()
32
+
33
+ # Freeze base model layers and cast layernorm in fp32
34
+ model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
35
+
36
+ peft_config = LoraConfig(
37
+ r=64,
38
+ lora_alpha=64,
39
+ target_modules= ["q_proj","k_proj","v_proj","dense","fc2","fc1"],
40
+ bias="none",
41
+ lora_dropout=0.05,
42
+ task_type="CAUSAL_LM",
43
+ )
44
+
45
+ training_args = TrainingArguments(
46
+ output_dir=output_dir, # Output directory for checkpoints and predictions
47
+ overwrite_output_dir=True, # Overwrite the content of the output directory
48
+ per_device_train_batch_size=2, # Batch size for training
49
+ per_device_eval_batch_size=2, # Batch size for evaluation
50
+ gradient_accumulation_steps=5, # number of steps before optimizing
51
+ gradient_checkpointing=True, # Enable gradient checkpointing
52
+ gradient_checkpointing_kwargs={"use_reentrant": False},
53
+ warmup_steps=10, # Number of warmup steps
54
+ #max_steps=1000, # Total number of training steps
55
+ num_train_epochs=100, # Number of training epochs
56
+ learning_rate=5e-5, # Learning rate
57
+ weight_decay=0.01, # Weight decay
58
+ optim="paged_adamw_8bit", #Keep the optimizer state and quantize it
59
+ bf16=True, #Use mixed precision training
60
+ #For logging and saving
61
+ logging_dir='./logs',
62
+ logging_strategy="epoch",
63
+ logging_steps=10,
64
+ save_strategy="epoch",
65
+ save_steps=10,
66
+ save_total_limit=2, # Limit the total number of checkpoints
67
+ evaluation_strategy="epoch",
68
+ eval_steps=10,
69
+ load_best_model_at_end=True, # Load the best model at the end of training
70
+ lr_scheduler_type="linear",
71
+ )
72
+
73
+ def formatting_func(data):
74
+ return f"[INST] {data['prompt']} [/INST]{data['completion']}{tokenizer.eos_token} "
75
+
76
+ trainer = SFTTrainer(
77
+ model=model,
78
+ train_dataset=dataset,
79
+ eval_dataset=dataset,
80
+ peft_config=peft_config,
81
+ args=training_args,
82
+ max_seq_length=1024,
83
+ packing=True,
84
+ formatting_func=formatting_func
85
+ )
86
+
87
+ model.config.use_cache = False # silence the warnings. Please re-enable for inference!
88
+
89
+ start_time = time.time() # Record the start time
90
+ trainer.train()
91
+ end_time = time.time() # Record the end time
92
+
93
+ training_time = end_time - start_time # Calculate total training time
94
+
95
+ trainer.save_model(output_dir)
96
+ print(f"Training completed in {training_time} seconds.")
train_dataset.py CHANGED
@@ -21,12 +21,9 @@ docs = db._collection.peek(db._collection.count())
21
  dataset = docs['documents']
22
 
23
  if torch.cuda.is_available():
24
- # torch.set_default_device("cuda")
25
  print("Cuda is available")
26
 
27
  base_model_id = "microsoft/phi-2"
28
- # base_model_id = "abacaj/phi-2-super"
29
- # base_model_id = "./results"
30
 
31
  tokenizer = AutoTokenizer.from_pretrained(base_model_id)
32
  if tokenizer.pad_token is None:
 
21
  dataset = docs['documents']
22
 
23
  if torch.cuda.is_available():
 
24
  print("Cuda is available")
25
 
26
  base_model_id = "microsoft/phi-2"
 
 
27
 
28
  tokenizer = AutoTokenizer.from_pretrained(base_model_id)
29
  if tokenizer.pad_token is None: