Add phi-2-super training script

Browse files

Files changed (3) hide show

.gitattributes +1 -0
train_csv_dataset_phi-2-super.py +96 -0
train_dataset.py +0 -3

.gitattributes CHANGED Viewed

@@ -25,6 +25,7 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 results/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 results/**/* filter=lfs diff=lfs merge=lfs -text
+results_phi-2-super/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text

train_csv_dataset_phi-2-super.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import time
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
+from trl import SFTTrainer
+from peft import LoraConfig, prepare_model_for_kbit_training
+dataset = load_dataset()
+if torch.cuda.is_available():
+    print("Cuda is available")
+base_model_id = "abacaj/phi-2-super"
+output_dir = "./results_phi-2-super"
+tokenizer = AutoTokenizer.from_pretrained(base_model_id)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+    print("pad_token was missing and has been set to eos_token")
+# Configuration to load model in 4-bit quantized
+bnb_config = BitsAndBytesConfig(load_in_4bit=True,
+                                bnb_4bit_quant_type='nf4',
+                                #bnb_4bit_compute_dtype='float16',
+                                bnb_4bit_compute_dtype=torch.bfloat16,
+                                bnb_4bit_use_double_quant=False)
+model = AutoModelForCausalLM.from_pretrained(base_model_id, attn_implementation="flash_attention_2", quantization_config=bnb_config, torch_dtype="auto")
+print(model)
+# Gradient checkpointing to save memory
+model.gradient_checkpointing_enable()
+# Freeze base model layers and cast layernorm in fp32
+model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
+peft_config = LoraConfig(
+    r=64,
+    lora_alpha=64,
+    target_modules= ["q_proj","k_proj","v_proj","dense","fc2","fc1"],
+    bias="none",
+    lora_dropout=0.05,
+    task_type="CAUSAL_LM",
+)
+training_args = TrainingArguments(
+    output_dir=output_dir,  # Output directory for checkpoints and predictions
+    overwrite_output_dir=True, # Overwrite the content of the output directory
+    per_device_train_batch_size=2,  # Batch size for training
+    per_device_eval_batch_size=2,  # Batch size for evaluation
+    gradient_accumulation_steps=5, # number of steps before optimizing
+    gradient_checkpointing=True,   # Enable gradient checkpointing
+    gradient_checkpointing_kwargs={"use_reentrant": False},
+    warmup_steps=10,  # Number of warmup steps
+    #max_steps=1000,  # Total number of training steps
+    num_train_epochs=100,  # Number of training epochs
+    learning_rate=5e-5,  # Learning rate
+    weight_decay=0.01,  # Weight decay
+    optim="paged_adamw_8bit", #Keep the optimizer state and quantize it
+    bf16=True, #Use mixed precision training
+    #For logging and saving
+    logging_dir='./logs',
+    logging_strategy="epoch",
+    logging_steps=10,
+    save_strategy="epoch",
+    save_steps=10,
+    save_total_limit=2,  # Limit the total number of checkpoints
+    evaluation_strategy="epoch",
+    eval_steps=10,
+    load_best_model_at_end=True, # Load the best model at the end of training
+    lr_scheduler_type="linear",
+)
+def formatting_func(data):
+    return f"[INST] {data['prompt']} [/INST]{data['completion']}{tokenizer.eos_token} "
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=dataset,
+    eval_dataset=dataset,
+    peft_config=peft_config,
+    args=training_args,
+    max_seq_length=1024,
+    packing=True,
+    formatting_func=formatting_func
+)
+model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+start_time = time.time()  # Record the start time
+trainer.train()
+end_time = time.time()  # Record the end time
+training_time = end_time - start_time  # Calculate total training time
+trainer.save_model(output_dir)
+print(f"Training completed in {training_time} seconds.")

train_dataset.py CHANGED Viewed

@@ -21,12 +21,9 @@ docs = db._collection.peek(db._collection.count())
 dataset = docs['documents']
 if torch.cuda.is_available():
-    # torch.set_default_device("cuda")
     print("Cuda is available")
 base_model_id = "microsoft/phi-2"
-# base_model_id = "abacaj/phi-2-super"
-# base_model_id = "./results"
 tokenizer = AutoTokenizer.from_pretrained(base_model_id)
 if tokenizer.pad_token is None:

 dataset = docs['documents']
 if torch.cuda.is_available():
     print("Cuda is available")
 base_model_id = "microsoft/phi-2"
 tokenizer = AutoTokenizer.from_pretrained(base_model_id)
 if tokenizer.pad_token is None: