File size: 2,572 Bytes
0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 0b25161 0b98bc4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import signal
import sys
import torch
from datasets import load_dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer
# Importing Sophia optimizer
from sophia import SophiaG
# Signal handler function
def signal_handler(sig, frame):
print('You pressed Ctrl+C! Exiting...')
sys.exit(0)
# Register signal handler
signal.signal(signal.SIGINT, signal_handler)
# Load the dataset
dataset = load_dataset("Crystalcareai/Orca-Reka", split="train")
model_id = "./outkannn"
tokenizer_id = model_id
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.padding_side = 'right'
# Formatting function for the dataset
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
def formatting_prompts_func(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
EOS_TOKEN = tokenizer.eos_token
for instruction, input, output in zip(instructions, inputs, outputs):
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
texts.append(text)
return {"text": texts}
# Process and map the formatting function
dataset = dataset.map(formatting_prompts_func, batched=True)
# Load model
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
# Define training arguments
args = TrainingArguments(
output_dir="./out",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_checkpointing=True,
logging_steps=2,
save_strategy="steps",
save_steps=300,
bf16=True,
tf32=True,
learning_rate=1e-4,
max_grad_norm=0.1,
warmup_ratio=0.00,
lr_scheduler_type="cosine",
push_to_hub=False
)
max_seq_length = 2048
# Custom Trainer Class
class CustomTrainer(SFTTrainer):
def create_optimizer(self):
# Override to use SophiaG optimizer
self.optimizer = SophiaG(self.model.parameters(), lr=self.args.learning_rate, betas=(0.965, 0.99), rho=0.01, weight_decay=0.1)
# Trainer configuration
trainer = CustomTrainer(
model=model,
args=args,
train_dataset=dataset,
max_seq_length=max_seq_length,
tokenizer=tokenizer,
dataset_text_field="output",
packing=False,
)
# Start training
trainer.train()
# Save model
trainer.save_model()
|