File size: 2,572 Bytes
0b25161
 
0b98bc4
0b25161
0b98bc4
0b25161
0b98bc4
 
 
0b25161
 
 
 
 
 
 
 
 
0b98bc4
 
 
 
 
 
 
 
 
0b25161
0b98bc4
 
0b25161
0b98bc4
 
0b25161
0b98bc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b25161
 
 
 
0b98bc4
0b25161
 
0b98bc4
0b25161
0b98bc4
 
 
 
0b25161
 
 
0b98bc4
 
 
 
0b25161
0b98bc4
 
0b25161
 
0b98bc4
0b25161
0b98bc4
 
 
 
 
0b25161
0b98bc4
 
0b25161
 
 
 
 
0b98bc4
0b25161
 
 
0b98bc4
0b25161
 
0b98bc4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import signal
import sys
import torch
from datasets import load_dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer

# Importing Sophia optimizer
from sophia import SophiaG

# Signal handler function
def signal_handler(sig, frame):
    print('You pressed Ctrl+C! Exiting...')
    sys.exit(0)

# Register signal handler
signal.signal(signal.SIGINT, signal_handler)

# Load the dataset
dataset = load_dataset("Crystalcareai/Orca-Reka", split="train")
model_id = "./outkannn"
tokenizer_id = model_id
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.padding_side = 'right'

# Formatting function for the dataset
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    EOS_TOKEN = tokenizer.eos_token
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Process and map the formatting function
dataset = dataset.map(formatting_prompts_func, batched=True)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# Define training arguments
args = TrainingArguments(
    output_dir="./out",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_checkpointing=True,
    logging_steps=2,
    save_strategy="steps",
    save_steps=300,
    bf16=True,
    tf32=True,
    learning_rate=1e-4,
    max_grad_norm=0.1,
    warmup_ratio=0.00,
    lr_scheduler_type="cosine",
    push_to_hub=False
)

max_seq_length = 2048

# Custom Trainer Class
class CustomTrainer(SFTTrainer):
    def create_optimizer(self):
        # Override to use SophiaG optimizer
        self.optimizer = SophiaG(self.model.parameters(), lr=self.args.learning_rate, betas=(0.965, 0.99), rho=0.01, weight_decay=0.1)

# Trainer configuration
trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    dataset_text_field="output",
    packing=False,
)

# Start training
trainer.train()

# Save model
trainer.save_model()