File size: 2,504 Bytes
d9cfebf
 
b538b3c
d9cfebf
 
 
 
 
 
 
b538b3c
d9cfebf
a18ecb8
b538b3c
 
 
 
d9cfebf
a18ecb8
d9cfebf
a16809f
 
 
d9cfebf
a18ecb8
b538b3c
 
 
 
d9cfebf
b538b3c
d9cfebf
 
 
b538b3c
 
 
d9cfebf
 
a18ecb8
406313e
 
a18ecb8
406313e
 
 
a18ecb8
406313e
 
a18ecb8
406313e
d9cfebf
 
406313e
d9cfebf
 
406313e
d9cfebf
406313e
 
 
a18ecb8
406313e
 
 
 
 
 
 
 
 
 
 
 
 
 
d9cfebf
a18ecb8
406313e
 
 
 
b538b3c
406313e
d9cfebf
 
a18ecb8
d9cfebf
406313e
b538b3c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# train_llama4.py

from transformers import AutoTokenizer, Llama4ForConditionalGeneration, BitsAndBytesConfig
import datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator
import huggingface_hub
import os

print("Running train_llama4.py with CPU offloading (version: 2025-04-22 v1)")

# β€” Authenticate with Hugging Face
LLAMA = os.getenv("LLama")
if not LLAMA:
    raise ValueError("LLama token not found. Set it in environment as 'LLama'.")
huggingface_hub.login(token=LLAMA)

# β€” Tokenizer
MODEL_ID = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# β€” Quantization + CPU off‑load config
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

print("Loading model with 8-bit quantization, CPU offload, and automatic device mapping")
model = Llama4ForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=quant_config,
    offload_folder="./offload"
)

# β€” Resize embeddings if pad was added
model.resize_token_embeddings(len(tokenizer))

# β€” Accelerator prep
accelerator = Accelerator()
model = accelerator.prepare(model)

# β€” Load training data
dataset = datasets.load_dataset('json', data_files="Bingaman_training_data.json")['train']

# β€” LoRA setup
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# β€” Training arguments
training_args = {
    "output_dir": "./results",
    "num_train_epochs": 1,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 8,
    "optim": "adamw_torch",
    "save_steps": 500,
    "logging_steps": 100,
    "learning_rate": 2e-4,
    "fp16": True,
    "max_grad_norm": 0.3,
    "warmup_ratio": 0.03,
    "lr_scheduler_type": "cosine"
}

# β€” Initialize Trainer via Accelerate
trainer = accelerator.prepare(
    datasets.Trainer(
        model=model,
        args=datasets.TrainingArguments(**training_args),
        train_dataset=dataset
    )
)

# β€” Run training
trainer.train()
model.save_pretrained("./fine_tuned_model")
print("Training completed!")