|
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, EarlyStoppingCallback, PreTrainedTokenizer |
|
from peft import LoraConfig, get_peft_model, TaskType |
|
import torch |
|
|
|
def initialize_deepseek_model(model, device, tokenizer, train_dataset, val_dataset, MODEL_DIR): |
|
lora_config = LoraConfig( |
|
r=16, |
|
lora_alpha=32, |
|
lora_dropout=0.0, |
|
bias="none", |
|
task_type=TaskType.CAUSAL_LM, |
|
target_modules=[ |
|
"q_proj", |
|
"k_proj", |
|
"v_proj", |
|
"o_proj", |
|
"gate_proj", |
|
"up_proj", |
|
"down_proj" |
|
] |
|
) |
|
|
|
model = get_peft_model(model, lora_config) |
|
model = model.to(device) |
|
|
|
training_args = TrainingArguments( |
|
output_dir=MODEL_DIR, |
|
eval_strategy="epoch", |
|
save_strategy="epoch", |
|
per_device_train_batch_size=1, |
|
per_device_eval_batch_size=1, |
|
gradient_accumulation_steps=16, |
|
num_train_epochs=10, |
|
learning_rate=5e-5, |
|
weight_decay=0.001, |
|
logging_steps=50, |
|
save_total_limit=2, |
|
bf16=True if torch.cuda.is_available() else False, |
|
push_to_hub=False, |
|
load_best_model_at_end=True, |
|
metric_for_best_model="eval_loss", |
|
greater_is_better=False |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=val_dataset, |
|
tokenizer=tokenizer, |
|
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] |
|
) |
|
|
|
return model, trainer |