Runa_max_train / fine_tune.py
gokstad's picture
Upload 4 files
123134e verified
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import torch
import os
# Paths
MODEL_NAME = "Gryphe/MythoMax-L2-13b"
OUTPUT_DIR = "/app/output" # Path where the fine-tuned model will be saved
DATASET_NAME = "mteb/toxic_conversations_50k"
# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Load tokenizer and model
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# Load dataset
print("Loading dataset...")
dataset = load_dataset(DATASET_NAME)
# Validate dataset columns
print("Validating dataset columns...")
required_columns = ["text", "label", "label_text"]
for column in required_columns:
if column not in dataset["train"].column_names:
if column == "label":
print("Warning: Dataset missing 'label' column. Proceeding without it.")
else:
raise ValueError(f"Dataset missing required column: {column}")
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
print("Tokenizing dataset...")
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Prepare data for training
print("Preparing dataset for training...")
if "label_text" in tokenized_datasets["train"].column_names:
tokenized_datasets = tokenized_datasets.remove_columns(["text", "label_text"])
else:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
if "label" in tokenized_datasets["train"].column_names:
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
else:
# If 'label' column is missing, create a dummy labels column
tokenized_datasets = tokenized_datasets.map(lambda x: {"labels": 0})
tokenized_datasets.set_format("torch")
# Define training arguments
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
evaluation_strategy="steps",
eval_steps=100,
save_steps=100,
save_total_limit=2,
logging_steps=10,
per_device_train_batch_size=1, # Adjust based on available GPU memory
gradient_accumulation_steps=16,
num_train_epochs=3,
warmup_steps=500,
weight_decay=0.01,
logging_dir=f"{OUTPUT_DIR}/logs",
fp16=torch.cuda.is_available(),
push_to_hub=False, # Set to True if you want to push to Hugging Face Hub
)
# Define Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
)
# Fine-tune the model
print("Starting training...")
trainer.train()
# Save the fine-tuned model
print("Saving fine-tuned model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Fine-tuning complete!")