gpt2_base_prefix_682k / scripts /train_medium.py
augustocsc's picture
GPT-2 Base trained on prefix dataset (682K)
c082aa2 verified
#!/usr/bin/env python3
"""
Train GPT-2 Medium (355M) on expression dataset to compare with base GPT-2 (124M).
"""
import os
import sys
import json
import argparse
from pathlib import Path
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model_size", type=str, default="gpt2-medium",
choices=["gpt2", "gpt2-medium", "gpt2-large"],
help="Model size to train")
parser.add_argument("--dataset_repo", type=str, default="augustocsc/sintetico_natural")
parser.add_argument("--data_dir", type=str, default="700K")
parser.add_argument("--data_column", type=str, default="i_prompt_n")
parser.add_argument("--output_dir", type=str, default=None)
parser.add_argument("--num_train_epochs", type=int, default=3)
parser.add_argument("--per_device_train_batch_size", type=int, default=4)
parser.add_argument("--learning_rate", type=float, default=5e-5)
parser.add_argument("--lora_r", type=int, default=8)
parser.add_argument("--lora_alpha", type=int, default=32)
args = parser.parse_args()
# Set output dir based on model size
if args.output_dir is None:
model_name = args.model_size.replace("-", "_")
args.output_dir = f"./output/{model_name}_700K_json"
print("="*80)
print(f"Training {args.model_size} on expression dataset")
print("="*80)
print(f"Output dir: {args.output_dir}")
print()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_size)
tokenizer.pad_token = tokenizer.eos_token
# Load base model
print(f"Loading {args.model_size}...")
model = AutoModelForCausalLM.from_pretrained(args.model_size)
# Add LoRA
lora_config = LoraConfig(
r=args.lora_r,
lora_alpha=args.lora_alpha,
target_modules=["c_attn"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} / {total_params:,} "
f"({100*trainable_params/total_params:.2f}%)")
print()
# Load dataset
print(f"Loading dataset: {args.dataset_repo}/{args.data_dir}")
dataset = load_dataset(args.dataset_repo, data_dir=args.data_dir)
train_dataset = dataset["train"]
print(f"Dataset size: {len(train_dataset)} examples")
print(f"Sample: {train_dataset[0][args.data_column][:100]}...")
print()
# Tokenize
def tokenize_function(examples):
return tokenizer(
examples[args.data_column],
truncation=True,
max_length=512,
padding=False,
)
print("Tokenizing dataset...")
tokenized_dataset = train_dataset.map(
tokenize_function,
batched=True,
remove_columns=train_dataset.column_names,
desc="Tokenizing",
)
# Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
# Training arguments
training_args = TrainingArguments(
output_dir=args.output_dir,
num_train_epochs=args.num_train_epochs,
per_device_train_batch_size=args.per_device_train_batch_size,
gradient_accumulation_steps=4,
learning_rate=args.learning_rate,
warmup_steps=500,
weight_decay=0.01,
logging_steps=100,
save_steps=1000,
save_total_limit=2,
fp16=True,
report_to="none",
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=data_collator,
)
# Train
print("Starting training...")
trainer.train()
# Save final model
print(f"\nSaving final model to {args.output_dir}")
trainer.save_model(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
print("\nTraining completed!")
print(f"Model saved to: {args.output_dir}")
if __name__ == "__main__":
main()