quiz_model / train.py
Percy3822
Initial training setup
2c1c9e4
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
# Load dataset from Hugging Face Hub
dataset = load_dataset("Percy3822/quiz_model")
# Preprocess: combine prompt + completion into single string
def format_for_training(example):
# Convert dict completion to string if needed
if isinstance(example["completion"], dict):
example["completion"] = str(example["completion"])
return {"text": example["prompt"] + "\n" + example["completion"]}
dataset = dataset.map(format_for_training)
# Load tokenizer and model (small model for low VRAM)
model_name = "distilgpt2" # Small and fast for testing
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# Tokenize
def tokenize(batch):
return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)
dataset = dataset.map(tokenize, batched=True)
# Load model
model = AutoModelForCausalLM.from_pretrained(model_name)
# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Training args
training_args = TrainingArguments(
output_dir="./results",
overwrite_output_dir=True,
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=2,
num_train_epochs=1,
save_strategy="epoch",
logging_dir="./logs",
logging_steps=5,
push_to_hub=True,
hub_model_id="Percy3822/quiz_model",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["train"], # Use train for eval in testing
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.train()
# Push trained model to Hub
trainer.push_to_hub()