|
|
from datasets import load_dataset |
|
|
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments |
|
|
|
|
|
|
|
|
dataset = load_dataset("DetectiveShadow/MVPQuestion")["train"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_name = "t5-small" |
|
|
tokenizer = T5Tokenizer.from_pretrained(model_name) |
|
|
model = T5ForConditionalGeneration.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
def tokenize(example): |
|
|
input_enc = tokenizer(example["input"], truncation=True, padding="max_length", max_length=64) |
|
|
target_enc = tokenizer(example["output"], truncation=True, padding="max_length", max_length=64) |
|
|
input_enc["labels"] = target_enc["input_ids"] |
|
|
return input_enc |
|
|
|
|
|
tokenized = dataset.map(tokenize) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir="./MVPTrivia", |
|
|
per_device_train_batch_size=8, |
|
|
num_train_epochs=3, |
|
|
logging_steps=10, |
|
|
save_strategy="epoch", |
|
|
push_to_hub=True, |
|
|
hub_model_id="DetectiveShadow/MVPTrivia", |
|
|
hub_strategy="every_save" |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized |
|
|
) |
|
|
|
|
|
|
|
|
trainer.train() |
|
|
trainer.push_to_hub() |
|
|
tokenizer.push_to_hub("DetectiveShadow/MVPTrivia") |
|
|
|
|
|
|