|
from transformers import T5TokenizerFast, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq |
|
from datasets import load_dataset, Dataset |
|
import os |
|
import json |
|
|
|
|
|
tokenizer = T5TokenizerFast.from_pretrained("minicoderx-tokenizer") |
|
|
|
|
|
def load_jsonl(path): |
|
with open(path) as f: |
|
data = [json.loads(line) for line in f] |
|
return Dataset.from_dict({ |
|
"input": [x["input"] for x in data], |
|
"output": [x["output"] for x in data] |
|
}) |
|
|
|
dataset = load_jsonl("data/train.jsonl") |
|
|
|
|
|
def tokenize(batch): |
|
return tokenizer(batch["input"], padding="max_length", truncation=True, max_length=128) |
|
|
|
def tokenize_labels(batch): |
|
labels = tokenizer(batch["output"], padding="max_length", truncation=True, max_length=128) |
|
batch["labels"] = labels["input_ids"] |
|
return batch |
|
|
|
dataset = dataset.map(tokenize) |
|
dataset = dataset.map(tokenize_labels) |
|
|
|
|
|
model = T5ForConditionalGeneration.from_pretrained("t5-small") |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="minicoderx-model", |
|
per_device_train_batch_size=4, |
|
num_train_epochs=3, |
|
logging_steps=10, |
|
save_strategy="epoch", |
|
evaluation_strategy="no", |
|
save_total_limit=2, |
|
fp16=True, |
|
overwrite_output_dir=True, |
|
) |
|
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=dataset, |
|
data_collator=data_collator, |
|
tokenizer=tokenizer |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
trainer.save_model("minicoderx-model") |
|
tokenizer.save_pretrained("minicoderx-model") |
|
|
|
print("Training complete and model saved.") |