from transformers import T5TokenizerFast, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq from datasets import load_dataset, Dataset import os import json # === Load your custom tokenizer === tokenizer = T5TokenizerFast.from_pretrained("minicoderx-tokenizer") # === Load or create dataset === def load_jsonl(path): with open(path) as f: data = [json.loads(line) for line in f] return Dataset.from_dict({ "input": [x["input"] for x in data], "output": [x["output"] for x in data] }) dataset = load_jsonl("data/train.jsonl") # === Tokenize dataset === def tokenize(batch): return tokenizer(batch["input"], padding="max_length", truncation=True, max_length=128) def tokenize_labels(batch): labels = tokenizer(batch["output"], padding="max_length", truncation=True, max_length=128) batch["labels"] = labels["input_ids"] return batch dataset = dataset.map(tokenize) dataset = dataset.map(tokenize_labels) # === Load pre-trained T5-small === model = T5ForConditionalGeneration.from_pretrained("t5-small") # === Training configuration === training_args = TrainingArguments( output_dir="minicoderx-model", per_device_train_batch_size=4, num_train_epochs=3, logging_steps=10, save_strategy="epoch", evaluation_strategy="no", save_total_limit=2, fp16=True, overwrite_output_dir=True, ) data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) trainer = Trainer( model=model, args=training_args, train_dataset=dataset, data_collator=data_collator, tokenizer=tokenizer ) # === Train === trainer.train() # === Save model === trainer.save_model("minicoderx-model") tokenizer.save_pretrained("minicoderx-model") print("Training complete and model saved.")