|
from datasets import load_dataset |
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling |
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
def tokenize_function(examples): |
|
return tokenizer( |
|
examples['Question'], |
|
padding='max_length', |
|
truncation=True, |
|
max_length=128 |
|
) |
|
|
|
|
|
dataset = load_dataset('InnerI/synCAI_144kda') |
|
|
|
|
|
tokenized_datasets = dataset.map(tokenize_function, batched=True) |
|
|
|
|
|
model = GPT2LMHeadModel.from_pretrained('gpt2-medium') |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, |
|
mlm=False |
|
) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir=r"InnerI/synCAI-144k-gpt2.5", |
|
overwrite_output_dir=True, |
|
num_train_epochs=1, |
|
per_device_train_batch_size=4, |
|
save_steps=10_000, |
|
save_total_limit=2, |
|
prediction_loss_only=True, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
data_collator=data_collator, |
|
train_dataset=tokenized_datasets['train'], |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
trainer.save_model(r"CAI-gpt2.5") |
|
|