File size: 1,902 Bytes
a80c40f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from torch.optim import Adam
from transformers import (
AutoTokenizer,
Trainer,
TrainingArguments,
DataCollatorForLanguageModeling,
get_scheduler,
)
from datasets import load_from_disk
from configuration_gpt1 import GPT1Config
from modeling_gpt1 import GPT1Model, GPT1ForCausalLM
GPT1Config.register_for_auto_class()
GPT1Model.register_for_auto_class('AutoModel')
GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')
# load the already tokenized dataset (see training_preprocessing.py)
tokenized_datasets = load_from_disk('tokenized_bookcorpusopen')
print(tokenized_datasets)
tokenizer = AutoTokenizer.from_pretrained('.')
config = GPT1Config()
model = GPT1ForCausalLM(config)
print(model)
_total_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {_total_params}")
batch_size = 32
epochs = 100
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
optimizer = Adam(model.parameters(), lr=2.5e-4, weight_decay=0.01)
scheduler = get_scheduler('cosine',
optimizer=optimizer,
num_warmup_steps=4000,
num_training_steps=epochs * len(tokenized_datasets['train']))
args = TrainingArguments(
output_dir='checkpoints',
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
evaluation_strategy='epoch',
gradient_accumulation_steps=1,
num_train_epochs=epochs,
save_total_limit=10,
max_grad_norm=1.0,
fp16=False,
)
trainer = Trainer(
model=model,
args=args,
data_collator=data_collator,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test'],
tokenizer=tokenizer,
optimizers=(optimizer, scheduler),
)
print("Starting training...")
trainer.train()
trainer.save_model('trained')
|