File size: 1,902 Bytes
a80c40f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from torch.optim import Adam
from transformers import (
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    get_scheduler,
)
from datasets import load_from_disk

from configuration_gpt1 import GPT1Config
from modeling_gpt1 import GPT1Model, GPT1ForCausalLM


GPT1Config.register_for_auto_class()
GPT1Model.register_for_auto_class('AutoModel')
GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')

# load the already tokenized dataset (see training_preprocessing.py)
tokenized_datasets = load_from_disk('tokenized_bookcorpusopen')

print(tokenized_datasets)

tokenizer = AutoTokenizer.from_pretrained('.')
config = GPT1Config()
model = GPT1ForCausalLM(config)

print(model)

_total_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {_total_params}")

batch_size = 32
epochs = 100

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

optimizer = Adam(model.parameters(), lr=2.5e-4, weight_decay=0.01)
scheduler = get_scheduler('cosine',
                          optimizer=optimizer,
                          num_warmup_steps=4000,
                          num_training_steps=epochs * len(tokenized_datasets['train']))

args = TrainingArguments(
    output_dir='checkpoints',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    gradient_accumulation_steps=1,
    num_train_epochs=epochs,
    save_total_limit=10,
    max_grad_norm=1.0,
    fp16=False,
)

trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler),
)

print("Starting training...")

trainer.train()

trainer.save_model('trained')