File size: 2,204 Bytes
3824b4f
a80c40f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
030a9e9
 
a80c40f
7e53000
 
 
a80c40f
 
 
 
 
 
 
 
 
 
 
7e53000
a80c40f
 
 
 
 
 
 
 
7e53000
a80c40f
 
 
 
7e53000
 
a80c40f
7e53000
a80c40f
 
 
7e53000
 
 
 
a80c40f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3824b4f
 
a80c40f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import torch
from torch.optim import Adam
from transformers import (
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    get_scheduler,
)
from datasets import load_from_disk

from configuration_gpt1 import GPT1Config
from modeling_gpt1 import GPT1Model, GPT1ForCausalLM


GPT1Config.register_for_auto_class()
GPT1Model.register_for_auto_class('AutoModel')
GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')

# load the already tokenized dataset (see preprocessing.py)
tokenized_datasets = load_from_disk('data')

# shuffle for good measure
tokenized_datasets = tokenized_datasets.shuffle(seed=42)

print(tokenized_datasets)

tokenizer = AutoTokenizer.from_pretrained('.')
config = GPT1Config()
model = GPT1ForCausalLM(config)

print(model)

_total_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {_total_params}")

batch_size = 16
epochs = 100

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

optimizer = Adam(model.parameters(), lr=2.5e-4, weight_decay=0.01)
scheduler = get_scheduler('cosine',
                          optimizer=optimizer,
                          num_warmup_steps=2000,
                          num_training_steps=epochs * len(tokenized_datasets['train']))

args = TrainingArguments(
    output_dir='checkpoints',
    per_device_train_batch_size=batch_size, # divide by number of GPU's
    per_device_eval_batch_size=batch_size, # divide by number of GPU's
    evaluation_strategy='epoch',
    gradient_accumulation_steps=4,
    num_train_epochs=epochs,
    save_total_limit=10,
    max_grad_norm=1.0,
    logging_strategy='steps',
    logging_steps=100,
    logging_first_step=True,
    logging_nan_inf_filter=False,
    fp16=False,
)

trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler),
)

print("Starting training...")

trainer.train()

torch.save(trainer.state.log_history, 'trainer_history.pt')

trainer.save_model('trained')