gpt-fi / train.py
Vaino Hatanpaa
add training and evaluation scripts
ceedef8
import transformers
import datasets
from transformers import PreTrainedTokenizerFast
from transformers import (
GPT2TokenizerFast,
AutoConfig,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
default_data_collator
)
from transformers.trainer_utils import get_last_checkpoint
import torch
#from transformers.utils.dummy_tokenizers_objects import PreTrainedTokenizerFast
#config_name = "C:\\Users\\vin\\Documents\\Projects\\NLP\\kielimalli\\config.json"
#tokenizer_file = "C:\\Users\\vin\\Documents\\Projects\\NLP\\models\\tokens.json"
#input_dir = "H:\\Data_temp\\tokenized_dataset"
#output_dir = "H:\\Data_temp\\checkpoints\\model1"
def main():
import os
#enable if required by your environment
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
#torch.backends.cuda.matmul.allow_tf32 = True
#torch.backends.cudnn.allow_tf32 = True
config_name = "config_large_bpe.json"
tokenizer_files = "/path/to/tokenizer/files"
input_dir = "/data/dir"
output_dir = "/out/dir"
training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
learning_rate=2.067e-5,
lr_scheduler_type="linear",
adam_beta1=0.95,
adam_beta2=0.985,
adam_epsilon=1e-8,
weight_decay=0.001,
gradient_accumulation_steps=32,
num_train_epochs=6.7,
save_total_limit=2,
dataloader_num_workers=10,
save_steps=100,
warmup_steps=1000,
do_eval=True,
eval_steps=1000,
evaluation_strategy="steps",
logging_strategy="steps",
logging_steps=100,
bf16=True,
tf32=True,
fp16_opt_level="O2",
half_precision_backend="amp",
bf16_full_eval=True
)
print("setting up tokenizer...")
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_files)
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})#Probably wrong
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
from tokenizers.processors import TemplateProcessing
tokenizer._tokenizer.post_processor = TemplateProcessing(
single="$0 "+tokenizer.eos_token,
pair="$A "+tokenizer.eos_token+" $B:1 "+tokenizer.eos_token,
special_tokens=[(tokenizer.eos_token, 0)],
)
print("loading model...")
config = AutoConfig.from_pretrained(config_name)
model = AutoModelForCausalLM.from_config(config)
#model = AutoModelForCausalLM.from_pretrained("/checkpoint/dir") if restarting training completely and loading weights from a checkpoints
model.gradient_checkpointing_enable() #Optional, affects performance
print("loading data...")
dataset = datasets.load_from_disk(input_dir)
print("starting training...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
data_collator=default_data_collator,
eval_dataset=dataset["test"].select(range(10000)), #To save time do not evaluate on whole test set during training
tokenizer=tokenizer
)
#checkpoint = None
checkpoint = get_last_checkpoint(output_dir)
print("checkpoint:", checkpoint)
trainer.train(resume_from_checkpoint=checkpoint)
if __name__ == "__main__":
main()