Alexandru Gherghescu commited on
Commit
a80c40f
1 Parent(s): 06c181f

Add training and preprocessing scripts

Browse files
Files changed (2) hide show
  1. pre_training.py +71 -0
  2. preprocessing.py +29 -0
pre_training.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.optim import Adam
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ Trainer,
5
+ TrainingArguments,
6
+ DataCollatorForLanguageModeling,
7
+ get_scheduler,
8
+ )
9
+ from datasets import load_from_disk
10
+
11
+ from configuration_gpt1 import GPT1Config
12
+ from modeling_gpt1 import GPT1Model, GPT1ForCausalLM
13
+
14
+
15
+ GPT1Config.register_for_auto_class()
16
+ GPT1Model.register_for_auto_class('AutoModel')
17
+ GPT1ForCausalLM.register_for_auto_class('AutoModelForCausalLM')
18
+
19
+ # load the already tokenized dataset (see training_preprocessing.py)
20
+ tokenized_datasets = load_from_disk('tokenized_bookcorpusopen')
21
+
22
+ print(tokenized_datasets)
23
+
24
+ tokenizer = AutoTokenizer.from_pretrained('.')
25
+ config = GPT1Config()
26
+ model = GPT1ForCausalLM(config)
27
+
28
+ print(model)
29
+
30
+ _total_params = sum(p.numel() for p in model.parameters())
31
+ print(f"Model parameters: {_total_params}")
32
+
33
+ batch_size = 32
34
+ epochs = 100
35
+
36
+ tokenizer.pad_token = tokenizer.eos_token
37
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
38
+
39
+ optimizer = Adam(model.parameters(), lr=2.5e-4, weight_decay=0.01)
40
+ scheduler = get_scheduler('cosine',
41
+ optimizer=optimizer,
42
+ num_warmup_steps=4000,
43
+ num_training_steps=epochs * len(tokenized_datasets['train']))
44
+
45
+ args = TrainingArguments(
46
+ output_dir='checkpoints',
47
+ per_device_train_batch_size=batch_size,
48
+ per_device_eval_batch_size=batch_size,
49
+ evaluation_strategy='epoch',
50
+ gradient_accumulation_steps=1,
51
+ num_train_epochs=epochs,
52
+ save_total_limit=10,
53
+ max_grad_norm=1.0,
54
+ fp16=False,
55
+ )
56
+
57
+ trainer = Trainer(
58
+ model=model,
59
+ args=args,
60
+ data_collator=data_collator,
61
+ train_dataset=tokenized_datasets['train'],
62
+ eval_dataset=tokenized_datasets['test'],
63
+ tokenizer=tokenizer,
64
+ optimizers=(optimizer, scheduler),
65
+ )
66
+
67
+ print("Starting training...")
68
+
69
+ trainer.train()
70
+
71
+ trainer.save_model('trained')
preprocessing.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoTokenizer,
3
+ )
4
+ from datasets import load_dataset
5
+
6
+
7
+ raw_datasets = load_dataset('lucadiliello/bookcorpusopen')
8
+ raw_datasets = raw_datasets['train'].train_test_split(test_size=0.05)
9
+
10
+ print(raw_datasets)
11
+
12
+ tokenizer = AutoTokenizer.from_pretrained('.')
13
+
14
+ seq_len = 512
15
+
16
+ def tokenize_fn(examples):
17
+ return tokenizer(examples['text'],
18
+ max_length=seq_len,
19
+ return_overflowing_tokens=True,
20
+ truncation=True)
21
+
22
+ tokenized_datasets = raw_datasets.map(
23
+ tokenize_fn,
24
+ batched=True,
25
+ batch_size=500,
26
+ remove_columns=raw_datasets['train'].column_names,
27
+ )
28
+
29
+ tokenized_datasets.save_to_disk('tokenized_bookcorpusopen')