Something2109 commited on
Commit
ea12d46
1 Parent(s): 2f6e5c3

Upload 3 files

Browse files
Files changed (3) hide show
  1. BERT.py +84 -0
  2. tokenizer.py +30 -0
  3. train.py +106 -0
BERT.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoTokenizer,
3
+ BertConfig,
4
+ BertForMaskedLM,
5
+ Trainer,
6
+ TrainingArguments,
7
+ DataCollatorForLanguageModeling,
8
+ )
9
+ from datasets import load_dataset
10
+ from laonlp import word_tokenize
11
+ import random
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained("bert/models/tokenizer")
14
+ print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")
15
+
16
+
17
+ def group_texts(examples):
18
+ tokenized_inputs = [" ".join(word_tokenize(x)) for x in examples["text"]]
19
+
20
+ tokenized_inputs = tokenizer(
21
+ tokenized_inputs,
22
+ return_special_tokens_mask=True,
23
+ padding=True,
24
+ truncation=True,
25
+ max_length=tokenizer.model_max_length,
26
+ return_tensors="pt",
27
+ )
28
+
29
+ return tokenized_inputs
30
+
31
+
32
+ if __name__ == "__main__":
33
+ train_dataset = load_dataset(path="bert/dataset/CulturaX", split="train")
34
+ eval_dataset = load_dataset(path="bert/dataset/laonlp", split="validation")
35
+
36
+ data_collator = DataCollatorForLanguageModeling(
37
+ tokenizer=tokenizer, mlm=True, mlm_probability=0.15
38
+ )
39
+
40
+ config_encoder = BertConfig(vocab_size=tokenizer.vocab_size)
41
+
42
+ model = BertForMaskedLM.from_pretrained("BERT\\models\\bert-culturaX-data")
43
+
44
+ train_dataset = train_dataset.map(
45
+ group_texts,
46
+ batched=True,
47
+ remove_columns=[
48
+ "text",
49
+ "timestamp",
50
+ "url",
51
+ "source",
52
+ ],
53
+ num_proc=12,
54
+ ).shuffle(seed=random.randint(0, 1000))
55
+ eval_dataset = eval_dataset.map(
56
+ group_texts, batched=True, remove_columns=["text"]
57
+ ).shuffle(seed=random.randint(0, 1000))
58
+
59
+ print(
60
+ f"the dataset contains in total {len(train_dataset)*tokenizer.model_max_length} tokens"
61
+ )
62
+
63
+ model_name = "bert-culturaX-data"
64
+
65
+ training_args = TrainingArguments(
66
+ output_dir=f"bert/models/{model_name}",
67
+ evaluation_strategy="epoch",
68
+ per_device_train_batch_size=16,
69
+ per_device_eval_batch_size=16,
70
+ weight_decay=0.01,
71
+ save_total_limit=3,
72
+ num_train_epochs=2,
73
+ push_to_hub=True,
74
+ )
75
+
76
+ trainer = Trainer(
77
+ model=model,
78
+ args=training_args,
79
+ data_collator=data_collator,
80
+ train_dataset=train_dataset,
81
+ eval_dataset=eval_dataset,
82
+ )
83
+
84
+ trainer.train()
tokenizer.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import BertTokenizerFast
3
+ from laonlp import word_tokenize
4
+ from tqdm import tqdm
5
+
6
+
7
+ def tokenize(examples):
8
+ examples["text"] = [" ".join(word_tokenize(x)) for x in examples["text"]]
9
+ return examples
10
+
11
+
12
+ # create a python generator to dynamically load the data
13
+ def batch_iterator(batch_size=10000):
14
+ for i in tqdm(range(0, len(raw_datasets), batch_size)):
15
+ yield raw_datasets[i : i + batch_size]["text"]
16
+
17
+
18
+ if __name__ == "__main__":
19
+ raw_datasets = load_dataset(path="bert/dataset/culturaX", split="train")
20
+
21
+ raw_datasets = raw_datasets.map(tokenize, batched=True, num_proc=12)
22
+
23
+ # create a tokenizer from existing one to re-use special tokens
24
+ tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
25
+
26
+ bert_tokenizer = tokenizer.train_new_from_iterator(
27
+ text_iterator=batch_iterator(),
28
+ vocab_size=32_000,
29
+ )
30
+ bert_tokenizer.save_pretrained("bert/models/tokenizer")
train.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoTokenizer,
3
+ AutoModel,
4
+ BertModel,
5
+ GPT2Model,
6
+ EncoderDecoderModel,
7
+ DataCollatorForSeq2Seq,
8
+ Seq2SeqTrainer,
9
+ Seq2SeqTrainingArguments,
10
+ )
11
+ from datasets import load_dataset
12
+ from laonlp import word_tokenize
13
+ from functools import partial
14
+ import random
15
+
16
+
17
+ def group_texts(tokenizer, examples):
18
+ tokenized_inputs = [" ".join(word_tokenize(x)) for x in examples["text"]]
19
+
20
+ tokenized_inputs = tokenizer(
21
+ examples["text"],
22
+ # return_special_tokens_mask=True,
23
+ # padding="max_length",
24
+ # truncation=True,
25
+ # max_length=tokenizer.model_max_length,
26
+ # return_tensors="pt",
27
+ )
28
+
29
+ return tokenized_inputs
30
+
31
+
32
+ if __name__ == "__main__":
33
+ encoder_src = "BERT\\models\\bert-culturaX-data"
34
+ decoder_src = "NlpHUST/gpt2-vietnamese"
35
+
36
+ encoder_tokenizer = AutoTokenizer.from_pretrained(encoder_src)
37
+ decoder_tokenizer = AutoTokenizer.from_pretrained(decoder_src)
38
+ decoder_tokenizer.model_max_length = encoder_tokenizer.model_max_length
39
+ decoder_tokenizer.pad_token = decoder_tokenizer.eos_token
40
+ print(f"The max length for the tokenizer is: {encoder_tokenizer.model_max_length}")
41
+
42
+ encoder = AutoModel.from_pretrained(encoder_src)
43
+ decoder = AutoModel.from_pretrained(decoder_src)
44
+ decoder.config.max_length = decoder_tokenizer.model_max_length
45
+
46
+ model = EncoderDecoderModel(encoder=encoder, decoder=decoder)
47
+ model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
48
+ model.config.pad_token_id = decoder_tokenizer.pad_token_id
49
+ model.config.vocab_size = decoder_tokenizer.vocab_size
50
+
51
+ data_collator = DataCollatorForSeq2Seq(decoder_tokenizer, model=model)
52
+
53
+ raw_lo_dataset = load_dataset("bert/dataset/original/lo")
54
+ raw_vi_dataset = load_dataset("bert/dataset/original/vi")
55
+
56
+ train_dataset = raw_lo_dataset["train"].map(
57
+ partial(group_texts, encoder_tokenizer),
58
+ remove_columns=["text"],
59
+ batched=True,
60
+ num_proc=12,
61
+ )
62
+ eval_dataset = raw_lo_dataset["validation"].map(
63
+ partial(group_texts, encoder_tokenizer),
64
+ batched=True,
65
+ remove_columns=["text"],
66
+ )
67
+ train_labels = raw_vi_dataset["train"].map(
68
+ partial(group_texts, decoder_tokenizer),
69
+ remove_columns=["text"],
70
+ batched=True,
71
+ num_proc=12,
72
+ )
73
+ eval_labels = raw_vi_dataset["validation"].map(
74
+ partial(group_texts, decoder_tokenizer),
75
+ batched=True,
76
+ remove_columns=["text"],
77
+ )
78
+ train_dataset = train_dataset.add_column("labels", train_labels["input_ids"])
79
+ eval_dataset = eval_dataset.add_column("labels", eval_labels["input_ids"])
80
+
81
+ print(
82
+ f"the dataset contains in total {len(train_dataset)*encoder_tokenizer.model_max_length} tokens"
83
+ )
84
+
85
+ model_name = "transformer-bert-gpt"
86
+
87
+ training_args = Seq2SeqTrainingArguments(
88
+ output_dir=f"bert/models/{model_name}",
89
+ evaluation_strategy="epoch",
90
+ per_device_train_batch_size=16,
91
+ per_device_eval_batch_size=16,
92
+ weight_decay=0.01,
93
+ save_total_limit=3,
94
+ num_train_epochs=2,
95
+ push_to_hub=True,
96
+ )
97
+
98
+ trainer = Seq2SeqTrainer(
99
+ model=model,
100
+ args=training_args,
101
+ data_collator=data_collator,
102
+ train_dataset=train_dataset,
103
+ eval_dataset=eval_dataset,
104
+ )
105
+
106
+ trainer.train()