Something2109
commited on
Commit
•
ea12d46
1
Parent(s):
2f6e5c3
Upload 3 files
Browse files- BERT.py +84 -0
- tokenizer.py +30 -0
- train.py +106 -0
BERT.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
AutoTokenizer,
|
3 |
+
BertConfig,
|
4 |
+
BertForMaskedLM,
|
5 |
+
Trainer,
|
6 |
+
TrainingArguments,
|
7 |
+
DataCollatorForLanguageModeling,
|
8 |
+
)
|
9 |
+
from datasets import load_dataset
|
10 |
+
from laonlp import word_tokenize
|
11 |
+
import random
|
12 |
+
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained("bert/models/tokenizer")
|
14 |
+
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")
|
15 |
+
|
16 |
+
|
17 |
+
def group_texts(examples):
|
18 |
+
tokenized_inputs = [" ".join(word_tokenize(x)) for x in examples["text"]]
|
19 |
+
|
20 |
+
tokenized_inputs = tokenizer(
|
21 |
+
tokenized_inputs,
|
22 |
+
return_special_tokens_mask=True,
|
23 |
+
padding=True,
|
24 |
+
truncation=True,
|
25 |
+
max_length=tokenizer.model_max_length,
|
26 |
+
return_tensors="pt",
|
27 |
+
)
|
28 |
+
|
29 |
+
return tokenized_inputs
|
30 |
+
|
31 |
+
|
32 |
+
if __name__ == "__main__":
|
33 |
+
train_dataset = load_dataset(path="bert/dataset/CulturaX", split="train")
|
34 |
+
eval_dataset = load_dataset(path="bert/dataset/laonlp", split="validation")
|
35 |
+
|
36 |
+
data_collator = DataCollatorForLanguageModeling(
|
37 |
+
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
|
38 |
+
)
|
39 |
+
|
40 |
+
config_encoder = BertConfig(vocab_size=tokenizer.vocab_size)
|
41 |
+
|
42 |
+
model = BertForMaskedLM.from_pretrained("BERT\\models\\bert-culturaX-data")
|
43 |
+
|
44 |
+
train_dataset = train_dataset.map(
|
45 |
+
group_texts,
|
46 |
+
batched=True,
|
47 |
+
remove_columns=[
|
48 |
+
"text",
|
49 |
+
"timestamp",
|
50 |
+
"url",
|
51 |
+
"source",
|
52 |
+
],
|
53 |
+
num_proc=12,
|
54 |
+
).shuffle(seed=random.randint(0, 1000))
|
55 |
+
eval_dataset = eval_dataset.map(
|
56 |
+
group_texts, batched=True, remove_columns=["text"]
|
57 |
+
).shuffle(seed=random.randint(0, 1000))
|
58 |
+
|
59 |
+
print(
|
60 |
+
f"the dataset contains in total {len(train_dataset)*tokenizer.model_max_length} tokens"
|
61 |
+
)
|
62 |
+
|
63 |
+
model_name = "bert-culturaX-data"
|
64 |
+
|
65 |
+
training_args = TrainingArguments(
|
66 |
+
output_dir=f"bert/models/{model_name}",
|
67 |
+
evaluation_strategy="epoch",
|
68 |
+
per_device_train_batch_size=16,
|
69 |
+
per_device_eval_batch_size=16,
|
70 |
+
weight_decay=0.01,
|
71 |
+
save_total_limit=3,
|
72 |
+
num_train_epochs=2,
|
73 |
+
push_to_hub=True,
|
74 |
+
)
|
75 |
+
|
76 |
+
trainer = Trainer(
|
77 |
+
model=model,
|
78 |
+
args=training_args,
|
79 |
+
data_collator=data_collator,
|
80 |
+
train_dataset=train_dataset,
|
81 |
+
eval_dataset=eval_dataset,
|
82 |
+
)
|
83 |
+
|
84 |
+
trainer.train()
|
tokenizer.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from transformers import BertTokenizerFast
|
3 |
+
from laonlp import word_tokenize
|
4 |
+
from tqdm import tqdm
|
5 |
+
|
6 |
+
|
7 |
+
def tokenize(examples):
|
8 |
+
examples["text"] = [" ".join(word_tokenize(x)) for x in examples["text"]]
|
9 |
+
return examples
|
10 |
+
|
11 |
+
|
12 |
+
# create a python generator to dynamically load the data
|
13 |
+
def batch_iterator(batch_size=10000):
|
14 |
+
for i in tqdm(range(0, len(raw_datasets), batch_size)):
|
15 |
+
yield raw_datasets[i : i + batch_size]["text"]
|
16 |
+
|
17 |
+
|
18 |
+
if __name__ == "__main__":
|
19 |
+
raw_datasets = load_dataset(path="bert/dataset/culturaX", split="train")
|
20 |
+
|
21 |
+
raw_datasets = raw_datasets.map(tokenize, batched=True, num_proc=12)
|
22 |
+
|
23 |
+
# create a tokenizer from existing one to re-use special tokens
|
24 |
+
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
|
25 |
+
|
26 |
+
bert_tokenizer = tokenizer.train_new_from_iterator(
|
27 |
+
text_iterator=batch_iterator(),
|
28 |
+
vocab_size=32_000,
|
29 |
+
)
|
30 |
+
bert_tokenizer.save_pretrained("bert/models/tokenizer")
|
train.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
AutoTokenizer,
|
3 |
+
AutoModel,
|
4 |
+
BertModel,
|
5 |
+
GPT2Model,
|
6 |
+
EncoderDecoderModel,
|
7 |
+
DataCollatorForSeq2Seq,
|
8 |
+
Seq2SeqTrainer,
|
9 |
+
Seq2SeqTrainingArguments,
|
10 |
+
)
|
11 |
+
from datasets import load_dataset
|
12 |
+
from laonlp import word_tokenize
|
13 |
+
from functools import partial
|
14 |
+
import random
|
15 |
+
|
16 |
+
|
17 |
+
def group_texts(tokenizer, examples):
|
18 |
+
tokenized_inputs = [" ".join(word_tokenize(x)) for x in examples["text"]]
|
19 |
+
|
20 |
+
tokenized_inputs = tokenizer(
|
21 |
+
examples["text"],
|
22 |
+
# return_special_tokens_mask=True,
|
23 |
+
# padding="max_length",
|
24 |
+
# truncation=True,
|
25 |
+
# max_length=tokenizer.model_max_length,
|
26 |
+
# return_tensors="pt",
|
27 |
+
)
|
28 |
+
|
29 |
+
return tokenized_inputs
|
30 |
+
|
31 |
+
|
32 |
+
if __name__ == "__main__":
|
33 |
+
encoder_src = "BERT\\models\\bert-culturaX-data"
|
34 |
+
decoder_src = "NlpHUST/gpt2-vietnamese"
|
35 |
+
|
36 |
+
encoder_tokenizer = AutoTokenizer.from_pretrained(encoder_src)
|
37 |
+
decoder_tokenizer = AutoTokenizer.from_pretrained(decoder_src)
|
38 |
+
decoder_tokenizer.model_max_length = encoder_tokenizer.model_max_length
|
39 |
+
decoder_tokenizer.pad_token = decoder_tokenizer.eos_token
|
40 |
+
print(f"The max length for the tokenizer is: {encoder_tokenizer.model_max_length}")
|
41 |
+
|
42 |
+
encoder = AutoModel.from_pretrained(encoder_src)
|
43 |
+
decoder = AutoModel.from_pretrained(decoder_src)
|
44 |
+
decoder.config.max_length = decoder_tokenizer.model_max_length
|
45 |
+
|
46 |
+
model = EncoderDecoderModel(encoder=encoder, decoder=decoder)
|
47 |
+
model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
|
48 |
+
model.config.pad_token_id = decoder_tokenizer.pad_token_id
|
49 |
+
model.config.vocab_size = decoder_tokenizer.vocab_size
|
50 |
+
|
51 |
+
data_collator = DataCollatorForSeq2Seq(decoder_tokenizer, model=model)
|
52 |
+
|
53 |
+
raw_lo_dataset = load_dataset("bert/dataset/original/lo")
|
54 |
+
raw_vi_dataset = load_dataset("bert/dataset/original/vi")
|
55 |
+
|
56 |
+
train_dataset = raw_lo_dataset["train"].map(
|
57 |
+
partial(group_texts, encoder_tokenizer),
|
58 |
+
remove_columns=["text"],
|
59 |
+
batched=True,
|
60 |
+
num_proc=12,
|
61 |
+
)
|
62 |
+
eval_dataset = raw_lo_dataset["validation"].map(
|
63 |
+
partial(group_texts, encoder_tokenizer),
|
64 |
+
batched=True,
|
65 |
+
remove_columns=["text"],
|
66 |
+
)
|
67 |
+
train_labels = raw_vi_dataset["train"].map(
|
68 |
+
partial(group_texts, decoder_tokenizer),
|
69 |
+
remove_columns=["text"],
|
70 |
+
batched=True,
|
71 |
+
num_proc=12,
|
72 |
+
)
|
73 |
+
eval_labels = raw_vi_dataset["validation"].map(
|
74 |
+
partial(group_texts, decoder_tokenizer),
|
75 |
+
batched=True,
|
76 |
+
remove_columns=["text"],
|
77 |
+
)
|
78 |
+
train_dataset = train_dataset.add_column("labels", train_labels["input_ids"])
|
79 |
+
eval_dataset = eval_dataset.add_column("labels", eval_labels["input_ids"])
|
80 |
+
|
81 |
+
print(
|
82 |
+
f"the dataset contains in total {len(train_dataset)*encoder_tokenizer.model_max_length} tokens"
|
83 |
+
)
|
84 |
+
|
85 |
+
model_name = "transformer-bert-gpt"
|
86 |
+
|
87 |
+
training_args = Seq2SeqTrainingArguments(
|
88 |
+
output_dir=f"bert/models/{model_name}",
|
89 |
+
evaluation_strategy="epoch",
|
90 |
+
per_device_train_batch_size=16,
|
91 |
+
per_device_eval_batch_size=16,
|
92 |
+
weight_decay=0.01,
|
93 |
+
save_total_limit=3,
|
94 |
+
num_train_epochs=2,
|
95 |
+
push_to_hub=True,
|
96 |
+
)
|
97 |
+
|
98 |
+
trainer = Seq2SeqTrainer(
|
99 |
+
model=model,
|
100 |
+
args=training_args,
|
101 |
+
data_collator=data_collator,
|
102 |
+
train_dataset=train_dataset,
|
103 |
+
eval_dataset=eval_dataset,
|
104 |
+
)
|
105 |
+
|
106 |
+
trainer.train()
|