from transformers import ( AutoTokenizer, BertConfig, BertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, ) from datasets import load_dataset from laonlp import word_tokenize import random tokenizer = AutoTokenizer.from_pretrained("bert/models/tokenizer") print(f"The max length for the tokenizer is: {tokenizer.model_max_length}") def group_texts(examples): tokenized_inputs = [" ".join(word_tokenize(x)) for x in examples["text"]] tokenized_inputs = tokenizer( tokenized_inputs, return_special_tokens_mask=True, padding=True, truncation=True, max_length=tokenizer.model_max_length, return_tensors="pt", ) return tokenized_inputs if __name__ == "__main__": train_dataset = load_dataset(path="bert/dataset/CulturaX", split="train") eval_dataset = load_dataset(path="bert/dataset/laonlp", split="validation") data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) config_encoder = BertConfig(vocab_size=tokenizer.vocab_size) model = BertForMaskedLM.from_pretrained("BERT\\models\\bert-culturaX-data") train_dataset = train_dataset.map( group_texts, batched=True, remove_columns=[ "text", "timestamp", "url", "source", ], num_proc=12, ).shuffle(seed=random.randint(0, 1000)) eval_dataset = eval_dataset.map( group_texts, batched=True, remove_columns=["text"] ).shuffle(seed=random.randint(0, 1000)) print( f"the dataset contains in total {len(train_dataset)*tokenizer.model_max_length} tokens" ) model_name = "bert-culturaX-data" training_args = TrainingArguments( output_dir=f"bert/models/{model_name}", evaluation_strategy="epoch", per_device_train_batch_size=16, per_device_eval_batch_size=16, weight_decay=0.01, save_total_limit=3, num_train_epochs=2, push_to_hub=True, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, ) trainer.train()