from datasets import load_from_disk | |
from transformers import AutoTokenizer | |
dataset = load_from_disk("/researchdisk/training_dataset_full_deduplicated") | |
dataset = dataset["train"] | |
# We train on batch of texts, 1000 at a time here. | |
batch_size = 1000 | |
corpus = (dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size)) | |
# ConvBERT uses Bert tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
#let's use same vocab size as in Finnish-NLP/roberta-large-finnish-v2 which is also very close to TurkuNLP/bert-base-finnish-cased-v1 | |
new_tokenizer = tokenizer.train_new_from_iterator(corpus, vocab_size=50265) | |
new_tokenizer.save_pretrained("./") | |