from datasets import load_from_disk from transformers import AutoTokenizer dataset = load_from_disk("/researchdisk/training_dataset_full_deduplicated") dataset = dataset["train"] # We train on batch of texts, 1000 at a time here. batch_size = 1000 corpus = (dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size)) # ConvBERT uses Bert tokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") #let's use same vocab size as in Finnish-NLP/roberta-large-finnish-v2 which is also very close to TurkuNLP/bert-base-finnish-cased-v1 new_tokenizer = tokenizer.train_new_from_iterator(corpus, vocab_size=50265) new_tokenizer.save_pretrained("./")