from datasets import load_dataset, concatenate_datasets from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer model_dir = "./scandinavian" # ${MODEL_DIR} # load dataset # dataset = load_dataset("oscar", "unshuffled_deduplicated_no", split="train") # mc4_subset_with_five_languages = load_dataset("mc4", languages=["en", "fr", "es", "de", "zh"]) # yoruba_dataset = load_dataset("mc4", "yo", split="train[0:10]") # yoruba_dataset2 = load_dataset("mc4", "yo", split="train[10:20]") danish_dataset = load_dataset("mc4", "da") # , download_mode="force_redownload") norwegian_dataset = load_dataset("mc4", "no") # , download_mode="force_redownload") swedish_dataset = load_dataset("mc4", "sv") # , download_mode="force_redownload") # all_datasets = concatenate_datasets([yoruba_dataset, yoruba_dataset2]) all_datasets = concatenate_datasets([danish_dataset, norwegian_dataset, swedish_dataset]) all_datasets = all_datasets.shuffle() # Instantiate tokenizer tokenizer = ByteLevelBPETokenizer() def batch_iterator(batch_size=1000): for i in range(0, len(all_datasets), batch_size): yield all_datasets[i : i + batch_size]["text"] # Customized training tokenizer.train_from_iterator( batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[ "", "", "", "", "", ], ) # Save files to disk tokenizer.save(f"{model_dir}/tokenizer.json")