from tokenizers import ByteLevelBPETokenizer from datasets import load_from_disk import os import argparse def parse_arguments(parser): parser.add_argument( "--dataset-dir", required=True, help="Define dataset folder", ) parser.add_argument( "--out", default="./", type=str, help="Path to the output directory, where the files will be saved", ) parser.add_argument( "--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files" ) return parser.parse_args() def main(args): if not os.path.isdir(args.dataset_dir): raise Exception('--dataset-dir not exists') os.makedirs(args.out, exist_ok=True) print('LOADING %s...' % (args.dataset_dir)) raw_dataset = load_from_disk(args.dataset_dir) print(raw_dataset) def batch_iterator(batch_size=10000): for i in range(0, len(raw_dataset), batch_size): yield raw_dataset[i : i + batch_size]['text'] tokenizer = ByteLevelBPETokenizer( lowercase=False, ) tokenizer.enable_truncation(max_length=512) tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[ "", "", "", "", "", ]) print("SAVING TOKENIZER CONFIG INTO...") tokenizer.save("{}/tokenizer.json".format(args.out)) tokenizer.save_model(args.out, args.name) print("SO FAR SO GOOD...") if __name__ == '__main__': args = parse_arguments(argparse.ArgumentParser()) print(args) try: main(args) except Exception as excp: print(excp)