from transformers import ( AutoTokenizer, ) from datasets import load_dataset raw_datasets = load_dataset('lucadiliello/bookcorpusopen') raw_datasets = raw_datasets['train'].train_test_split(test_size=0.05) print(raw_datasets) tokenizer = AutoTokenizer.from_pretrained('.') seq_len = 512 def tokenize_fn(examples): return tokenizer(examples['text'], max_length=seq_len, return_overflowing_tokens=True, truncation=True) tokenized_datasets = raw_datasets.map( tokenize_fn, batched=True, batch_size=500, remove_columns=raw_datasets['train'].column_names, ) tokenized_datasets.save_to_disk('tokenized_bookcorpusopen')