File size: 709 Bytes
a80c40f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
from transformers import (
AutoTokenizer,
)
from datasets import load_dataset
raw_datasets = load_dataset('lucadiliello/bookcorpusopen')
raw_datasets = raw_datasets['train'].train_test_split(test_size=0.05)
print(raw_datasets)
tokenizer = AutoTokenizer.from_pretrained('.')
seq_len = 512
def tokenize_fn(examples):
return tokenizer(examples['text'],
max_length=seq_len,
return_overflowing_tokens=True,
truncation=True)
tokenized_datasets = raw_datasets.map(
tokenize_fn,
batched=True,
batch_size=500,
remove_columns=raw_datasets['train'].column_names,
)
tokenized_datasets.save_to_disk('tokenized_bookcorpusopen')
|