gpt1 / preprocessing.py
Alexandru Gherghescu
Fix couple of issues with inference, dataset folder
030a9e9 unverified
raw
history blame
No virus
689 Bytes
from transformers import (
AutoTokenizer,
)
from datasets import load_dataset
raw_datasets = load_dataset('lucadiliello/bookcorpusopen')
raw_datasets = raw_datasets['train'].train_test_split(test_size=0.05)
print(raw_datasets)
tokenizer = AutoTokenizer.from_pretrained('.')
seq_len = 512
def tokenize_fn(examples):
return tokenizer(examples['text'],
max_length=seq_len,
return_overflowing_tokens=True,
truncation=True)
tokenized_datasets = raw_datasets.map(
tokenize_fn,
batched=True,
batch_size=500,
remove_columns=raw_datasets['train'].column_names,
)
tokenized_datasets.save_to_disk('data')