Spaces:
Sleeping
Sleeping
File size: 587 Bytes
fb4a3c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
from transformers import AutoTokenizer
from.ingest_data import get_data
model_nm = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_nm)
def tokenize_data(x):
model_inputs = tokenizer(
x['document'],
max_length = 512,
padding=True,
truncation=True
)
labels = tokenizer(
x['summary'],
max_length = 512,
padding = True,
truncation=True
)
model_inputs['labels'] = labels['input_ids']
return model_inputs
def preprocess():
dataset = get_data()
tok_ds = dataset.map(tokenize_data, batched=True)
return tok_ds |