Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer | |
from.ingest_data import get_data | |
model_nm = 't5-small' | |
tokenizer = AutoTokenizer.from_pretrained(model_nm) | |
def tokenize_data(x): | |
model_inputs = tokenizer( | |
x['document'], | |
max_length = 512, | |
padding=True, | |
truncation=True | |
) | |
labels = tokenizer( | |
x['summary'], | |
max_length = 512, | |
padding = True, | |
truncation=True | |
) | |
model_inputs['labels'] = labels['input_ids'] | |
return model_inputs | |
def preprocess(): | |
dataset = get_data() | |
tok_ds = dataset.map(tokenize_data, batched=True) | |
return tok_ds |