Spaces:
Sleeping
Sleeping
| from transformers import AutoTokenizer | |
| from.ingest_data import get_data | |
| model_nm = 't5-small' | |
| tokenizer = AutoTokenizer.from_pretrained(model_nm) | |
| def tokenize_data(x): | |
| model_inputs = tokenizer( | |
| x['document'], | |
| max_length = 512, | |
| padding=True, | |
| truncation=True | |
| ) | |
| labels = tokenizer( | |
| x['summary'], | |
| max_length = 512, | |
| padding = True, | |
| truncation=True | |
| ) | |
| model_inputs['labels'] = labels['input_ids'] | |
| return model_inputs | |
| def preprocess(): | |
| dataset = get_data() | |
| tok_ds = dataset.map(tokenize_data, batched=True) | |
| return tok_ds |