File size: 587 Bytes
fb4a3c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from transformers import AutoTokenizer
from.ingest_data import get_data

model_nm = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_nm)

def tokenize_data(x):
  model_inputs = tokenizer(
      x['document'],
      max_length = 512,
      padding=True,
      truncation=True
  )
  labels = tokenizer(
      x['summary'],
      max_length = 512,
      padding = True,
      truncation=True
  )
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

def preprocess():
    dataset = get_data()
    tok_ds = dataset.map(tokenize_data, batched=True)
    return tok_ds