|
MAX_INPUT_LENGTH = 256 |
|
MAX_TARGET_LENGTH = 128 |
|
|
|
def preprocess_function(examples): |
|
""" |
|
Preprocess entries of the given dataset (should be used with a `map` function) |
|
Params: |
|
examples (Dataset): dataset to be preprocessed |
|
Returns: |
|
model_inputs (BatchEncoding): tokenized dataset entries |
|
""" |
|
inputs, targets = [], [] |
|
for i in range(len(examples['question'])): |
|
inputs.append(f"Answer: {examples['provided_answer'][i]} Reference: {examples['reference_answer'][i]} Question: {examples['question'][i]}") |
|
targets.append(f"{examples['score'][i]} Feedback: {examples['answer_feedback'][i]}") |
|
|
|
|
|
model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding='max_length', truncation=True) |
|
labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True) |
|
|
|
model_inputs['labels'] = labels['input_ids'] |
|
|
|
return model_inputs |
|
|