MAX_INPUT_LENGTH = 256 MAX_TARGET_LENGTH = 128 def preprocess_function(examples): """ Preprocess entries of the given dataset (should be used with a `map` function) Params: examples (Dataset): dataset to be preprocessed Returns: model_inputs (BatchEncoding): tokenized dataset entries """ inputs, targets = [], [] for i in range(len(examples['question'])): inputs.append(f"Antwort: {examples['provided_answer'][i]} Lösung: {examples['reference_answer'][i]} Frage: {examples['question'][i]}") targets.append(f"{examples['score'][i]} Feedback: {examples['answer_feedback'][i]}") # apply tokenization to inputs and labels model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding='max_length', truncation=True) labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True) model_inputs['labels'] = labels['input_ids'] return model_inputs