gptj-mnli / data_utils.py
sofial's picture
Upload data_utils.py
31b0b7e
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
from typing import Dict, Any
def form_text(example: Dict[str, Any]) -> Dict[str, Any]:
"""
Given an example from the glue mnli dataset, generate a prompt version example in the format:
mnli hypothesis: {hypothesis} premise: {premise} target: {class_label}<|endoftext|>
This format can be used to finetune the model as a Causal Languange Model.
"""
hypothesis = example['hypothesis']
premise = example['premise']
class_label = ['entailment', 'neutral', 'contradiction'][example['label']]
example[
'text'] = f'mnli hypothesis: {hypothesis} premise: {premise} target: {class_label}<|endoftext|>'
return example
def split_text(example: Dict[str, Any]) -> Dict[str, Any]:
"""
Given an example in the format
mnli hypothesis: {hypothesis} premise: {premise} target: {class_label}<|endoftext|>
split it in the prompt to be used for validation (excluding the target) and the class label.
"""
partition = example['text'].rpartition(' ')
example['prompt_text'] = partition[0]
example['class_label'] = partition[2].replace('<|endoftext|>', '')
return example