sofial commited on
Commit
31b0b7e
1 Parent(s): 809adbb

Upload data_utils.py

Browse files
Files changed (1) hide show
  1. data_utils.py +28 -0
data_utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022 Graphcore Ltd. All rights reserved.
2
+
3
+ from typing import Dict, Any
4
+
5
+ def form_text(example: Dict[str, Any]) -> Dict[str, Any]:
6
+ """
7
+ Given an example from the glue mnli dataset, generate a prompt version example in the format:
8
+ mnli hypothesis: {hypothesis} premise: {premise} target: {class_label}<|endoftext|>
9
+ This format can be used to finetune the model as a Causal Languange Model.
10
+ """
11
+ hypothesis = example['hypothesis']
12
+ premise = example['premise']
13
+ class_label = ['entailment', 'neutral', 'contradiction'][example['label']]
14
+
15
+ example[
16
+ 'text'] = f'mnli hypothesis: {hypothesis} premise: {premise} target: {class_label}<|endoftext|>'
17
+ return example
18
+
19
+ def split_text(example: Dict[str, Any]) -> Dict[str, Any]:
20
+ """
21
+ Given an example in the format
22
+ mnli hypothesis: {hypothesis} premise: {premise} target: {class_label}<|endoftext|>
23
+ split it in the prompt to be used for validation (excluding the target) and the class label.
24
+ """
25
+ partition = example['text'].rpartition(' ')
26
+ example['prompt_text'] = partition[0]
27
+ example['class_label'] = partition[2].replace('<|endoftext|>', '')
28
+ return example