Elron commited on
Commit
ee71e67
1 Parent(s): c727338

Upload formats.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. formats.py +101 -72
formats.py CHANGED
@@ -1,82 +1,111 @@
1
- from .artifact import Artifact
 
 
 
 
 
2
 
 
 
3
 
4
- class Format(Artifact):
 
5
  pass
6
 
7
 
8
- class SizeLimitingFormat(Format):
9
- size_limiter: Artifact = None
10
-
11
-
12
- class ICLFormat(SizeLimitingFormat):
13
- prefix: str = ""
14
- input_prefix: str = ""
15
- output_prefix: str = ""
16
- target_prefix: str = " "
17
- instruction_prefix: str = ""
18
- input_output_separator: str = "\n"
19
- demo_separator: str = "\n\n"
20
- suffix: str = ""
21
- add_instruction_at_start: bool = True
22
- add_instruction_after_demos: bool = False
23
-
24
- def single_source_str(self, source):
25
- return (
26
- self.input_prefix
27
- + source
28
- + self.input_output_separator
29
- + self.output_prefix
 
 
 
 
 
 
 
 
 
 
 
30
  )
31
-
32
- def single_source_str_with_instruction(self, source, instruction):
33
- return (
34
- self.input_prefix
35
- + instruction
36
- + self.demo_separator
37
- + source
38
- + self.input_output_separator
39
- + self.output_prefix
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  )
41
 
42
- def format(self, instance, demos_instances=None):
43
- if demos_instances is None:
44
- demos_instances = []
45
- source = self.prefix
46
-
47
- instruction = ""
48
  if "instruction" in instance:
49
- instruction = instance.pop("instruction")
 
 
 
 
50
  assert (
51
- "instruction" != None
52
- ), f"instruction field can not be none : {instance}"
53
-
54
- if self.add_instruction_at_start and instruction != "":
55
- source += self.instruction_prefix + instruction + self.demo_separator
56
-
57
- if self.add_instruction_after_demos and instruction != "":
58
- query_str = self.single_source_str_with_instruction(
59
- instance["source"], instruction
60
- )
61
- else:
62
- query_str = self.single_source_str(instance["source"])
63
-
64
- for demo_instance in demos_instances:
65
- demo_str = (
66
- self.single_source_str(demo_instance["source"])
67
- + self.target_prefix
68
- + demo_instance["target"]
69
- + self.demo_separator
70
- )
71
-
72
- if self.size_limiter is not None:
73
- if not self.size_limiter.check(
74
- source + demo_str + query_str + instance["target"]
75
- ):
76
- continue
77
-
78
- source += demo_str
79
-
80
- source += query_str
81
- source += self.suffix
82
- return source
 
1
+ from typing import (
2
+ Any,
3
+ Dict,
4
+ List,
5
+ Optional,
6
+ )
7
 
8
+ from .operator import StreamInstanceOperator
9
+ from .type_utils import isoftype
10
 
11
+
12
+ class Format(StreamInstanceOperator):
13
  pass
14
 
15
 
16
+ class SystemFormat(Format):
17
+ r"""Generates the whole input to the model, from constant strings that are given as args, and from values found in specified fields of the instance.
18
+
19
+ SystemFormat expects the input instance to contain:
20
+ 1. A field named "source" whose value is a string verbalizing the original values in the instance (as read
21
+ from the source dataset), in the context of the underlying task.
22
+ 2. A field named "instruction" that contains a (non-None) string.
23
+ 3. A field named with the value in arg 'demos_field', containing a list of dicts, each dict with fields "source"
24
+ and "target", representing a single demo.
25
+
26
+ SystemFormat formats the above fields into a single string to be inputted to the model. This string overwrites
27
+ field "source" of the instance. Formatting is driven by two args: 'demo_format' and 'model_input_format'.
28
+ SystemFormat also pops field "instruction" and the field containing the demos out from the input instance.
29
+
30
+ Args:
31
+ demos_field (str): the name of the field that contains the demos, being a list of dicts, each with "source" and "target" keys
32
+ demo_format (str): formatting string for a single demo, combining fields "source" and "target"
33
+ model_input_format (str) overall product format, combining instruction and source (as read from fields "instruction"
34
+ and "source" of the input instance), together with demos (as formatted into one string)
35
+
36
+ Example:
37
+ when input instance:
38
+ {
39
+ "source": "1+1",
40
+ "target": "2",
41
+ "instruction": "Solve the math exercises.",
42
+ "demos": [{"source": "1+2", "target": "3"}, {"source": "4-2", "target": "2"}]
43
+ }
44
+ is process-ed by
45
+ system_format = SystemFormat(
46
+ demos_field="demos",
47
+ demo_format="Input: {source}\nOutput: {target}\n\n",
48
+ model_input_format="Instruction: {instruction}\n\n{demos}Input: {source}\nOutput: ",
49
  )
50
+ the resulting instance is:
51
+ {
52
+ "target": "2",
53
+ "source": "Instruction: Solve the math exercises.\n\nInput: 1+2\nOutput: 3\n\nInput: 4-2\nOutput: 2\n\nInput: 1+1\nOutput: ",
54
+ }
55
+ """
56
+
57
+ demos_field: str = "demos"
58
+ demo_format: str = (
59
+ "{source}\n{target}\n\n" # example: "User: {source}\nAgent: {target}\n\n"
60
+ )
61
+ model_input_format: str = "{instruction}{demos}{source}\n"
62
+
63
+ @staticmethod
64
+ def _retrieve_field_and_assert_not_none(instance, field_name) -> str:
65
+ if field_name is not None and field_name in instance:
66
+ field_value = instance[field_name]
67
+ assert (
68
+ field_value is not None
69
+ ), f"Value in field '{field_name}' should not be none. Received instance: {instance}"
70
+ return field_value
71
+ return ""
72
+
73
+ def process(
74
+ self, instance: Dict[str, Any], stream_name: Optional[str] = None
75
+ ) -> Dict[str, Any]:
76
+ assert (
77
+ "source" in instance
78
+ ), f"field 'source' is expected to be in the input instance. Received instance: {instance}"
79
+ source = self._retrieve_field_and_assert_not_none(
80
+ instance=instance, field_name="source"
81
  )
82
 
83
+ instruction = self._retrieve_field_and_assert_not_none(
84
+ instance=instance, field_name="instruction"
85
+ )
86
+ # pop "instruction" from instance
 
 
87
  if "instruction" in instance:
88
+ instance.pop("instruction")
89
+
90
+ demo_instances = []
91
+ if self.demos_field is not None and self.demos_field in instance:
92
+ demos = instance[self.demos_field]
93
  assert (
94
+ demos is not None and isoftype(demos, List[Dict[str, Any]])
95
+ ), f"A list of dict-s is expected in field '{self.demos_field}'. Received instance: {instance}"
96
+ demo_instances = demos
97
+ # pop demos from instance
98
+ instance.pop(self.demos_field)
99
+
100
+ demos_string = ""
101
+ for demo_instance in demo_instances:
102
+ demo_str = self.demo_format.format(**demo_instance)
103
+ demos_string += demo_str
104
+
105
+ output = self.model_input_format.format(
106
+ instruction=instruction,
107
+ demos=demos_string,
108
+ source=source,
109
+ )
110
+ instance["source"] = output
111
+ return instance