Elron commited on
Commit
eee0bf8
1 Parent(s): 43b496d

Upload standard.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. standard.py +81 -17
standard.py CHANGED
@@ -1,11 +1,12 @@
 
1
  from typing import List
2
 
3
  from .card import TaskCard
4
  from .dataclass import InternalField, OptionalField
5
  from .formats import ICLFormat
6
  from .instructions import Instruction
7
- from .operator import SourceSequntialOperator, StreamingOperator
8
- from .operators import StreamRefiner
9
  from .recipe import Recipe
10
  from .renderers import StandardRenderer
11
  from .schema import ToUnitxtGroup
@@ -13,21 +14,30 @@ from .splitters import Sampler, SeparateSplit, SpreadSplit
13
  from .templates import Template
14
 
15
 
16
- class BaseRecipe(Recipe, SourceSequntialOperator):
 
 
 
 
 
 
 
 
 
17
  card: TaskCard
18
  template: Template = None
19
  instruction: Instruction = None
20
  format: ICLFormat = ICLFormat()
21
 
 
 
22
  max_train_instances: int = None
23
  max_validation_instances: int = None
24
  max_test_instances: int = None
25
 
26
- train_refiner: StreamRefiner = OptionalField(default_factory=lambda: StreamRefiner(apply_to_streams=["train"]))
27
- validation_refiner: StreamRefiner = OptionalField(
28
- default_factory=lambda: StreamRefiner(apply_to_streams=["validation"])
29
- )
30
- test_refiner: StreamRefiner = OptionalField(default_factory=lambda: StreamRefiner(apply_to_streams=["test"]))
31
 
32
  demos_pool_size: int = None
33
  num_demos: int = 0
@@ -37,6 +47,8 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
37
  demos_field: str = "demos"
38
  sampler: Sampler = None
39
 
 
 
40
  steps: List[StreamingOperator] = InternalField(default_factory=list)
41
 
42
  def verify(self):
@@ -48,7 +60,31 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
48
  )
49
  if self.demos_pool_size < self.num_demos:
50
  raise ValueError(
51
- f"demos_pool_size must be bigger than num_demos={self.num_demos}, Got demos_pool_size={self.demos_pool_size}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  )
53
 
54
  def prepare(self):
@@ -56,14 +92,23 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
56
  self.card.loader,
57
  ]
58
 
 
 
 
 
 
59
  if self.card.preprocess_steps is not None:
60
  self.steps.extend(self.card.preprocess_steps)
61
 
62
  self.steps.append(self.card.task)
63
 
 
 
 
 
64
  if self.demos_pool_size is not None:
65
  self.steps.append(
66
- SeparateSplit(
67
  from_split=self.demos_taken_from,
68
  to_split_names=[self.demos_pool_name, self.demos_taken_from],
69
  to_split_sizes=[int(self.demos_pool_size)],
@@ -79,7 +124,7 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
79
  sampler.set_size(self.num_demos)
80
 
81
  self.steps.append(
82
- SpreadSplit(
83
  source_stream=self.demos_pool_name,
84
  target_field=self.demos_field,
85
  sampler=sampler,
@@ -87,12 +132,15 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
87
  )
88
 
89
  self.train_refiner.max_instances = self.max_train_instances
 
90
  self.steps.append(self.train_refiner)
91
 
92
  self.validation_refiner.max_instances = self.max_validation_instances
 
93
  self.steps.append(self.validation_refiner)
94
 
95
  self.test_refiner.max_instances = self.max_test_instances
 
96
  self.steps.append(self.test_refiner)
97
 
98
  render = StandardRenderer(
@@ -104,6 +152,9 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
104
 
105
  self.steps.append(render)
106
 
 
 
 
107
  postprocessors = render.get_postprocessors()
108
 
109
  self.steps.append(
@@ -122,10 +173,21 @@ class StandardRecipeWithIndexes(BaseRecipe):
122
  def prepare(self):
123
  assert (
124
  self.template_card_index is None or self.template is None
125
- ), "Specify either template or template_card_index"
 
 
 
126
  if self.template_card_index is not None:
127
- self.template = self.card.templates[int(self.template_card_index)]
128
-
 
 
 
 
 
 
 
 
129
  assert (
130
  self.instruction_card_index is None or self.instruction is None
131
  ), "Specify either instruction or instruction_card_index"
@@ -136,9 +198,9 @@ class StandardRecipeWithIndexes(BaseRecipe):
136
 
137
 
138
  class StandardRecipe(StandardRecipeWithIndexes):
139
- """
140
- This class represents a standard recipe for data processing and preperation.
141
- This class can be used to prepare a recipe
142
  with all necessary steps, refiners and renderers included. It allows to set various
143
  parameters and steps in a sequential manner for preparing the recipe.
144
 
@@ -146,6 +208,7 @@ class StandardRecipe(StandardRecipeWithIndexes):
146
  card (TaskCard): TaskCard object associated with the recipe.
147
  template (Template, optional): Template object to be used for the recipe.
148
  instruction (Instruction, optional): Instruction object to be used for the recipe.
 
149
  format (ICLFormat, optional): ICLFormat object to be used for the recipe.
150
  train_refiner (StreamRefiner, optional): Train refiner to be used in the recipe.
151
  max_train_instances (int, optional): Maximum training instances for the refiner.
@@ -160,6 +223,7 @@ class StandardRecipe(StandardRecipeWithIndexes):
160
  demos_field (str, optional): Field name for demos. Default is "demos".
161
  sampler (Sampler, optional): Sampler object to be used in the recipe.
162
  steps (List[StreamingOperator], optional): List of StreamingOperator objects to be used in the recipe.
 
163
  instruction_card_index (int, optional): Index of instruction card to be used
164
  for preparing the recipe.
165
  template_card_index (int, optional): Index of template card to be used for
 
1
+ import logging
2
  from typing import List
3
 
4
  from .card import TaskCard
5
  from .dataclass import InternalField, OptionalField
6
  from .formats import ICLFormat
7
  from .instructions import Instruction
8
+ from .operator import SourceSequentialOperator, StreamingOperator
9
+ from .operators import Augmentor, NullAugmentor, StreamRefiner
10
  from .recipe import Recipe
11
  from .renderers import StandardRenderer
12
  from .schema import ToUnitxtGroup
 
14
  from .templates import Template
15
 
16
 
17
+ # Used to give meaningful name to recipe steps
18
+ class CreateDemosPool(SeparateSplit):
19
+ pass
20
+
21
+
22
+ class AddDemosField(SpreadSplit):
23
+ pass
24
+
25
+
26
+ class BaseRecipe(Recipe, SourceSequentialOperator):
27
  card: TaskCard
28
  template: Template = None
29
  instruction: Instruction = None
30
  format: ICLFormat = ICLFormat()
31
 
32
+ loader_limit: int = None
33
+
34
  max_train_instances: int = None
35
  max_validation_instances: int = None
36
  max_test_instances: int = None
37
 
38
+ train_refiner: StreamRefiner = OptionalField(default_factory=StreamRefiner)
39
+ validation_refiner: StreamRefiner = OptionalField(default_factory=StreamRefiner)
40
+ test_refiner: StreamRefiner = OptionalField(default_factory=StreamRefiner)
 
 
41
 
42
  demos_pool_size: int = None
43
  num_demos: int = 0
 
47
  demos_field: str = "demos"
48
  sampler: Sampler = None
49
 
50
+ augmentor: Augmentor = OptionalField(default_factory=NullAugmentor)
51
+
52
  steps: List[StreamingOperator] = InternalField(default_factory=list)
53
 
54
  def verify(self):
 
60
  )
61
  if self.demos_pool_size < self.num_demos:
62
  raise ValueError(
63
+ f"demos_pool_size must be bigger than num_demos ({self.num_demos}), Got demos_pool_size={self.demos_pool_size}"
64
+ )
65
+ if self.loader_limit and self.demos_pool_size > self.loader_limit:
66
+ raise ValueError(
67
+ f"demos_pool_size must be bigger than loader_limit ({self.loader_limit}), Got demos_pool_size={self.demos_pool_size}"
68
+ )
69
+
70
+ if self.loader_limit:
71
+ if self.max_test_instances and self.max_test_instances > self.loader_limit:
72
+ raise ValueError(
73
+ f"max_test_instances must be bigger than loader_limit ({self.loader_limit}), Got max_test_instances={self.max_test_instances}"
74
+ )
75
+ if (
76
+ self.max_validation_instances
77
+ and self.max_validation_instances > self.loader_limit
78
+ ):
79
+ raise ValueError(
80
+ f"max_validation_instances must be bigger than loader_limit ({self.loader_limit}), Got max_validation_instances={self.max_validation_instances}"
81
+ )
82
+ if (
83
+ self.max_train_instances
84
+ and self.max_train_instances > self.loader_limit
85
+ ):
86
+ raise ValueError(
87
+ f"max_train_instances must be bigger than loader_limit ({self.loader_limit}), Got max_train_instances={self.max_train_instances}"
88
  )
89
 
90
  def prepare(self):
 
92
  self.card.loader,
93
  ]
94
 
95
+ if self.loader_limit:
96
+ self.card.loader.loader_limit = self.loader_limit
97
+ logging.info(f"Loader line limit was set to {self.loader_limit}")
98
+ self.steps.append(StreamRefiner(max_instances=self.loader_limit))
99
+
100
  if self.card.preprocess_steps is not None:
101
  self.steps.extend(self.card.preprocess_steps)
102
 
103
  self.steps.append(self.card.task)
104
 
105
+ if self.augmentor.augment_task_input:
106
+ self.augmentor.set_task_input_fields(self.card.task.augmentable_inputs)
107
+ self.steps.append(self.augmentor)
108
+
109
  if self.demos_pool_size is not None:
110
  self.steps.append(
111
+ CreateDemosPool(
112
  from_split=self.demos_taken_from,
113
  to_split_names=[self.demos_pool_name, self.demos_taken_from],
114
  to_split_sizes=[int(self.demos_pool_size)],
 
124
  sampler.set_size(self.num_demos)
125
 
126
  self.steps.append(
127
+ AddDemosField(
128
  source_stream=self.demos_pool_name,
129
  target_field=self.demos_field,
130
  sampler=sampler,
 
132
  )
133
 
134
  self.train_refiner.max_instances = self.max_train_instances
135
+ self.train_refiner.apply_to_streams = ["train"]
136
  self.steps.append(self.train_refiner)
137
 
138
  self.validation_refiner.max_instances = self.max_validation_instances
139
+ self.validation_refiner.apply_to_streams = ["validation"]
140
  self.steps.append(self.validation_refiner)
141
 
142
  self.test_refiner.max_instances = self.max_test_instances
143
+ self.test_refiner.apply_to_streams = ["test"]
144
  self.steps.append(self.test_refiner)
145
 
146
  render = StandardRenderer(
 
152
 
153
  self.steps.append(render)
154
 
155
+ if self.augmentor.augment_model_input:
156
+ self.steps.append(self.augmentor)
157
+
158
  postprocessors = render.get_postprocessors()
159
 
160
  self.steps.append(
 
173
  def prepare(self):
174
  assert (
175
  self.template_card_index is None or self.template is None
176
+ ), f"Specify either template ({self.template}) or template_card_index ({self.template_card_index}) but not both"
177
+ assert not (
178
+ self.template_card_index is None and self.template is None
179
+ ), "Specify either template or template_card_index in card"
180
  if self.template_card_index is not None:
181
+ try:
182
+ self.template = self.card.templates[self.template_card_index]
183
+ except Exception as e:
184
+ if isinstance(self.card.templates, dict):
185
+ options = self.card.templates.keys()
186
+ else:
187
+ options = list(range(0, len(self.card.templates)))
188
+ raise ValueError(
189
+ f"card_template_index '{self.template_card_index}' is not in card. Available options: {options}"
190
+ ) from e
191
  assert (
192
  self.instruction_card_index is None or self.instruction is None
193
  ), "Specify either instruction or instruction_card_index"
 
198
 
199
 
200
  class StandardRecipe(StandardRecipeWithIndexes):
201
+ """This class represents a standard recipe for data processing and preperation.
202
+
203
+ This class can be used to prepare a recipe.
204
  with all necessary steps, refiners and renderers included. It allows to set various
205
  parameters and steps in a sequential manner for preparing the recipe.
206
 
 
208
  card (TaskCard): TaskCard object associated with the recipe.
209
  template (Template, optional): Template object to be used for the recipe.
210
  instruction (Instruction, optional): Instruction object to be used for the recipe.
211
+ loader_limit (int, optional): Specifies the maximum number of instances per stream to be returned from the loader (used to reduce loading time in large datasets)
212
  format (ICLFormat, optional): ICLFormat object to be used for the recipe.
213
  train_refiner (StreamRefiner, optional): Train refiner to be used in the recipe.
214
  max_train_instances (int, optional): Maximum training instances for the refiner.
 
223
  demos_field (str, optional): Field name for demos. Default is "demos".
224
  sampler (Sampler, optional): Sampler object to be used in the recipe.
225
  steps (List[StreamingOperator], optional): List of StreamingOperator objects to be used in the recipe.
226
+ augmentor (Augmentor) : Augmentor to be used to pseudo randomly augment the source text
227
  instruction_card_index (int, optional): Index of instruction card to be used
228
  for preparing the recipe.
229
  template_card_index (int, optional): Index of template card to be used for