Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Jan 11, 2024

Commit

dcd3b86

verified ·

1 Parent(s): a66b8be

Upload operators.py with huggingface_hub

Browse files

Files changed (1) hide show

operators.py +625 -235

operators.py CHANGED Viewed

@@ -1,11 +1,47 @@
 import collections
 import importlib
 import uuid
 from abc import abstractmethod
 from collections import Counter
 from copy import deepcopy
 from dataclasses import field
 from itertools import zip_longest
 from typing import (
     Any,
     Callable,
@@ -32,17 +68,20 @@ from .operator import (
     StreamInstanceOperator,
     StreamSource,
 )
-from .random_utils import get_random, nested_seed
 from .stream import Stream
 from .text_utils import nested_tuple_to_string
 from .utils import flatten_dict
 class FromIterables(StreamInitializerOperator):
-    """Creates a MultiStream from iterables.
-    Args:
-        iterables (Dict[str, Iterable]): A dictionary where each key-value pair represents a stream name and its corresponding iterable.
     """
     def process(self, iterables: Dict[str, Iterable]) -> MultiStream:
@@ -50,6 +89,19 @@ class FromIterables(StreamInitializerOperator):
 class IterableSource(StreamSource):
     iterables: Dict[str, Iterable]
     def __call__(self) -> MultiStream:
@@ -57,7 +109,7 @@ class IterableSource(StreamSource):
 class MapInstanceValues(StreamInstanceOperator):
-    """A class used to map instance values into a stream.
     This class is a type of StreamInstanceOperator,
     it maps values of instances in a stream using predefined mappers.
@@ -87,6 +139,11 @@ class MapInstanceValues(StreamInstanceOperator):
         To ensure that all values of field 'a' are mapped in every instance, use strict=True.
         Input instance {"a":"3", "b": 2} will raise an exception per the above call,
         because "3" is not a key in the mapper of "a".
     """
     mappers: Dict[str, Dict[str, str]]
@@ -115,34 +172,31 @@ class MapInstanceValues(StreamInstanceOperator):
                     raise ValueError(
                         f"'process_every_field' == True is allowed only when all fields which have mappers, i.e., {list(self.mappers.keys())} are lists. Instace = {instance}"
                     )
-                if isinstance(value, list):
-                    if self.process_every_value:
-                        for i, val in enumerate(value):
-                            val = str(val)  # make sure the value is a string
-                            if self.strict and (val not in mapper):
-                                raise KeyError(
-                                    f"value '{val}' in instance '{instance}' is not found in mapper '{mapper}', associated with field '{key}'."
-                                )
-                            if val in mapper:
-                                # replace just that member of value (value is a list)
-                                value[i] = mapper[val]
-                                dict_set(instance, key, value, use_dpath=self.use_query)
-                    else:  # field is a list, and process_every_value == False
-                        if self.strict:  # whole lists can not be mapped by a string-to-something mapper
-                            raise KeyError(
-                                f"A whole list ({value}) in the instance can not be mapped by a field mapper."
-                            )
-                else:  # value is not a list, implying process_every_value == False
-                    value = str(value)  # make sure the value is a string
-                    if self.strict and (value not in mapper):
-                        raise KeyError(
-                            f"value '{value}' in instance '{instance}' is not found in mapper '{mapper}', associated with field '{key}'."
-                        )
-                    if value in mapper:
-                        dict_set(instance, key, mapper[value], use_dpath=self.use_query)
         return instance
 class FlattenInstances(StreamInstanceOperator):
     """Flattens each instance in a stream, making nested dictionary entries into top-level entries.
@@ -182,6 +236,7 @@ class AddFields(StreamInstanceOperator):
         # Add a 'classes' field on a given list, prevent modification of original list
         # from changing the instance.
         AddFields(fields={"classes": alist}), use_deepcopy=True)
     """
     fields: Dict[str, object]
@@ -204,7 +259,7 @@ class AddFields(StreamInstanceOperator):
 class RemoveFields(StreamInstanceOperator):
-    """Remove specified fields to each instance in a stream.
     Args:
         fields (List[str]): The fields to remove from each instance.
@@ -221,19 +276,32 @@ class RemoveFields(StreamInstanceOperator):
 class FieldOperator(StreamInstanceOperator):
-    """A general stream that processes the values of a field (or multiple ones.
     Args:
-        field (Optional[str]): The field to process, if only a single one is passed Defaults to None
-        to_field (Optional[str]): Field name to save, if only one field is to be saved, if None is passed the operator would happen in-place and replace "field" Defaults to None
-        field_to_field (Optional[Union[List[Tuple[str, str]], Dict[str, str]]]): Mapping from fields to process to their names after this process, duplicates are allowed. Defaults to None
         process_every_value (bool): Processes the values in a list instead of the list as a value, similar to *var. Defaults to False
         use_query (bool): Whether to use dpath style queries. Defaults to False.
     """
     field: Optional[str] = None
     to_field: Optional[str] = None
-    field_to_field: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None
     process_every_value: bool = False
     use_query: bool = False
     get_default: Any = None
@@ -250,25 +318,67 @@ class FieldOperator(StreamInstanceOperator):
         ), f"Can not apply operator to create both on {self.to_field} and on the mapping from fields to fields {self.field_to_field}"
         assert (
             self.field is None or self.field_to_field is None
-        ), f"Can not apply operator both on {self.field} and on the mapping from fields to fields {self.field_to_field}"
         assert (
-            self._field_to_field
-        ), f"the from and to fields must be defined got: {self._field_to_field}"
     @abstractmethod
     def process_value(self, value: Any) -> Any:
         pass
     def prepare(self):
-        if self.to_field is None:
-            self.to_field = self.field
         if self.field_to_field is None:
-            self._field_to_field = [(self.field, self.to_field)]
         else:
-            try:
-                self._field_to_field = list(self.field_to_field.items())
-            except AttributeError:
-                self._field_to_field = self.field_to_field
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
@@ -295,7 +405,7 @@ class FieldOperator(StreamInstanceOperator):
                 raise ValueError(
                     f"Failed to process '{from_field}' from {instance} due to : {e}"
                 ) from e
-            if self.use_query and is_subpath(from_field, to_field):
                 dict_delete(instance, from_field)
             dict_set(
                 instance,
@@ -308,7 +418,25 @@ class FieldOperator(StreamInstanceOperator):
 class RenameFields(FieldOperator):
-    """Renames fields."""
     def process_value(self, value: Any) -> Any:
         return value
@@ -317,20 +445,31 @@ class RenameFields(FieldOperator):
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         res = super().process(instance=instance, stream_name=stream_name)
-        vals = [x[1] for x in self._field_to_field]
-        for key, _ in self._field_to_field:
-            if self.use_query and "/" in key:
-                continue
-            if key not in vals:
-                res.pop(key)
         return res
 class AddConstant(FieldOperator):
-    """Adds a value, similar to  add + field.
     Args:
-        add: sum to add.
     """
     add: Any
@@ -396,19 +535,15 @@ class Augmentor(StreamInstanceOperator):
                     default="",
                     not_exist_ok=False,
                 )
-            except TypeError as e:
                 raise TypeError(f"Failed to get {field_name} from {instance}") from e
-            # We are setting a nested seed based on the value processed, to ensure that
-            # the augmentation randomizations do not effect other randomization choices and
-            # to make the augmentation randomization choices different for each text.
-            with nested_seed(str(hash(old_value))):
-                try:
-                    new_value = self.process_value(old_value)
-                except Exception as e:
-                    raise RuntimeError(
-                        f"Error augmenting value '{old_value}' from '{field_name}' in instance: {instance}"
-                    ) from e
             dict_set(instance, field_name, new_value, use_dpath=True, not_exist_ok=True)
         return instance
@@ -433,90 +568,146 @@ class AugmentWhitespace(Augmentor):
         words = re.split(r"(\s+)", value)
         new_value = ""
         for word in words:
             if word.isspace():
-                new_value += get_random().choice(
                     ["\n", "\t", " "]
-                ) * get_random().randint(1, 3)
             else:
                 new_value += word
         return new_value
-class AugmentSuffix(Augmentor):
-    r"""Augments the input by appending to it a randomly selected (typically, whitespace) pattern.
     Args:
-     suffixes : the potential (typically, whitespace) patterns to select from.
         The dictionary version allows to specify relative weights of the different patterns.
-     remove_existing_trailing_whitespaces : allows to first clean existing trailing whitespaces.
-        The selected pattern is then appended to the potentially trimmed at its end input.
     Examples:
-        to append a '\n' or a '\t' to the end of the input, employ
-        AugmentSuffix(augment_model_input=True, suffixes=['\n','\t'])
-        If '\n' is preferred over '\t', at 2:1 ratio, employ
-        AugmentSuffix(augment_model_input=True, suffixes={'\n':2,'\t':1})
-        which will append '\n' twice as often as '\t'.
     """
-    suffixes: Optional[Union[List[str], Dict[str, int]]] = [" ", "\n", "\t"]
-    remove_existing_trailing_whitespaces: Optional[bool] = False
     def verify(self):
         assert (
-            isinstance(self.suffixes, list) or isinstance(self.suffixes, dict)
-        ), f"Argument 'suffixes' should be either a list or a dictionary, whereas it is of type {type(self.suffixes)}"
-        if isinstance(self.suffixes, dict):
-            for k, v in self.suffixes.items():
-                assert isinstance(
-                    k, str
-                ), f"suffixes should map strings, whereas key {k!s} is of type {type(k)}"
-                assert isinstance(
-                    v, int
-                ), f"suffixes should map to ints, whereas value {v!s} is of type {type(v)}"
-        else:
-            for k in self.suffixes:
-                assert isinstance(
-                    k, str
-                ), f"suffixes should be a list of strings, whereas member {k!s} is of type {type(k)}"
-        self.pats = (
-            self.suffixes
-            if isinstance(self.suffixes, list)
-            else [k for k, v in self.suffixes.items()]
         )
         total_weight = (
-            len(self.pats)
-            if isinstance(self.suffixes, list)
-            else sum([v for k, v in self.suffixes.items()])
         )
-        self.weights = (
-            [1.0 / total_weight] * len(self.pats)
-            if isinstance(self.suffixes, list)
-            else [float(self.suffixes[p]) / total_weight for p in self.pats]
         )
-        super().verify()
     def process_value(self, value: Any) -> Any:
         assert value is not None, "input value should not be None"
         new_value = str(value)
-        if self.remove_existing_trailing_whitespaces:
-            new_value = new_value.rstrip()
-        new_value += get_random().choices(self.pats, self.weights, k=1)[0]
-        return new_value
 class ShuffleFieldValues(FieldOperator):
-    """Shuffles an iterable value."""
     def process_value(self, value: Any) -> Any:
         res = list(value)
-        get_random().shuffle(res)
         return res
@@ -621,9 +812,18 @@ class ListFieldValues(StreamInstanceOperator):
 class ZipFieldValues(StreamInstanceOperator):
-    """Zips values of multiple fields similar to list(zip(*fields))."""
-    fields: str
     to_field: str
     longest: bool = False
     use_query: bool = False
@@ -643,7 +843,7 @@ class ZipFieldValues(StreamInstanceOperator):
 class IndexOf(StreamInstanceOperator):
-    """Finds the location of one value in another (iterable) value similar to to_field=search_in.index(index_of)."""
     search_in: str
     index_of: str
@@ -660,7 +860,7 @@ class IndexOf(StreamInstanceOperator):
 class TakeByField(StreamInstanceOperator):
-    """Takes value from one field based on another field similar to field[index]."""
     field: str
     index: str
@@ -681,11 +881,24 @@ class TakeByField(StreamInstanceOperator):
 class CopyFields(FieldOperator):
-    """Copies specified fields from one field to another.
-    Args:
         field_to_field (Union[List[List], Dict[str, str]]): A list of lists, where each sublist contains the source field and the destination field, or a dictionary mapping source fields to destination fields.
-        use_dpath (bool): Whether to use dpath for accessing fields. Defaults to False.
     """
     def process_value(self, value: Any) -> Any:
@@ -693,6 +906,8 @@ class CopyFields(FieldOperator):
 class AddID(StreamInstanceOperator):
     id_field_name: str = "id"
     def process(
@@ -706,22 +921,31 @@ class CastFields(StreamInstanceOperator):
     """Casts specified fields to specified types.
     Args:
-        types (Dict[str, str]): A dictionary mapping fields to their new types.
-        nested (bool): Whether to cast nested fields. Defaults to False.
-        fields (Dict[str, str]): A dictionary mapping fields to their new types.
-        defaults (Dict[str, object]): A dictionary mapping types to their default values for cases of casting failure.
     """
-    types = {
-        "int": int,
-        "float": float,
-        "str": str,
-        "bool": bool,
-    }
     fields: Dict[str, str] = field(default_factory=dict)
     failure_defaults: Dict[str, object] = field(default_factory=dict)
     use_nested_query: bool = False
-    cast_multiple: bool = False
     def _cast_single(self, value, type, field):
         try:
@@ -734,14 +958,17 @@ class CastFields(StreamInstanceOperator):
             return self.failure_defaults[field]
     def _cast_multiple(self, values, type, field):
-        values = [self._cast_single(value, type, field) for value in values]
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         for field_name, type in self.fields.items():
             value = dict_get(instance, field_name, use_dpath=self.use_nested_query)
-            if self.cast_multiple:
                 casted_value = self._cast_multiple(value, type, field_name)
             else:
                 casted_value = self._cast_single(value, type, field_name)
@@ -751,29 +978,46 @@ class CastFields(StreamInstanceOperator):
         return instance
-def recursive_divide(instance, divisor, strict=False):
-    if isinstance(instance, dict):
-        for key, value in instance.items():
-            instance[key] = recursive_divide(value, divisor, strict=strict)
-    elif isinstance(instance, list):
-        for i, value in enumerate(instance):
-            instance[i] = recursive_divide(value, divisor, strict=strict)
-    elif isinstance(instance, float):
-        instance /= divisor
-    elif strict:
-        raise ValueError(f"Cannot divide instance of type {type(instance)}")
-    return instance
-class DivideAllFieldsBy(StreamInstanceOperator):
     divisor: float = 1.0
     strict: bool = False
-    recursive: bool = True
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
-        return recursive_divide(instance, self.divisor, strict=self.strict)
 class ArtifactFetcherMixin:
@@ -797,13 +1041,21 @@ class ApplyOperatorsField(StreamInstanceOperator, ArtifactFetcherMixin):
     """Applies value operators to each instance in a stream based on specified fields.
     Args:
-        value_field (str): The field containing the value to be operated on.
-        operators_field (str): The field containing the operators to be applied.
         default_operators (List[str]): A list of default operators to be used if no operators are found in the instance.
-    """
-    inputs_fields: str
     operators_field: str
     default_operators: List[str] = None
     fields_to_treat_as_list: List[str] = NonPositionalField(default_factory=list)
@@ -815,7 +1067,7 @@ class ApplyOperatorsField(StreamInstanceOperator, ArtifactFetcherMixin):
         if operator_names is None:
             assert (
                 self.default_operators is not None
-            ), f"No operators found in {self.field} field and no default operators provided"
             operator_names = self.default_operators
         if isinstance(operator_names, str):
@@ -828,35 +1080,155 @@ class ApplyOperatorsField(StreamInstanceOperator, ArtifactFetcherMixin):
                 if field_name in self.fields_to_treat_as_list:
                     instance[field_name] = [operator.process(v) for v in value]
                 else:
-                    instance[field_name] = operator.process(instance[field_name])
         return instance
-class FilterByValues(SingleStreamOperator):
-    """Filters a stream, yielding only instances that match specified values in the provided fields.
     Args:
-        values (Dict[str, Any]): For each field, the values that instances should match to be included in the output.
     """
-    required_values: Dict[str, Any]
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         for instance in stream:
-            filter = False
-            for key, value in self.required_values.items():
-                if key not in instance:
                     raise ValueError(
-                        f"Required filter field ('{key}') in FilterByValues is not found in {instance}"
                     )
-                if instance[key] != value:
-                    filter = True
-            if not filter:
                 yield instance
-class ExtractFieldValues(MultiStreamOperator):
     field: str
     stream_name: str
     overall_top_frequency_percent: Optional[int] = 100
@@ -877,21 +1249,21 @@ class ExtractFieldValues(MultiStreamOperator):
     Examples:
-    ExtractFieldValues(stream_name="train", field="label", to_field="classes") - extracts all the unique values of
     field 'label', sorts them by decreasing frequency, and stores the resulting list in field 'classes' of each and
     every instance in all streams.
-    ExtractFieldValues(stream_name="train", field="labels", to_field="classes", process_every_value=True) -
     in case that field 'labels' contains a list of values (and not a single value) - track the occurrences of all the possible
     value members in these lists, and report the most frequent values.
     if process_every_value=False, track the most frequent whole lists, and report those (as a list of lists) in field
     'to_field' of each instance of all streams.
-    ExtractFieldValues(stream_name="train", field="label", to_field="classes",overall_top_frequency_percent=80) -
     extracts the most frequent possible values of field 'label' that together cover at least 80% of the instances of stream_name,
     and stores them in field 'classes' of each instance of all streams.
-    ExtractFieldValues(stream_name="train", field="label", to_field="classes",min_frequency_percent=5) -
     extracts all possible values of field 'label' that cover, each, at least 5% of the instances.
     Stores these values, sorted by decreasing order of frequency, in field 'classes' of each instance in all streams.
     """
@@ -952,41 +1324,18 @@ class ExtractFieldValues(MultiStreamOperator):
             [*ele[0]] if isinstance(ele[0], tuple) else ele[0]
             for ele in values_and_counts
         ]
-        for name in multi_stream:
-            for instance in multi_stream[name]:
-                instance[self.to_field] = values_to_keep
-        return multi_stream
-class FilterByListsOfValues(SingleStreamOperator):
-    """Filters a stream, yielding only instances that  whose field values are included in the specified value lists.
-    Args:
-        required_values (Dict[str, List]): For each field, the list of values that instances should match to be included in the output.
-    """
-    required_values: Dict[str, List]
     def verify(self):
         super().verify()
-        for key, value in self.required_values.items():
-            if not isinstance(value, list):
-                raise ValueError(
-                    f"The filter for key ('{key}') in FilterByListsOfValues is not a list but '{value}'"
-                )
-    def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
-        for instance in stream:
-            filter = False
-            for key, value in self.required_values.items():
-                if key not in instance:
-                    raise ValueError(
-                        f"Required filter field ('{key}') in FilterByListsOfValues is not found in {instance}"
-                    )
-                if instance[key] not in value:
-                    filter = True
-            if not filter:
-                yield instance
 class Intersect(FieldOperator):
@@ -1011,6 +1360,7 @@ class Intersect(FieldOperator):
             )
     def process_value(self, value: Any) -> Any:
         if not isinstance(value, list):
             raise ValueError(f"The value in field is not a list but '{value}'")
         return [e for e in value if e in self.allowed_values]
@@ -1020,7 +1370,7 @@ class RemoveValues(FieldOperator):
     """Removes elements in a field, which must be a list, using a given list of unallowed.
     Args:
-        unallowed_values (list) - removed_values.
     """
     unallowed_values: List[Any]
@@ -1089,8 +1439,8 @@ class SplitByValue(MultiStreamOperator):
             stream_unique_values = uniques[stream_name]
             for unique_values in stream_unique_values:
                 filtering_values = dict(zip(self.fields, unique_values))
-                filtered_streams = FilterByValues(
-                    required_values=filtering_values
                 )._process_single_stream(stream)
                 filtered_stream_name = (
                     stream_name + "_" + nested_tuple_to_string(unique_values)
@@ -1112,7 +1462,7 @@ class ApplyStreamOperatorsField(SingleStreamOperator, ArtifactFetcherMixin):
     reversed: bool = False
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
-        first_instance = stream.peak()
         operators = first_instance.get(self.field, [])
         if isinstance(operators, str):
@@ -1146,7 +1496,7 @@ class ApplyMetric(SingleStreamOperator, ArtifactFetcherMixin):
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         from .metrics import Metric, MetricPipeline, MetricWithConfidenceInterval
-        first_instance = stream.peak()
         metric_names = first_instance.get(self.metric_field, [])
         if not metric_names:
@@ -1182,27 +1532,6 @@ class ApplyMetric(SingleStreamOperator, ArtifactFetcherMixin):
         yield from stream
-class AddFieldNamePrefix(StreamInstanceOperator):
-    """Adds a prefix to each field name in each instance of a stream.
-    Args:
-        prefix_dict (Dict[str, str]): A dictionary mapping stream names to prefixes.
-    """
-    prefix_dict: Dict[str, str]
-    def prepare(self):
-        return super().prepare()
-    def process(
-        self, instance: Dict[str, Any], stream_name: Optional[str] = None
-    ) -> Dict[str, Any]:
-        return {
-            self.prefix_dict[stream_name] + key: value
-            for key, value in instance.items()
-        }
 class MergeStreams(MultiStreamOperator):
     """Merges multiple streams into a single stream.
@@ -1238,20 +1567,39 @@ class MergeStreams(MultiStreamOperator):
 class Shuffle(PagedStreamOperator):
     """Shuffles the order of instances in each page of a stream.
-    Args:
         page_size (int): The size of each page in the stream. Defaults to 1000.
     """
     def process(self, page: List[Dict], stream_name: Optional[str] = None) -> Generator:
-        get_random().shuffle(page)
         yield from page
 class EncodeLabels(StreamInstanceOperator):
-    """Encode labels of specified fields together a into integers.
     Args:
         fields (List[str]): The fields to encode together.
     """
     fields: List[str]
@@ -1279,7 +1627,23 @@ class EncodeLabels(StreamInstanceOperator):
 class StreamRefiner(SingleStreamOperator):
     max_instances: int = None
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         if self.max_instances is not None:
@@ -1291,13 +1655,23 @@ class StreamRefiner(SingleStreamOperator):
 class DeterministicBalancer(StreamRefiner):
     """A class used to balance streams deterministically.
     Attributes:
-        fields (List[str]): A list of field names to be used in determining the signature of an instance.
-        streams (List[str]): A list of stream names to be processed by the balancer.
     Usage:
-        balancer = DeterministicBalancer(fields=["field1", "field2"], streams=["stream1", "stream2"])
         balanced_stream = balancer.process(stream)
     """
     fields: List[str]
@@ -1334,7 +1708,23 @@ class DeterministicBalancer(StreamRefiner):
 class LengthBalancer(DeterministicBalancer):
     segments_boundaries: List[int]
     def signature(self, instance):
         total_len = 0

+"""This section describes unitxt operators.
+Operators: Building Blocks of Unitxt Processing Pipelines
+==============================================================
+Within the Unitxt framework, operators serve as the foundational elements used to assemble processing pipelines.
+Each operator is designed to perform specific manipulations on dictionary structures within a stream.
+These operators are callable entities that receive a MultiStream as input.
+The output is a MultiStream, augmented with the operator's manipulations, which are then systematically applied to each instance in the stream when pulled.
+Creating Custom Operators
+-------------------------------
+To enhance the functionality of Unitxt, users are encouraged to develop custom operators.
+This can be achieved by inheriting from any of the existing operators listed below or from one of the fundamental :class:`base operators<unitxt.operator>`.
+The primary task in any operator development is to implement the `process` function, which defines the unique manipulations the operator will perform.
+General or Specelized Operators
+--------------------------------
+Some operators are specielized in specific task such as:
+- :class:`loaders<unitxt.loaders>` for loading data.
+- :class:`splitters<unitxt.splitters>` for fixing data splits.
+Other specelized operators are used by unitxt internally:
+- :class:`templates<unitxt.templates>` for verbalizing data examples.
+- :class:`formats<unitxt.formats>` for preparing data for models.
+The rest of this section is dedicated for general operators.
+General Operaotrs List:
+------------------------
+"""
 import collections
 import importlib
+import operator
+import os
 import uuid
 from abc import abstractmethod
 from collections import Counter
 from copy import deepcopy
 from dataclasses import field
 from itertools import zip_longest
+from random import Random
 from typing import (
     Any,
     Callable,
     StreamInstanceOperator,
     StreamSource,
 )
+from .random_utils import new_random_generator
 from .stream import Stream
 from .text_utils import nested_tuple_to_string
+from .type_utils import isoftype
 from .utils import flatten_dict
 class FromIterables(StreamInitializerOperator):
+    """Creates a MultiStream from a dict of named iterables.
+    Example:
+        operator = FromIterables()
+        ms = operator.process(iterables)
     """
     def process(self, iterables: Dict[str, Iterable]) -> MultiStream:
 class IterableSource(StreamSource):
+    """Creates a MultiStream from a dict of named iterables.
+    It is a callable.
+    Args:
+        iterables (Dict[str, Iterable]): A dictionary mapping stream names to iterables.
+    Example:
+        operator =  IterableSource(input_dict)
+        ms = operator()
+    """
     iterables: Dict[str, Iterable]
     def __call__(self) -> MultiStream:
 class MapInstanceValues(StreamInstanceOperator):
+    """A class used to map instance values into other values.
     This class is a type of StreamInstanceOperator,
     it maps values of instances in a stream using predefined mappers.
         To ensure that all values of field 'a' are mapped in every instance, use strict=True.
         Input instance {"a":"3", "b": 2} will raise an exception per the above call,
         because "3" is not a key in the mapper of "a".
+        MapInstanceValues(mappers={"a": {str([1,2,3,4]): 'All', str([]): 'None'}}, strict=True)
+        replaces a list [1,2,3,4] with the string 'All' and an empty list by string 'None'.
+        Note that mapped values are defined by their string representation, so mapped values
+        must be converted to strings.
     """
     mappers: Dict[str, Dict[str, str]]
                     raise ValueError(
                         f"'process_every_field' == True is allowed only when all fields which have mappers, i.e., {list(self.mappers.keys())} are lists. Instace = {instance}"
                     )
+                if isinstance(value, list) and self.process_every_value:
+                    for i, val in enumerate(value):
+                        value[i] = self.get_mapped_value(instance, key, mapper, val)
+                else:
+                    value = self.get_mapped_value(instance, key, mapper, value)
+                dict_set(
+                    instance,
+                    key,
+                    value,
+                    use_dpath=self.use_query,
+                )
         return instance
+    def get_mapped_value(self, instance, key, mapper, val):
+        val_as_str = str(val)  # make sure the value is a string
+        if self.strict and (val_as_str not in mapper):
+            raise KeyError(
+                f"value '{val}' in instance '{instance}' is not found in mapper '{mapper}', associated with field '{key}'."
+            )
+        # By default deep copy the value in mapper to avoid shared modifications
+        if val_as_str in mapper:
+            return deepcopy(mapper[val_as_str])
+        return val
 class FlattenInstances(StreamInstanceOperator):
     """Flattens each instance in a stream, making nested dictionary entries into top-level entries.
         # Add a 'classes' field on a given list, prevent modification of original list
         # from changing the instance.
         AddFields(fields={"classes": alist}), use_deepcopy=True)
+        # if now alist is modified, still the instances remain intact.
     """
     fields: Dict[str, object]
 class RemoveFields(StreamInstanceOperator):
+    """Remove specified fields from each instance in a stream.
     Args:
         fields (List[str]): The fields to remove from each instance.
 class FieldOperator(StreamInstanceOperator):
+    """A general stream instance operator that processes the values of a field (or multiple ones).
     Args:
+        field (Optional[str]): The field to process, if only a single one is passed. Defaults to None
+        to_field (Optional[str]): Field name to save result into, if only one field is processed, if None is passed the
+          operation would happen in-place and its result would replace the value of "field". Defaults to None
+        field_to_field (Optional[Union[List[List[str]], Dict[str, str]]]): Mapping from names of fields to process,
+          to names of fields to save the results into. Inner List, if used, should be of length 2.
+          A field is processed by feeding its value into method 'process_value' and storing the result in to_field that
+          is mapped to the field.
+          When the type of argument 'field_to_field' is List, the order by which the fields are processed is their order
+          in the (outer) List. But when the type of argument 'field_to_field' is Dict, there is no uniquely determined
+          order. The end result might depend on that order if either (1) two different fields are mapped to the same
+          to_field, or (2) a field shows both as a key and as a value in different mappings.
+          The operator throws an AssertionError in either of these cases.
+          field_to_field defaults to None
         process_every_value (bool): Processes the values in a list instead of the list as a value, similar to *var. Defaults to False
         use_query (bool): Whether to use dpath style queries. Defaults to False.
+        Note: if 'field' and 'to_field' (or both members of a pair in 'field_to_field') are equal (or share a common
+        prefix if 'use_query'=True), then the result of the operation is saved within 'field'
     """
     field: Optional[str] = None
     to_field: Optional[str] = None
+    field_to_field: Optional[Union[List[List[str]], Dict[str, str]]] = None
     process_every_value: bool = False
     use_query: bool = False
     get_default: Any = None
         ), f"Can not apply operator to create both on {self.to_field} and on the mapping from fields to fields {self.field_to_field}"
         assert (
             self.field is None or self.field_to_field is None
+        ), f"Can not apply operator both on {self.field} and on the from fields in the mapping {self.field_to_field}"
+        assert self._field_to_field, f"the from and to fields must be defined or implied from the other inputs got: {self._field_to_field}"
         assert (
+            len(self._field_to_field) > 0
+        ), f"'input argument 'field_to_field' should convey at least one field to process. Got {self.field_to_field}"
+        # self._field_to_field is built explicitly by pairs, or copied from argument 'field_to_field'
+        if self.field_to_field is None:
+            return
+        # for backward compatibility also allow list of tupples of two strings
+        if isoftype(self.field_to_field, List[List[str]]) or isoftype(
+            self.field_to_field, List[Tuple[str, str]]
+        ):
+            for pair in self._field_to_field:
+                assert (
+                    len(pair) == 2
+                ), f"when 'field_to_field' is defined as a list of lists, the inner lists should all be of length 2. {self.field_to_field}"
+            # order of field processing is uniquely determined by the input field_to_field when a list
+            return
+        if isoftype(self.field_to_field, Dict[str, str]):
+            if len(self.field_to_field) < 2:
+                return
+            for ff, tt in self.field_to_field.items():
+                for f, t in self.field_to_field.items():
+                    if f == ff:
+                        continue
+                    assert (
+                        t != ff
+                    ), f"In input argument 'field_to_field': {self.field_to_field}, field {f} is mapped to field {t}, while the latter is mapped to {tt}. Whether {f} or {t} is processed first might impact end result."
+                    assert (
+                        tt != t
+                    ), f"In input argument 'field_to_field': {self.field_to_field}, two different fields: {ff} and {f} are mapped to field {tt}. Whether {ff} or {f} is processed last might impact end result."
+            return
+        raise ValueError(
+            "Input argument 'field_to_field': {self.field_to_field} is neither of type List{List[str]] nor of type Dict[str, str]."
+        )
     @abstractmethod
     def process_value(self, value: Any) -> Any:
         pass
     def prepare(self):
+        super().prepare()
+        # prepare is invoked before verify, hence must make some checks here, before the changes done here
+        assert (
+            (self.field is None) != (self.field_to_field is None)
+        ), "Must uniquely define the field to work on, through exactly one of either 'field' or 'field_to_field'"
+        assert (
+            self.to_field is None or self.field_to_field is None
+        ), f"Can not apply operator to create both {self.to_field} and the to fields in the mapping {self.field_to_field}"
         if self.field_to_field is None:
+            self._field_to_field = [
+                (self.field, self.to_field if self.to_field is not None else self.field)
+            ]
         else:
+            self._field_to_field = (
+                list(self.field_to_field.items())
+                if isinstance(self.field_to_field, dict)
+                else self.field_to_field
+            )
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
                 raise ValueError(
                     f"Failed to process '{from_field}' from {instance} due to : {e}"
                 ) from e
+            if is_subpath(from_field, to_field) or is_subpath(to_field, from_field):
                 dict_delete(instance, from_field)
             dict_set(
                 instance,
 class RenameFields(FieldOperator):
+    """Renames fields.
+    Move value from one field to another, potentially, if 'use_query'=True, from one branch into another.
+    Remove the from field, potentially part of it in case of use_query.
+    Examples:
+        RenameFields(field_to_field={"b": "c"})
+        will change inputs [{"a": 1, "b": 2}, {"a": 2, "b": 3}] to [{"a": 1, "c": 2}, {"a": 2, "c": 3}]
+        RenameFields(field_to_field={"b": "c/d"}, use_query=True)
+        will change inputs [{"a": 1, "b": 2}, {"a": 2, "b": 3}] to [{"a": 1, "c": {"d": 2}}, {"a": 2, "c": {"d": 3}}]
+        RenameFields(field_to_field={"b": "b/d"}, use_query=True)
+        will change inputs [{"a": 1, "b": 2}, {"a": 2, "b": 3}] to [{"a": 1, "b": {"d": 2}}, {"a": 2, "b": {"d": 3}}]
+        RenameFields(field_to_field={"b/c/e": "b/d"}, use_query=True)
+        will change inputs [{"a": 1, "b": {"c": {"e": 2, "f": 20}}}] to [{"a": 1, "b": {"c": {"f": 20}, "d": 2}}]
+    """
     def process_value(self, value: Any) -> Any:
         return value
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         res = super().process(instance=instance, stream_name=stream_name)
+        for from_field, to_field in self._field_to_field:
+            if (not is_subpath(from_field, to_field)) and (
+                not is_subpath(to_field, from_field)
+            ):
+                dict_delete(res, from_field)
+                if self.use_query:
+                    from_field_components = list(
+                        os.path.normpath(from_field).split(os.path.sep)
+                    )
+                    while len(from_field_components) > 1:
+                        from_field_components.pop()
+                        parent = dict_get(res, os.path.sep.join(from_field_components))
+                        if isinstance(parent, dict) and not parent:
+                            dict_delete(res, os.path.sep.join(from_field_components))
+                        else:
+                            break
         return res
 class AddConstant(FieldOperator):
+    """Adds a constant, being argument 'add', to the processed value.
     Args:
+        add: the constant to add.
     """
     add: Any
                     default="",
                     not_exist_ok=False,
                 )
+            except ValueError as e:
                 raise TypeError(f"Failed to get {field_name} from {instance}") from e
+            try:
+                new_value = self.process_value(old_value)
+            except Exception as e:
+                raise RuntimeError(
+                    f"Error augmenting value '{old_value}' from '{field_name}' in instance: {instance}"
+                ) from e
             dict_set(instance, field_name, new_value, use_dpath=True, not_exist_ok=True)
         return instance
         words = re.split(r"(\s+)", value)
         new_value = ""
+        random_generator = new_random_generator(sub_seed=value)
         for word in words:
             if word.isspace():
+                new_value += random_generator.choice(
                     ["\n", "\t", " "]
+                ) * random_generator.randint(1, 3)
             else:
                 new_value += word
         return new_value
+class AugmentPrefixSuffix(Augmentor):
+    r"""Augments the input by prepending and appending to it a randomly selected (typically, whitespace) patterns.
     Args:
+     prefixes, suffixes (list or dict) : the potential (typically, whitespace) patterns to select from.
         The dictionary version allows to specify relative weights of the different patterns.
+     prefix_len, suffix_len (positive int) : The added prefix or suffix will be of length
+        prefix_len of suffix_len, respectively, repetitions of the randomly selected patterns.
+     remove_existing_whitespaces : allows to first clean any existing leading and trailing whitespaces.
+        The strings made of repetitions of the selected pattern(s) are then prepended and/or appended to the potentially
+        trimmed input.
+     If only one of prefixes/suffixes is needed, set the other to None.
     Examples:
+        To prepend the input with a prefix made of 4 '\n'-s or '\t'-s, employ
+        AugmentPrefixSuffix(augment_model_input=True, prefixes=['\n','\t'], prefix_len=4, suffixes = None)
+        To append the input with a suffix made of 3 '\n'-s or '\t'-s, with triple '\n' suffixes
+        being preferred over triple '\t', at 2:1 ratio, employ
+        AugmentPrefixSuffix(augment_model_input=True, suffixes={'\n':2,'\t':1}, suffix_len=3, prefixes = None)
+        which will append '\n'-s twice as often as '\t'-s.
     """
+    prefixes: Optional[Union[List[str], Dict[str, int]]] = {
+        " ": 20,
+        "\\t": 10,
+        "\\n": 40,
+        "": 30,
+    }
+    prefix_len: Optional[int] = 3
+    suffixes: Optional[Union[List[str], Dict[str, int]]] = {
+        " ": 20,
+        "\\t": 10,
+        "\\n": 40,
+        "": 30,
+    }
+    suffix_len: Optional[int] = 3
+    remove_existing_whitespaces: Optional[bool] = False
     def verify(self):
         assert (
+            self.prefixes or self.suffixes
+        ), "At least one of prefixes/suffixes should be not None."
+        for arg, arg_name in zip(
+            [self.prefixes, self.suffixes], ["prefixes", "suffixes"]
+        ):
+            assert (
+                arg is None or isoftype(arg, List[str]) or isoftype(arg, Dict[str, int])
+            ), f"Argument {arg_name} should be either None or a list of strings or a dictionary str->int. {arg} is none of the above."
+        assert (
+            self.prefix_len > 0
+        ), f"prefix_len must be positive, got {self.prefix_len}"
+        assert (
+            self.suffix_len > 0
+        ), f"suffix_len must be positive, got {self.suffix_len}"
+        super().verify()
+    def _calculate_distributions(self, prefs_or_suffs):
+        if prefs_or_suffs is None:
+            return None, None
+        patterns = (
+            prefs_or_suffs
+            if isinstance(prefs_or_suffs, list)
+            else [k for k, v in prefs_or_suffs.items()]
         )
         total_weight = (
+            len(patterns)
+            if isinstance(prefs_or_suffs, list)
+            else sum([v for k, v in prefs_or_suffs.items()])
         )
+        weights = (
+            [1.0 / total_weight] * len(patterns)
+            if isinstance(prefs_or_suffs, list)
+            else [float(prefs_or_suffs[p]) / total_weight for p in patterns]
         )
+        return patterns, weights
+    def prepare(self):
+        # Being an artifact, prepare is invoked before verify. Here we need verify before the actions
+        self.verify()
+        self._prefix_pattern_distribution = {"length": self.prefix_len}
+        self._suffix_pattern_distribution = {"length": self.suffix_len}
+        (
+            self._prefix_pattern_distribution["patterns"],
+            self._prefix_pattern_distribution["weights"],
+        ) = self._calculate_distributions(self.prefixes)
+        (
+            self._suffix_pattern_distribution["patterns"],
+            self._suffix_pattern_distribution["weights"],
+        ) = self._calculate_distributions(self.suffixes)
+        super().prepare()
+    def _get_random_pattern(
+        self, pattern_distribution, random_generator: Random
+    ) -> str:
+        string_to_add = ""
+        if pattern_distribution["patterns"]:
+            string_to_add = "".join(
+                random_generator.choices(
+                    pattern_distribution["patterns"],
+                    pattern_distribution["weights"],
+                    k=pattern_distribution["length"],
+                )
+            )
+        return string_to_add
     def process_value(self, value: Any) -> Any:
         assert value is not None, "input value should not be None"
         new_value = str(value)
+        if self.remove_existing_whitespaces:
+            new_value = new_value.strip()
+        random_generator = new_random_generator(sub_seed=value)
+        prefix = self._get_random_pattern(
+            self._prefix_pattern_distribution, random_generator
+        )
+        suffix = self._get_random_pattern(
+            self._suffix_pattern_distribution, random_generator
+        )
+        return prefix + new_value + suffix
 class ShuffleFieldValues(FieldOperator):
+    """Shuffles a list of values found in a field."""
     def process_value(self, value: Any) -> Any:
         res = list(value)
+        random_generator = new_random_generator(sub_seed=res)
+        random_generator.shuffle(res)
         return res
 class ZipFieldValues(StreamInstanceOperator):
+    """Zips values of multiple fields in a given instance, similar to list(zip(*fields)).
+    The value in each of the specified 'fields' is assumed to be a list. The lists from all 'fields'
+    are zipped, and stored into 'to_field'.
+    If 'longest'=False, the length of the zipped result is determined by the shortest input value.
+    If 'longest'=False, the length of the zipped result is determined by the longest input, padding shorter
+    inputs with None -s.
+    """
+    fields: List[str]
     to_field: str
     longest: bool = False
     use_query: bool = False
 class IndexOf(StreamInstanceOperator):
+    """For a given instance, finds the offset of value of field 'index_of', within the value of field 'search_in'."""
     search_in: str
     index_of: str
 class TakeByField(StreamInstanceOperator):
+    """From field 'field' of a given instance, select the member indexed by field 'index', and store to field 'to_field'."""
     field: str
     index: str
 class CopyFields(FieldOperator):
+    """Copies values from specified fields to specified fields.
+    Args (of parent class):
         field_to_field (Union[List[List], Dict[str, str]]): A list of lists, where each sublist contains the source field and the destination field, or a dictionary mapping source fields to destination fields.
+        use_query (bool): Whether to use dpath for accessing fields. Defaults to False.
+    Examples:
+        An input instance {"a": 2, "b": 3}, when processed by
+        CopyField(field_to_field={"a": "b"}
+        would yield {"a": 2, "b": 2}, and when processed by
+        CopyField(field_to_field={"a": "c"} would yield
+        {"a": 2, "b": 3, "c": 2}
+        with use_query=True, we can also copy inside the field:
+        CopyFields(field_to_field={"a/0": "a"}, use_query=True)
+        would process instance {"a": [1, 3]} into {"a": 1}
     """
     def process_value(self, value: Any) -> Any:
 class AddID(StreamInstanceOperator):
+    """Stores a unique id value in the designated 'id_field_name' field of the given instance."""
     id_field_name: str = "id"
     def process(
     """Casts specified fields to specified types.
     Args:
+        use_nested_query (bool): Whether to cast nested fields, expressed in dpath. Defaults to False.
+        fields (Dict[str, str]): A dictionary mapping field names to the names of the types to cast the fields to.
+            e.g: "int", "str", "float", "bool". Basic names of types
+        defaults (Dict[str, object]): A dictionary mapping field names to default values for cases of casting failure.
+        process_every_value (bool): If true, all fields involved must contain lists, and each value in the list is then casted. Defaults to False.
+    Examples:
+        CastFields(
+                fields={"a/d": "float", "b": "int"},
+                failure_defaults={"a/d": 0.0, "b": 0},
+                process_every_value=True,
+                use_nested_query=True
+            )
+        would process the input instance: {"a": {"d": ["half", "0.6", 1, 12]}, "b": ["2"]}
+            into {"a": {"d": [0.0, 0.6, 1.0, 12.0]}, "b": [2]}
     """
     fields: Dict[str, str] = field(default_factory=dict)
     failure_defaults: Dict[str, object] = field(default_factory=dict)
     use_nested_query: bool = False
+    process_every_value: bool = False
+    def prepare(self):
+        self.types = {"int": int, "float": float, "str": str, "bool": bool}
     def _cast_single(self, value, type, field):
         try:
             return self.failure_defaults[field]
     def _cast_multiple(self, values, type, field):
+        return [self._cast_single(value, type, field) for value in values]
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         for field_name, type in self.fields.items():
             value = dict_get(instance, field_name, use_dpath=self.use_nested_query)
+            if self.process_every_value:
+                assert isinstance(
+                    value, list
+                ), f"'process_every_value' can be set to True only for fields that contain lists, whereas in instance {instance}, the contents of field '{field_name}' is of type '{type(value)}'"
                 casted_value = self._cast_multiple(value, type, field_name)
             else:
                 casted_value = self._cast_single(value, type, field_name)
         return instance
+class DivideAllFieldsBy(StreamInstanceOperator):
+    """Recursively reach down to all fields that are float, and divide each by 'divisor'.
+    The given instance is viewed as a tree whose internal nodes are dictionaries and lists, and
+    the leaves are either 'float' and then divided, or other basic type, in which case, a ValueError is raised
+    if input flag 'strict' is True, or -- left alone, if 'strict' is False.
+    Args:
+        divisor (float) the value to divide by
+        strict (bool) whether to raise an error upon visiting a leaf that is not float. Defaults to False.
+    Example:
+        when instance {"a": 10.0, "b": [2.0, 4.0, 7.0], "c": 5} is processed by operator:
+        operator = DivideAllFieldsBy(divisor=2.0)
+        the output is: {"a": 5.0, "b": [1.0, 2.0, 3.5], "c": 5}
+        If the operator were defined with strict=True, through:
+        operator = DivideAllFieldsBy(divisor=2.0, strict=True),
+        the processing of the above instance would raise a ValueError, for the integer at "c".
+    """
     divisor: float = 1.0
     strict: bool = False
+    def _recursive_divide(self, instance, divisor):
+        if isinstance(instance, dict):
+            for key, value in instance.items():
+                instance[key] = self._recursive_divide(value, divisor)
+        elif isinstance(instance, list):
+            for i, value in enumerate(instance):
+                instance[i] = self._recursive_divide(value, divisor)
+        elif isinstance(instance, float):
+            instance /= divisor
+        elif self.strict:
+            raise ValueError(f"Cannot divide instance of type {type(instance)}")
+        return instance
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
+        return self._recursive_divide(instance, self.divisor)
 class ArtifactFetcherMixin:
     """Applies value operators to each instance in a stream based on specified fields.
     Args:
+        inputs_fields (List[str]): list of field names, the values in which are to be processed
+        fields_to_treat_as_list (List[str]): sublist of input_fields, each member of this sublist is supposed to contain
+            a list of values, each of which is to be processed.
+        operators_field (str): name of the field that contains the list of names of the operators to be applied,
+            one after the other, for the processing.
         default_operators (List[str]): A list of default operators to be used if no operators are found in the instance.
+    Example:
+        when instance {"a": 111, "b": 2, "c": ["processors.to_string", "processors.first_character"]} is processed by operator:
+        operator = ApplyOperatorsField(inputs_fields=["a"], operators_field="c", default_operators=["add"]),
+        the resulting instance is: {"a": "1", "b": 2, "c": ["processors.to_string", "processors.first_character"]}
+    """
+    inputs_fields: List[str]
     operators_field: str
     default_operators: List[str] = None
     fields_to_treat_as_list: List[str] = NonPositionalField(default_factory=list)
         if operator_names is None:
             assert (
                 self.default_operators is not None
+            ), f"No operators found in field '{self.operators_field}', and no default operators provided."
             operator_names = self.default_operators
         if isinstance(operator_names, str):
                 if field_name in self.fields_to_treat_as_list:
                     instance[field_name] = [operator.process(v) for v in value]
                 else:
+                    instance[field_name] = operator.process(value)
         return instance
+class FilterByCondition(SingleStreamOperator):
+    """Filters a stream, yielding only instances for which the required values follows the required condition operator.
+    Raises an error if a required key is missing.
     Args:
+       values (Dict[str, Any]): Values that instances must match using the condition to be included in the output.
+       condition: the name of the desired condition operator between the key and the value in values ("gt", "ge", "lt", "le", "ne", "eq")
+       error_on_filtered_all (bool, optional): If True, raises an error if all instances are filtered out. Defaults to True.
+    Examples:
+       FilterByCondition(values = {"a":4}, condition = "gt") will yield only instances where "a">4
+       FilterByCondition(values = {"a":4}, condition = "le") will yield only instances where "a"<=4
+       FilterByCondition(values = {"a":[4,8]}, condition = "in") will yield only instances where "a" is 4 or 8
+       FilterByCondition(values = {"a":[4,8]}, condition = "not in") will yield only instances where "a" different from 4 or 8
     """
+    values: Dict[str, Any]
+    condition: str
+    condition_to_func = {
+        "gt": operator.gt,
+        "ge": operator.ge,
+        "lt": operator.lt,
+        "le": operator.le,
+        "eq": operator.eq,
+        "ne": operator.ne,
+        "in": None,  # Handled as special case
+        "not in": None,  # Handled as special case
+    }
+    error_on_filtered_all: bool = True
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        yielded = False
         for instance in stream:
+            if self._is_required(instance):
+                yielded = True
+                yield instance
+        if not yielded and self.error_on_filtered_all:
+            raise RuntimeError(
+                f"{self.__class__.__name__} filtered out every instance in stream '{stream_name}'. If this is intended set error_on_filtered_all=False"
+            )
+    def verify(self):
+        if self.condition not in self.condition_to_func:
+            raise ValueError(
+                f"Unsupported condition operator '{self.condition}', supported {list(self.condition_to_func.keys())}"
+            )
+        for key, value in self.values.items():
+            if self.condition in ["in", "not it"] and not isinstance(value, list):
+                raise ValueError(
+                    f"The filter for key ('{key}') in FilterByCondition with condition '{self.condition}' must be list but is not : '{value}'"
+                )
+        return super().verify()
+    def _is_required(self, instance: dict) -> bool:
+        for key, value in self.values.items():
+            if key not in instance:
+                raise ValueError(
+                    f"Required filter field ('{key}') in FilterByCondition is not found in {instance}"
+                )
+            if self.condition == "in":
+                if instance[key] not in value:
+                    return False
+            elif self.condition == "not in":
+                if instance[key] in value:
+                    return False
+            else:
+                func = self.condition_to_func[self.condition]
+                if func is None:
                     raise ValueError(
+                        f"Function not defined for condition '{self.condition}'"
                     )
+                if not func(instance[key], value):
+                    return False
+        return True
+class FilterByQuery(SingleStreamOperator):
+    """Filters a stream, yielding only instances which fulfil a condition specified as a string to be python's eval-uated.
+    Raises an error if a field participating in the specified condition is missing from the instance
+    Args:
+       query (str): a condition over fields of the instance, to be processed by python's eval()
+       error_on_filtered_all (bool, optional): If True, raises an error if all instances are filtered out. Defaults to True.
+    Examples:
+       FilterByQuery(query = "a > 4") will yield only instances where "a">4
+       FilterByQuery(query = "a <= 4 and b > 5") will yield only instances where the value of field "a" is not exceeding 4 and in field "b" -- greater than 5
+       FilterByQuery(query = "a in [4, 8]") will yield only instances where "a" is 4 or 8
+       FilterByQuery(query = "a not in [4, 8]") will yield only instances where "a" is neither 4 nor 8
+    """
+    query: str
+    error_on_filtered_all: bool = True
+    def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        yielded = False
+        for instance in stream:
+            if eval(self.query, None, instance):
+                yielded = True
                 yield instance
+        if not yielded and self.error_on_filtered_all:
+            raise RuntimeError(
+                f"{self.__class__.__name__} filtered out every instance in stream '{stream_name}'. If this is intended set error_on_filtered_all=False"
+            )
+class ExecuteQuery(StreamInstanceOperator):
+    """Compute an expression (query), expressed as a string to be eval-uated, over the instance's fields, and store the result in field to_field.
+    Raises an error if a field mentioned in the query is missing from the instance.
+    Args:
+       query (str): an expression to be evaluated over the fields of the instance
+       to_field (str): the field where the result is to be stored into
+    Examples:
+       When instance {"a": 2, "b": 3} is process-ed by operator
+       ExecuteQuery(query="a+b", to_field = "c")
+       the result is {"a": 2, "b": 3, "c": 5}
+       When instance {"a": "hello", "b": "world"} is process-ed by operator
+       ExecuteQuery(query = "a+' '+b", to_field = "c")
+       the result is {"a": "hello", "b": "world", "c": "hello world"}
+    """
+    query: str
+    to_field: str
+    def process(
+        self, instance: Dict[str, Any], stream_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        instance[self.to_field] = eval(self.query, None, instance)
+        return instance
+class ExtractMostCommonFieldValues(MultiStreamOperator):
     field: str
     stream_name: str
     overall_top_frequency_percent: Optional[int] = 100
     Examples:
+    ExtractMostCommonFieldValues(stream_name="train", field="label", to_field="classes") - extracts all the unique values of
     field 'label', sorts them by decreasing frequency, and stores the resulting list in field 'classes' of each and
     every instance in all streams.
+    ExtractMostCommonFieldValues(stream_name="train", field="labels", to_field="classes", process_every_value=True) -
     in case that field 'labels' contains a list of values (and not a single value) - track the occurrences of all the possible
     value members in these lists, and report the most frequent values.
     if process_every_value=False, track the most frequent whole lists, and report those (as a list of lists) in field
     'to_field' of each instance of all streams.
+    ExtractMostCommonFieldValues(stream_name="train", field="label", to_field="classes",overall_top_frequency_percent=80) -
     extracts the most frequent possible values of field 'label' that together cover at least 80% of the instances of stream_name,
     and stores them in field 'classes' of each instance of all streams.
+    ExtractMostCommonFieldValues(stream_name="train", field="label", to_field="classes",min_frequency_percent=5) -
     extracts all possible values of field 'label' that cover, each, at least 5% of the instances.
     Stores these values, sorted by decreasing order of frequency, in field 'classes' of each instance in all streams.
     """
             [*ele[0]] if isinstance(ele[0], tuple) else ele[0]
             for ele in values_and_counts
         ]
+        addmostcommons = AddFields(fields={self.to_field: values_to_keep})
+        return addmostcommons(multi_stream)
+class ExtractFieldValues(ExtractMostCommonFieldValues):
     def verify(self):
         super().verify()
+    def prepare(self):
+        self.overall_top_frequency_percent = 100
+        self.min_frequency_percent = 0
 class Intersect(FieldOperator):
             )
     def process_value(self, value: Any) -> Any:
+        super().process_value(value)
         if not isinstance(value, list):
             raise ValueError(f"The value in field is not a list but '{value}'")
         return [e for e in value if e in self.allowed_values]
     """Removes elements in a field, which must be a list, using a given list of unallowed.
     Args:
+        unallowed_values (list) - values to be removed.
     """
     unallowed_values: List[Any]
             stream_unique_values = uniques[stream_name]
             for unique_values in stream_unique_values:
                 filtering_values = dict(zip(self.fields, unique_values))
+                filtered_streams = FilterByCondition(
+                    values=filtering_values, condition="eq"
                 )._process_single_stream(stream)
                 filtered_stream_name = (
                     stream_name + "_" + nested_tuple_to_string(unique_values)
     reversed: bool = False
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        first_instance = stream.peek()
         operators = first_instance.get(self.field, [])
         if isinstance(operators, str):
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         from .metrics import Metric, MetricPipeline, MetricWithConfidenceInterval
+        first_instance = stream.peek()
         metric_names = first_instance.get(self.metric_field, [])
         if not metric_names:
         yield from stream
 class MergeStreams(MultiStreamOperator):
     """Merges multiple streams into a single stream.
 class Shuffle(PagedStreamOperator):
     """Shuffles the order of instances in each page of a stream.
+    Args (of superclass):
         page_size (int): The size of each page in the stream. Defaults to 1000.
     """
+    random_generator: Random = None
+    def before_process_multi_stream(self):
+        super().before_process_multi_stream()
+        self.random_generator = new_random_generator(sub_seed="shuffle")
     def process(self, page: List[Dict], stream_name: Optional[str] = None) -> Generator:
+        self.random_generator.shuffle(page)
         yield from page
 class EncodeLabels(StreamInstanceOperator):
+    """Encode each value encountered in any field in 'fields' into the integers 0,1,...
+    Encoding is determined by a str->int map that is built on the go, as different values are
+    first encountered in the stream, either as list members or as values in single-value fields.
     Args:
         fields (List[str]): The fields to encode together.
+    Example: applying
+        EncodeLabels(fields = ["a", "b/*"])
+        on input stream = [{"a": "red", "b": ["red", "blue"], "c":"bread"},
+        {"a": "blue", "b": ["green"], "c":"water"}]   will yield the
+        output stream = [{'a': 0, 'b': [0, 1], 'c': 'bread'}, {'a': 1, 'b': [2], 'c': 'water'}]
+        Note: dpath is applied here, and hence, fields that are lists, should be included in
+        input 'fields' with the appendix "/*"  as in the above example.
     """
     fields: List[str]
 class StreamRefiner(SingleStreamOperator):
+    """Discard from the input stream all instances beyond the leading 'max_instances' instances.
+    Thereby, if the input stream consists of no more than 'max_instances' instances, the resulting stream is the whole of the
+    input stream. And if the input stream consists of more than 'max_instances' instances, the resulting stream only consists
+    of the leading 'max_instances' of the input stream.
+    Args:  max_instances (int)
+           apply_to_streams (optional, list(str)): names of streams to refine.
+    Examples:
+        when input = [{"a": 1},{"a": 2},{"a": 3},{"a": 4},{"a": 5},{"a": 6}] is fed into
+        StreamRefiner(max_instances=4)
+        the resulting stream is [{"a": 1},{"a": 2},{"a": 3},{"a": 4}]
+    """
     max_instances: int = None
+    apply_to_streams: Optional[List[str]] = None
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         if self.max_instances is not None:
 class DeterministicBalancer(StreamRefiner):
     """A class used to balance streams deterministically.
+    For each instance, a signature is constructed from the values of the instance in specified input 'fields'.
+    By discarding instances from the input stream, DeterministicBalancer maintains equal number of instances for all signatures.
+    When also input 'max_instances' is specified, DeterministicBalancer maintains a total instance count not exceeding
+    'max_instances'. The total number of discarded instances is as few as possible.
     Attributes:
+        fields (List[str]): A list of field names to be used in producing the instance's signature.
+        max_instances (Optional, int)
     Usage:
+        balancer = DeterministicBalancer(fields=["field1", "field2"], max_instances=200)
         balanced_stream = balancer.process(stream)
+    Example:
+        When input [{"a": 1, "b": 1},{"a": 1, "b": 2},{"a": 2},{"a": 3},{"a": 4}] is fed into
+        DeterministicBalancer(fields=["a"])
+        the resulting stream will be: [{"a": 1, "b": 1},{"a": 2},{"a": 3},{"a": 4}]
     """
     fields: List[str]
 class LengthBalancer(DeterministicBalancer):
+    """Balances by a signature that reflects the total length of the fields' values, quantized into integer segments.
+    Args:
+        segments_boundaries (List[int]): distinct integers sorted in increasing order, that maps a given total length
+           into the index of the least of them that exceeds the total length. (If none exceeds -- into one index
+           beyond, namely, the length of segments_boudaries)
+        fields (Optional, List[str])
+    Example:
+        when input [{"a": [1, 3], "b": 0, "id": 0}, {"a": [1, 3], "b": 0, "id": 1}, {"a": [], "b": "a", "id": 2}] is fed into
+        LengthBalancer(fields=["a"], segments_boundaries=[1])
+        input instances will be counted and balanced against two categories: empty total length (less than 1), and non-empty.
+    """
     segments_boundaries: List[int]
+    fields: Optional[List[str]]
     def signature(self, instance):
         total_len = 0